mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 18:12:55 +00:00
move common to common crate, create fastfield_codecs crate
move common to common crate create fastfield_codecs crate add bitpacker to fast field codecs add linear interpolation to fast field codecs add tests
This commit is contained in:
@@ -35,6 +35,8 @@ crossbeam = "0.8"
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version="0.1", path="./common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
@@ -88,7 +90,7 @@ unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "bitpacker"]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
10
common/Cargo.toml
Normal file
10
common/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "common"
|
||||
version = "0.1.0"
|
||||
authors = ["Pascal Seitz <pascal.seitz@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
7
common/src/lib.rs
Normal file
7
common/src/lib.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod serialize;
|
||||
mod vint;
|
||||
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::common::Endianness;
|
||||
use crate::common::VInt;
|
||||
use crate::Endianness;
|
||||
use crate::VInt;
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
14
fastfield_codecs/Cargo.toml
Normal file
14
fastfield_codecs/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.1.0"
|
||||
authors = ["Pascal Seitz <pascal.seitz@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { path = "../common/" }
|
||||
tantivy-bitpacker = { path = "../bitpacker/" }
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8.3"
|
||||
5
fastfield_codecs/src/bitpacked/mod.rs
Normal file
5
fastfield_codecs/src/bitpacked/mod.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
mod reader;
|
||||
mod serialize;
|
||||
|
||||
pub use reader::BitpackedFastFieldReader;
|
||||
pub use serialize::BitpackedFastFieldSerializer;
|
||||
33
fastfield_codecs/src/bitpacked/reader.rs
Normal file
33
fastfield_codecs/src/bitpacked/reader.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader<'data> {
|
||||
bytes: &'data [u8],
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub min_value_u64: u64,
|
||||
pub max_value_u64: u64,
|
||||
}
|
||||
|
||||
impl<'data> BitpackedFastFieldReader<'data> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open_from_bytes(mut bytes: &'data [u8]) -> io::Result<Self> {
|
||||
let min_value = u64::deserialize(&mut bytes)?;
|
||||
let amplitude = u64::deserialize(&mut bytes)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
bytes,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
})
|
||||
}
|
||||
pub fn get_u64(&self, doc: u64) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes)
|
||||
}
|
||||
}
|
||||
93
fastfield_codecs/src/bitpacked/serialize.rs
Normal file
93
fastfield_codecs/src/bitpacked/serialize.rs
Normal file
@@ -0,0 +1,93 @@
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializer {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn create(
|
||||
write: &'a mut W,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer = Self::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::codec_id().0;
|
||||
(ratio, name)
|
||||
}
|
||||
fn codec_id() -> (&'static str, u8) {
|
||||
("Bitpacked", 1)
|
||||
}
|
||||
}
|
||||
47
fastfield_codecs/src/lib.rs
Normal file
47
fastfield_codecs/src/lib.rs
Normal file
@@ -0,0 +1,47 @@
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
|
||||
pub trait FastFieldDataAccess: Clone {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
fn get(&self, doc: u32) -> u64;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldSerializerEstimate {
|
||||
/// returns an estimate of the compression ratio.
|
||||
fn estimate(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str);
|
||||
/// the unique (name, id) of the compressor. Used to distinguish when de/serializing.
|
||||
fn codec_id() -> (&'static str, u8);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FastFieldStats {
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
pub num_vals: u64,
|
||||
}
|
||||
123
fastfield_codecs/src/linearinterpol/mod.rs
Normal file
123
fastfield_codecs/src/linearinterpol/mod.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
mod serialize;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
use crate::FastFieldDataAccess;
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearinterpolFastFieldReader<'data> {
|
||||
bytes: &'data [u8],
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub first_value: u64,
|
||||
pub rel_max_value: u64,
|
||||
pub offset: u64,
|
||||
pub slope: f64,
|
||||
}
|
||||
|
||||
impl<'data> LinearinterpolFastFieldReader<'data> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open_from_bytes(mut bytes: &'data [u8]) -> io::Result<Self> {
|
||||
let rel_max_value = u64::deserialize(&mut bytes)?;
|
||||
let offset = u64::deserialize(&mut bytes)?;
|
||||
let first_value = u64::deserialize(&mut bytes)?;
|
||||
let last_value = u64::deserialize(&mut bytes)?;
|
||||
let num_vals = u64::deserialize(&mut bytes)?;
|
||||
let slope = (last_value as f64 - first_value as f64) / (num_vals as u64 - 1) as f64;
|
||||
|
||||
let num_bits = compute_num_bits(rel_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearinterpolFastFieldReader {
|
||||
bytes,
|
||||
first_value,
|
||||
rel_max_value,
|
||||
offset,
|
||||
bit_unpacker,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
pub fn get_u64(&self, doc: u64) -> u64 {
|
||||
let calculated_value = self.first_value + (doc as f64 * self.slope) as u64;
|
||||
calculated_value + self.bit_unpacker.get(doc, &self.bytes) - self.offset
|
||||
|
||||
//self.offset + self.min_value + self.bit_unpacker.get(doc, &self.bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_and_validate(data: &[u64]) -> (u64, u64) {
|
||||
let mut out = vec![];
|
||||
serialize::LinearInterpolFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, val) in data.iter().enumerate() {
|
||||
assert_eq!(reader.get_u64(doc as u64), *val);
|
||||
}
|
||||
(reader.rel_max_value, reader.offset)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
let (rel_max_value, offset) = create_and_validate(&data);
|
||||
|
||||
assert_eq!(offset, 0);
|
||||
assert_eq!(rel_max_value, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_with_offset() {
|
||||
//let data = vec![5, 50, 95, 96, 97, 98, 99, 100];
|
||||
let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_no_structure() {
|
||||
let mut data = vec![5, 50, 3, 13, 1, 1000, 35];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for i in 0..50000 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|num| rand::random::<i64>() as u64 / 2 as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
}
|
||||
}
|
||||
87
fastfield_codecs/src/linearinterpol/serialize.rs
Normal file
87
fastfield_codecs/src/linearinterpol/serialize.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
// TODO not suitable if max is larger than i64::MAX / 2
|
||||
|
||||
impl LinearInterpolFastFieldSerializer {
|
||||
/// Creates a new fast field serializer.
|
||||
pub fn create(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
data_iter2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
//let first_val = stats.min_value;
|
||||
//let last_val = stats.max_value;
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
let slope = (last_val as f64 - first_val as f64) / (stats.num_vals as u64 - 1) as f64;
|
||||
// offset to ensure all values are positive
|
||||
let offset = data_iter1
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
val as i64 - calculated_value as i64
|
||||
})
|
||||
.min()
|
||||
.unwrap()
|
||||
.abs() as u64;
|
||||
|
||||
//calc new max
|
||||
let rel_max = data_iter2
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
(val + offset) - calculated_value
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
let amplitude = rel_max;
|
||||
amplitude.serialize(write)?;
|
||||
offset.serialize(write)?;
|
||||
first_val.serialize(write)?;
|
||||
last_val.serialize(write)?;
|
||||
stats.num_vals.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, val) in data_iter.enumerate() {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
let diff = (val + offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::codec_id().0;
|
||||
(ratio, name)
|
||||
}
|
||||
fn codec_id() -> (&'static str, u8) {
|
||||
("LinearInterpol", 2)
|
||||
}
|
||||
}
|
||||
@@ -1,18 +1,19 @@
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
mod counting_writer;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
//mod serialize;
|
||||
//mod vint;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use self::vint::{
|
||||
//pub use self::serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
pub use common::{
|
||||
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
|
||||
};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
@@ -103,8 +104,9 @@ pub fn u64_to_f64(val: u64) -> f64 {
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
//pub use super::serialize::test::fixed_size_test;
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use proptest::prelude::*;
|
||||
use std::f64;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
@@ -118,6 +120,12 @@ pub(crate) mod test {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
|
||||
@@ -56,6 +56,12 @@ impl OwnedBytes {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Returns the underlying slice of data.
|
||||
/// `Deref` and `AsRef` are also available.
|
||||
#[inline]
|
||||
pub fn into_slice(self) -> &'static [u8] {
|
||||
self.data
|
||||
}
|
||||
/// Returns the len of the slice.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
|
||||
@@ -215,6 +215,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::common::HasLen;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::BitpackedFastFieldReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
@@ -223,7 +224,6 @@ mod tests {
|
||||
use crate::schema::FAST;
|
||||
use crate::schema::{Document, IntOptions};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
use common::HasLen;
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
|
||||
@@ -8,6 +8,7 @@ use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
@@ -67,9 +68,9 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let bytes = file.read_bytes()?;
|
||||
let (mut id_bytes, data_bytes) = bytes.split(1);
|
||||
let id = u8::deserialize(&mut id_bytes)?;
|
||||
let _id = u8::deserialize(&mut id_bytes)?;
|
||||
|
||||
Ok(DynamicFastFieldReader::Bitpacked(
|
||||
BitpackedFastFieldReader::open_from_bytes(data_bytes)?,
|
||||
@@ -106,10 +107,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader<Item: FastValue> {
|
||||
bytes: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
min_value_u64: u64,
|
||||
max_value_u64: u64,
|
||||
reader: BitpackedReader<'static>,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
@@ -121,22 +119,15 @@ impl<Item: FastValue> BitpackedFastFieldReader<Item> {
|
||||
Self::open_from_bytes(bytes)
|
||||
}
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open_from_bytes(mut bytes: OwnedBytes) -> crate::Result<Self> {
|
||||
let min_value = u64::deserialize(&mut bytes)?;
|
||||
let amplitude = u64::deserialize(&mut bytes)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
|
||||
let reader = BitpackedReader::open_from_bytes(bytes.into_slice())?;
|
||||
Ok(BitpackedFastFieldReader {
|
||||
bytes,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
reader,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
|
||||
Item::from_u64(self.reader.get_u64(doc))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
@@ -194,7 +185,7 @@ impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.min_value_u64)
|
||||
Item::from_u64(self.reader.min_value_u64)
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
@@ -203,7 +194,7 @@ impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.max_value_u64)
|
||||
Item::from_u64(self.reader.max_value_u64)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,22 +7,13 @@ use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
pub use bitpacked::BitpackedFastFieldSerializer;
|
||||
//pub use bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::FastFieldDataAccess;
|
||||
pub use fastfield_codecs::FastFieldSerializerEstimate;
|
||||
pub use fastfield_codecs::FastFieldStats;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// FastFieldReader is the trait to access fast field data.
|
||||
pub trait FastFieldDataAccess: Clone {
|
||||
//type IteratorType: Iterator<Item = u64>;
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
fn get(&self, doc: DocId) -> u64;
|
||||
}
|
||||
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
@@ -129,13 +120,6 @@ impl CompositeFastFieldSerializer {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FastFieldStats {
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
/// The FastFieldSerializer trait is the common interface
|
||||
/// implemented by every fastfield serializer variant.
|
||||
///
|
||||
@@ -148,18 +132,6 @@ pub trait FastFieldSerializer {
|
||||
fn close_field(self) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldSerializerEstimate {
|
||||
/// returns an estimate of the compression ratio.
|
||||
fn estimate(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str);
|
||||
/// the unique (name, id) of the compressor. Used to distinguish when de/serializing.
|
||||
fn codec_id() -> (&'static str, u8);
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user