mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Blop
This commit is contained in:
@@ -17,10 +17,12 @@ use std::sync::Arc;
|
||||
|
||||
use common::{BinarySerializable, OwnedBytes};
|
||||
use compact_space::CompactSpaceDecompressor;
|
||||
pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
|
||||
use monotonic_mapping::{
|
||||
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
|
||||
StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
|
||||
};
|
||||
pub use monotonic_mapping_u128::MonotonicallyMappableToU128;
|
||||
use serialize::{Header, U128Header};
|
||||
|
||||
mod bitpacked;
|
||||
@@ -37,8 +39,6 @@ mod gcd;
|
||||
pub mod serialize;
|
||||
|
||||
pub use self::column::{monotonic_map_column, ColumnValues, IterColumn, VecColumn};
|
||||
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
|
||||
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
|
||||
#[cfg(test)]
|
||||
pub use self::serialize::tests::serialize_and_load;
|
||||
pub use self::serialize::{serialize_column_values, NormalizedHeader};
|
||||
|
||||
@@ -20,7 +20,7 @@ mod value;
|
||||
|
||||
pub use column::{BytesColumn, Column, StrColumn};
|
||||
pub use column_index::ColumnIndex;
|
||||
pub use column_values::ColumnValues;
|
||||
pub use column_values::{ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
pub use columnar::{
|
||||
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
|
||||
MergeDocOrder,
|
||||
|
||||
@@ -3,254 +3,3 @@ use std::marker::PhantomData;
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
|
||||
pub use columnar::ColumnValues as Column;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
use crate::monotonic_mapping::StrictlyMonotonicFn;
|
||||
|
||||
/// VecColumn provides `Column` over a slice.
|
||||
pub struct VecColumn<'a, T = u64> {
|
||||
values: &'a [T],
|
||||
min_value: T,
|
||||
max_value: T,
|
||||
}
|
||||
|
||||
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
|
||||
fn get_val(&self, position: u32) -> T {
|
||||
self.values[position as usize]
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
|
||||
Box::new(self.values.iter().copied())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> T {
|
||||
self.min_value
|
||||
}
|
||||
|
||||
fn max_value(&self) -> T {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u32 {
|
||||
self.values.len() as u32
|
||||
}
|
||||
|
||||
fn get_range(&self, start: u64, output: &mut [T]) {
|
||||
output.copy_from_slice(&self.values[start as usize..][..output.len()])
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
|
||||
where V: AsRef<[T]> + ?Sized
|
||||
{
|
||||
fn from(values: &'a V) -> Self {
|
||||
let values = values.as_ref();
|
||||
let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
|
||||
Self {
|
||||
values,
|
||||
min_value,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct MonotonicMappingColumn<C, T, Input> {
|
||||
from_column: C,
|
||||
monotonic_mapping: T,
|
||||
_phantom: PhantomData<Input>,
|
||||
}
|
||||
|
||||
/// Creates a view of a column transformed by a strictly monotonic mapping. See
|
||||
/// [`StrictlyMonotonicFn`].
|
||||
///
|
||||
/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
|
||||
/// monotonic_mapping.mapping() is expected to be injective, and we should always have
|
||||
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
|
||||
///
|
||||
/// The inverse of the mapping is required for:
|
||||
/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> `
|
||||
/// The user provides the original value range and we need to monotonic map them in the same way the
|
||||
/// serialization does before calling the underlying column.
|
||||
///
|
||||
/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
|
||||
/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
|
||||
/// monotonic_mapping during serialization.
|
||||
pub fn monotonic_map_column<C, T, Input, Output>(
|
||||
from_column: C,
|
||||
monotonic_mapping: T,
|
||||
) -> impl Column<Output>
|
||||
where
|
||||
C: Column<Input>,
|
||||
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
|
||||
Input: PartialOrd + Send + Sync + Copy + Debug,
|
||||
Output: PartialOrd + Send + Sync + Copy + Debug,
|
||||
{
|
||||
MonotonicMappingColumn {
|
||||
from_column,
|
||||
monotonic_mapping,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
|
||||
where
|
||||
C: Column<Input>,
|
||||
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
|
||||
Input: PartialOrd + Send + Sync + Copy + Debug,
|
||||
Output: PartialOrd + Send + Sync + Copy + Debug,
|
||||
{
|
||||
#[inline]
|
||||
fn get_val(&self, idx: u32) -> Output {
|
||||
let from_val = self.from_column.get_val(idx);
|
||||
self.monotonic_mapping.mapping(from_val)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> Output {
|
||||
let from_min_value = self.from_column.min_value();
|
||||
self.monotonic_mapping.mapping(from_min_value)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> Output {
|
||||
let from_max_value = self.from_column.max_value();
|
||||
self.monotonic_mapping.mapping(from_max_value)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u32 {
|
||||
self.from_column.num_vals()
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
|
||||
Box::new(
|
||||
self.from_column
|
||||
.iter()
|
||||
.map(|el| self.monotonic_mapping.mapping(el)),
|
||||
)
|
||||
}
|
||||
|
||||
fn get_docids_for_value_range(
|
||||
&self,
|
||||
range: RangeInclusive<Output>,
|
||||
doc_id_range: Range<u32>,
|
||||
positions: &mut Vec<u32>,
|
||||
) {
|
||||
if range.start() > &self.max_value() || range.end() < &self.min_value() {
|
||||
return;
|
||||
}
|
||||
let range = self.monotonic_mapping.inverse_coerce(range);
|
||||
if range.start() > range.end() {
|
||||
return;
|
||||
}
|
||||
self.from_column
|
||||
.get_docids_for_value_range(range, doc_id_range, positions)
|
||||
}
|
||||
|
||||
// We voluntarily do not implement get_range as it yields a regression,
|
||||
// and we do not have any specialized implementation anyway.
|
||||
}
|
||||
|
||||
/// Wraps an iterator into a `Column`.
|
||||
pub struct IterColumn<T>(T);
|
||||
|
||||
impl<T> From<T> for IterColumn<T>
|
||||
where T: Iterator + Clone + ExactSizeIterator
|
||||
{
|
||||
fn from(iter: T) -> Self {
|
||||
IterColumn(iter)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Column<T::Item> for IterColumn<T>
|
||||
where
|
||||
T: Iterator + Clone + ExactSizeIterator + Send + Sync,
|
||||
T::Item: PartialOrd + fmt::Debug,
|
||||
{
|
||||
fn get_val(&self, idx: u32) -> T::Item {
|
||||
self.0.clone().nth(idx as usize).unwrap()
|
||||
}
|
||||
|
||||
fn min_value(&self) -> T::Item {
|
||||
self.0.clone().next().unwrap()
|
||||
}
|
||||
|
||||
fn max_value(&self) -> T::Item {
|
||||
self.0.clone().last().unwrap()
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u32 {
|
||||
self.0.len() as u32
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> {
|
||||
Box::new(self.0.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::monotonic_mapping::{
|
||||
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval,
|
||||
StrictlyMonotonicMappingToInternalGCDBaseval,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_monotonic_mapping() {
|
||||
let vals = &[3u64, 5u64][..];
|
||||
let col = VecColumn::from(vals);
|
||||
let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2));
|
||||
assert_eq!(mapped.min_value(), 1u64);
|
||||
assert_eq!(mapped.max_value(), 3u64);
|
||||
assert_eq!(mapped.num_vals(), 2);
|
||||
assert_eq!(mapped.num_vals(), 2);
|
||||
assert_eq!(mapped.get_val(0), 1);
|
||||
assert_eq!(mapped.get_val(1), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_as_col() {
|
||||
let col = IterColumn::from(10..100);
|
||||
assert_eq!(col.num_vals(), 90);
|
||||
assert_eq!(col.max_value(), 99);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_monotonic_mapping_iter() {
|
||||
let vals: Vec<u64> = (10..110u64).map(|el| el * 10).collect();
|
||||
let col = VecColumn::from(&vals);
|
||||
let mapped = monotonic_map_column(
|
||||
col,
|
||||
StrictlyMonotonicMappingInverter::from(
|
||||
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100),
|
||||
),
|
||||
);
|
||||
let val_i64s: Vec<u64> = mapped.iter().collect();
|
||||
for i in 0..100 {
|
||||
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_monotonic_mapping_get_range() {
|
||||
let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
|
||||
let col = VecColumn::from(&vals);
|
||||
let mapped = monotonic_map_column(
|
||||
col,
|
||||
StrictlyMonotonicMappingInverter::from(
|
||||
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0),
|
||||
),
|
||||
);
|
||||
|
||||
assert_eq!(mapped.min_value(), 0u64);
|
||||
assert_eq!(mapped.max_value(), 9900u64);
|
||||
assert_eq!(mapped.num_vals(), 100);
|
||||
let val_u64s: Vec<u64> = mapped.iter().collect();
|
||||
assert_eq!(val_u64s.len(), 100);
|
||||
for i in 0..100 {
|
||||
assert_eq!(val_u64s[i as usize], mapped.get_val(i));
|
||||
assert_eq!(val_u64s[i as usize], vals[i as usize] * 10);
|
||||
}
|
||||
let mut buf = [0u64; 20];
|
||||
mapped.get_range(7, &mut buf[..]);
|
||||
assert_eq!(&val_u64s[7..][..20], &buf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,208 +14,206 @@ extern crate more_asserts;
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
extern crate test;
|
||||
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
// use common::{BinarySerializable, OwnedBytes};
|
||||
// use compact_space::CompactSpaceDecompressor;
|
||||
// use format_version::read_format_version;
|
||||
// use monotonic_mapping::{
|
||||
// StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
|
||||
// StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
|
||||
// };
|
||||
// use null_index_footer::read_null_index_footer;
|
||||
// use serialize::{Header, U128Header};
|
||||
|
||||
use common::{BinarySerializable, OwnedBytes};
|
||||
use compact_space::CompactSpaceDecompressor;
|
||||
use format_version::read_format_version;
|
||||
use monotonic_mapping::{
|
||||
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
|
||||
StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
|
||||
};
|
||||
use null_index_footer::read_null_index_footer;
|
||||
use serialize::{Header, U128Header};
|
||||
|
||||
mod bitpacked;
|
||||
mod blockwise_linear;
|
||||
mod compact_space;
|
||||
mod format_version;
|
||||
mod line;
|
||||
mod linear;
|
||||
mod monotonic_mapping;
|
||||
mod monotonic_mapping_u128;
|
||||
#[allow(dead_code)]
|
||||
mod null_index;
|
||||
mod null_index_footer;
|
||||
// mod bitpacked;
|
||||
// mod blockwise_linear;
|
||||
// mod compact_space;
|
||||
// mod format_version;
|
||||
// mod line;
|
||||
// mod linear;
|
||||
// mod monotonic_mapping;
|
||||
// mod monotonic_mapping_u128;
|
||||
// #[allow(dead_code)]
|
||||
// mod null_index;
|
||||
// mod null_index_footer;
|
||||
|
||||
mod column;
|
||||
mod gcd;
|
||||
pub mod serialize;
|
||||
pub use column::Column;
|
||||
|
||||
use self::bitpacked::BitpackedCodec;
|
||||
use self::blockwise_linear::BlockwiseLinearCodec;
|
||||
pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
|
||||
use self::linear::LinearCodec;
|
||||
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
|
||||
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
|
||||
pub use self::serialize::{
|
||||
estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
|
||||
};
|
||||
// mod gcd;
|
||||
// pub mod serialize;
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
/// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data.
|
||||
pub enum FastFieldCodecType {
|
||||
/// Bitpack all values in the value range. The number of bits is defined by the amplitude
|
||||
/// `column.max_value() - column.min_value()`
|
||||
Bitpacked = 1,
|
||||
/// Linear interpolation puts a line between the first and last value and then bitpacks the
|
||||
/// values by the offset from the line. The number of bits is defined by the max deviation from
|
||||
/// the line.
|
||||
Linear = 2,
|
||||
/// Same as [`FastFieldCodecType::Linear`], but encodes in blocks of 512 elements.
|
||||
BlockwiseLinear = 3,
|
||||
}
|
||||
// use self::bitpacked::BitpackedCodec;
|
||||
// use self::blockwise_linear::BlockwiseLinearCodec;
|
||||
// pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
|
||||
// use self::linear::LinearCodec;
|
||||
// pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
|
||||
// pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
|
||||
// pub use self::serialize::{
|
||||
// estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
|
||||
// };
|
||||
|
||||
impl BinarySerializable for FastFieldCodecType {
|
||||
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
|
||||
self.to_code().serialize(wrt)
|
||||
}
|
||||
// #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
// #[repr(u8)]
|
||||
// /// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted
|
||||
// data. pub enum FastFieldCodecType {
|
||||
// /// Bitpack all values in the value range. The number of bits is defined by the amplitude
|
||||
// /// `column.max_value() - column.min_value()`
|
||||
// Bitpacked = 1,
|
||||
// /// Linear interpolation puts a line between the first and last value and then bitpacks the
|
||||
// /// values by the offset from the line. The number of bits is defined by the max deviation
|
||||
// from /// the line.
|
||||
// Linear = 2,
|
||||
// /// Same as [`FastFieldCodecType::Linear`], but encodes in blocks of 512 elements.
|
||||
// BlockwiseLinear = 3,
|
||||
// }
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let code = u8::deserialize(reader)?;
|
||||
let codec_type: Self = Self::from_code(code)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
|
||||
Ok(codec_type)
|
||||
}
|
||||
}
|
||||
// impl BinarySerializable for FastFieldCodecType {
|
||||
// fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
|
||||
// self.to_code().serialize(wrt)
|
||||
// }
|
||||
|
||||
impl FastFieldCodecType {
|
||||
pub(crate) fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
// fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
// let code = u8::deserialize(reader)?;
|
||||
// let codec_type: Self = Self::from_code(code)
|
||||
// .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code
|
||||
// `{code}.`"))?; Ok(codec_type)
|
||||
// }
|
||||
// }
|
||||
|
||||
pub(crate) fn from_code(code: u8) -> Option<Self> {
|
||||
match code {
|
||||
1 => Some(Self::Bitpacked),
|
||||
2 => Some(Self::Linear),
|
||||
3 => Some(Self::BlockwiseLinear),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
// impl FastFieldCodecType {
|
||||
// pub(crate) fn to_code(self) -> u8 {
|
||||
// self as u8
|
||||
// }
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data.
|
||||
pub enum U128FastFieldCodecType {
|
||||
/// This codec takes a large number space (u128) and reduces it to a compact number space, by
|
||||
/// removing the holes.
|
||||
CompactSpace = 1,
|
||||
}
|
||||
// pub(crate) fn from_code(code: u8) -> Option<Self> {
|
||||
// match code {
|
||||
// 1 => Some(Self::Bitpacked),
|
||||
// 2 => Some(Self::Linear),
|
||||
// 3 => Some(Self::BlockwiseLinear),
|
||||
// _ => None,
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
impl BinarySerializable for U128FastFieldCodecType {
|
||||
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
|
||||
self.to_code().serialize(wrt)
|
||||
}
|
||||
// #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
// #[repr(u8)]
|
||||
// /// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted
|
||||
// data. pub enum U128FastFieldCodecType {
|
||||
// /// This codec takes a large number space (u128) and reduces it to a compact number space, by
|
||||
// /// removing the holes.
|
||||
// CompactSpace = 1,
|
||||
// }
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let code = u8::deserialize(reader)?;
|
||||
let codec_type: Self = Self::from_code(code)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
|
||||
Ok(codec_type)
|
||||
}
|
||||
}
|
||||
// impl BinarySerializable for U128FastFieldCodecType {
|
||||
// fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
|
||||
// self.to_code().serialize(wrt)
|
||||
// }
|
||||
|
||||
impl U128FastFieldCodecType {
|
||||
pub(crate) fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
// fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
// let code = u8::deserialize(reader)?;
|
||||
// let codec_type: Self = Self::from_code(code)
|
||||
// .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code
|
||||
// `{code}.`"))?; Ok(codec_type)
|
||||
// }
|
||||
// }
|
||||
|
||||
pub(crate) fn from_code(code: u8) -> Option<Self> {
|
||||
match code {
|
||||
1 => Some(Self::CompactSpace),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
// impl U128FastFieldCodecType {
|
||||
// pub(crate) fn to_code(self) -> u8 {
|
||||
// self as u8
|
||||
// }
|
||||
|
||||
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||
pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>(
|
||||
bytes: OwnedBytes,
|
||||
) -> io::Result<Arc<dyn Column<Item>>> {
|
||||
let (bytes, _format_version) = read_format_version(bytes)?;
|
||||
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
|
||||
let header = U128Header::deserialize(&mut bytes)?;
|
||||
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
|
||||
let reader = CompactSpaceDecompressor::open(bytes)?;
|
||||
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
|
||||
StrictlyMonotonicMappingToInternal::<Item>::new().into();
|
||||
Ok(Arc::new(monotonic_map_column(reader, inverted)))
|
||||
}
|
||||
// pub(crate) fn from_code(code: u8) -> Option<Self> {
|
||||
// match code {
|
||||
// 1 => Some(Self::CompactSpace),
|
||||
// _ => None,
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
/// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||
pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>(
|
||||
bytes: OwnedBytes,
|
||||
) -> io::Result<Arc<dyn Column<T>>> {
|
||||
let (bytes, _format_version) = read_format_version(bytes)?;
|
||||
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
|
||||
let header = Header::deserialize(&mut bytes)?;
|
||||
match header.codec_type {
|
||||
FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes, &header),
|
||||
FastFieldCodecType::Linear => open_specific_codec::<LinearCodec, _>(bytes, &header),
|
||||
FastFieldCodecType::BlockwiseLinear => {
|
||||
open_specific_codec::<BlockwiseLinearCodec, _>(bytes, &header)
|
||||
}
|
||||
}
|
||||
}
|
||||
// /// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||
// pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>(
|
||||
// bytes: OwnedBytes,
|
||||
// ) -> io::Result<Arc<dyn Column<Item>>> {
|
||||
// let (bytes, _format_version) = read_format_version(bytes)?;
|
||||
// let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
|
||||
// let header = U128Header::deserialize(&mut bytes)?;
|
||||
// assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
|
||||
// let reader = CompactSpaceDecompressor::open(bytes)?;
|
||||
// let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
|
||||
// StrictlyMonotonicMappingToInternal::<Item>::new().into();
|
||||
// Ok(Arc::new(monotonic_map_column(reader, inverted)))
|
||||
// }
|
||||
|
||||
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + fmt::Debug>(
|
||||
bytes: OwnedBytes,
|
||||
header: &Header,
|
||||
) -> io::Result<Arc<dyn Column<Item>>> {
|
||||
let normalized_header = header.normalized();
|
||||
let reader = C::open_from_bytes(bytes, normalized_header)?;
|
||||
let min_value = header.min_value;
|
||||
if let Some(gcd) = header.gcd {
|
||||
let mapping = StrictlyMonotonicMappingInverter::from(
|
||||
StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
|
||||
);
|
||||
Ok(Arc::new(monotonic_map_column(reader, mapping)))
|
||||
} else {
|
||||
let mapping = StrictlyMonotonicMappingInverter::from(
|
||||
StrictlyMonotonicMappingToInternalBaseval::new(min_value),
|
||||
);
|
||||
Ok(Arc::new(monotonic_map_column(reader, mapping)))
|
||||
}
|
||||
}
|
||||
// /// Returns the correct codec reader wrapped in the `Arc` for the data.
|
||||
// pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>(
|
||||
// bytes: OwnedBytes,
|
||||
// ) -> io::Result<Arc<dyn Column<T>>> {
|
||||
// let (bytes, _format_version) = read_format_version(bytes)?;
|
||||
// let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
|
||||
// let header = Header::deserialize(&mut bytes)?;
|
||||
// match header.codec_type {
|
||||
// FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes,
|
||||
// &header), FastFieldCodecType::Linear => open_specific_codec::<LinearCodec, _>(bytes,
|
||||
// &header), FastFieldCodecType::BlockwiseLinear => {
|
||||
// open_specific_codec::<BlockwiseLinearCodec, _>(bytes, &header)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
trait FastFieldCodec: 'static {
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const CODEC_TYPE: FastFieldCodecType;
|
||||
// fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + fmt::Debug>(
|
||||
// bytes: OwnedBytes,
|
||||
// header: &Header,
|
||||
// ) -> io::Result<Arc<dyn Column<Item>>> {
|
||||
// let normalized_header = header.normalized();
|
||||
// let reader = C::open_from_bytes(bytes, normalized_header)?;
|
||||
// let min_value = header.min_value;
|
||||
// if let Some(gcd) = header.gcd {
|
||||
// let mapping = StrictlyMonotonicMappingInverter::from(
|
||||
// StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
|
||||
// );
|
||||
// Ok(Arc::new(monotonic_map_column(reader, mapping)))
|
||||
// } else {
|
||||
// let mapping = StrictlyMonotonicMappingInverter::from(
|
||||
// StrictlyMonotonicMappingToInternalBaseval::new(min_value),
|
||||
// );
|
||||
// Ok(Arc::new(monotonic_map_column(reader, mapping)))
|
||||
// }
|
||||
// }
|
||||
|
||||
type Reader: Column<u64> + 'static;
|
||||
// /// The FastFieldSerializerEstimate trait is required on all variants
|
||||
// /// of fast field compressions, to decide which one to choose.
|
||||
// trait FastFieldCodec: 'static {
|
||||
// /// A codex needs to provide a unique name and id, which is
|
||||
// /// used for debugging and de/serialization.
|
||||
// const CODEC_TYPE: FastFieldCodecType;
|
||||
|
||||
/// Reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result<Self::Reader>;
|
||||
// type Reader: Column<u64> + 'static;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
///
|
||||
/// The column iterator should be preferred over using column `get_val` method for
|
||||
/// performance reasons.
|
||||
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>;
|
||||
// /// Reads the metadata and returns the CodecReader
|
||||
// fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result<Self::Reader>;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// If the codec is not applicable, returns `None`.
|
||||
///
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(column: &dyn Column) -> Option<f32>;
|
||||
}
|
||||
// /// Serializes the data using the serializer into write.
|
||||
// ///
|
||||
// /// The column iterator should be preferred over using column `get_val` method for
|
||||
// /// performance reasons.
|
||||
// fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>;
|
||||
|
||||
/// The list of all available codecs for u64 convertible data.
|
||||
pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
|
||||
FastFieldCodecType::Bitpacked,
|
||||
FastFieldCodecType::BlockwiseLinear,
|
||||
FastFieldCodecType::Linear,
|
||||
];
|
||||
// /// Returns an estimate of the compression ratio.
|
||||
// /// If the codec is not applicable, returns `None`.
|
||||
// ///
|
||||
// /// The baseline is uncompressed 64bit data.
|
||||
// ///
|
||||
// /// It could make sense to also return a value representing
|
||||
// /// computational complexity.
|
||||
// fn estimate(column: &dyn Column) -> Option<f32>;
|
||||
// }
|
||||
|
||||
// /// The list of all available codecs for u64 convertible data.
|
||||
// pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
|
||||
// FastFieldCodecType::Bitpacked,
|
||||
// FastFieldCodecType::BlockwiseLinear,
|
||||
// FastFieldCodecType::Linear,
|
||||
// ];
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -1,222 +0,0 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::io::BufRead;
|
||||
use std::net::{IpAddr, Ipv6Addr};
|
||||
use std::str::FromStr;
|
||||
|
||||
use common::OwnedBytes;
|
||||
use fastfield_codecs::{open_u128, serialize_u128, Column, FastFieldCodecType, VecColumn};
|
||||
use itertools::Itertools;
|
||||
use measure_time::print_time;
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
fn print_set_stats(ip_addrs: &[u128]) {
|
||||
println!("NumIps\t{}", ip_addrs.len());
|
||||
let ip_addr_set: HashSet<u128> = ip_addrs.iter().cloned().collect();
|
||||
println!("NumUniqueIps\t{}", ip_addr_set.len());
|
||||
let ratio_unique = ip_addr_set.len() as f64 / ip_addrs.len() as f64;
|
||||
println!("RatioUniqueOverTotal\t{ratio_unique:.4}");
|
||||
|
||||
// histogram
|
||||
let mut ip_addrs = ip_addrs.to_vec();
|
||||
ip_addrs.sort();
|
||||
let mut cnts: Vec<usize> = ip_addrs
|
||||
.into_iter()
|
||||
.dedup_with_count()
|
||||
.map(|(cnt, _)| cnt)
|
||||
.collect();
|
||||
cnts.sort();
|
||||
|
||||
let top_256_cnt: usize = cnts.iter().rev().take(256).sum();
|
||||
let top_128_cnt: usize = cnts.iter().rev().take(128).sum();
|
||||
let top_64_cnt: usize = cnts.iter().rev().take(64).sum();
|
||||
let top_8_cnt: usize = cnts.iter().rev().take(8).sum();
|
||||
let total: usize = cnts.iter().sum();
|
||||
|
||||
println!("{}", total);
|
||||
println!("{}", top_256_cnt);
|
||||
println!("{}", top_128_cnt);
|
||||
println!("Percentage Top8 {:02}", top_8_cnt as f32 / total as f32);
|
||||
println!("Percentage Top64 {:02}", top_64_cnt as f32 / total as f32);
|
||||
println!("Percentage Top128 {:02}", top_128_cnt as f32 / total as f32);
|
||||
println!("Percentage Top256 {:02}", top_256_cnt as f32 / total as f32);
|
||||
|
||||
let mut cnts: Vec<(usize, usize)> = cnts.into_iter().dedup_with_count().collect();
|
||||
cnts.sort_by(|a, b| {
|
||||
if a.1 == b.1 {
|
||||
a.0.cmp(&b.0)
|
||||
} else {
|
||||
b.1.cmp(&a.1)
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn ip_dataset() -> Vec<u128> {
|
||||
let mut ip_addr_v4 = 0;
|
||||
|
||||
let stdin = std::io::stdin();
|
||||
let ip_addrs: Vec<u128> = stdin
|
||||
.lock()
|
||||
.lines()
|
||||
.flat_map(|line| {
|
||||
let line = line.unwrap();
|
||||
let line = line.trim();
|
||||
let ip_addr = IpAddr::from_str(line.trim()).ok()?;
|
||||
if ip_addr.is_ipv4() {
|
||||
ip_addr_v4 += 1;
|
||||
}
|
||||
let ip_addr_v6: Ipv6Addr = match ip_addr {
|
||||
IpAddr::V4(v4) => v4.to_ipv6_mapped(),
|
||||
IpAddr::V6(v6) => v6,
|
||||
};
|
||||
Some(ip_addr_v6)
|
||||
})
|
||||
.map(|ip_v6| u128::from_be_bytes(ip_v6.octets()))
|
||||
.collect();
|
||||
|
||||
println!("IpAddrsAny\t{}", ip_addrs.len());
|
||||
println!("IpAddrsV4\t{}", ip_addr_v4);
|
||||
|
||||
ip_addrs
|
||||
}
|
||||
|
||||
fn bench_ip() {
|
||||
let dataset = ip_dataset();
|
||||
print_set_stats(&dataset);
|
||||
|
||||
// Chunks
|
||||
{
|
||||
let mut data = vec![];
|
||||
for dataset in dataset.chunks(500_000) {
|
||||
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap();
|
||||
}
|
||||
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
|
||||
println!("Compression 50_000 chunks {:.4}", compression);
|
||||
println!(
|
||||
"Num Bits per elem {:.2}",
|
||||
(data.len() * 8) as f32 / dataset.len() as f32
|
||||
);
|
||||
}
|
||||
|
||||
let mut data = vec![];
|
||||
{
|
||||
print_time!("creation");
|
||||
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap();
|
||||
}
|
||||
|
||||
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
|
||||
println!("Compression {:.2}", compression);
|
||||
println!(
|
||||
"Num Bits per elem {:.2}",
|
||||
(data.len() * 8) as f32 / dataset.len() as f32
|
||||
);
|
||||
|
||||
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
|
||||
// Sample some ranges
|
||||
let mut doc_values = Vec::new();
|
||||
for value in dataset.iter().take(1110).skip(1100).cloned() {
|
||||
doc_values.clear();
|
||||
print_time!("get range");
|
||||
decompressor.get_docids_for_value_range(
|
||||
value..=value,
|
||||
0..decompressor.num_vals(),
|
||||
&mut doc_values,
|
||||
);
|
||||
println!("{:?}", doc_values.len());
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
if env::args().nth(1).unwrap() == "bench_ip" {
|
||||
bench_ip();
|
||||
return;
|
||||
}
|
||||
|
||||
let mut table = Table::new();
|
||||
|
||||
// Add a row per time
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let results: Vec<(f32, f32, FastFieldCodecType)> = [
|
||||
serialize_with_codec(&data, FastFieldCodecType::Bitpacked),
|
||||
serialize_with_codec(&data, FastFieldCodecType::Linear),
|
||||
serialize_with_codec(&data, FastFieldCodecType::BlockwiseLinear),
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap())
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (est, comp, codec_type) in results {
|
||||
let est_cell = est.to_string();
|
||||
let ratio_cell = comp.to_string();
|
||||
let style = if comp == best_compression_ratio_codec.1 {
|
||||
"Fb"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
table.add_row(Row::new(vec![
|
||||
Cell::new(&format!("{codec_type:?}")).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
]));
|
||||
}
|
||||
}
|
||||
|
||||
table.printstd();
|
||||
}
|
||||
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (1000..=200_000_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Autoincrement"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
let data = (1..=200_000_u64)
|
||||
.map(|num| {
|
||||
let num = (num as f32 + num as f32).log10() as u64;
|
||||
current_cumulative += num;
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
let data = (1..=200_000_u64)
|
||||
.map(|num| {
|
||||
let num = (200_000.0 - num as f32).log10() as u64;
|
||||
current_cumulative += num;
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing convex"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec(
|
||||
data: &[u64],
|
||||
codec_type: FastFieldCodecType,
|
||||
) -> Option<(f32, f32, FastFieldCodecType)> {
|
||||
let col = VecColumn::from(data);
|
||||
let estimation = fastfield_codecs::estimate(&col, codec_type)?;
|
||||
let mut out = Vec::new();
|
||||
fastfield_codecs::serialize(&col, &mut out, &[codec_type]).ok()?;
|
||||
let actual_compression = out.len() as f32 / (col.num_vals() * 8) as f32;
|
||||
Some((estimation, actual_compression, codec_type))
|
||||
}
|
||||
@@ -21,14 +21,14 @@
|
||||
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use fastfield_codecs::MonotonicallyMappableToU64;
|
||||
use columnar::MonotonicallyMappableToU64;
|
||||
pub use fastfield_codecs::Column;
|
||||
|
||||
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
|
||||
// pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
// pub use self::facet_reader::FacetReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::{Column, CompositeFastFieldSerializer};
|
||||
pub use self::writer::FastFieldsWriter;
|
||||
use crate::schema::{Type, Value};
|
||||
use crate::DateTime;
|
||||
@@ -38,7 +38,6 @@ mod alive_bitset;
|
||||
mod error;
|
||||
// mod facet_reader;
|
||||
mod readers;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for types that provide a zero value.
|
||||
@@ -71,7 +70,7 @@ impl MakeZero for Ipv6Addr {
|
||||
/// Trait for types that are allowed for fast fields:
|
||||
/// (u64, i64 and f64, bool, DateTime).
|
||||
pub trait FastValue:
|
||||
MonotonicallyMappableToU64 + Copy + Send + Sync + PartialOrd + 'static
|
||||
Copy + Send + Sync + columnar::MonotonicallyMappableToU64 + PartialOrd + 'static
|
||||
{
|
||||
/// Returns the `schema::Type` for this FastValue.
|
||||
fn to_type() -> Type;
|
||||
@@ -100,21 +99,21 @@ impl FastValue for bool {
|
||||
Type::Bool
|
||||
}
|
||||
}
|
||||
impl FastValue for DateTime {
|
||||
fn to_type() -> Type {
|
||||
Type::Date
|
||||
}
|
||||
}
|
||||
|
||||
impl MonotonicallyMappableToU64 for DateTime {
|
||||
impl columnar::MonotonicallyMappableToU64 for DateTime {
|
||||
fn to_u64(self) -> u64 {
|
||||
self.timestamp_micros.to_u64()
|
||||
}
|
||||
|
||||
fn from_u64(val: u64) -> Self {
|
||||
let timestamp_micros = i64::from_u64(val);
|
||||
DateTime { timestamp_micros }
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for DateTime {
|
||||
fn to_type() -> Type {
|
||||
Type::Date
|
||||
DateTime {
|
||||
timestamp_micros: MonotonicallyMappableToU64::from_u64(val),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,7 +165,6 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::{HasLen, TerminatingWrite};
|
||||
use fastfield_codecs::{open, FastFieldCodecType};
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
|
||||
@@ -4,15 +4,12 @@ use std::sync::Arc;
|
||||
|
||||
use columnar::{
|
||||
BytesColumn, ColumnType, ColumnValues, ColumnarReader, DynamicColumn, DynamicColumnHandle,
|
||||
HasAssociatedColumnType, NumericalType, StrColumn,
|
||||
HasAssociatedColumnType, StrColumn,
|
||||
};
|
||||
use fastfield_codecs::{open, open_u128, Column};
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
|
||||
use crate::schema::{Field, FieldType, Schema};
|
||||
use crate::directory::FileSlice;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::{DateTime, TantivyError};
|
||||
|
||||
/// Provides access to all of the BitpackedFastFieldReader.
|
||||
///
|
||||
|
||||
@@ -1,122 +0,0 @@
|
||||
use std::fmt;
|
||||
use std::io::{self, Write};
|
||||
|
||||
pub use fastfield_codecs::Column;
|
||||
use fastfield_codecs::{FastFieldCodecType, MonotonicallyMappableToU64, ALL_CODEC_TYPES};
|
||||
|
||||
use crate::directory::{CompositeWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
/// Fast fields have different encodings like bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `create_auto_detect_u64_fast_field(...)`
|
||||
/// * `create_auto_detect_u64_fast_field(...)`
|
||||
/// * ...
|
||||
/// * `let bytes_fastfield = new_bytes_fast_field(...)`
|
||||
/// * `bytes_fastfield.write_all(...)`
|
||||
/// * `bytes_fastfield.write_all(...)`
|
||||
/// * `bytes_fastfield.flush()`
|
||||
/// * ...
|
||||
/// * `close()`
|
||||
pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
codec_types: Vec<FastFieldCodecType>,
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
/// New fast field serializer with all codec types
|
||||
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
|
||||
Self::from_write_with_codec(write, &ALL_CODEC_TYPES)
|
||||
}
|
||||
|
||||
/// New fast field serializer with allowed codec types
|
||||
pub fn from_write_with_codec(
|
||||
write: WritePtr,
|
||||
codec_types: &[FastFieldCodecType],
|
||||
) -> io::Result<CompositeFastFieldSerializer> {
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(CompositeFastFieldSerializer {
|
||||
composite_write,
|
||||
codec_types: codec_types.to_vec(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64 + fmt::Debug>(
|
||||
&mut self,
|
||||
field: Field,
|
||||
fastfield_accessor: impl Column<T>,
|
||||
) -> io::Result<()> {
|
||||
self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0)
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx<
|
||||
T: MonotonicallyMappableToU64 + fmt::Debug,
|
||||
>(
|
||||
&mut self,
|
||||
field: Field,
|
||||
fastfield_accessor: impl Column<T>,
|
||||
idx: usize,
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
fastfield_codecs::serialize(fastfield_accessor, field_write, &self.codec_types)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec of the the provided
|
||||
/// will be chosen.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs<
|
||||
T: MonotonicallyMappableToU64 + fmt::Debug,
|
||||
>(
|
||||
&mut self,
|
||||
field: Field,
|
||||
fastfield_accessor: impl Column<T>,
|
||||
idx: usize,
|
||||
codec_types: &[FastFieldCodecType],
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
fastfield_codecs::serialize(fastfield_accessor, field_write, codec_types)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serialize data into a new u128 fast field. The codec will be compact space compressor,
|
||||
/// which is optimized for scanning the fast field for a given range.
|
||||
pub fn create_u128_fast_field_with_idx<F: Fn() -> I, I: Iterator<Item = u128>>(
|
||||
&mut self,
|
||||
field: Field,
|
||||
iter_gen: F,
|
||||
num_vals: u32,
|
||||
idx: usize,
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
fastfield_codecs::serialize_u128(iter_gen, num_vals, field_write)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field. Use the returned writer to write data into the
|
||||
/// bytes field. To associate the bytes with documents a seperate index must be created on
|
||||
/// index 0. See bytes/writer.rs::serialize for an example.
|
||||
///
|
||||
/// The bytes will be stored as is, no compression will be applied.
|
||||
pub fn new_bytes_fast_field(&mut self, field: Field) -> impl Write + '_ {
|
||||
self.composite_write.for_field_with_idx(field, 1)
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently saved on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
@@ -3,12 +3,10 @@ use std::io;
|
||||
|
||||
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
|
||||
use common;
|
||||
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
use rustc_hash::FxHashMap;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
use super::FastFieldType;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};
|
||||
|
||||
@@ -801,7 +801,6 @@ mod tests {
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use fastfield_codecs::MonotonicallyMappableToU128;
|
||||
use proptest::prelude::*;
|
||||
use proptest::prop_oneof;
|
||||
use proptest::strategy::Strategy;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use fastfield_codecs::MonotonicallyMappableToU64;
|
||||
use columnar::MonotonicallyMappableToU64;
|
||||
use murmurhash32::murmurhash2;
|
||||
use rustc_hash::FxHashMap;
|
||||
|
||||
|
||||
@@ -1,20 +1,15 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fastfield_codecs::VecColumn;
|
||||
use itertools::Itertools;
|
||||
use measure_time::debug_time;
|
||||
|
||||
use super::flat_map_with_buffer::FlatMapWithBufferIter;
|
||||
// use super::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueIndexColumn;
|
||||
use crate::core::{Segment, SegmentReader};
|
||||
use crate::directory::WritePtr;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::{
|
||||
AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldNotAvailableError,
|
||||
};
|
||||
use crate::fastfield::{AliveBitSet, Column, FastFieldNotAvailableError};
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
|
||||
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
|
||||
// use crate::indexer::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueColumn;
|
||||
|
||||
@@ -2,7 +2,6 @@ use common::TerminatingWrite;
|
||||
|
||||
use crate::core::{Segment, SegmentComponent};
|
||||
use crate::directory::WritePtr;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::postings::InvertedIndexSerializer;
|
||||
use crate::store::StoreWriter;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use fastfield_codecs::MonotonicallyMappableToU64;
|
||||
use columnar::MonotonicallyMappableToU64;
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
||||
|
||||
@@ -5,9 +5,8 @@
|
||||
use std::net::Ipv6Addr;
|
||||
use std::ops::{Bound, RangeInclusive};
|
||||
|
||||
use columnar::Column;
|
||||
use columnar::{Column, MonotonicallyMappableToU128};
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::MonotonicallyMappableToU128;
|
||||
|
||||
use super::map_bound;
|
||||
use crate::query::range_query::fast_field_range_query::RangeDocSet;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
use std::ops::{Bound, RangeInclusive};
|
||||
|
||||
use fastfield_codecs::MonotonicallyMappableToU64;
|
||||
use columnar::MonotonicallyMappableToU64;
|
||||
|
||||
use super::fast_field_range_query::RangeDocSet;
|
||||
use super::map_bound;
|
||||
|
||||
@@ -132,7 +132,7 @@ mod tests {
|
||||
use std::net::{IpAddr, Ipv6Addr};
|
||||
use std::str::FromStr;
|
||||
|
||||
use fastfield_codecs::MonotonicallyMappableToU128;
|
||||
use columnar::MonotonicallyMappableToU128;
|
||||
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::query::{Query, QueryParser, TermQuery};
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::hash::{Hash, Hasher};
|
||||
use std::net::Ipv6Addr;
|
||||
use std::{fmt, str};
|
||||
|
||||
use fastfield_codecs::MonotonicallyMappableToU128;
|
||||
use columnar::MonotonicallyMappableToU128;
|
||||
|
||||
use super::Field;
|
||||
use crate::fastfield::FastValue;
|
||||
|
||||
@@ -319,8 +319,8 @@ mod binary_serialize {
|
||||
use std::io::{self, Read, Write};
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use columnar::MonotonicallyMappableToU128;
|
||||
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
||||
use fastfield_codecs::MonotonicallyMappableToU128;
|
||||
|
||||
use super::Value;
|
||||
use crate::schema::Facet;
|
||||
|
||||
Reference in New Issue
Block a user