diff --git a/columnar/src/column_values/mod.rs b/columnar/src/column_values/mod.rs index a3bb4f65d..f5b0d632b 100644 --- a/columnar/src/column_values/mod.rs +++ b/columnar/src/column_values/mod.rs @@ -17,10 +17,12 @@ use std::sync::Arc; use common::{BinarySerializable, OwnedBytes}; use compact_space::CompactSpaceDecompressor; +pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; use monotonic_mapping::{ StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal, StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval, }; +pub use monotonic_mapping_u128::MonotonicallyMappableToU128; use serialize::{Header, U128Header}; mod bitpacked; @@ -37,8 +39,6 @@ mod gcd; pub mod serialize; pub use self::column::{monotonic_map_column, ColumnValues, IterColumn, VecColumn}; -pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; -pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128; #[cfg(test)] pub use self::serialize::tests::serialize_and_load; pub use self::serialize::{serialize_column_values, NormalizedHeader}; diff --git a/columnar/src/lib.rs b/columnar/src/lib.rs index 958e0378e..44214cba7 100644 --- a/columnar/src/lib.rs +++ b/columnar/src/lib.rs @@ -20,7 +20,7 @@ mod value; pub use column::{BytesColumn, Column, StrColumn}; pub use column_index::ColumnIndex; -pub use column_values::ColumnValues; +pub use column_values::{ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64}; pub use columnar::{ merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType, MergeDocOrder, diff --git a/fastfield_codecs/src/column.rs b/fastfield_codecs/src/column.rs index 807e59855..a5a945b70 100644 --- a/fastfield_codecs/src/column.rs +++ b/fastfield_codecs/src/column.rs @@ -3,254 +3,3 @@ use std::marker::PhantomData; use std::ops::{Range, RangeInclusive}; pub use columnar::ColumnValues as Column; -use tantivy_bitpacker::minmax; - -use crate::monotonic_mapping::StrictlyMonotonicFn; - -/// VecColumn provides `Column` over a slice. -pub struct VecColumn<'a, T = u64> { - values: &'a [T], - min_value: T, - max_value: T, -} - -impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column for VecColumn<'a, T> { - fn get_val(&self, position: u32) -> T { - self.values[position as usize] - } - - fn iter(&self) -> Box + '_> { - Box::new(self.values.iter().copied()) - } - - fn min_value(&self) -> T { - self.min_value - } - - fn max_value(&self) -> T { - self.max_value - } - - fn num_vals(&self) -> u32 { - self.values.len() as u32 - } - - fn get_range(&self, start: u64, output: &mut [T]) { - output.copy_from_slice(&self.values[start as usize..][..output.len()]) - } -} - -impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T> -where V: AsRef<[T]> + ?Sized -{ - fn from(values: &'a V) -> Self { - let values = values.as_ref(); - let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default(); - Self { - values, - min_value, - max_value, - } - } -} - -struct MonotonicMappingColumn { - from_column: C, - monotonic_mapping: T, - _phantom: PhantomData, -} - -/// Creates a view of a column transformed by a strictly monotonic mapping. See -/// [`StrictlyMonotonicFn`]. -/// -/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3] -/// monotonic_mapping.mapping() is expected to be injective, and we should always have -/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el -/// -/// The inverse of the mapping is required for: -/// `fn get_positions_for_value_range(&self, range: RangeInclusive) -> Vec ` -/// The user provides the original value range and we need to monotonic map them in the same way the -/// serialization does before calling the underlying column. -/// -/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping -/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as -/// monotonic_mapping during serialization. -pub fn monotonic_map_column( - from_column: C, - monotonic_mapping: T, -) -> impl Column -where - C: Column, - T: StrictlyMonotonicFn + Send + Sync, - Input: PartialOrd + Send + Sync + Copy + Debug, - Output: PartialOrd + Send + Sync + Copy + Debug, -{ - MonotonicMappingColumn { - from_column, - monotonic_mapping, - _phantom: PhantomData, - } -} - -impl Column for MonotonicMappingColumn -where - C: Column, - T: StrictlyMonotonicFn + Send + Sync, - Input: PartialOrd + Send + Sync + Copy + Debug, - Output: PartialOrd + Send + Sync + Copy + Debug, -{ - #[inline] - fn get_val(&self, idx: u32) -> Output { - let from_val = self.from_column.get_val(idx); - self.monotonic_mapping.mapping(from_val) - } - - fn min_value(&self) -> Output { - let from_min_value = self.from_column.min_value(); - self.monotonic_mapping.mapping(from_min_value) - } - - fn max_value(&self) -> Output { - let from_max_value = self.from_column.max_value(); - self.monotonic_mapping.mapping(from_max_value) - } - - fn num_vals(&self) -> u32 { - self.from_column.num_vals() - } - - fn iter(&self) -> Box + '_> { - Box::new( - self.from_column - .iter() - .map(|el| self.monotonic_mapping.mapping(el)), - ) - } - - fn get_docids_for_value_range( - &self, - range: RangeInclusive, - doc_id_range: Range, - positions: &mut Vec, - ) { - if range.start() > &self.max_value() || range.end() < &self.min_value() { - return; - } - let range = self.monotonic_mapping.inverse_coerce(range); - if range.start() > range.end() { - return; - } - self.from_column - .get_docids_for_value_range(range, doc_id_range, positions) - } - - // We voluntarily do not implement get_range as it yields a regression, - // and we do not have any specialized implementation anyway. -} - -/// Wraps an iterator into a `Column`. -pub struct IterColumn(T); - -impl From for IterColumn -where T: Iterator + Clone + ExactSizeIterator -{ - fn from(iter: T) -> Self { - IterColumn(iter) - } -} - -impl Column for IterColumn -where - T: Iterator + Clone + ExactSizeIterator + Send + Sync, - T::Item: PartialOrd + fmt::Debug, -{ - fn get_val(&self, idx: u32) -> T::Item { - self.0.clone().nth(idx as usize).unwrap() - } - - fn min_value(&self) -> T::Item { - self.0.clone().next().unwrap() - } - - fn max_value(&self) -> T::Item { - self.0.clone().last().unwrap() - } - - fn num_vals(&self) -> u32 { - self.0.len() as u32 - } - - fn iter(&self) -> Box + '_> { - Box::new(self.0.clone()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::monotonic_mapping::{ - StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval, - StrictlyMonotonicMappingToInternalGCDBaseval, - }; - - #[test] - fn test_monotonic_mapping() { - let vals = &[3u64, 5u64][..]; - let col = VecColumn::from(vals); - let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2)); - assert_eq!(mapped.min_value(), 1u64); - assert_eq!(mapped.max_value(), 3u64); - assert_eq!(mapped.num_vals(), 2); - assert_eq!(mapped.num_vals(), 2); - assert_eq!(mapped.get_val(0), 1); - assert_eq!(mapped.get_val(1), 3); - } - - #[test] - fn test_range_as_col() { - let col = IterColumn::from(10..100); - assert_eq!(col.num_vals(), 90); - assert_eq!(col.max_value(), 99); - } - - #[test] - fn test_monotonic_mapping_iter() { - let vals: Vec = (10..110u64).map(|el| el * 10).collect(); - let col = VecColumn::from(&vals); - let mapped = monotonic_map_column( - col, - StrictlyMonotonicMappingInverter::from( - StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100), - ), - ); - let val_i64s: Vec = mapped.iter().collect(); - for i in 0..100 { - assert_eq!(val_i64s[i as usize], mapped.get_val(i)); - } - } - - #[test] - fn test_monotonic_mapping_get_range() { - let vals: Vec = (0..100u64).map(|el| el * 10).collect(); - let col = VecColumn::from(&vals); - let mapped = monotonic_map_column( - col, - StrictlyMonotonicMappingInverter::from( - StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0), - ), - ); - - assert_eq!(mapped.min_value(), 0u64); - assert_eq!(mapped.max_value(), 9900u64); - assert_eq!(mapped.num_vals(), 100); - let val_u64s: Vec = mapped.iter().collect(); - assert_eq!(val_u64s.len(), 100); - for i in 0..100 { - assert_eq!(val_u64s[i as usize], mapped.get_val(i)); - assert_eq!(val_u64s[i as usize], vals[i as usize] * 10); - } - let mut buf = [0u64; 20]; - mapped.get_range(7, &mut buf[..]); - assert_eq!(&val_u64s[7..][..20], &buf); - } -} diff --git a/fastfield_codecs/src/lib.rs b/fastfield_codecs/src/lib.rs index b1ade7426..1345572c7 100644 --- a/fastfield_codecs/src/lib.rs +++ b/fastfield_codecs/src/lib.rs @@ -14,208 +14,206 @@ extern crate more_asserts; #[cfg(all(test, feature = "unstable"))] extern crate test; -use std::io::Write; -use std::sync::Arc; -use std::{fmt, io}; +// use common::{BinarySerializable, OwnedBytes}; +// use compact_space::CompactSpaceDecompressor; +// use format_version::read_format_version; +// use monotonic_mapping::{ +// StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal, +// StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval, +// }; +// use null_index_footer::read_null_index_footer; +// use serialize::{Header, U128Header}; -use common::{BinarySerializable, OwnedBytes}; -use compact_space::CompactSpaceDecompressor; -use format_version::read_format_version; -use monotonic_mapping::{ - StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal, - StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval, -}; -use null_index_footer::read_null_index_footer; -use serialize::{Header, U128Header}; - -mod bitpacked; -mod blockwise_linear; -mod compact_space; -mod format_version; -mod line; -mod linear; -mod monotonic_mapping; -mod monotonic_mapping_u128; -#[allow(dead_code)] -mod null_index; -mod null_index_footer; +// mod bitpacked; +// mod blockwise_linear; +// mod compact_space; +// mod format_version; +// mod line; +// mod linear; +// mod monotonic_mapping; +// mod monotonic_mapping_u128; +// #[allow(dead_code)] +// mod null_index; +// mod null_index_footer; mod column; -mod gcd; -pub mod serialize; +pub use column::Column; -use self::bitpacked::BitpackedCodec; -use self::blockwise_linear::BlockwiseLinearCodec; -pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn}; -use self::linear::LinearCodec; -pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; -pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128; -pub use self::serialize::{ - estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader, -}; +// mod gcd; +// pub mod serialize; -#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] -#[repr(u8)] -/// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data. -pub enum FastFieldCodecType { - /// Bitpack all values in the value range. The number of bits is defined by the amplitude - /// `column.max_value() - column.min_value()` - Bitpacked = 1, - /// Linear interpolation puts a line between the first and last value and then bitpacks the - /// values by the offset from the line. The number of bits is defined by the max deviation from - /// the line. - Linear = 2, - /// Same as [`FastFieldCodecType::Linear`], but encodes in blocks of 512 elements. - BlockwiseLinear = 3, -} +// use self::bitpacked::BitpackedCodec; +// use self::blockwise_linear::BlockwiseLinearCodec; +// pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn}; +// use self::linear::LinearCodec; +// pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; +// pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128; +// pub use self::serialize::{ +// estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader, +// }; -impl BinarySerializable for FastFieldCodecType { - fn serialize(&self, wrt: &mut W) -> io::Result<()> { - self.to_code().serialize(wrt) - } +// #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] +// #[repr(u8)] +// /// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted +// data. pub enum FastFieldCodecType { +// /// Bitpack all values in the value range. The number of bits is defined by the amplitude +// /// `column.max_value() - column.min_value()` +// Bitpacked = 1, +// /// Linear interpolation puts a line between the first and last value and then bitpacks the +// /// values by the offset from the line. The number of bits is defined by the max deviation +// from /// the line. +// Linear = 2, +// /// Same as [`FastFieldCodecType::Linear`], but encodes in blocks of 512 elements. +// BlockwiseLinear = 3, +// } - fn deserialize(reader: &mut R) -> io::Result { - let code = u8::deserialize(reader)?; - let codec_type: Self = Self::from_code(code) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?; - Ok(codec_type) - } -} +// impl BinarySerializable for FastFieldCodecType { +// fn serialize(&self, wrt: &mut W) -> io::Result<()> { +// self.to_code().serialize(wrt) +// } -impl FastFieldCodecType { - pub(crate) fn to_code(self) -> u8 { - self as u8 - } +// fn deserialize(reader: &mut R) -> io::Result { +// let code = u8::deserialize(reader)?; +// let codec_type: Self = Self::from_code(code) +// .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code +// `{code}.`"))?; Ok(codec_type) +// } +// } - pub(crate) fn from_code(code: u8) -> Option { - match code { - 1 => Some(Self::Bitpacked), - 2 => Some(Self::Linear), - 3 => Some(Self::BlockwiseLinear), - _ => None, - } - } -} +// impl FastFieldCodecType { +// pub(crate) fn to_code(self) -> u8 { +// self as u8 +// } -#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] -#[repr(u8)] -/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data. -pub enum U128FastFieldCodecType { - /// This codec takes a large number space (u128) and reduces it to a compact number space, by - /// removing the holes. - CompactSpace = 1, -} +// pub(crate) fn from_code(code: u8) -> Option { +// match code { +// 1 => Some(Self::Bitpacked), +// 2 => Some(Self::Linear), +// 3 => Some(Self::BlockwiseLinear), +// _ => None, +// } +// } +// } -impl BinarySerializable for U128FastFieldCodecType { - fn serialize(&self, wrt: &mut W) -> io::Result<()> { - self.to_code().serialize(wrt) - } +// #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] +// #[repr(u8)] +// /// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted +// data. pub enum U128FastFieldCodecType { +// /// This codec takes a large number space (u128) and reduces it to a compact number space, by +// /// removing the holes. +// CompactSpace = 1, +// } - fn deserialize(reader: &mut R) -> io::Result { - let code = u8::deserialize(reader)?; - let codec_type: Self = Self::from_code(code) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?; - Ok(codec_type) - } -} +// impl BinarySerializable for U128FastFieldCodecType { +// fn serialize(&self, wrt: &mut W) -> io::Result<()> { +// self.to_code().serialize(wrt) +// } -impl U128FastFieldCodecType { - pub(crate) fn to_code(self) -> u8 { - self as u8 - } +// fn deserialize(reader: &mut R) -> io::Result { +// let code = u8::deserialize(reader)?; +// let codec_type: Self = Self::from_code(code) +// .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code +// `{code}.`"))?; Ok(codec_type) +// } +// } - pub(crate) fn from_code(code: u8) -> Option { - match code { - 1 => Some(Self::CompactSpace), - _ => None, - } - } -} +// impl U128FastFieldCodecType { +// pub(crate) fn to_code(self) -> u8 { +// self as u8 +// } -/// Returns the correct codec reader wrapped in the `Arc` for the data. -pub fn open_u128( - bytes: OwnedBytes, -) -> io::Result>> { - let (bytes, _format_version) = read_format_version(bytes)?; - let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?; - let header = U128Header::deserialize(&mut bytes)?; - assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace); - let reader = CompactSpaceDecompressor::open(bytes)?; - let inverted: StrictlyMonotonicMappingInverter> = - StrictlyMonotonicMappingToInternal::::new().into(); - Ok(Arc::new(monotonic_map_column(reader, inverted))) -} +// pub(crate) fn from_code(code: u8) -> Option { +// match code { +// 1 => Some(Self::CompactSpace), +// _ => None, +// } +// } +// } -/// Returns the correct codec reader wrapped in the `Arc` for the data. -pub fn open( - bytes: OwnedBytes, -) -> io::Result>> { - let (bytes, _format_version) = read_format_version(bytes)?; - let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?; - let header = Header::deserialize(&mut bytes)?; - match header.codec_type { - FastFieldCodecType::Bitpacked => open_specific_codec::(bytes, &header), - FastFieldCodecType::Linear => open_specific_codec::(bytes, &header), - FastFieldCodecType::BlockwiseLinear => { - open_specific_codec::(bytes, &header) - } - } -} +// /// Returns the correct codec reader wrapped in the `Arc` for the data. +// pub fn open_u128( +// bytes: OwnedBytes, +// ) -> io::Result>> { +// let (bytes, _format_version) = read_format_version(bytes)?; +// let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?; +// let header = U128Header::deserialize(&mut bytes)?; +// assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace); +// let reader = CompactSpaceDecompressor::open(bytes)?; +// let inverted: StrictlyMonotonicMappingInverter> = +// StrictlyMonotonicMappingToInternal::::new().into(); +// Ok(Arc::new(monotonic_map_column(reader, inverted))) +// } -fn open_specific_codec( - bytes: OwnedBytes, - header: &Header, -) -> io::Result>> { - let normalized_header = header.normalized(); - let reader = C::open_from_bytes(bytes, normalized_header)?; - let min_value = header.min_value; - if let Some(gcd) = header.gcd { - let mapping = StrictlyMonotonicMappingInverter::from( - StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value), - ); - Ok(Arc::new(monotonic_map_column(reader, mapping))) - } else { - let mapping = StrictlyMonotonicMappingInverter::from( - StrictlyMonotonicMappingToInternalBaseval::new(min_value), - ); - Ok(Arc::new(monotonic_map_column(reader, mapping))) - } -} +// /// Returns the correct codec reader wrapped in the `Arc` for the data. +// pub fn open( +// bytes: OwnedBytes, +// ) -> io::Result>> { +// let (bytes, _format_version) = read_format_version(bytes)?; +// let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?; +// let header = Header::deserialize(&mut bytes)?; +// match header.codec_type { +// FastFieldCodecType::Bitpacked => open_specific_codec::(bytes, +// &header), FastFieldCodecType::Linear => open_specific_codec::(bytes, +// &header), FastFieldCodecType::BlockwiseLinear => { +// open_specific_codec::(bytes, &header) +// } +// } +// } -/// The FastFieldSerializerEstimate trait is required on all variants -/// of fast field compressions, to decide which one to choose. -trait FastFieldCodec: 'static { - /// A codex needs to provide a unique name and id, which is - /// used for debugging and de/serialization. - const CODEC_TYPE: FastFieldCodecType; +// fn open_specific_codec( +// bytes: OwnedBytes, +// header: &Header, +// ) -> io::Result>> { +// let normalized_header = header.normalized(); +// let reader = C::open_from_bytes(bytes, normalized_header)?; +// let min_value = header.min_value; +// if let Some(gcd) = header.gcd { +// let mapping = StrictlyMonotonicMappingInverter::from( +// StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value), +// ); +// Ok(Arc::new(monotonic_map_column(reader, mapping))) +// } else { +// let mapping = StrictlyMonotonicMappingInverter::from( +// StrictlyMonotonicMappingToInternalBaseval::new(min_value), +// ); +// Ok(Arc::new(monotonic_map_column(reader, mapping))) +// } +// } - type Reader: Column + 'static; +// /// The FastFieldSerializerEstimate trait is required on all variants +// /// of fast field compressions, to decide which one to choose. +// trait FastFieldCodec: 'static { +// /// A codex needs to provide a unique name and id, which is +// /// used for debugging and de/serialization. +// const CODEC_TYPE: FastFieldCodecType; - /// Reads the metadata and returns the CodecReader - fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result; +// type Reader: Column + 'static; - /// Serializes the data using the serializer into write. - /// - /// The column iterator should be preferred over using column `get_val` method for - /// performance reasons. - fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>; +// /// Reads the metadata and returns the CodecReader +// fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result; - /// Returns an estimate of the compression ratio. - /// If the codec is not applicable, returns `None`. - /// - /// The baseline is uncompressed 64bit data. - /// - /// It could make sense to also return a value representing - /// computational complexity. - fn estimate(column: &dyn Column) -> Option; -} +// /// Serializes the data using the serializer into write. +// /// +// /// The column iterator should be preferred over using column `get_val` method for +// /// performance reasons. +// fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>; -/// The list of all available codecs for u64 convertible data. -pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [ - FastFieldCodecType::Bitpacked, - FastFieldCodecType::BlockwiseLinear, - FastFieldCodecType::Linear, -]; +// /// Returns an estimate of the compression ratio. +// /// If the codec is not applicable, returns `None`. +// /// +// /// The baseline is uncompressed 64bit data. +// /// +// /// It could make sense to also return a value representing +// /// computational complexity. +// fn estimate(column: &dyn Column) -> Option; +// } + +// /// The list of all available codecs for u64 convertible data. +// pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [ +// FastFieldCodecType::Bitpacked, +// FastFieldCodecType::BlockwiseLinear, +// FastFieldCodecType::Linear, +// ]; #[cfg(test)] mod tests { diff --git a/fastfield_codecs/src/main.rs b/fastfield_codecs/src/main.rs deleted file mode 100644 index aad421866..000000000 --- a/fastfield_codecs/src/main.rs +++ /dev/null @@ -1,222 +0,0 @@ -#[macro_use] -extern crate prettytable; -use std::collections::HashSet; -use std::env; -use std::io::BufRead; -use std::net::{IpAddr, Ipv6Addr}; -use std::str::FromStr; - -use common::OwnedBytes; -use fastfield_codecs::{open_u128, serialize_u128, Column, FastFieldCodecType, VecColumn}; -use itertools::Itertools; -use measure_time::print_time; -use prettytable::{Cell, Row, Table}; - -fn print_set_stats(ip_addrs: &[u128]) { - println!("NumIps\t{}", ip_addrs.len()); - let ip_addr_set: HashSet = ip_addrs.iter().cloned().collect(); - println!("NumUniqueIps\t{}", ip_addr_set.len()); - let ratio_unique = ip_addr_set.len() as f64 / ip_addrs.len() as f64; - println!("RatioUniqueOverTotal\t{ratio_unique:.4}"); - - // histogram - let mut ip_addrs = ip_addrs.to_vec(); - ip_addrs.sort(); - let mut cnts: Vec = ip_addrs - .into_iter() - .dedup_with_count() - .map(|(cnt, _)| cnt) - .collect(); - cnts.sort(); - - let top_256_cnt: usize = cnts.iter().rev().take(256).sum(); - let top_128_cnt: usize = cnts.iter().rev().take(128).sum(); - let top_64_cnt: usize = cnts.iter().rev().take(64).sum(); - let top_8_cnt: usize = cnts.iter().rev().take(8).sum(); - let total: usize = cnts.iter().sum(); - - println!("{}", total); - println!("{}", top_256_cnt); - println!("{}", top_128_cnt); - println!("Percentage Top8 {:02}", top_8_cnt as f32 / total as f32); - println!("Percentage Top64 {:02}", top_64_cnt as f32 / total as f32); - println!("Percentage Top128 {:02}", top_128_cnt as f32 / total as f32); - println!("Percentage Top256 {:02}", top_256_cnt as f32 / total as f32); - - let mut cnts: Vec<(usize, usize)> = cnts.into_iter().dedup_with_count().collect(); - cnts.sort_by(|a, b| { - if a.1 == b.1 { - a.0.cmp(&b.0) - } else { - b.1.cmp(&a.1) - } - }); -} - -fn ip_dataset() -> Vec { - let mut ip_addr_v4 = 0; - - let stdin = std::io::stdin(); - let ip_addrs: Vec = stdin - .lock() - .lines() - .flat_map(|line| { - let line = line.unwrap(); - let line = line.trim(); - let ip_addr = IpAddr::from_str(line.trim()).ok()?; - if ip_addr.is_ipv4() { - ip_addr_v4 += 1; - } - let ip_addr_v6: Ipv6Addr = match ip_addr { - IpAddr::V4(v4) => v4.to_ipv6_mapped(), - IpAddr::V6(v6) => v6, - }; - Some(ip_addr_v6) - }) - .map(|ip_v6| u128::from_be_bytes(ip_v6.octets())) - .collect(); - - println!("IpAddrsAny\t{}", ip_addrs.len()); - println!("IpAddrsV4\t{}", ip_addr_v4); - - ip_addrs -} - -fn bench_ip() { - let dataset = ip_dataset(); - print_set_stats(&dataset); - - // Chunks - { - let mut data = vec![]; - for dataset in dataset.chunks(500_000) { - serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap(); - } - let compression = data.len() as f64 / (dataset.len() * 16) as f64; - println!("Compression 50_000 chunks {:.4}", compression); - println!( - "Num Bits per elem {:.2}", - (data.len() * 8) as f32 / dataset.len() as f32 - ); - } - - let mut data = vec![]; - { - print_time!("creation"); - serialize_u128(|| dataset.iter().cloned(), dataset.len() as u32, &mut data).unwrap(); - } - - let compression = data.len() as f64 / (dataset.len() * 16) as f64; - println!("Compression {:.2}", compression); - println!( - "Num Bits per elem {:.2}", - (data.len() * 8) as f32 / dataset.len() as f32 - ); - - let decompressor = open_u128::(OwnedBytes::new(data)).unwrap(); - // Sample some ranges - let mut doc_values = Vec::new(); - for value in dataset.iter().take(1110).skip(1100).cloned() { - doc_values.clear(); - print_time!("get range"); - decompressor.get_docids_for_value_range( - value..=value, - 0..decompressor.num_vals(), - &mut doc_values, - ); - println!("{:?}", doc_values.len()); - } -} - -fn main() { - if env::args().nth(1).unwrap() == "bench_ip" { - bench_ip(); - return; - } - - let mut table = Table::new(); - - // Add a row per time - table.add_row(row!["", "Compression Ratio", "Compression Estimation"]); - - for (data, data_set_name) in get_codec_test_data_sets() { - let results: Vec<(f32, f32, FastFieldCodecType)> = [ - serialize_with_codec(&data, FastFieldCodecType::Bitpacked), - serialize_with_codec(&data, FastFieldCodecType::Linear), - serialize_with_codec(&data, FastFieldCodecType::BlockwiseLinear), - ] - .into_iter() - .flatten() - .collect(); - let best_compression_ratio_codec = results - .iter() - .min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap()) - .cloned() - .unwrap(); - - table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")])); - for (est, comp, codec_type) in results { - let est_cell = est.to_string(); - let ratio_cell = comp.to_string(); - let style = if comp == best_compression_ratio_codec.1 { - "Fb" - } else { - "" - }; - table.add_row(Row::new(vec![ - Cell::new(&format!("{codec_type:?}")).style_spec("bFg"), - Cell::new(&ratio_cell).style_spec(style), - Cell::new(&est_cell).style_spec(""), - ])); - } - } - - table.printstd(); -} - -pub fn get_codec_test_data_sets() -> Vec<(Vec, &'static str)> { - let mut data_and_names = vec![]; - - let data = (1000..=200_000_u64).collect::>(); - data_and_names.push((data, "Autoincrement")); - - let mut current_cumulative = 0; - let data = (1..=200_000_u64) - .map(|num| { - let num = (num as f32 + num as f32).log10() as u64; - current_cumulative += num; - current_cumulative - }) - .collect::>(); - // let data = (1..=200000_u64).map(|num| num + num).collect::>(); - data_and_names.push((data, "Monotonically increasing concave")); - - let mut current_cumulative = 0; - let data = (1..=200_000_u64) - .map(|num| { - let num = (200_000.0 - num as f32).log10() as u64; - current_cumulative += num; - current_cumulative - }) - .collect::>(); - data_and_names.push((data, "Monotonically increasing convex")); - - let data = (1000..=200_000_u64) - .map(|num| num + rand::random::() as u64) - .collect::>(); - data_and_names.push((data, "Almost monotonically increasing")); - - data_and_names -} - -pub fn serialize_with_codec( - data: &[u64], - codec_type: FastFieldCodecType, -) -> Option<(f32, f32, FastFieldCodecType)> { - let col = VecColumn::from(data); - let estimation = fastfield_codecs::estimate(&col, codec_type)?; - let mut out = Vec::new(); - fastfield_codecs::serialize(&col, &mut out, &[codec_type]).ok()?; - let actual_compression = out.len() as f32 / (col.num_vals() * 8) as f32; - Some((estimation, actual_compression, codec_type)) -} diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 297c2a9ff..4f863466c 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -21,14 +21,14 @@ use std::net::Ipv6Addr; -use fastfield_codecs::MonotonicallyMappableToU64; +use columnar::MonotonicallyMappableToU64; +pub use fastfield_codecs::Column; pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet}; // pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::error::{FastFieldNotAvailableError, Result}; // pub use self::facet_reader::FacetReader; pub use self::readers::FastFieldReaders; -pub use self::serializer::{Column, CompositeFastFieldSerializer}; pub use self::writer::FastFieldsWriter; use crate::schema::{Type, Value}; use crate::DateTime; @@ -38,7 +38,6 @@ mod alive_bitset; mod error; // mod facet_reader; mod readers; -mod serializer; mod writer; /// Trait for types that provide a zero value. @@ -71,7 +70,7 @@ impl MakeZero for Ipv6Addr { /// Trait for types that are allowed for fast fields: /// (u64, i64 and f64, bool, DateTime). pub trait FastValue: - MonotonicallyMappableToU64 + Copy + Send + Sync + PartialOrd + 'static + Copy + Send + Sync + columnar::MonotonicallyMappableToU64 + PartialOrd + 'static { /// Returns the `schema::Type` for this FastValue. fn to_type() -> Type; @@ -100,21 +99,21 @@ impl FastValue for bool { Type::Bool } } +impl FastValue for DateTime { + fn to_type() -> Type { + Type::Date + } +} -impl MonotonicallyMappableToU64 for DateTime { +impl columnar::MonotonicallyMappableToU64 for DateTime { fn to_u64(self) -> u64 { self.timestamp_micros.to_u64() } fn from_u64(val: u64) -> Self { - let timestamp_micros = i64::from_u64(val); - DateTime { timestamp_micros } - } -} - -impl FastValue for DateTime { - fn to_type() -> Type { - Type::Date + DateTime { + timestamp_micros: MonotonicallyMappableToU64::from_u64(val), + } } } @@ -166,7 +165,6 @@ mod tests { use std::sync::Arc; use common::{HasLen, TerminatingWrite}; - use fastfield_codecs::{open, FastFieldCodecType}; use once_cell::sync::Lazy; use rand::prelude::SliceRandom; use rand::rngs::StdRng; diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index 20249b9f9..653941faa 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -4,15 +4,12 @@ use std::sync::Arc; use columnar::{ BytesColumn, ColumnType, ColumnValues, ColumnarReader, DynamicColumn, DynamicColumnHandle, - HasAssociatedColumnType, NumericalType, StrColumn, + HasAssociatedColumnType, StrColumn, }; -use fastfield_codecs::{open, open_u128, Column}; +use fastfield_codecs::Column; -use crate::directory::{CompositeFile, FileSlice}; -use crate::fastfield::{FastFieldNotAvailableError, FastValue}; -use crate::schema::{Field, FieldType, Schema}; +use crate::directory::FileSlice; use crate::space_usage::PerFieldSpaceUsage; -use crate::{DateTime, TantivyError}; /// Provides access to all of the BitpackedFastFieldReader. /// diff --git a/src/fastfield/serializer/mod.rs b/src/fastfield/serializer/mod.rs deleted file mode 100644 index 6f22f7c7a..000000000 --- a/src/fastfield/serializer/mod.rs +++ /dev/null @@ -1,122 +0,0 @@ -use std::fmt; -use std::io::{self, Write}; - -pub use fastfield_codecs::Column; -use fastfield_codecs::{FastFieldCodecType, MonotonicallyMappableToU64, ALL_CODEC_TYPES}; - -use crate::directory::{CompositeWrite, WritePtr}; -use crate::schema::Field; - -/// `CompositeFastFieldSerializer` is in charge of serializing -/// fastfields on disk. -/// -/// Fast fields have different encodings like bit-packing. -/// -/// `FastFieldWriter`s are in charge of pushing the data to -/// the serializer. -/// The serializer expects to receive the following calls. -/// -/// * `create_auto_detect_u64_fast_field(...)` -/// * `create_auto_detect_u64_fast_field(...)` -/// * ... -/// * `let bytes_fastfield = new_bytes_fast_field(...)` -/// * `bytes_fastfield.write_all(...)` -/// * `bytes_fastfield.write_all(...)` -/// * `bytes_fastfield.flush()` -/// * ... -/// * `close()` -pub struct CompositeFastFieldSerializer { - composite_write: CompositeWrite, - codec_types: Vec, -} - -impl CompositeFastFieldSerializer { - /// New fast field serializer with all codec types - pub fn from_write(write: WritePtr) -> io::Result { - Self::from_write_with_codec(write, &ALL_CODEC_TYPES) - } - - /// New fast field serializer with allowed codec types - pub fn from_write_with_codec( - write: WritePtr, - codec_types: &[FastFieldCodecType], - ) -> io::Result { - let composite_write = CompositeWrite::wrap(write); - Ok(CompositeFastFieldSerializer { - composite_write, - codec_types: codec_types.to_vec(), - }) - } - - /// Serialize data into a new u64 fast field. The best compression codec will be chosen - /// automatically. - pub fn create_auto_detect_u64_fast_field( - &mut self, - field: Field, - fastfield_accessor: impl Column, - ) -> io::Result<()> { - self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0) - } - - /// Serialize data into a new u64 fast field. The best compression codec will be chosen - /// automatically. - pub fn create_auto_detect_u64_fast_field_with_idx< - T: MonotonicallyMappableToU64 + fmt::Debug, - >( - &mut self, - field: Field, - fastfield_accessor: impl Column, - idx: usize, - ) -> io::Result<()> { - let field_write = self.composite_write.for_field_with_idx(field, idx); - fastfield_codecs::serialize(fastfield_accessor, field_write, &self.codec_types)?; - Ok(()) - } - - /// Serialize data into a new u64 fast field. The best compression codec of the the provided - /// will be chosen. - pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs< - T: MonotonicallyMappableToU64 + fmt::Debug, - >( - &mut self, - field: Field, - fastfield_accessor: impl Column, - idx: usize, - codec_types: &[FastFieldCodecType], - ) -> io::Result<()> { - let field_write = self.composite_write.for_field_with_idx(field, idx); - fastfield_codecs::serialize(fastfield_accessor, field_write, codec_types)?; - Ok(()) - } - - /// Serialize data into a new u128 fast field. The codec will be compact space compressor, - /// which is optimized for scanning the fast field for a given range. - pub fn create_u128_fast_field_with_idx I, I: Iterator>( - &mut self, - field: Field, - iter_gen: F, - num_vals: u32, - idx: usize, - ) -> io::Result<()> { - let field_write = self.composite_write.for_field_with_idx(field, idx); - fastfield_codecs::serialize_u128(iter_gen, num_vals, field_write)?; - - Ok(()) - } - - /// Start serializing a new [u8] fast field. Use the returned writer to write data into the - /// bytes field. To associate the bytes with documents a seperate index must be created on - /// index 0. See bytes/writer.rs::serialize for an example. - /// - /// The bytes will be stored as is, no compression will be applied. - pub fn new_bytes_fast_field(&mut self, field: Field) -> impl Write + '_ { - self.composite_write.for_field_with_idx(field, 1) - } - - /// Closes the serializer - /// - /// After this call the data must be persistently saved on disk. - pub fn close(self) -> io::Result<()> { - self.composite_write.close() - } -} diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index a8a3251ac..3a0679694 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -3,12 +3,10 @@ use std::io; use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue}; use common; -use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64}; use rustc_hash::FxHashMap; use tantivy_bitpacker::BlockedBitpacker; use super::FastFieldType; -use crate::fastfield::CompositeFastFieldSerializer; use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::UnorderedTermId; use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value}; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 0d0746910..37fca01ff 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -801,7 +801,6 @@ mod tests { use std::collections::{HashMap, HashSet}; use std::net::Ipv6Addr; - use fastfield_codecs::MonotonicallyMappableToU128; use proptest::prelude::*; use proptest::prop_oneof; use proptest::strategy::Strategy; diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index be3c56bb8..1dbb29f25 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -1,4 +1,4 @@ -use fastfield_codecs::MonotonicallyMappableToU64; +use columnar::MonotonicallyMappableToU64; use murmurhash32::murmurhash2; use rustc_hash::FxHashMap; diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 455c08309..d7e03adbd 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,20 +1,15 @@ use std::collections::HashMap; -use std::io::Write; use std::sync::Arc; -use fastfield_codecs::VecColumn; use itertools::Itertools; use measure_time::debug_time; -use super::flat_map_with_buffer::FlatMapWithBufferIter; // use super::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueIndexColumn; use crate::core::{Segment, SegmentReader}; use crate::directory::WritePtr; use crate::docset::{DocSet, TERMINATED}; use crate::error::DataCorruption; -use crate::fastfield::{ - AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldNotAvailableError, -}; +use crate::fastfield::{AliveBitSet, Column, FastFieldNotAvailableError}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::indexer::doc_id_mapping::SegmentDocIdMapping; // use crate::indexer::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueColumn; diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index f96a9a594..6dcb442ff 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -2,7 +2,6 @@ use common::TerminatingWrite; use crate::core::{Segment, SegmentComponent}; use crate::directory::WritePtr; -use crate::fastfield::CompositeFastFieldSerializer; use crate::fieldnorm::FieldNormsSerializer; use crate::postings::InvertedIndexSerializer; use crate::store::StoreWriter; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index c3efdbf7b..f0471aa15 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -1,4 +1,4 @@ -use fastfield_codecs::MonotonicallyMappableToU64; +use columnar::MonotonicallyMappableToU64; use itertools::Itertools; use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}; diff --git a/src/query/range_query/range_query_ip_fastfield.rs b/src/query/range_query/range_query_ip_fastfield.rs index a7627a92f..e449d2337 100644 --- a/src/query/range_query/range_query_ip_fastfield.rs +++ b/src/query/range_query/range_query_ip_fastfield.rs @@ -5,9 +5,8 @@ use std::net::Ipv6Addr; use std::ops::{Bound, RangeInclusive}; -use columnar::Column; +use columnar::{Column, MonotonicallyMappableToU128}; use common::BinarySerializable; -use fastfield_codecs::MonotonicallyMappableToU128; use super::map_bound; use crate::query::range_query::fast_field_range_query::RangeDocSet; diff --git a/src/query/range_query/range_query_u64_fastfield.rs b/src/query/range_query/range_query_u64_fastfield.rs index 29df77e3a..2077a55ad 100644 --- a/src/query/range_query/range_query_u64_fastfield.rs +++ b/src/query/range_query/range_query_u64_fastfield.rs @@ -4,7 +4,7 @@ use std::ops::{Bound, RangeInclusive}; -use fastfield_codecs::MonotonicallyMappableToU64; +use columnar::MonotonicallyMappableToU64; use super::fast_field_range_query::RangeDocSet; use super::map_bound; diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index bc13fafc2..89b3a7e14 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -132,7 +132,7 @@ mod tests { use std::net::{IpAddr, Ipv6Addr}; use std::str::FromStr; - use fastfield_codecs::MonotonicallyMappableToU128; + use columnar::MonotonicallyMappableToU128; use crate::collector::{Count, TopDocs}; use crate::query::{Query, QueryParser, TermQuery}; diff --git a/src/schema/term.rs b/src/schema/term.rs index ad393b1db..fc4505f72 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -3,7 +3,7 @@ use std::hash::{Hash, Hasher}; use std::net::Ipv6Addr; use std::{fmt, str}; -use fastfield_codecs::MonotonicallyMappableToU128; +use columnar::MonotonicallyMappableToU128; use super::Field; use crate::fastfield::FastValue; diff --git a/src/schema/value.rs b/src/schema/value.rs index 6e101c9a4..73bedc23d 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -319,8 +319,8 @@ mod binary_serialize { use std::io::{self, Read, Write}; use std::net::Ipv6Addr; + use columnar::MonotonicallyMappableToU128; use common::{f64_to_u64, u64_to_f64, BinarySerializable}; - use fastfield_codecs::MonotonicallyMappableToU128; use super::Value; use crate::schema::Facet;