This commit is contained in:
Paul Masurel
2023-01-23 17:15:43 +09:00
parent 0e66423de8
commit d29d63a829
19 changed files with 203 additions and 815 deletions

View File

@@ -21,14 +21,14 @@
use std::net::Ipv6Addr;
use fastfield_codecs::MonotonicallyMappableToU64;
use columnar::MonotonicallyMappableToU64;
pub use fastfield_codecs::Column;
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
// pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result};
// pub use self::facet_reader::FacetReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::{Column, CompositeFastFieldSerializer};
pub use self::writer::FastFieldsWriter;
use crate::schema::{Type, Value};
use crate::DateTime;
@@ -38,7 +38,6 @@ mod alive_bitset;
mod error;
// mod facet_reader;
mod readers;
mod serializer;
mod writer;
/// Trait for types that provide a zero value.
@@ -71,7 +70,7 @@ impl MakeZero for Ipv6Addr {
/// Trait for types that are allowed for fast fields:
/// (u64, i64 and f64, bool, DateTime).
pub trait FastValue:
MonotonicallyMappableToU64 + Copy + Send + Sync + PartialOrd + 'static
Copy + Send + Sync + columnar::MonotonicallyMappableToU64 + PartialOrd + 'static
{
/// Returns the `schema::Type` for this FastValue.
fn to_type() -> Type;
@@ -100,21 +99,21 @@ impl FastValue for bool {
Type::Bool
}
}
impl FastValue for DateTime {
fn to_type() -> Type {
Type::Date
}
}
impl MonotonicallyMappableToU64 for DateTime {
impl columnar::MonotonicallyMappableToU64 for DateTime {
fn to_u64(self) -> u64 {
self.timestamp_micros.to_u64()
}
fn from_u64(val: u64) -> Self {
let timestamp_micros = i64::from_u64(val);
DateTime { timestamp_micros }
}
}
impl FastValue for DateTime {
fn to_type() -> Type {
Type::Date
DateTime {
timestamp_micros: MonotonicallyMappableToU64::from_u64(val),
}
}
}
@@ -166,7 +165,6 @@ mod tests {
use std::sync::Arc;
use common::{HasLen, TerminatingWrite};
use fastfield_codecs::{open, FastFieldCodecType};
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;

View File

@@ -4,15 +4,12 @@ use std::sync::Arc;
use columnar::{
BytesColumn, ColumnType, ColumnValues, ColumnarReader, DynamicColumn, DynamicColumnHandle,
HasAssociatedColumnType, NumericalType, StrColumn,
HasAssociatedColumnType, StrColumn,
};
use fastfield_codecs::{open, open_u128, Column};
use fastfield_codecs::Column;
use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::{Field, FieldType, Schema};
use crate::directory::FileSlice;
use crate::space_usage::PerFieldSpaceUsage;
use crate::{DateTime, TantivyError};
/// Provides access to all of the BitpackedFastFieldReader.
///

View File

@@ -1,122 +0,0 @@
use std::fmt;
use std::io::{self, Write};
pub use fastfield_codecs::Column;
use fastfield_codecs::{FastFieldCodecType, MonotonicallyMappableToU64, ALL_CODEC_TYPES};
use crate::directory::{CompositeWrite, WritePtr};
use crate::schema::Field;
/// `CompositeFastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
/// Fast fields have different encodings like bit-packing.
///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
/// The serializer expects to receive the following calls.
///
/// * `create_auto_detect_u64_fast_field(...)`
/// * `create_auto_detect_u64_fast_field(...)`
/// * ...
/// * `let bytes_fastfield = new_bytes_fast_field(...)`
/// * `bytes_fastfield.write_all(...)`
/// * `bytes_fastfield.write_all(...)`
/// * `bytes_fastfield.flush()`
/// * ...
/// * `close()`
pub struct CompositeFastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
codec_types: Vec<FastFieldCodecType>,
}
impl CompositeFastFieldSerializer {
/// New fast field serializer with all codec types
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
Self::from_write_with_codec(write, &ALL_CODEC_TYPES)
}
/// New fast field serializer with allowed codec types
pub fn from_write_with_codec(
write: WritePtr,
codec_types: &[FastFieldCodecType],
) -> io::Result<CompositeFastFieldSerializer> {
let composite_write = CompositeWrite::wrap(write);
Ok(CompositeFastFieldSerializer {
composite_write,
codec_types: codec_types.to_vec(),
})
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64 + fmt::Debug>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,
) -> io::Result<()> {
self.create_auto_detect_u64_fast_field_with_idx(field, fastfield_accessor, 0)
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field_with_idx<
T: MonotonicallyMappableToU64 + fmt::Debug,
>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
fastfield_codecs::serialize(fastfield_accessor, field_write, &self.codec_types)?;
Ok(())
}
/// Serialize data into a new u64 fast field. The best compression codec of the the provided
/// will be chosen.
pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs<
T: MonotonicallyMappableToU64 + fmt::Debug,
>(
&mut self,
field: Field,
fastfield_accessor: impl Column<T>,
idx: usize,
codec_types: &[FastFieldCodecType],
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
fastfield_codecs::serialize(fastfield_accessor, field_write, codec_types)?;
Ok(())
}
/// Serialize data into a new u128 fast field. The codec will be compact space compressor,
/// which is optimized for scanning the fast field for a given range.
pub fn create_u128_fast_field_with_idx<F: Fn() -> I, I: Iterator<Item = u128>>(
&mut self,
field: Field,
iter_gen: F,
num_vals: u32,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
fastfield_codecs::serialize_u128(iter_gen, num_vals, field_write)?;
Ok(())
}
/// Start serializing a new [u8] fast field. Use the returned writer to write data into the
/// bytes field. To associate the bytes with documents a seperate index must be created on
/// index 0. See bytes/writer.rs::serialize for an example.
///
/// The bytes will be stored as is, no compression will be applied.
pub fn new_bytes_fast_field(&mut self, field: Field) -> impl Write + '_ {
self.composite_write.for_field_with_idx(field, 1)
}
/// Closes the serializer
///
/// After this call the data must be persistently saved on disk.
pub fn close(self) -> io::Result<()> {
self.composite_write.close()
}
}

View File

@@ -3,12 +3,10 @@ use std::io;
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::FastFieldType;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};

View File

@@ -801,7 +801,6 @@ mod tests {
use std::collections::{HashMap, HashSet};
use std::net::Ipv6Addr;
use fastfield_codecs::MonotonicallyMappableToU128;
use proptest::prelude::*;
use proptest::prop_oneof;
use proptest::strategy::Strategy;

View File

@@ -1,4 +1,4 @@
use fastfield_codecs::MonotonicallyMappableToU64;
use columnar::MonotonicallyMappableToU64;
use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap;

View File

@@ -1,20 +1,15 @@
use std::collections::HashMap;
use std::io::Write;
use std::sync::Arc;
use fastfield_codecs::VecColumn;
use itertools::Itertools;
use measure_time::debug_time;
use super::flat_map_with_buffer::FlatMapWithBufferIter;
// use super::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueIndexColumn;
use crate::core::{Segment, SegmentReader};
use crate::directory::WritePtr;
use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::{
AliveBitSet, Column, CompositeFastFieldSerializer, FastFieldNotAvailableError,
};
use crate::fastfield::{AliveBitSet, Column, FastFieldNotAvailableError};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
// use crate::indexer::sorted_doc_id_multivalue_column::RemappedDocIdMultiValueColumn;

View File

@@ -2,7 +2,6 @@ use common::TerminatingWrite;
use crate::core::{Segment, SegmentComponent};
use crate::directory::WritePtr;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fieldnorm::FieldNormsSerializer;
use crate::postings::InvertedIndexSerializer;
use crate::store::StoreWriter;

View File

@@ -1,4 +1,4 @@
use fastfield_codecs::MonotonicallyMappableToU64;
use columnar::MonotonicallyMappableToU64;
use itertools::Itertools;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};

View File

@@ -5,9 +5,8 @@
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use columnar::Column;
use columnar::{Column, MonotonicallyMappableToU128};
use common::BinarySerializable;
use fastfield_codecs::MonotonicallyMappableToU128;
use super::map_bound;
use crate::query::range_query::fast_field_range_query::RangeDocSet;

View File

@@ -4,7 +4,7 @@
use std::ops::{Bound, RangeInclusive};
use fastfield_codecs::MonotonicallyMappableToU64;
use columnar::MonotonicallyMappableToU64;
use super::fast_field_range_query::RangeDocSet;
use super::map_bound;

View File

@@ -132,7 +132,7 @@ mod tests {
use std::net::{IpAddr, Ipv6Addr};
use std::str::FromStr;
use fastfield_codecs::MonotonicallyMappableToU128;
use columnar::MonotonicallyMappableToU128;
use crate::collector::{Count, TopDocs};
use crate::query::{Query, QueryParser, TermQuery};

View File

@@ -3,7 +3,7 @@ use std::hash::{Hash, Hasher};
use std::net::Ipv6Addr;
use std::{fmt, str};
use fastfield_codecs::MonotonicallyMappableToU128;
use columnar::MonotonicallyMappableToU128;
use super::Field;
use crate::fastfield::FastValue;

View File

@@ -319,8 +319,8 @@ mod binary_serialize {
use std::io::{self, Read, Write};
use std::net::Ipv6Addr;
use columnar::MonotonicallyMappableToU128;
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
use fastfield_codecs::MonotonicallyMappableToU128;
use super::Value;
use crate::schema::Facet;