From c57fa3f7468ca854cbdaf6185d67c5b70ffe006a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 27 Dec 2022 10:34:51 +0900 Subject: [PATCH] Review comments --- columnar/Cargo.toml | 1 - columnar/README.md | 38 +++++----- columnar/src/column_type_header.rs | 108 ++++++++++++++++------------- columnar/src/dictionary.rs | 8 +-- columnar/src/lib.rs | 13 ++-- columnar/src/reader/mod.rs | 29 +++++--- columnar/src/value.rs | 14 ++-- columnar/src/writer/mod.rs | 60 ++++++++-------- 8 files changed, 146 insertions(+), 125 deletions(-) diff --git a/columnar/Cargo.toml b/columnar/Cargo.toml index 4e5053ae6..b67c4cbfd 100644 --- a/columnar/Cargo.toml +++ b/columnar/Cargo.toml @@ -10,7 +10,6 @@ serde_json = "1" thiserror = "1" fnv = "1" sstable = { path = "../sstable", package = "tantivy-sstable" } -zstd = "0.12" common = { path = "../common", package = "tantivy-common" } fastfield_codecs = { path = "../fastfield_codecs"} itertools = "0.10" diff --git a/columnar/README.md b/columnar/README.md index 0c6f53653..f11fe94fa 100644 --- a/columnar/README.md +++ b/columnar/README.md @@ -16,42 +16,36 @@ and different cardinality `(required, optional, multivalued)`. # Coercion rules -Users can create a columnar by appending rows to a writer. -Nothing prevents a user from recording values with different to a same `column_key`. +Users can create a columnar by inserting rows to a `ColumnarWriter`, +and serializing it into a `Write` object. +Nothing prevents a user from recording values with different type to the same `column_name`. In that case, `tantivy-columnar`'s behavior is as follows: -- Values that corresponds to different JsonValue type are mapped to different columns. For instance, String values are treated independently from Number or boolean values. `tantivy-columnar` will simply emit several columns associated to a given column_name. -- Only one column for a given json value type is emitted. If number values with different number types are recorded (e.g. u64, i64, f64), `tantivy-columnar` will pick the first type that can represents the set of appended value, with the following prioriy order (`i64`, `u64`, `f64`). `i64` is picked over `u64` as it is likely to yield less change of types. Most use cases strictly requiring `u64` show the restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot of use cases can show rare negative value. +- JsonValues are grouped into 3 types (String, Number, bool). +Values that corresponds to different groups are mapped to different columns. For instance, String values are treated independently +from Number or boolean values. `tantivy-columnar` will simply emit several columns associated to a given column_name. +- Only one column for a given json value type is emitted. If number values with different number types are recorded (e.g. u64, i64, f64), +`tantivy-columnar` will pick the first type that can represents the set of appended value, with the following prioriy order (`i64`, `u64`, `f64`). +`i64` is picked over `u64` as it is likely to yield less change of types. Most use cases strictly requiring `u64` show the +restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot of use cases can show rare negative value. # Columnar format -Because this columnar format tries to avoid some coercion. -There can be several columns (with different type) associated to a single `column_name`. - -Each column is associated to `column_key`. -The format of that key is: +This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above). +The `(column_name, columne_type)` couple however uniquely identifies a column. +That couple is serialized as a column `column_key`. The format of that key is: `[column_name][ZERO_BYTE][column_type_header: u8]` ``` COLUMNAR:= [COLUMNAR_DATA] - [COLUMNAR_INDEX] + [COLUMNAR_KEY_TO_DATA_INDEX] [COLUMNAR_FOOTER]; # Columns are sorted by their column key. COLUMNAR_DATA:= - [COLUMN]+; - -COLUMN:= - COMPRESSED_COLUMN | NON_COMPRESSED_COLUMN; - -# COLUMN_DATA is compressed when it exceeds a threshold of 100KB. - -COMPRESSED_COLUMN := [b'1'][zstd(COLUMN_DATA)] -NON_COMPRESSED_COLUMN:= [b'0'][COLUMN_DATA] - -COLUMNAR_INDEX := [RANGE_SSTABLE_BYTES] + [COLUMN_DATA]+; COLUMNAR_FOOTER := [RANGE_SSTABLE_BYTES_LEN: 8 bytes little endian] @@ -63,7 +57,7 @@ sorted by column key. A sstable associates `(column names, column_cardinality, column_type) to range of bytes. -Column name may not contain the zero byte. +Column name may not contain the zero byte `\0`. Listing all columns associated to `column_name` can therefore be done by listing all keys prefixed by diff --git a/columnar/src/column_type_header.rs b/columnar/src/column_type_header.rs index 094c2d75c..cb2b54d31 100644 --- a/columnar/src/column_type_header.rs +++ b/columnar/src/column_type_header.rs @@ -1,8 +1,11 @@ use crate::utils::{place_bits, select_bits}; use crate::value::NumericalType; +use crate::InvalidData; /// Enum describing the number of values that can exist per document /// (or per row if you will). +/// +/// The cardinality must fit on 2 bits. #[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)] #[repr(u8)] pub enum Cardinality { @@ -20,16 +23,20 @@ impl Cardinality { self as u8 } - pub(crate) fn try_from_code(code: u8) -> Option { + pub(crate) fn try_from_code(code: u8) -> Result { match code { - 0 => Some(Cardinality::Required), - 1 => Some(Cardinality::Optional), - 2 => Some(Cardinality::Multivalued), - _ => None, + 0 => Ok(Cardinality::Required), + 1 => Ok(Cardinality::Optional), + 2 => Ok(Cardinality::Multivalued), + _ => Err(InvalidData), } } } +/// The column type represents the column type and can fit on 6-bits. +/// +/// - bits[0..3]: Column category type. +/// - bits[3..6]: Numerical type if necessary. #[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)] pub enum ColumnType { Bytes, @@ -40,73 +47,79 @@ pub enum ColumnType { impl ColumnType { /// Encoded over 6 bits. pub(crate) fn to_code(self) -> u8 { - let high_type; - let low_code: u8; + let column_type_category; + let numerical_type_code: u8; match self { ColumnType::Bytes => { - high_type = GeneralType::Str; - low_code = 0u8; + column_type_category = ColumnTypeCategory::Str; + numerical_type_code = 0u8; } ColumnType::Numerical(numerical_type) => { - high_type = GeneralType::Numerical; - low_code = numerical_type.to_code(); + column_type_category = ColumnTypeCategory::Numerical; + numerical_type_code = numerical_type.to_code(); } ColumnType::Bool => { - high_type = GeneralType::Bool; - low_code = 0u8; + column_type_category = ColumnTypeCategory::Bool; + numerical_type_code = 0u8; } } - place_bits::<3, 6>(high_type.to_code()) | place_bits::<0, 3>(low_code) + place_bits::<0, 3>(column_type_category.to_code()) | place_bits::<3, 6>(numerical_type_code) } - pub(crate) fn try_from_code(code: u8) -> Option { + pub(crate) fn try_from_code(code: u8) -> Result { if select_bits::<6, 8>(code) != 0u8 { - return None; + return Err(InvalidData); } - let high_code = select_bits::<3, 6>(code); - let low_code = select_bits::<0, 3>(code); - let high_type = GeneralType::try_from_code(high_code)?; - match high_type { - GeneralType::Bool => { - if low_code != 0u8 { - return None; + let column_type_category_code = select_bits::<0, 3>(code); + let numerical_type_code = select_bits::<3, 6>(code); + let column_type_category = ColumnTypeCategory::try_from_code(column_type_category_code)?; + match column_type_category { + ColumnTypeCategory::Bool => { + if numerical_type_code != 0u8 { + return Err(InvalidData); } - Some(ColumnType::Bool) + Ok(ColumnType::Bool) } - GeneralType::Str => { - if low_code != 0u8 { - return None; + ColumnTypeCategory::Str => { + if numerical_type_code != 0u8 { + return Err(InvalidData); } - Some(ColumnType::Bytes) + Ok(ColumnType::Bytes) } - GeneralType::Numerical => { - let numerical_type = NumericalType::try_from_code(low_code)?; - Some(ColumnType::Numerical(numerical_type)) + ColumnTypeCategory::Numerical => { + let numerical_type = NumericalType::try_from_code(numerical_type_code)?; + Ok(ColumnType::Numerical(numerical_type)) } } } } -/// This corresponds to the JsonType. +/// Column types are grouped into different categories that +/// corresponds to the different types of `JsonValue` types. +/// +/// The columnar writer will apply coercion rules to make sure that +/// at most one column exist per `ColumnTypeCategory`. +/// +/// See also [README.md]. #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)] #[repr(u8)] -pub(crate) enum GeneralType { +pub(crate) enum ColumnTypeCategory { Bool = 0u8, Str = 1u8, Numerical = 2u8, } -impl GeneralType { +impl ColumnTypeCategory { pub fn to_code(self) -> u8 { self as u8 } - pub fn try_from_code(code: u8) -> Option { + pub fn try_from_code(code: u8) -> Result { match code { - 0u8 => Some(Self::Bool), - 1u8 => Some(Self::Str), - 2u8 => Some(Self::Numerical), - _ => None, + 0u8 => Ok(Self::Bool), + 1u8 => Ok(Self::Str), + 2u8 => Ok(Self::Numerical), + _ => Err(InvalidData), } } } @@ -115,12 +128,12 @@ impl GeneralType { /// This is encoded over one-byte and added to a column key in the /// columnar sstable. /// -/// Cardinality is encoded as the first two highest two bits. -/// The low 6 bits encode the column type. +/// - [0..6] bits: encodes the column type +/// - [6..8] bits: encodes the cardinality #[derive(Eq, Hash, PartialEq, Debug, Copy, Clone)] pub struct ColumnTypeAndCardinality { - pub cardinality: Cardinality, pub typ: ColumnType, + pub cardinality: Cardinality, } impl ColumnTypeAndCardinality { @@ -128,13 +141,13 @@ impl ColumnTypeAndCardinality { place_bits::<6, 8>(self.cardinality.to_code()) | place_bits::<0, 6>(self.typ.to_code()) } - pub fn try_from_code(code: u8) -> Option { + pub fn try_from_code(code: u8) -> Result { let typ_code = select_bits::<0, 6>(code); let cardinality_code = select_bits::<6, 8>(code); let cardinality = Cardinality::try_from_code(cardinality_code)?; let typ = ColumnType::try_from_code(typ_code)?; assert_eq!(typ.to_code(), typ_code); - Some(ColumnTypeAndCardinality { cardinality, typ }) + Ok(ColumnTypeAndCardinality { cardinality, typ }) } } @@ -149,7 +162,7 @@ mod tests { fn test_column_type_header_to_code() { let mut column_type_header_set: HashSet = HashSet::new(); for code in u8::MIN..=u8::MAX { - if let Some(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) { + if let Ok(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) { assert_eq!(column_type_header.to_code(), code); assert!(column_type_header_set.insert(column_type_header)); } @@ -165,7 +178,7 @@ mod tests { fn test_column_type_to_code() { let mut column_type_set: HashSet = HashSet::new(); for code in u8::MIN..=u8::MAX { - if let Some(column_type) = ColumnType::try_from_code(code) { + if let Ok(column_type) = ColumnType::try_from_code(code) { assert_eq!(column_type.to_code(), code); assert!(column_type_set.insert(column_type)); } @@ -177,8 +190,7 @@ mod tests { fn test_cardinality_to_code() { let mut num_cardinality = 0; for code in u8::MIN..=u8::MAX { - let cardinality_opt = Cardinality::try_from_code(code); - if let Some(cardinality) = cardinality_opt { + if let Ok(cardinality) = Cardinality::try_from_code(code) { assert_eq!(cardinality.to_code(), code); num_cardinality += 1; } diff --git a/columnar/src/dictionary.rs b/columnar/src/dictionary.rs index d14038d43..82ccb91df 100644 --- a/columnar/src/dictionary.rs +++ b/columnar/src/dictionary.rs @@ -3,11 +3,11 @@ use std::io; use fnv::FnvHashMap; use sstable::SSTable; -pub(crate) struct IdMapping { +pub(crate) struct TermIdMapping { unordered_to_ord: Vec, } -impl IdMapping { +impl TermIdMapping { pub fn to_ord(&self, unordered: UnorderedId) -> OrderedId { self.unordered_to_ord[unordered.0 as usize] } @@ -48,7 +48,7 @@ impl DictionaryBuilder { /// Serialize the dictionary into an fst, and returns the /// `UnorderedId -> TermOrdinal` map. - pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result { + pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result { let mut terms: Vec<(&[u8], UnorderedId)> = self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect(); terms.sort_unstable_by_key(|(key, _)| *key); @@ -61,7 +61,7 @@ impl DictionaryBuilder { unordered_to_ord[unordered_id.0 as usize] = ordered_id; } sstable_builder.finish()?; - Ok(IdMapping { unordered_to_ord }) + Ok(TermIdMapping { unordered_to_ord }) } } diff --git a/columnar/src/lib.rs b/columnar/src/lib.rs index 4a3b55d2b..0e28de4da 100644 --- a/columnar/src/lib.rs +++ b/columnar/src/lib.rs @@ -12,6 +12,9 @@ pub use writer::ColumnarWriter; pub type DocId = u32; +#[derive(Copy, Clone, Debug)] +pub struct InvalidData; + #[cfg(test)] mod tests { use std::ops::Range; @@ -26,8 +29,8 @@ mod tests { #[test] fn test_dataframe_writer_bytes() { let mut dataframe_writer = ColumnarWriter::default(); - dataframe_writer.record_str(1u32, "my_string", b"hello"); - dataframe_writer.record_str(3u32, "my_string", b"helloeee"); + dataframe_writer.record_str(1u32, "my_string", "hello"); + dataframe_writer.record_str(3u32, "my_string", "helloeee"); let mut buffer: Vec = Vec::new(); dataframe_writer.serialize(5, &mut buffer).unwrap(); let columnar_fileslice = FileSlice::from(buffer); @@ -36,7 +39,7 @@ mod tests { let cols: Vec<(ColumnTypeAndCardinality, Range)> = columnar.read_columns("my_string").unwrap(); assert_eq!(cols.len(), 1); - assert_eq!(cols[0].1, 0..159); + assert_eq!(cols[0].1, 0..158); } #[test] @@ -58,7 +61,7 @@ mod tests { typ: ColumnType::Bool } ); - assert_eq!(cols[0].1, 0..22); + assert_eq!(cols[0].1, 0..21); } #[test] @@ -81,6 +84,6 @@ mod tests { // - vals 8 //< due to padding? could have been 1byte?. // - null footer 6 bytes // - version footer 3 bytes // Should be file-wide - assert_eq!(cols[0].1, 0..32); + assert_eq!(cols[0].1, 0..31); } } diff --git a/columnar/src/reader/mod.rs b/columnar/src/reader/mod.rs index 51429b62c..2907ac805 100644 --- a/columnar/src/reader/mod.rs +++ b/columnar/src/reader/mod.rs @@ -9,7 +9,7 @@ use crate::column_type_header::ColumnTypeAndCardinality; fn io_invalid_data(msg: String) -> io::Error { io::Error::new(io::ErrorKind::InvalidData, msg) - // {key_bytes:?}"))); + // {key_bytes:?}"))); } /// The ColumnarReader makes it possible to access a set of columns @@ -50,7 +50,7 @@ impl ColumnarReader { let key_bytes: &[u8] = stream.key(); let column_code: u8 = key_bytes.last().cloned().unwrap(); let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code) - .ok_or_else(|| io_invalid_data(format!("Unknown column code `{column_code}`")))?; + .map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?; let range = stream.value().clone(); let column_name = String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 1]); let range_len = range.end - range.start; @@ -64,15 +64,26 @@ impl ColumnarReader { Ok(results) } - /// Get all columns for the given field_name. + /// Get all columns for the given column name. + /// + /// There can be more than one column associated to a given column name, provided they have + /// different types. // TODO fix ugly API pub fn read_columns( &self, - field_name: &str, + column_name: &str, ) -> io::Result)>> { - let mut start_key = field_name.to_string(); + // Each column is a associated to a given `column_key`, + // that starts by `column_name\0column_header`. + // + // Listing the columns associate to the given column name is therefore equivalent to listing + // `column_key` with the prefix `column_name\0`. + // + // This is in turn equivalent to searching for the range + // `[column_name,\0`..column_name\1)`. + let mut start_key = column_name.to_string(); start_key.push('\0'); - let mut end_key = field_name.to_string(); + let mut end_key = column_name.to_string(); end_key.push(1u8 as char); let mut stream = self .column_dictionary @@ -83,12 +94,10 @@ impl ColumnarReader { let mut results = Vec::new(); while stream.advance() { let key_bytes: &[u8] = stream.key(); - if !key_bytes.starts_with(start_key.as_bytes()) { - return Err(io_invalid_data(format!("Invalid key found. key: {key_bytes:?} field_name:{field_name:?}"))); - } + assert!(key_bytes.starts_with(start_key.as_bytes())); let column_code: u8 = key_bytes.last().cloned().unwrap(); let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code) - .ok_or_else(|| io_invalid_data(format!("Unknown column code `{column_code}`")))?; + .map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?; let range = stream.value().clone(); results.push((column_type_and_cardinality, range)); } diff --git a/columnar/src/value.rs b/columnar/src/value.rs index ba1879487..258e80b18 100644 --- a/columnar/src/value.rs +++ b/columnar/src/value.rs @@ -1,3 +1,5 @@ +use crate::InvalidData; + #[derive(Copy, Clone, Debug, PartialEq)] pub enum NumericalValue { I64(i64), @@ -49,12 +51,12 @@ impl NumericalType { self as u8 } - pub fn try_from_code(code: u8) -> Option { + pub fn try_from_code(code: u8) -> Result { match code { - 0 => Some(NumericalType::I64), - 1 => Some(NumericalType::U64), - 2 => Some(NumericalType::F64), - _ => None, + 0 => Ok(NumericalType::I64), + 1 => Ok(NumericalType::U64), + 2 => Ok(NumericalType::F64), + _ => Err(InvalidData), } } } @@ -112,7 +114,7 @@ mod tests { fn test_numerical_type_code() { let mut num_numerical_type = 0; for code in u8::MIN..=u8::MAX { - if let Some(numerical_type) = NumericalType::try_from_code(code) { + if let Ok(numerical_type) = NumericalType::try_from_code(code) { assert_eq!(numerical_type.to_code(), code); num_numerical_type += 1; } diff --git a/columnar/src/writer/mod.rs b/columnar/src/writer/mod.rs index a32318066..e2d6c0847 100644 --- a/columnar/src/writer/mod.rs +++ b/columnar/src/writer/mod.rs @@ -11,17 +11,13 @@ use fastfield_codecs::{Column, MonotonicallyMappableToU64, VecColumn}; use serializer::ColumnarSerializer; use stacker::{Addr, ArenaHashMap, MemoryArena}; -use crate::column_type_header::{ColumnType, ColumnTypeAndCardinality, GeneralType}; -use crate::dictionary::{DictionaryBuilder, IdMapping, UnorderedId}; +use crate::column_type_header::{ColumnType, ColumnTypeAndCardinality, ColumnTypeCategory}; +use crate::dictionary::{DictionaryBuilder, TermIdMapping, UnorderedId}; use crate::value::{Coerce, NumericalType, NumericalValue}; use crate::writer::column_writers::{ColumnWriter, NumericalColumnWriter, StrColumnWriter}; use crate::writer::value_index::{IndexBuilder, SpareIndexBuilders}; use crate::{Cardinality, DocId}; -/// Threshold above which a column data will be compressed -/// using ZSTD. -const COLUMN_COMPRESSION_THRESHOLD: usize = 100_000; - /// This is a set of buffers that are only here /// to limit the amount of allocation. #[derive(Default)] @@ -34,6 +30,20 @@ struct SpareBuffers { column_buffer: Vec, } +/// Makes it possible to create a new columnar. +/// +/// ```rust +/// use tantivy_columnar::ColumnarWriter; +/// fn main() { +/// let mut columnar_writer = ColumnarWriter::default(); +/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack"); +/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64); +/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple"); +/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats. +/// let mut wrt: Vec = Vec::new(); +/// columnar_writer.serialize(2u32, &mut wrt).unwrap(); +/// } +/// ``` pub struct ColumnarWriter { numerical_field_hash_map: ArenaHashMap, bool_field_hash_map: ArenaHashMap, @@ -58,11 +68,11 @@ impl Default for ColumnarWriter { } impl ColumnarWriter { - pub fn record_numerical( + pub fn record_numerical + Copy>( &mut self, doc: DocId, column_name: &str, - numerical_value: NumericalValue, + numerical_value: T, ) { assert!( !column_name.as_bytes().contains(&0u8), @@ -73,7 +83,7 @@ impl ColumnarWriter { column_name.as_bytes(), |column_opt: Option| { let mut column: NumericalColumnWriter = column_opt.unwrap_or_default(); - column.record_numerical_value(doc, numerical_value, arena); + column.record_numerical_value(doc, numerical_value.into(), arena); column }, ); @@ -95,7 +105,7 @@ impl ColumnarWriter { ); } - pub fn record_str(&mut self, doc: DocId, column_name: &str, value: &[u8]) { + pub fn record_str(&mut self, doc: DocId, column_name: &str, value: &str) { assert!( !column_name.as_bytes().contains(&0u8), "key may not contain the 0 byte" @@ -113,7 +123,7 @@ impl ColumnarWriter { dictionaries.push(DictionaryBuilder::default()); StrColumnWriter::with_dictionary_id(dictionary_id) }); - column.record_bytes(doc, value, dictionaries, arena); + column.record_bytes(doc, value.as_bytes(), dictionaries, arena); column }, ); @@ -121,27 +131,27 @@ impl ColumnarWriter { pub fn serialize(&mut self, num_docs: DocId, wrt: &mut dyn io::Write) -> io::Result<()> { let mut serializer = ColumnarSerializer::new(wrt); - let mut field_columns: Vec<(&[u8], GeneralType, Addr)> = self + let mut field_columns: Vec<(&[u8], ColumnTypeCategory, Addr)> = self .numerical_field_hash_map .iter() - .map(|(term, addr, _)| (term, GeneralType::Numerical, addr)) + .map(|(term, addr, _)| (term, ColumnTypeCategory::Numerical, addr)) .collect(); field_columns.extend( self.bytes_field_hash_map .iter() - .map(|(term, addr, _)| (term, GeneralType::Str, addr)), + .map(|(term, addr, _)| (term, ColumnTypeCategory::Str, addr)), ); field_columns.extend( self.bool_field_hash_map .iter() - .map(|(term, addr, _)| (term, GeneralType::Bool, addr)), + .map(|(term, addr, _)| (term, ColumnTypeCategory::Bool, addr)), ); field_columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type)); let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries); let mut symbol_byte_buffer: Vec = Vec::new(); for (column_name, bytes_or_numerical, addr) in field_columns { match bytes_or_numerical { - GeneralType::Bool => { + ColumnTypeCategory::Bool => { let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr); let cardinality = column_writer.get_cardinality(num_docs); let column_type_and_cardinality = ColumnTypeAndCardinality { @@ -158,7 +168,7 @@ impl ColumnarWriter { column_serializer, )?; } - GeneralType::Str => { + ColumnTypeCategory::Str => { let str_column_writer: StrColumnWriter = self.bytes_field_hash_map.read(addr); let dictionary_builder = &dictionaries[str_column_writer.dictionary_id as usize]; @@ -178,7 +188,7 @@ impl ColumnarWriter { column_serializer, )?; } - GeneralType::Numerical => { + ColumnTypeCategory::Numerical => { let numerical_column_writer: NumericalColumnWriter = self.numerical_field_hash_map.read(addr); let (numerical_type, cardinality) = @@ -206,15 +216,7 @@ impl ColumnarWriter { } fn compress_and_write_column(column_bytes: &[u8], wrt: &mut W) -> io::Result<()> { - if column_bytes.len() >= COLUMN_COMPRESSION_THRESHOLD { - wrt.write_all(&[1])?; - let mut encoder = zstd::Encoder::new(wrt, 3)?; - encoder.write_all(column_bytes)?; - encoder.finish()?; - } else { - wrt.write_all(&[0])?; - wrt.write_all(column_bytes)?; - } + wrt.write_all(column_bytes)?; Ok(()) } @@ -233,13 +235,13 @@ fn serialize_bytes_column( .. } = buffers; column_buffer.clear(); - let id_mapping: IdMapping = dictionary_builder.serialize(column_buffer)?; + let term_id_mapping: TermIdMapping = dictionary_builder.serialize(column_buffer)?; let dictionary_num_bytes: u32 = column_buffer.len() as u32; let operation_iterator = operation_it.map(|symbol: ColumnOperation| { // We map unordered ids to ordered ids. match symbol { ColumnOperation::Value(unordered_id) => { - let ordered_id = id_mapping.to_ord(unordered_id); + let ordered_id = term_id_mapping.to_ord(unordered_id); ColumnOperation::Value(ordered_id.0 as u64) } ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),