diff --git a/columnar/src/columnar/column_type.rs b/columnar/src/columnar/column_type.rs index a723cb015..aaaca69de 100644 --- a/columnar/src/columnar/column_type.rs +++ b/columnar/src/columnar/column_type.rs @@ -3,24 +3,22 @@ use std::net::Ipv6Addr; use crate::value::NumericalType; use crate::InvalidData; -/// The column type represents the column type and can fit on 6-bits. -/// -/// - bits[0..3]: Column category type. -/// - bits[3..6]: Numerical type if necessary. -#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)] +/// The column type represents the column type. +/// Any changes need to be propagated to `COLUMN_TYPES`. +#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy, Ord, PartialOrd)] #[repr(u8)] pub enum ColumnType { I64 = 0u8, U64 = 1u8, F64 = 2u8, - Bytes = 10u8, - Str = 14u8, - Bool = 18u8, - IpAddr = 22u8, - DateTime = 26u8, + Bytes = 3u8, + Str = 4u8, + Bool = 5u8, + IpAddr = 6u8, + DateTime = 7u8, } -#[cfg(test)] +// The order needs to match _exactly_ the order in the enum const COLUMN_TYPES: [ColumnType; 8] = [ ColumnType::I64, ColumnType::U64, @@ -38,18 +36,7 @@ impl ColumnType { } pub(crate) fn try_from_code(code: u8) -> Result { - use ColumnType::*; - match code { - 0u8 => Ok(I64), - 1u8 => Ok(U64), - 2u8 => Ok(F64), - 10u8 => Ok(Bytes), - 14u8 => Ok(Str), - 18u8 => Ok(Bool), - 22u8 => Ok(IpAddr), - 26u8 => Ok(Self::DateTime), - _ => Err(InvalidData), - } + COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData) } } @@ -64,18 +51,6 @@ impl From for ColumnType { } impl ColumnType { - /// get column type category - pub(crate) fn column_type_category(self) -> ColumnTypeCategory { - match self { - ColumnType::I64 | ColumnType::U64 | ColumnType::F64 => ColumnTypeCategory::Numerical, - ColumnType::Bytes => ColumnTypeCategory::Bytes, - ColumnType::Str => ColumnTypeCategory::Str, - ColumnType::Bool => ColumnTypeCategory::Bool, - ColumnType::IpAddr => ColumnTypeCategory::IpAddr, - ColumnType::DateTime => ColumnTypeCategory::DateTime, - } - } - pub fn numerical_type(&self) -> Option { match self { ColumnType::I64 => Some(NumericalType::I64), @@ -154,70 +129,20 @@ impl HasAssociatedColumnType for Ipv6Addr { } } -/// Column types are grouped into different categories that -/// corresponds to the different types of `JsonValue` types. -/// -/// The columnar writer will apply coercion rules to make sure that -/// at most one column exist per `ColumnTypeCategory`. -/// -/// See also [README.md]. -#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug)] -#[repr(u8)] -pub enum ColumnTypeCategory { - Bool, - Str, - Numerical, - DateTime, - Bytes, - IpAddr, -} - -impl From for ColumnTypeCategory { - fn from(column_type: ColumnType) -> Self { - match column_type { - ColumnType::I64 => ColumnTypeCategory::Numerical, - ColumnType::U64 => ColumnTypeCategory::Numerical, - ColumnType::F64 => ColumnTypeCategory::Numerical, - ColumnType::Bytes => ColumnTypeCategory::Bytes, - ColumnType::Str => ColumnTypeCategory::Str, - ColumnType::Bool => ColumnTypeCategory::Bool, - ColumnType::IpAddr => ColumnTypeCategory::IpAddr, - ColumnType::DateTime => ColumnTypeCategory::DateTime, - } - } -} - #[cfg(test)] mod tests { - use std::collections::HashSet; - use super::*; use crate::Cardinality; #[test] fn test_column_type_to_code() { - let mut column_type_set: HashSet = HashSet::new(); - for code in u8::MIN..=u8::MAX { - if let Ok(column_type) = ColumnType::try_from_code(code) { - assert_eq!(column_type.to_code(), code); - assert!(column_type_set.insert(column_type)); + for (code, expected_column_type) in super::COLUMN_TYPES.iter().copied().enumerate() { + if let Ok(column_type) = ColumnType::try_from_code(code as u8) { + assert_eq!(column_type, expected_column_type); } } - assert_eq!(column_type_set.len(), super::COLUMN_TYPES.len()); - } - - #[test] - fn test_column_category_sort_consistent_with_column_type_sort() { - // This is a very important property because we - // we need to serialize colunmn in the right order. - let mut column_types: Vec = super::COLUMN_TYPES.iter().copied().collect(); - column_types.sort_by_key(|col| col.to_code()); - let column_categories: Vec = column_types - .into_iter() - .map(ColumnTypeCategory::from) - .collect(); - for (prev, next) in column_categories.iter().zip(column_categories.iter()) { - assert!(prev <= next); + for code in COLUMN_TYPES.len() as u8..=u8::MAX { + assert!(ColumnType::try_from_code(code as u8).is_err()); } } diff --git a/columnar/src/columnar/merge.rs b/columnar/src/columnar/merge.rs index 3255dc309..9778b7135 100644 --- a/columnar/src/columnar/merge.rs +++ b/columnar/src/columnar/merge.rs @@ -1,9 +1,9 @@ use std::collections::HashMap; use std::io; -use super::column_type::ColumnTypeCategory; use crate::columnar::ColumnarReader; use crate::dynamic_column::DynamicColumn; +use crate::ColumnType; pub enum MergeDocOrder { /// Columnar tables are simply stacked one above the other. @@ -35,7 +35,40 @@ pub fn merge_columnar( } } -pub fn collect_columns( +/// Column types are grouped into different categories. +/// After merge, all columns belonging to the same category are coerced to +/// the same column type. +/// +/// In practise, today, only Numerical colummns are coerced into one type today. +/// +/// See also [README.md]. +#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)] +#[repr(u8)] +enum ColumnTypeCategory { + Bool, + Str, + Numerical, + DateTime, + Bytes, + IpAddr, +} + +impl From for ColumnTypeCategory { + fn from(column_type: ColumnType) -> Self { + match column_type { + ColumnType::I64 => ColumnTypeCategory::Numerical, + ColumnType::U64 => ColumnTypeCategory::Numerical, + ColumnType::F64 => ColumnTypeCategory::Numerical, + ColumnType::Bytes => ColumnTypeCategory::Bytes, + ColumnType::Str => ColumnTypeCategory::Str, + ColumnType::Bool => ColumnTypeCategory::Bool, + ColumnType::IpAddr => ColumnTypeCategory::IpAddr, + ColumnType::DateTime => ColumnTypeCategory::DateTime, + } + } +} + +fn collect_columns( columnar_readers: &[&ColumnarReader], ) -> io::Result>>> { // Each column name may have multiple types of column associated. @@ -51,7 +84,7 @@ pub fn collect_columns( .or_default(); let columns = column_type_to_handles - .entry(handle.column_type().column_type_category()) + .entry(handle.column_type().into()) .or_default(); columns.push(handle.open()?); } @@ -62,10 +95,9 @@ pub fn collect_columns( Ok(field_name_to_group) } -/// Cast numerical type columns to the same type -pub(crate) fn normalize_columns( - map: &mut HashMap>>, -) { +/// Coerce numerical type columns to the same type +/// TODO rename to `coerce_columns` +fn normalize_columns(map: &mut HashMap>>) { for (_field_name, type_category_to_columns) in map.iter_mut() { for (type_category, columns) in type_category_to_columns { if type_category == &ColumnTypeCategory::Numerical { diff --git a/columnar/src/columnar/writer/column_writers.rs b/columnar/src/columnar/writer/column_writers.rs index 6186029a6..00dc2b6bc 100644 --- a/columnar/src/columnar/writer/column_writers.rs +++ b/columnar/src/columnar/writer/column_writers.rs @@ -184,10 +184,12 @@ impl CompatibleNumericalTypes { } impl NumericalColumnWriter { - pub fn column_type_and_cardinality(&self, num_docs: RowId) -> (NumericalType, Cardinality) { - let numerical_type = self.compatible_numerical_types.to_numerical_type(); - let cardinality = self.column_writer.get_cardinality(num_docs); - (numerical_type, cardinality) + pub fn numerical_type(&self) -> NumericalType { + self.compatible_numerical_types.to_numerical_type() + } + + pub fn cardinality(&self, num_docs: RowId) -> Cardinality { + self.column_writer.get_cardinality(num_docs) } pub fn record_numerical_value( diff --git a/columnar/src/columnar/writer/mod.rs b/columnar/src/columnar/writer/mod.rs index 741042a7a..c1e2dac26 100644 --- a/columnar/src/columnar/writer/mod.rs +++ b/columnar/src/columnar/writer/mod.rs @@ -15,7 +15,7 @@ use crate::column_index::SerializableColumnIndex; use crate::column_values::{ ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn, }; -use crate::columnar::column_type::{ColumnType, ColumnTypeCategory}; +use crate::columnar::column_type::ColumnType; use crate::columnar::writer::column_writers::{ ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter, }; @@ -276,35 +276,40 @@ impl ColumnarWriter { } pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> { let mut serializer = ColumnarSerializer::new(wrt); - let mut columns: Vec<(&[u8], ColumnTypeCategory, Addr)> = self + let mut columns: Vec<(&[u8], ColumnType, Addr)> = self .numerical_field_hash_map .iter() - .map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Numerical, addr)) + .map(|(column_name, addr, _)| { + let numerical_column_writer: NumericalColumnWriter = + self.numerical_field_hash_map.read(addr); + let column_type = numerical_column_writer.numerical_type().into(); + (column_name, column_type, addr) + }) .collect(); columns.extend( self.bytes_field_hash_map .iter() - .map(|(term, addr, _)| (term, ColumnTypeCategory::Bytes, addr)), + .map(|(term, addr, _)| (term, ColumnType::Bytes, addr)), ); columns.extend( self.str_field_hash_map .iter() - .map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Str, addr)), + .map(|(column_name, addr, _)| (column_name, ColumnType::Str, addr)), ); columns.extend( self.bool_field_hash_map .iter() - .map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Bool, addr)), + .map(|(column_name, addr, _)| (column_name, ColumnType::Bool, addr)), ); columns.extend( self.ip_addr_field_hash_map .iter() - .map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::IpAddr, addr)), + .map(|(column_name, addr, _)| (column_name, ColumnType::IpAddr, addr)), ); columns.extend( self.datetime_field_hash_map .iter() - .map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::DateTime, addr)), + .map(|(column_name, addr, _)| (column_name, ColumnType::DateTime, addr)), ); columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type)); @@ -312,8 +317,12 @@ impl ColumnarWriter { let mut symbol_byte_buffer: Vec = Vec::new(); for (column_name, column_type, addr) in columns { match column_type { - ColumnTypeCategory::Bool => { - let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr); + ColumnType::Bool | ColumnType::DateTime => { + let column_writer: ColumnWriter = if column_type == ColumnType::Bool { + self.bool_field_hash_map.read(addr) + } else { + self.datetime_field_hash_map.read(addr) + }; let cardinality = column_writer.get_cardinality(num_docs); let mut column_serializer = serializer.serialize_column(column_name, ColumnType::Bool); @@ -325,7 +334,7 @@ impl ColumnarWriter { &mut column_serializer, )?; } - ColumnTypeCategory::IpAddr => { + ColumnType::IpAddr => { let column_writer: ColumnWriter = self.ip_addr_field_hash_map.read(addr); let cardinality = column_writer.get_cardinality(num_docs); let mut column_serializer = @@ -338,32 +347,35 @@ impl ColumnarWriter { &mut column_serializer, )?; } - ColumnTypeCategory::Bytes | ColumnTypeCategory::Str => { - let (column_type, str_column_writer): (ColumnType, StrOrBytesColumnWriter) = - if column_type == ColumnTypeCategory::Bytes { - (ColumnType::Bytes, self.bytes_field_hash_map.read(addr)) + ColumnType::Bytes | ColumnType::Str => { + let str_or_bytes_column_writer: StrOrBytesColumnWriter = + if column_type == ColumnType::Bytes { + self.bytes_field_hash_map.read(addr) } else { - (ColumnType::Str, self.str_field_hash_map.read(addr)) + self.str_field_hash_map.read(addr) }; let dictionary_builder = - &dictionaries[str_column_writer.dictionary_id as usize]; - let cardinality = str_column_writer.column_writer.get_cardinality(num_docs); + &dictionaries[str_or_bytes_column_writer.dictionary_id as usize]; + let cardinality = str_or_bytes_column_writer + .column_writer + .get_cardinality(num_docs); let mut column_serializer = serializer.serialize_column(column_name, column_type); serialize_bytes_or_str_column( cardinality, num_docs, dictionary_builder, - str_column_writer.operation_iterator(arena, &mut symbol_byte_buffer), + str_or_bytes_column_writer + .operation_iterator(arena, &mut symbol_byte_buffer), buffers, &mut column_serializer, )?; } - ColumnTypeCategory::Numerical => { + ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => { let numerical_column_writer: NumericalColumnWriter = self.numerical_field_hash_map.read(addr); - let (numerical_type, cardinality) = - numerical_column_writer.column_type_and_cardinality(num_docs); + let numerical_type = column_type.numerical_type().unwrap(); + let cardinality = numerical_column_writer.cardinality(num_docs); let mut column_serializer = serializer.serialize_column(column_name, ColumnType::from(numerical_type)); serialize_numerical_column( @@ -375,20 +387,6 @@ impl ColumnarWriter { &mut column_serializer, )?; } - ColumnTypeCategory::DateTime => { - let column_writer: ColumnWriter = self.datetime_field_hash_map.read(addr); - let cardinality = column_writer.get_cardinality(num_docs); - let mut column_serializer = - serializer.serialize_column(column_name, ColumnType::DateTime); - serialize_numerical_column( - cardinality, - num_docs, - NumericalType::I64, - column_writer.operation_iterator(arena, &mut symbol_byte_buffer), - buffers, - &mut column_serializer, - )?; - } }; } serializer.finalize()?;