mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-22 07:50:38 +00:00
feat: Update parquet writer and indexer to support the flat format (#6866)
* feat: implements method to write flat batch for ParquetWriter Signed-off-by: evenyag <realevenyag@gmail.com> * feat: add update method for flat RecordBatch in Indexer Signed-off-by: evenyag <realevenyag@gmail.com> * feat: calls indexer to write flat batch in ParquetWriter Signed-off-by: evenyag <realevenyag@gmail.com> * fix: handle empty projection for flat format Signed-off-by: evenyag <realevenyag@gmail.com> * fix: eval array in precise_filter_flat Signed-off-by: evenyag <realevenyag@gmail.com> * feat: cache column lookup result in inverted indexer Signed-off-by: evenyag <realevenyag@gmail.com> * test: add test Signed-off-by: evenyag <realevenyag@gmail.com> * feat: support dict type in dense codec Signed-off-by: evenyag <realevenyag@gmail.com> * test: remove read part in test as it need modifying the reader Signed-off-by: evenyag <realevenyag@gmail.com> * feat: support dictionary type in other methods for dense codec Signed-off-by: evenyag <realevenyag@gmail.com> * refactor: fulltext use string array directly Signed-off-by: evenyag <realevenyag@gmail.com> --------- Signed-off-by: evenyag <realevenyag@gmail.com>
This commit is contained in:
@@ -59,6 +59,15 @@ impl SortField {
|
||||
|
||||
pub fn estimated_size(&self) -> usize {
|
||||
match &self.data_type {
|
||||
ConcreteDataType::Dictionary(dict_type) => {
|
||||
Self::estimated_size_by_type(dict_type.value_type())
|
||||
}
|
||||
data_type => Self::estimated_size_by_type(data_type),
|
||||
}
|
||||
}
|
||||
|
||||
fn estimated_size_by_type(data_type: &ConcreteDataType) -> usize {
|
||||
match data_type {
|
||||
ConcreteDataType::Boolean(_) => 2,
|
||||
ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => 2,
|
||||
ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => 3,
|
||||
@@ -88,16 +97,29 @@ impl SortField {
|
||||
&self,
|
||||
serializer: &mut Serializer<&mut Vec<u8>>,
|
||||
value: &ValueRef,
|
||||
) -> Result<()> {
|
||||
match self.data_type() {
|
||||
ConcreteDataType::Dictionary(dict_type) => {
|
||||
Self::serialize_by_type(dict_type.value_type(), serializer, value)
|
||||
}
|
||||
data_type => Self::serialize_by_type(data_type, serializer, value),
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_by_type(
|
||||
data_type: &ConcreteDataType,
|
||||
serializer: &mut Serializer<&mut Vec<u8>>,
|
||||
value: &ValueRef,
|
||||
) -> Result<()> {
|
||||
macro_rules! cast_value_and_serialize {
|
||||
(
|
||||
$self: ident;
|
||||
$data_type: ident;
|
||||
$serializer: ident;
|
||||
$(
|
||||
$ty: ident, $f: ident
|
||||
),*
|
||||
) => {
|
||||
match &$self.data_type {
|
||||
match $data_type {
|
||||
$(
|
||||
ConcreteDataType::$ty(_) => {
|
||||
paste!{
|
||||
@@ -139,13 +161,13 @@ impl SortField {
|
||||
ConcreteDataType::Dictionary(_) |
|
||||
ConcreteDataType::Null(_) => {
|
||||
return error::NotSupportedFieldSnafu {
|
||||
data_type: $self.data_type.clone()
|
||||
data_type: $data_type.clone()
|
||||
}.fail()
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
cast_value_and_serialize!(self; serializer;
|
||||
cast_value_and_serialize!(data_type; serializer;
|
||||
Boolean, boolean,
|
||||
Binary, binary,
|
||||
Int8, i8,
|
||||
@@ -172,16 +194,28 @@ impl SortField {
|
||||
|
||||
/// Deserialize a value from the deserializer.
|
||||
pub fn deserialize<B: Buf>(&self, deserializer: &mut Deserializer<B>) -> Result<Value> {
|
||||
match &self.data_type {
|
||||
ConcreteDataType::Dictionary(dict_type) => {
|
||||
Self::deserialize_by_type(dict_type.value_type(), deserializer)
|
||||
}
|
||||
data_type => Self::deserialize_by_type(data_type, deserializer),
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_by_type<B: Buf>(
|
||||
data_type: &ConcreteDataType,
|
||||
deserializer: &mut Deserializer<B>,
|
||||
) -> Result<Value> {
|
||||
macro_rules! deserialize_and_build_value {
|
||||
(
|
||||
$self: ident;
|
||||
$data_type: ident;
|
||||
$serializer: ident;
|
||||
$(
|
||||
$ty: ident, $f: ident
|
||||
),*
|
||||
) => {
|
||||
|
||||
match &$self.data_type {
|
||||
match $data_type {
|
||||
$(
|
||||
ConcreteDataType::$ty(_) => {
|
||||
Ok(Value::from(Option::<$f>::deserialize(deserializer).context(error::DeserializeFieldSnafu)?))
|
||||
@@ -235,7 +269,7 @@ impl SortField {
|
||||
}
|
||||
};
|
||||
}
|
||||
deserialize_and_build_value!(self; deserializer;
|
||||
deserialize_and_build_value!(data_type; deserializer;
|
||||
Boolean, bool,
|
||||
Int8, i8,
|
||||
Int16, i16,
|
||||
@@ -267,7 +301,20 @@ impl SortField {
|
||||
return Ok(1);
|
||||
}
|
||||
|
||||
let to_skip = match &self.data_type {
|
||||
match &self.data_type {
|
||||
ConcreteDataType::Dictionary(dict_type) => {
|
||||
Self::skip_deserialize_by_type(dict_type.value_type(), bytes, deserializer)
|
||||
}
|
||||
data_type => Self::skip_deserialize_by_type(data_type, bytes, deserializer),
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_deserialize_by_type(
|
||||
data_type: &ConcreteDataType,
|
||||
bytes: &[u8],
|
||||
deserializer: &mut Deserializer<&[u8]>,
|
||||
) -> Result<usize> {
|
||||
let to_skip = match data_type {
|
||||
ConcreteDataType::Boolean(_) => 2,
|
||||
ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => 2,
|
||||
ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => 3,
|
||||
@@ -629,6 +676,51 @@ mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_memcmp_dictionary() {
|
||||
// Test Dictionary<i32, string>
|
||||
check_encode_and_decode(
|
||||
&[ConcreteDataType::dictionary_datatype(
|
||||
ConcreteDataType::int32_datatype(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
)],
|
||||
vec![Value::String("hello".into())],
|
||||
);
|
||||
|
||||
// Test Dictionary<i32, i64>
|
||||
check_encode_and_decode(
|
||||
&[ConcreteDataType::dictionary_datatype(
|
||||
ConcreteDataType::int32_datatype(),
|
||||
ConcreteDataType::int64_datatype(),
|
||||
)],
|
||||
vec![Value::Int64(42)],
|
||||
);
|
||||
|
||||
// Test Dictionary with null value
|
||||
check_encode_and_decode(
|
||||
&[ConcreteDataType::dictionary_datatype(
|
||||
ConcreteDataType::int32_datatype(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
)],
|
||||
vec![Value::Null],
|
||||
);
|
||||
|
||||
// Test multiple Dictionary columns
|
||||
check_encode_and_decode(
|
||||
&[
|
||||
ConcreteDataType::dictionary_datatype(
|
||||
ConcreteDataType::int32_datatype(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
),
|
||||
ConcreteDataType::dictionary_datatype(
|
||||
ConcreteDataType::int16_datatype(),
|
||||
ConcreteDataType::int64_datatype(),
|
||||
),
|
||||
],
|
||||
vec![Value::String("world".into()), Value::Int64(123)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_multiple_rows() {
|
||||
check_encode_and_decode(
|
||||
@@ -691,6 +783,10 @@ mod tests {
|
||||
ConcreteDataType::interval_month_day_nano_datatype(),
|
||||
ConcreteDataType::decimal128_default_datatype(),
|
||||
ConcreteDataType::vector_datatype(3),
|
||||
ConcreteDataType::dictionary_datatype(
|
||||
ConcreteDataType::int32_datatype(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
),
|
||||
],
|
||||
vec![
|
||||
Value::Boolean(true),
|
||||
@@ -715,6 +811,7 @@ mod tests {
|
||||
Value::IntervalMonthDayNano(IntervalMonthDayNano::new(1, 1, 15)),
|
||||
Value::Decimal128(Decimal128::from(16)),
|
||||
Value::Binary(Bytes::from(vec![0; 12])),
|
||||
Value::String("dict_value".into()),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user