feat: Update parquet writer and indexer to support the flat format (#6866)

* feat: implements method to write flat batch for ParquetWriter

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: add update method for flat RecordBatch in Indexer

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: calls indexer to write flat batch in ParquetWriter

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: handle empty projection for flat format

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: eval array in precise_filter_flat

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: cache column lookup result in inverted indexer

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: add test

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support dict type in dense codec

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: remove read part in test as it need modifying the reader

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: support dictionary type in other methods for dense codec

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: fulltext use string array directly

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
This commit is contained in:
Yingwen
2025-09-02 20:48:34 +08:00
committed by GitHub
parent d394f38d18
commit 8fc42aeb27
11 changed files with 815 additions and 39 deletions

View File

@@ -59,6 +59,15 @@ impl SortField {
pub fn estimated_size(&self) -> usize {
match &self.data_type {
ConcreteDataType::Dictionary(dict_type) => {
Self::estimated_size_by_type(dict_type.value_type())
}
data_type => Self::estimated_size_by_type(data_type),
}
}
fn estimated_size_by_type(data_type: &ConcreteDataType) -> usize {
match data_type {
ConcreteDataType::Boolean(_) => 2,
ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => 2,
ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => 3,
@@ -88,16 +97,29 @@ impl SortField {
&self,
serializer: &mut Serializer<&mut Vec<u8>>,
value: &ValueRef,
) -> Result<()> {
match self.data_type() {
ConcreteDataType::Dictionary(dict_type) => {
Self::serialize_by_type(dict_type.value_type(), serializer, value)
}
data_type => Self::serialize_by_type(data_type, serializer, value),
}
}
fn serialize_by_type(
data_type: &ConcreteDataType,
serializer: &mut Serializer<&mut Vec<u8>>,
value: &ValueRef,
) -> Result<()> {
macro_rules! cast_value_and_serialize {
(
$self: ident;
$data_type: ident;
$serializer: ident;
$(
$ty: ident, $f: ident
),*
) => {
match &$self.data_type {
match $data_type {
$(
ConcreteDataType::$ty(_) => {
paste!{
@@ -139,13 +161,13 @@ impl SortField {
ConcreteDataType::Dictionary(_) |
ConcreteDataType::Null(_) => {
return error::NotSupportedFieldSnafu {
data_type: $self.data_type.clone()
data_type: $data_type.clone()
}.fail()
}
}
};
}
cast_value_and_serialize!(self; serializer;
cast_value_and_serialize!(data_type; serializer;
Boolean, boolean,
Binary, binary,
Int8, i8,
@@ -172,16 +194,28 @@ impl SortField {
/// Deserialize a value from the deserializer.
pub fn deserialize<B: Buf>(&self, deserializer: &mut Deserializer<B>) -> Result<Value> {
match &self.data_type {
ConcreteDataType::Dictionary(dict_type) => {
Self::deserialize_by_type(dict_type.value_type(), deserializer)
}
data_type => Self::deserialize_by_type(data_type, deserializer),
}
}
fn deserialize_by_type<B: Buf>(
data_type: &ConcreteDataType,
deserializer: &mut Deserializer<B>,
) -> Result<Value> {
macro_rules! deserialize_and_build_value {
(
$self: ident;
$data_type: ident;
$serializer: ident;
$(
$ty: ident, $f: ident
),*
) => {
match &$self.data_type {
match $data_type {
$(
ConcreteDataType::$ty(_) => {
Ok(Value::from(Option::<$f>::deserialize(deserializer).context(error::DeserializeFieldSnafu)?))
@@ -235,7 +269,7 @@ impl SortField {
}
};
}
deserialize_and_build_value!(self; deserializer;
deserialize_and_build_value!(data_type; deserializer;
Boolean, bool,
Int8, i8,
Int16, i16,
@@ -267,7 +301,20 @@ impl SortField {
return Ok(1);
}
let to_skip = match &self.data_type {
match &self.data_type {
ConcreteDataType::Dictionary(dict_type) => {
Self::skip_deserialize_by_type(dict_type.value_type(), bytes, deserializer)
}
data_type => Self::skip_deserialize_by_type(data_type, bytes, deserializer),
}
}
fn skip_deserialize_by_type(
data_type: &ConcreteDataType,
bytes: &[u8],
deserializer: &mut Deserializer<&[u8]>,
) -> Result<usize> {
let to_skip = match data_type {
ConcreteDataType::Boolean(_) => 2,
ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => 2,
ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => 3,
@@ -629,6 +676,51 @@ mod tests {
)
}
#[test]
fn test_memcmp_dictionary() {
// Test Dictionary<i32, string>
check_encode_and_decode(
&[ConcreteDataType::dictionary_datatype(
ConcreteDataType::int32_datatype(),
ConcreteDataType::string_datatype(),
)],
vec![Value::String("hello".into())],
);
// Test Dictionary<i32, i64>
check_encode_and_decode(
&[ConcreteDataType::dictionary_datatype(
ConcreteDataType::int32_datatype(),
ConcreteDataType::int64_datatype(),
)],
vec![Value::Int64(42)],
);
// Test Dictionary with null value
check_encode_and_decode(
&[ConcreteDataType::dictionary_datatype(
ConcreteDataType::int32_datatype(),
ConcreteDataType::string_datatype(),
)],
vec![Value::Null],
);
// Test multiple Dictionary columns
check_encode_and_decode(
&[
ConcreteDataType::dictionary_datatype(
ConcreteDataType::int32_datatype(),
ConcreteDataType::string_datatype(),
),
ConcreteDataType::dictionary_datatype(
ConcreteDataType::int16_datatype(),
ConcreteDataType::int64_datatype(),
),
],
vec![Value::String("world".into()), Value::Int64(123)],
);
}
#[test]
fn test_encode_multiple_rows() {
check_encode_and_decode(
@@ -691,6 +783,10 @@ mod tests {
ConcreteDataType::interval_month_day_nano_datatype(),
ConcreteDataType::decimal128_default_datatype(),
ConcreteDataType::vector_datatype(3),
ConcreteDataType::dictionary_datatype(
ConcreteDataType::int32_datatype(),
ConcreteDataType::string_datatype(),
),
],
vec![
Value::Boolean(true),
@@ -715,6 +811,7 @@ mod tests {
Value::IntervalMonthDayNano(IntervalMonthDayNano::new(1, 1, 15)),
Value::Decimal128(Decimal128::from(16)),
Value::Binary(Bytes::from(vec![0; 12])),
Value::String("dict_value".into()),
],
);
}