feat: precise filter for mito parquet reader (#3178)

* impl SimpleFilterEvaluator

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* time index and field filter

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* finish parquet filter

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* remove empty Batch

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix clippy

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix fmt

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update metric

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* use projected schema from batch

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* correct naming

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* remove unnecessary error

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix clippy

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Ruihang Xia
2024-01-18 14:59:48 +08:00
committed by GitHub
parent 63205907fb
commit cde5a36f5e
17 changed files with 652 additions and 32 deletions

View File

@@ -6,6 +6,7 @@ license.workspace = true
[dependencies]
arc-swap = "1.6"
common-base.workspace = true
common-error.workspace = true
common-macro.workspace = true
datafusion-common.workspace = true

View File

@@ -107,6 +107,16 @@ pub enum Error {
location: Location,
source: datatypes::error::Error,
},
#[snafu(display("Error occurs when performing arrow computation"))]
ArrowCompute {
#[snafu(source)]
error: datatypes::arrow::error::ArrowError,
location: Location,
},
#[snafu(display("Unsupported operation: {}", reason))]
UnsupportedOperation { reason: String, location: Location },
}
impl ErrorExt for Error {
@@ -120,10 +130,13 @@ impl ErrorExt for Error {
| Error::Format { .. }
| Error::InitRecordbatchStream { .. }
| Error::ColumnNotExists { .. }
| Error::ProjectArrowRecordBatch { .. } => StatusCode::Internal,
| Error::ProjectArrowRecordBatch { .. }
| Error::ArrowCompute { .. } => StatusCode::Internal,
Error::External { source, .. } => source.status_code(),
Error::UnsupportedOperation { .. } => StatusCode::Unsupported,
Error::SchemaConversion { source, .. } | Error::CastVector { source, .. } => {
source.status_code()
}

View File

@@ -0,0 +1,258 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Util record batch stream wrapper that can perform precise filter.
use datafusion::logical_expr::{Expr, Operator};
use datafusion_common::arrow::array::{ArrayRef, Datum, Scalar};
use datafusion_common::arrow::buffer::BooleanBuffer;
use datafusion_common::arrow::compute::kernels::cmp;
use datafusion_common::ScalarValue;
use datatypes::vectors::VectorRef;
use snafu::ResultExt;
use crate::error::{ArrowComputeSnafu, Result, UnsupportedOperationSnafu};
/// An inplace expr evaluator for simple filter. Only support
/// - `col` `op` `literal`
/// - `literal` `op` `col`
///
/// And the `op` is one of `=`, `!=`, `>`, `>=`, `<`, `<=`.
///
/// This struct contains normalized predicate expr. In the form of
/// `col` `op` `literal` where the `col` is provided from input.
#[derive(Debug)]
pub struct SimpleFilterEvaluator {
/// Name of the referenced column.
column_name: String,
/// The literal value.
literal: Scalar<ArrayRef>,
/// The operator.
op: Operator,
}
impl SimpleFilterEvaluator {
pub fn try_new(predicate: &Expr) -> Option<Self> {
match predicate {
Expr::BinaryExpr(binary) => {
// check if the expr is in the supported form
match binary.op {
Operator::Eq
| Operator::NotEq
| Operator::Lt
| Operator::LtEq
| Operator::Gt
| Operator::GtEq => {}
_ => return None,
}
// swap the expr if it is in the form of `literal` `op` `col`
let mut op = binary.op;
let (lhs, rhs) = match (&*binary.left, &*binary.right) {
(Expr::Column(ref col), Expr::Literal(ref lit)) => (col, lit),
(Expr::Literal(ref lit), Expr::Column(ref col)) => {
// safety: The previous check ensures the operator is able to swap.
op = op.swap().unwrap();
(col, lit)
}
_ => return None,
};
Some(Self {
column_name: lhs.name.clone(),
literal: rhs.clone().to_scalar(),
op,
})
}
_ => None,
}
}
/// Get the name of the referenced column.
pub fn column_name(&self) -> &str {
&self.column_name
}
pub fn evaluate_scalar(&self, input: &ScalarValue) -> Result<bool> {
let result = self.evaluate_datum(&input.to_scalar())?;
Ok(result.value(0))
}
pub fn evaluate_array(&self, input: &ArrayRef) -> Result<BooleanBuffer> {
self.evaluate_datum(input)
}
pub fn evaluate_vector(&self, input: &VectorRef) -> Result<BooleanBuffer> {
self.evaluate_datum(&input.to_arrow_array())
}
fn evaluate_datum(&self, input: &impl Datum) -> Result<BooleanBuffer> {
let result = match self.op {
Operator::Eq => cmp::eq(input, &self.literal),
Operator::NotEq => cmp::neq(input, &self.literal),
Operator::Lt => cmp::lt(input, &self.literal),
Operator::LtEq => cmp::lt_eq(input, &self.literal),
Operator::Gt => cmp::gt(input, &self.literal),
Operator::GtEq => cmp::gt_eq(input, &self.literal),
_ => {
return UnsupportedOperationSnafu {
reason: format!("{:?}", self.op),
}
.fail()
}
};
result
.context(ArrowComputeSnafu)
.map(|array| array.values().clone())
}
}
#[cfg(test)]
mod test {
use std::sync::Arc;
use datafusion::logical_expr::BinaryExpr;
use datafusion_common::Column;
use super::*;
#[test]
fn unsupported_filter_op() {
// `+` is not supported
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Plus,
right: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
});
assert!(SimpleFilterEvaluator::try_new(&expr).is_none());
// two literal is not supported
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
});
assert!(SimpleFilterEvaluator::try_new(&expr).is_none());
// two column is not supported
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Column(Column {
relation: None,
name: "bar".to_string(),
})),
});
assert!(SimpleFilterEvaluator::try_new(&expr).is_none());
// compound expr is not supported
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
})),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
});
assert!(SimpleFilterEvaluator::try_new(&expr).is_none());
}
#[test]
fn supported_filter_op() {
// equal
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
});
let _ = SimpleFilterEvaluator::try_new(&expr).unwrap();
// swap operands
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
op: Operator::Lt,
right: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
});
let evaluator = SimpleFilterEvaluator::try_new(&expr).unwrap();
assert_eq!(evaluator.op, Operator::Gt);
assert_eq!(evaluator.column_name, "foo".to_string());
}
#[test]
fn run_on_array() {
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
});
let evaluator = SimpleFilterEvaluator::try_new(&expr).unwrap();
let input_1 = Arc::new(datatypes::arrow::array::Int64Array::from(vec![1, 2, 3])) as _;
let result = evaluator.evaluate_array(&input_1).unwrap();
assert_eq!(result, BooleanBuffer::from(vec![true, false, false]));
let input_2 = Arc::new(datatypes::arrow::array::Int64Array::from(vec![1, 1, 1])) as _;
let result = evaluator.evaluate_array(&input_2).unwrap();
assert_eq!(result, BooleanBuffer::from(vec![true, true, true]));
let input_3 = Arc::new(datatypes::arrow::array::Int64Array::new_null(0)) as _;
let result = evaluator.evaluate_array(&input_3).unwrap();
assert_eq!(result, BooleanBuffer::from(vec![]));
}
#[test]
fn run_on_scalar() {
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Lt,
right: Box::new(Expr::Literal(ScalarValue::Int64(Some(1)))),
});
let evaluator = SimpleFilterEvaluator::try_new(&expr).unwrap();
let input_1 = ScalarValue::Int64(Some(1));
let result = evaluator.evaluate_scalar(&input_1).unwrap();
assert!(!result);
let input_2 = ScalarValue::Int64(Some(0));
let result = evaluator.evaluate_scalar(&input_2).unwrap();
assert!(result);
let input_3 = ScalarValue::Int64(None);
let result = evaluator.evaluate_scalar(&input_3).unwrap();
assert!(!result);
}
}

View File

@@ -14,6 +14,7 @@
pub mod adapter;
pub mod error;
pub mod filter;
mod recordbatch;
pub mod util;