feat: refine log query AST (#5316)

* draft

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* impl planner part

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* feat: tweak aggr func

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix clippy

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* todo about context

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* rename log expr

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* sign todo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Ruihang Xia
2025-01-15 20:11:24 +08:00
committed by GitHub
parent 121ec7936f
commit 4f29e50ef3
2 changed files with 110 additions and 27 deletions

View File

@@ -24,16 +24,73 @@ use crate::error::{
/// GreptimeDB's log query request.
#[derive(Debug, Serialize, Deserialize)]
pub struct LogQuery {
// Global query parameters
/// A fully qualified table name to query logs from.
pub table: TableName,
/// Specifies the time range for the log query. See [`TimeFilter`] for more details.
pub time_filter: TimeFilter,
/// Columns with filters to query.
pub columns: Vec<ColumnFilters>,
/// Controls row skipping and fetch count for logs.
/// Controls row skipping and fetch on the result set.
pub limit: Limit,
/// Adjacent lines to return.
/// Columns to return in the result set.
///
/// The columns can be either from the original log or derived from processing exprs.
/// Default (empty) means all columns.
///
/// TODO(ruihang): Do we need negative select?
pub columns: Vec<String>,
// Filters
/// Conjunction of filters to apply for the raw logs.
///
/// Filters here can only refer to the columns from the original log.
pub filters: Vec<ColumnFilters>,
/// Adjacent lines to return. Applies to all filters above.
///
/// TODO(ruihang): Do we need per-filter context?
pub context: Context,
// Processors
/// Expressions to calculate after filter.
pub exprs: Vec<LogExpr>,
}
/// Expression to calculate on log after filtering.
#[derive(Debug, Serialize, Deserialize)]
pub enum LogExpr {
NamedIdent(String),
PositionalIdent(usize),
Literal(String),
ScalarFunc {
name: String,
args: Vec<LogExpr>,
},
AggrFunc {
name: String,
args: Vec<LogExpr>,
/// Optional range function parameter. Stands for the time range for both step and align.
range: Option<String>,
by: Vec<LogExpr>,
},
Decompose {
expr: Box<LogExpr>,
/// JSON, CSV, etc.
schema: String,
/// Fields with type name to extract from the decomposed value.
fields: Vec<(String, String)>,
},
BinaryOp {
left: Box<LogExpr>,
op: String,
right: Box<LogExpr>,
},
Alias {
expr: Box<LogExpr>,
alias: String,
},
Filter {
expr: Box<LogExpr>,
filter: ContentFilter,
},
}
impl Default for LogQuery {
@@ -41,9 +98,11 @@ impl Default for LogQuery {
Self {
table: TableName::new("", "", ""),
time_filter: Default::default(),
columns: vec![],
filters: vec![],
limit: Limit::default(),
context: Default::default(),
columns: vec![],
exprs: vec![],
}
}
}
@@ -232,6 +291,7 @@ pub struct ColumnFilters {
#[derive(Debug, Serialize, Deserialize)]
pub enum ContentFilter {
// Search-based filters
/// Only match the exact content.
///
/// For example, if the content is "pale blue dot", the filter "pale" or "pale blue" will match.
@@ -246,6 +306,14 @@ pub enum ContentFilter {
Contains(String),
/// Match the content with a regex pattern. The pattern should be a valid Rust regex.
Regex(String),
// Value-based filters
/// Content exists, a.k.a. not null.
Exist,
Between(String, String),
// TODO(ruihang): arithmetic operations
// Compound filters
Compound(Vec<ContentFilter>, BinaryOperator),
}

View File

@@ -69,13 +69,11 @@ impl LogQueryPlanner {
// Time filter
filters.push(self.build_time_filter(&query.time_filter, &schema)?);
// Column filters and projections
let mut projected_columns = Vec::new();
for column_filter in &query.columns {
// Column filters
for column_filter in &query.filters {
if let Some(expr) = self.build_column_filter(column_filter)? {
filters.push(expr);
}
projected_columns.push(col(&column_filter.column_name));
}
// Apply filters
@@ -87,9 +85,12 @@ impl LogQueryPlanner {
}
// Apply projections
plan_builder = plan_builder
.project(projected_columns)
.context(DataFusionPlanningSnafu)?;
if !query.columns.is_empty() {
let projected_columns = query.columns.iter().map(col).collect::<Vec<_>>();
plan_builder = plan_builder
.project(projected_columns)
.context(DataFusionPlanningSnafu)?;
}
// Apply limit
plan_builder = plan_builder
@@ -159,6 +160,17 @@ impl LogQueryPlanner {
}
.build(),
),
log_query::ContentFilter::Exist => {
Ok(col(&column_filter.column_name).is_not_null())
}
log_query::ContentFilter::Between(lower, upper) => {
Ok(col(&column_filter.column_name)
.gt_eq(lit(ScalarValue::Utf8(Some(escape_like_pattern(lower)))))
.and(
col(&column_filter.column_name)
.lt_eq(lit(ScalarValue::Utf8(Some(escape_like_pattern(upper))))),
))
}
log_query::ContentFilter::Compound(..) => Err::<Expr, _>(
UnimplementedSnafu {
feature: "compound filter",
@@ -267,7 +279,7 @@ mod tests {
end: Some("2021-01-02T00:00:00Z".to_string()),
span: None,
},
columns: vec![ColumnFilters {
filters: vec![ColumnFilters {
column_name: "message".to_string(),
filters: vec![ContentFilter::Contains("error".to_string())],
}],
@@ -276,13 +288,14 @@ mod tests {
fetch: Some(100),
},
context: Context::None,
columns: vec![],
exprs: vec![],
};
let plan = planner.query_to_plan(log_query).await.unwrap();
let expected = "Limit: skip=0, fetch=100 [message:Utf8]\
\n Projection: greptime.public.test_table.message [message:Utf8]\
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
let expected = "Limit: skip=0, fetch=100 [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
assert_eq!(plan.display_indent_schema().to_string(), expected);
}
@@ -380,7 +393,7 @@ mod tests {
end: Some("2021-01-02T00:00:00Z".to_string()),
span: None,
},
columns: vec![ColumnFilters {
filters: vec![ColumnFilters {
column_name: "message".to_string(),
filters: vec![ContentFilter::Contains("error".to_string())],
}],
@@ -389,13 +402,14 @@ mod tests {
fetch: None,
},
context: Context::None,
columns: vec![],
exprs: vec![],
};
let plan = planner.query_to_plan(log_query).await.unwrap();
let expected = "Limit: skip=10, fetch=1000 [message:Utf8]\
\n Projection: greptime.public.test_table.message [message:Utf8]\
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
let expected = "Limit: skip=10, fetch=1000 [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
assert_eq!(plan.display_indent_schema().to_string(), expected);
}
@@ -413,7 +427,7 @@ mod tests {
end: Some("2021-01-02T00:00:00Z".to_string()),
span: None,
},
columns: vec![ColumnFilters {
filters: vec![ColumnFilters {
column_name: "message".to_string(),
filters: vec![ContentFilter::Contains("error".to_string())],
}],
@@ -422,13 +436,14 @@ mod tests {
fetch: None,
},
context: Context::None,
columns: vec![],
exprs: vec![],
};
let plan = planner.query_to_plan(log_query).await.unwrap();
let expected = "Limit: skip=0, fetch=1000 [message:Utf8]\
\n Projection: greptime.public.test_table.message [message:Utf8]\
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
let expected = "Limit: skip=0, fetch=1000 [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n Filter: greptime.public.test_table.timestamp >= Utf8(\"2021-01-01T00:00:00Z\") AND greptime.public.test_table.timestamp <= Utf8(\"2021-01-02T00:00:00Z\") AND greptime.public.test_table.message LIKE Utf8(\"%error%\") [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]\
\n TableScan: greptime.public.test_table [message:Utf8, timestamp:Timestamp(Millisecond, None), host:Utf8;N]";
assert_eq!(plan.display_indent_schema().to_string(), expected);
}