From 7e573e497c5c4675e3eee69ce5be668bdf22fc0d Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Wed, 20 Aug 2025 11:51:10 -0700 Subject: [PATCH] feat: simplify more regex patterns in promql (#6747) * feat: simplify more regex patterns in promql Signed-off-by: Ruihang Xia * add sqlness cases Signed-off-by: Ruihang Xia * update sqlness case Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- src/query/src/promql/planner.rs | 50 ++++++-- .../standalone/common/promql/regex.result | 107 ++++++++++++++++++ .../cases/standalone/common/promql/regex.sql | 38 +++++++ 3 files changed, 184 insertions(+), 11 deletions(-) diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs index ef3890307b..2ef0d75462 100644 --- a/src/query/src/promql/planner.rs +++ b/src/query/src/promql/planner.rs @@ -1304,20 +1304,48 @@ impl PromPlanner { MatchOp::NotEqual => col.not_eq(lit), MatchOp::Re(re) => { // TODO(ruihang): a more programmatic way to handle this in datafusion - if re.as_str() == ".*" { + + // This is a hack to handle `.+` and `.*`, and is not strictly correct + // `.` doesn't match newline (`\n`). Given this is in PromQL context, + // most of the time it's fine. + if re.as_str() == "^(?:.*)$" { continue; } - DfExpr::BinaryExpr(BinaryExpr { - left: Box::new(col), - op: Operator::RegexMatch, - right: Box::new(re.as_str().lit()), - }) + if re.as_str() == "^(?:.+)$" { + col.not_eq(DfExpr::Literal( + ScalarValue::Utf8(Some(String::new())), + None, + )) + } else { + DfExpr::BinaryExpr(BinaryExpr { + left: Box::new(col), + op: Operator::RegexMatch, + right: Box::new(DfExpr::Literal( + ScalarValue::Utf8(Some(re.as_str().to_string())), + None, + )), + }) + } + } + MatchOp::NotRe(re) => { + if re.as_str() == "^(?:.*)$" { + DfExpr::Literal(ScalarValue::Boolean(Some(false)), None) + } else if re.as_str() == "^(?:.+)$" { + col.eq(DfExpr::Literal( + ScalarValue::Utf8(Some(String::new())), + None, + )) + } else { + DfExpr::BinaryExpr(BinaryExpr { + left: Box::new(col), + op: Operator::RegexNotMatch, + right: Box::new(DfExpr::Literal( + ScalarValue::Utf8(Some(re.as_str().to_string())), + None, + )), + }) + } } - MatchOp::NotRe(re) => DfExpr::BinaryExpr(BinaryExpr { - left: Box::new(col), - op: Operator::RegexNotMatch, - right: Box::new(re.as_str().lit()), - }), }; exprs.push(expr); } diff --git a/tests/cases/standalone/common/promql/regex.result b/tests/cases/standalone/common/promql/regex.result index 8758c77ba7..76bdf5a222 100644 --- a/tests/cases/standalone/common/promql/regex.result +++ b/tests/cases/standalone/common/promql/regex.result @@ -55,6 +55,113 @@ TQL EVAL (0, 100, '15s') test{host=~"(10\\.0\\.160\\.237:8080|10\\.0\\.160\\.237 | 1970-01-01T00:01:30 | 10.0.160.237:8080 | 1 | +---------------------+-------------------+-----+ +-- Some radical regex optimization +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +TQL ANALYZE VERBOSE (0, 0, '1s') test{host=~".*"}; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[1000], time index=[ts] REDACTED +|_|_|_PromSeriesDivideExec: tags=["host"] REDACTED +|_|_|_SortExec: expr=[host@1 ASC, ts@0 ASC], preserve_partitioning=[true] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_SeriesScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries", "projection": ["ts", "host", "val"], "filters": ["ts >= TimestampMillisecond(-300000, None)", "ts <= TimestampMillisecond(300000, None)"], "REDACTED +|_|_|_| +|_|_| Total rows: 2_| ++-+-+-+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +TQL ANALYZE VERBOSE (0, 0, '1s') test{host=~".+"}; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[1000], time index=[ts] REDACTED +|_|_|_PromSeriesDivideExec: tags=["host"] REDACTED +|_|_|_SortExec: expr=[host@1 ASC, ts@0 ASC], preserve_partitioning=[true] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_SeriesScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries", "projection": ["ts", "host", "val"], "filters": ["host != Utf8(\"\")", "ts >= TimestampMillisecond(-300000, None)", "ts <= TimestampMillisecond(300000, None)"], "REDACTED +|_|_|_| +|_|_| Total rows: 2_| ++-+-+-+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +TQL ANALYZE VERBOSE (0, 0, '1s') test{host!~".*"}; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[1000], time index=[ts] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_PromSeriesDivideExec: tags=["host"] REDACTED +|_|_|_SortExec: expr=[host@1 ASC, ts@0 ASC], preserve_partitioning=[false] REDACTED +|_|_|_EmptyExec REDACTED +|_|_|_| +|_|_| Total rows: 0_| ++-+-+-+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +TQL ANALYZE VERBOSE (0, 0, '1s') test{host!~".+"}; + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[1000], time index=[ts] REDACTED +|_|_|_PromSeriesDivideExec: tags=["host"] REDACTED +|_|_|_SortExec: expr=[host@1 ASC, ts@0 ASC], preserve_partitioning=[true] REDACTED +|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_SeriesScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries", "projection": ["ts", "host", "val"], "filters": ["host = Utf8(\"\")", "ts >= TimestampMillisecond(-300000, None)", "ts <= TimestampMillisecond(300000, None)"], "REDACTED +|_|_|_| +|_|_| Total rows: 0_| ++-+-+-+ + DROP TABLE test; Affected Rows: 0 diff --git a/tests/cases/standalone/common/promql/regex.sql b/tests/cases/standalone/common/promql/regex.sql index 84b1ff09c9..3b7be2a5ac 100644 --- a/tests/cases/standalone/common/promql/regex.sql +++ b/tests/cases/standalone/common/promql/regex.sql @@ -17,4 +17,42 @@ TQL EVAL (0, 100, '15s') test{host=~"10\\.0\\.160\\.237:808|nonexistence"}; TQL EVAL (0, 100, '15s') test{host=~"(10\\.0\\.160\\.237:8080|10\\.0\\.160\\.237:9090)"}; +-- Some radical regex optimization + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +TQL ANALYZE VERBOSE (0, 0, '1s') test{host=~".*"}; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +TQL ANALYZE VERBOSE (0, 0, '1s') test{host=~".+"}; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +TQL ANALYZE VERBOSE (0, 0, '1s') test{host!~".*"}; + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +TQL ANALYZE VERBOSE (0, 0, '1s') test{host!~".+"}; + DROP TABLE test;