feat: query mem limiter (#7078)

* feat: query mem limiter * feat: config docs * feat: frontend query limit config * fix: unused imports Signed-off-by: jeremyhi <fengjiachun@gmail.com> * feat: add metrics for query memory tracker Signed-off-by: jeremyhi <fengjiachun@gmail.com> * fix: right postion for tracker Signed-off-by: jeremyhi <fengjiachun@gmail.com> * fix: avoid race condition Signed-off-by: jeremyhi <fengjiachun@gmail.com> * feat: soft and hard limit Signed-off-by: jeremyhi <fengjiachun@gmail.com> * feat: docs Signed-off-by: jeremyhi <fengjiachun@gmail.com> * fix: when soft_limit == 0 Signed-off-by: jeremyhi <fengjiachun@gmail.com> * feat: upgrade limit algorithm Signed-off-by: jeremyhi <fengjiachun@gmail.com> * fix: remove batch window Signed-off-by: jeremyhi <fengjiachun@gmail.com> * chore: batch mem size Signed-off-by: jeremyhi <fengjiachun@gmail.com> * feat: refine limit algorithm Signed-off-by: jeremyhi <fengjiachun@gmail.com> * fix: get sys mem Signed-off-by: jeremyhi <fengjiachun@gmail.com> * chore: minor change * feat: up tracker to the top stream * feat: estimated_size for batch Signed-off-by: jeremyhi <fengjiachun@gmail.com> * chore: minor refactor * feat: scan_memory_limit connect to max_concurrent_queries Signed-off-by: jeremyhi <fengjiachun@gmail.com> * chore: make callback clearly * feat: add unlimted enum Signed-off-by: jeremyhi <fengjiachun@gmail.com> * chore: by review comment * chore: comment on recursion_limit Signed-off-by: jeremyhi <fengjiachun@gmail.com> * feat: refactor and put permit into RegionScanExec Signed-off-by: jeremyhi <fengjiachun@gmail.com> * chore: multiple lazy static blocks * chore: minor change Signed-off-by: jeremyhi <fengjiachun@gmail.com> --------- Signed-off-by: jeremyhi <fengjiachun@gmail.com>
2026-01-06 13:22:57 +00:00 · 2025-11-11 15:47:55 +08:00
parent afa8684ebd
commit c7fded29ee
24 changed files with 1118 additions and 22 deletions
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -18,6 +18,9 @@ init_regions_in_background = false
 init_regions_parallelism = 16

 ## The maximum current queries allowed to be executed. Zero means unlimited.
+## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
+## When set, 70% of queries get privileged memory access (full scan_memory_limit).
+## The remaining 30% get standard tier access (70% of scan_memory_limit).
 max_concurrent_queries = 0

 ## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -261,6 +264,13 @@ overwrite_entry_start_id = false
 ## Default to 0, which means the number of CPU cores.
 parallelism = 0

+## Memory pool size for query execution operators (aggregation, sorting, join).
+## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
+## Setting it to 0 disables the limit (unbounded, default behavior).
+## When this limit is reached, queries will fail with ResourceExhausted error.
+## NOTE: This does NOT limit memory used by table scans.
+memory_pool_size = "50%"
+
 ## The data storage options.
 [storage]
 ## The working home directory.
@@ -501,6 +511,14 @@ max_concurrent_scan_files = 384
 ## Whether to allow stale WAL entries read during replay.
 allow_stale_entries = false

+## Memory limit for table scans across all queries.
+## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
+## Setting it to 0 disables the limit.
+## NOTE: Works with max_concurrent_queries for tiered memory allocation.
+## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
+## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
+scan_memory_limit = "50%"
+
 ## Minimum time interval between two compactions.
 ## To align with the old behavior, the default value is 0 (no restrictions).
 min_compaction_interval = "0m"