mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-26 16:10:02 +00:00
Compare commits
10 Commits
feat/impl-
...
fix/part_s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d3cfdc0e5 | ||
|
|
088401c3e9 | ||
|
|
4419e0254f | ||
|
|
709ccd3e31 | ||
|
|
5b50b4824d | ||
|
|
1ef5c2e024 | ||
|
|
d20727f335 | ||
|
|
2391ab1941 | ||
|
|
ec77a5d53a | ||
|
|
dbad96eb80 |
@@ -102,30 +102,6 @@ like `feat`/`fix`/`docs`, with a concise summary of code change following. AVOID
|
||||
|
||||
All commit messages SHOULD adhere to the [Conventional Commits specification](https://conventionalcommits.org/).
|
||||
|
||||
## AI-Assisted contributions
|
||||
|
||||
We has the following policy for AI-assisted PRs:
|
||||
|
||||
- The PR author should **understand the core ideas** behind the implementation **end-to-end**, and be able to justify the design and code during review.
|
||||
- **Calls out unknowns and assumptions**. It's okay to not fully understand some bits of AI generated code. You should comment on these cases and point them out to reviewers so that they can use their knowledge of the codebase to clear up any concerns. For example, you might comment "calling this function here seems to work but I'm not familiar with how it works internally, I wonder if there's a race condition if it is called concurrently".
|
||||
|
||||
### Why fully AI-generated PRs without understanding are not helpful
|
||||
|
||||
Today, AI tools cannot reliably make complex changes to DataFusion on their own, which is why we rely on pull requests and code review.
|
||||
|
||||
The purposes of code review are:
|
||||
|
||||
1. Finish the intended task.
|
||||
2. Share knowledge between authors and reviewers, as a long-term investment in the project. For this reason, even if someone familiar with the codebase can finish a task quickly, we're still happy to help a new contributor work on it even if it takes longer.
|
||||
|
||||
An AI dump for an issue doesn’t meet these purposes. Maintainers could finish the task faster by using AI directly, and the submitters gain little knowledge if they act only as a pass through AI proxy without understanding.
|
||||
|
||||
Please understand the reviewing capacity is **very limited** for the project, so large PRs which appear to not have the requisite understanding might not get reviewed, and eventually closed or redirected.
|
||||
|
||||
### Better ways to contribute than an “AI dump”
|
||||
|
||||
It's recommended to write a high-quality issue with a clear problem statement and a minimal, reproducible example. This can make it easier for others to contribute.
|
||||
|
||||
## Getting Help
|
||||
|
||||
There are many ways to get help when you're stuck. It is recommended to ask for help by opening an issue, with a detailed description
|
||||
|
||||
45
Cargo.lock
generated
45
Cargo.lock
generated
@@ -2190,7 +2190,6 @@ dependencies = [
|
||||
"approx 0.5.1",
|
||||
"arc-swap",
|
||||
"arrow",
|
||||
"arrow-cast",
|
||||
"arrow-schema",
|
||||
"async-trait",
|
||||
"bincode",
|
||||
@@ -2221,7 +2220,6 @@ dependencies = [
|
||||
"h3o",
|
||||
"hyperloglogplus",
|
||||
"jsonb",
|
||||
"jsonpath-rust 0.7.5",
|
||||
"memchr",
|
||||
"mito-codec",
|
||||
"nalgebra",
|
||||
@@ -2582,12 +2580,10 @@ dependencies = [
|
||||
name = "common-sql"
|
||||
version = "1.0.0-beta.3"
|
||||
dependencies = [
|
||||
"arrow-schema",
|
||||
"common-base",
|
||||
"common-decimal",
|
||||
"common-error",
|
||||
"common-macro",
|
||||
"common-telemetry",
|
||||
"common-time",
|
||||
"datafusion-sql",
|
||||
"datatypes",
|
||||
@@ -5040,7 +5036,6 @@ dependencies = [
|
||||
"common-function",
|
||||
"common-grpc",
|
||||
"common-macro",
|
||||
"common-memory-manager",
|
||||
"common-meta",
|
||||
"common-options",
|
||||
"common-procedure",
|
||||
@@ -9325,9 +9320,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pgwire"
|
||||
version = "0.37.0"
|
||||
version = "0.36.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02d86d57e732d40382ceb9bfea80901d839bae8571aa11c06af9177aed9dfb6c"
|
||||
checksum = "70a2bcdcc4b20a88e0648778ecf00415bbd5b447742275439c22176835056f99"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.22.1",
|
||||
@@ -9346,7 +9341,6 @@ dependencies = [
|
||||
"ryu",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"smol_str",
|
||||
"stringprep",
|
||||
"thiserror 2.0.17",
|
||||
"tokio",
|
||||
@@ -11511,11 +11505,10 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.228"
|
||||
version = "1.0.219"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
@@ -11529,20 +11522,11 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.228"
|
||||
version = "1.0.219"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -11695,7 +11679,6 @@ dependencies = [
|
||||
"common-grpc",
|
||||
"common-macro",
|
||||
"common-mem-prof",
|
||||
"common-memory-manager",
|
||||
"common-meta",
|
||||
"common-plugins",
|
||||
"common-pprof",
|
||||
@@ -12018,16 +12001,6 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smol_str"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3498b0a27f93ef1402f20eefacfaa1691272ac4eca1cdc8c596cb0a245d6cbf5"
|
||||
dependencies = [
|
||||
"borsh",
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "snafu"
|
||||
version = "0.7.5"
|
||||
@@ -12233,7 +12206,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "sqlparser"
|
||||
version = "0.58.0"
|
||||
source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=a0ce2bc6eb3e804532932f39833c32432f5c9a39#a0ce2bc6eb3e804532932f39833c32432f5c9a39"
|
||||
source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=4b519a5caa95472cc3988f5556813a583dd35af1#4b519a5caa95472cc3988f5556813a583dd35af1"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
@@ -12257,7 +12230,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "sqlparser_derive"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=a0ce2bc6eb3e804532932f39833c32432f5c9a39#a0ce2bc6eb3e804532932f39833c32432f5c9a39"
|
||||
source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=4b519a5caa95472cc3988f5556813a583dd35af1#4b519a5caa95472cc3988f5556813a583dd35af1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -12488,7 +12461,6 @@ dependencies = [
|
||||
"common-config",
|
||||
"common-error",
|
||||
"common-macro",
|
||||
"common-memory-manager",
|
||||
"common-meta",
|
||||
"common-options",
|
||||
"common-procedure",
|
||||
@@ -13191,7 +13163,6 @@ dependencies = [
|
||||
"common-event-recorder",
|
||||
"common-frontend",
|
||||
"common-grpc",
|
||||
"common-memory-manager",
|
||||
"common-meta",
|
||||
"common-procedure",
|
||||
"common-query",
|
||||
|
||||
@@ -103,7 +103,6 @@ aquamarine = "0.6"
|
||||
arrow = { version = "56.2", features = ["prettyprint"] }
|
||||
arrow-array = { version = "56.2", default-features = false, features = ["chrono-tz"] }
|
||||
arrow-buffer = "56.2"
|
||||
arrow-cast = "56.2"
|
||||
arrow-flight = "56.2"
|
||||
arrow-ipc = { version = "56.2", default-features = false, features = ["lz4", "zstd"] }
|
||||
arrow-schema = { version = "56.2", features = ["serde"] }
|
||||
@@ -333,7 +332,7 @@ datafusion-physical-plan = { git = "https://github.com/GreptimeTeam/datafusion.g
|
||||
datafusion-datasource = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-sql = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
datafusion-substrait = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "fd4b2abcf3c3e43e94951bda452c9fd35243aab0" }
|
||||
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "a0ce2bc6eb3e804532932f39833c32432f5c9a39" } # branch = "v0.58.x"
|
||||
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "4b519a5caa95472cc3988f5556813a583dd35af1" } # branch = "v0.58.x"
|
||||
|
||||
[profile.release]
|
||||
debug = 1
|
||||
|
||||
@@ -14,12 +14,11 @@
|
||||
| --- | -----| ------- | ----------- |
|
||||
| `default_timezone` | String | Unset | The default timezone of the server. |
|
||||
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
|
||||
| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
|
||||
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
|
||||
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
|
||||
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). |
|
||||
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
|
||||
| `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. |
|
||||
| `runtime` | -- | -- | The runtime options. |
|
||||
| `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
|
||||
| `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
|
||||
@@ -27,12 +26,14 @@
|
||||
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
|
||||
| `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
|
||||
| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
|
||||
| `http.max_total_body_memory` | String | Unset | Maximum total memory for all concurrent HTTP request bodies.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `http.enable_cors` | Bool | `true` | HTTP CORS support, it's turned on by default<br/>This allows browser to access http APIs without CORS restrictions |
|
||||
| `http.cors_allowed_origins` | Array | Unset | Customize allowed origins for HTTP CORS. |
|
||||
| `http.prom_validation_mode` | String | `strict` | Whether to enable validation for Prometheus remote write requests.<br/>Available options:<br/>- strict: deny invalid UTF-8 strings (default).<br/>- lossy: allow invalid UTF-8 strings, replace invalid characters with REPLACEMENT_CHARACTER(U+FFFD).<br/>- unchecked: do not valid strings. |
|
||||
| `grpc` | -- | -- | The gRPC server options. |
|
||||
| `grpc.bind_addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
|
||||
| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
|
||||
| `grpc.max_total_message_memory` | String | Unset | Maximum total memory for all concurrent gRPC request messages.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `grpc.max_connection_age` | String | Unset | The maximum connection age for gRPC connection.<br/>The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.<br/>Refer to https://grpc.io/docs/guides/keepalive/ for more details. |
|
||||
| `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
|
||||
| `grpc.tls.mode` | String | `disable` | TLS mode. |
|
||||
@@ -226,8 +227,7 @@
|
||||
| --- | -----| ------- | ----------- |
|
||||
| `default_timezone` | String | Unset | The default timezone of the server. |
|
||||
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
|
||||
| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
|
||||
| `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. |
|
||||
| `runtime` | -- | -- | The runtime options. |
|
||||
| `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
|
||||
| `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
|
||||
@@ -238,6 +238,7 @@
|
||||
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
|
||||
| `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
|
||||
| `http.body_limit` | String | `64MB` | HTTP request body limit.<br/>The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.<br/>Set to 0 to disable limit. |
|
||||
| `http.max_total_body_memory` | String | Unset | Maximum total memory for all concurrent HTTP request bodies.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `http.enable_cors` | Bool | `true` | HTTP CORS support, it's turned on by default<br/>This allows browser to access http APIs without CORS restrictions |
|
||||
| `http.cors_allowed_origins` | Array | Unset | Customize allowed origins for HTTP CORS. |
|
||||
| `http.prom_validation_mode` | String | `strict` | Whether to enable validation for Prometheus remote write requests.<br/>Available options:<br/>- strict: deny invalid UTF-8 strings (default).<br/>- lossy: allow invalid UTF-8 strings, replace invalid characters with REPLACEMENT_CHARACTER(U+FFFD).<br/>- unchecked: do not valid strings. |
|
||||
@@ -245,6 +246,7 @@
|
||||
| `grpc.bind_addr` | String | `127.0.0.1:4001` | The address to bind the gRPC server. |
|
||||
| `grpc.server_addr` | String | `127.0.0.1:4001` | The address advertised to the metasrv, and used for connections from outside the host.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `grpc.bind_addr`. |
|
||||
| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
|
||||
| `grpc.max_total_message_memory` | String | Unset | Maximum total memory for all concurrent gRPC request messages.<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `grpc.flight_compression` | String | `arrow_ipc` | Compression mode for frontend side Arrow IPC service. Available options:<br/>- `none`: disable all compression<br/>- `transport`: only enable gRPC transport compression (zstd)<br/>- `arrow_ipc`: only enable Arrow IPC compression (lz4)<br/>- `all`: enable all compression.<br/>Default to `none` |
|
||||
| `grpc.max_connection_age` | String | Unset | The maximum connection age for gRPC connection.<br/>The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.<br/>Refer to https://grpc.io/docs/guides/keepalive/ for more details. |
|
||||
| `grpc.tls` | -- | -- | gRPC server TLS options, see `mysql.tls` section. |
|
||||
@@ -344,10 +346,10 @@
|
||||
| `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. |
|
||||
| `backend` | String | `etcd_store` | The datastore for meta server.<br/>Available values:<br/>- `etcd_store` (default value)<br/>- `memory_store`<br/>- `postgres_store`<br/>- `mysql_store` |
|
||||
| `meta_table_name` | String | `greptime_metakv` | Table name in RDS to store metadata. Effect when using a RDS kvbackend.<br/>**Only used when backend is `postgres_store`.** |
|
||||
| `meta_schema_name` | String | `greptime_schema` | Optional PostgreSQL schema for metadata table and election table name qualification.<br/>When PostgreSQL public schema is not writable (e.g., PostgreSQL 15+ with restricted public),<br/>set this to a writable schema. GreptimeDB will use `meta_schema_name`.`meta_table_name`.<br/>**Only used when backend is `postgres_store`.** |
|
||||
| `auto_create_schema` | Bool | `true` | Automatically create PostgreSQL schema if it doesn't exist.<br/>When enabled, the system will execute `CREATE SCHEMA IF NOT EXISTS <schema_name>`<br/>before creating metadata tables. This is useful in production environments where<br/>manual schema creation may be restricted.<br/>Default is true.<br/>Note: The PostgreSQL user must have CREATE SCHEMA permission for this to work.<br/>**Only used when backend is `postgres_store`.** |
|
||||
| `meta_schema_name` | String | `greptime_schema` | Optional PostgreSQL schema for metadata table and election table name qualification.<br/>When PostgreSQL public schema is not writable (e.g., PostgreSQL 15+ with restricted public),<br/>set this to a writable schema. GreptimeDB will use `meta_schema_name`.`meta_table_name`.<br/>GreptimeDB will NOT create the schema automatically; please ensure it exists or the user has permission.<br/>**Only used when backend is `postgres_store`.** |
|
||||
| `meta_election_lock_id` | Integer | `1` | Advisory lock id in PostgreSQL for election. Effect when using PostgreSQL as kvbackend<br/>Only used when backend is `postgres_store`. |
|
||||
| `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
|
||||
| `use_memory_store` | Bool | `false` | Store data in memory. |
|
||||
| `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). |
|
||||
| `region_failure_detector_initialization_delay` | String | `10m` | The delay before starting region failure detection.<br/>This delay helps prevent Metasrv from triggering unnecessary region failovers before all Datanodes are fully started.<br/>Especially useful when the cluster is not deployed with GreptimeDB Operator and maintenance mode is not enabled. |
|
||||
| `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL.<br/>**This option is not recommended to be set to true, because it may lead to data loss during failover.** |
|
||||
|
||||
@@ -6,15 +6,9 @@ default_timezone = "UTC"
|
||||
## @toml2docs:none-default
|
||||
default_column_prefix = "greptime"
|
||||
|
||||
## Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## The maximum in-flight write bytes.
|
||||
## @toml2docs:none-default
|
||||
#+ max_in_flight_write_bytes = "1GB"
|
||||
|
||||
## Policy when write bytes quota is exhausted.
|
||||
## Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail"
|
||||
## @toml2docs:none-default
|
||||
#+ write_bytes_exhausted_policy = "wait"
|
||||
#+ max_in_flight_write_bytes = "500MB"
|
||||
|
||||
## The runtime options.
|
||||
#+ [runtime]
|
||||
@@ -41,6 +35,10 @@ timeout = "0s"
|
||||
## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
|
||||
## Set to 0 to disable limit.
|
||||
body_limit = "64MB"
|
||||
## Maximum total memory for all concurrent HTTP request bodies.
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_total_body_memory = "1GB"
|
||||
## HTTP CORS support, it's turned on by default
|
||||
## This allows browser to access http APIs without CORS restrictions
|
||||
enable_cors = true
|
||||
@@ -64,6 +62,10 @@ bind_addr = "127.0.0.1:4001"
|
||||
server_addr = "127.0.0.1:4001"
|
||||
## The number of server worker threads.
|
||||
runtime_size = 8
|
||||
## Maximum total memory for all concurrent gRPC request messages.
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_total_message_memory = "1GB"
|
||||
## Compression mode for frontend side Arrow IPC service. Available options:
|
||||
## - `none`: disable all compression
|
||||
## - `transport`: only enable gRPC transport compression (zstd)
|
||||
|
||||
@@ -34,17 +34,10 @@ meta_table_name = "greptime_metakv"
|
||||
## Optional PostgreSQL schema for metadata table and election table name qualification.
|
||||
## When PostgreSQL public schema is not writable (e.g., PostgreSQL 15+ with restricted public),
|
||||
## set this to a writable schema. GreptimeDB will use `meta_schema_name`.`meta_table_name`.
|
||||
## GreptimeDB will NOT create the schema automatically; please ensure it exists or the user has permission.
|
||||
## **Only used when backend is `postgres_store`.**
|
||||
meta_schema_name = "greptime_schema"
|
||||
|
||||
## Automatically create PostgreSQL schema if it doesn't exist.
|
||||
## When enabled, the system will execute `CREATE SCHEMA IF NOT EXISTS <schema_name>`
|
||||
## before creating metadata tables. This is useful in production environments where
|
||||
## manual schema creation may be restricted.
|
||||
## Default is true.
|
||||
## Note: The PostgreSQL user must have CREATE SCHEMA permission for this to work.
|
||||
## **Only used when backend is `postgres_store`.**
|
||||
auto_create_schema = true
|
||||
meta_schema_name = "greptime_schema"
|
||||
|
||||
## Advisory lock id in PostgreSQL for election. Effect when using PostgreSQL as kvbackend
|
||||
## Only used when backend is `postgres_store`.
|
||||
@@ -57,6 +50,9 @@ meta_election_lock_id = 1
|
||||
## For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector".
|
||||
selector = "round_robin"
|
||||
|
||||
## Store data in memory.
|
||||
use_memory_store = false
|
||||
|
||||
## Whether to enable region failover.
|
||||
## This feature is only available on GreptimeDB running on cluster mode and
|
||||
## - Using Remote WAL
|
||||
|
||||
@@ -6,16 +6,6 @@ default_timezone = "UTC"
|
||||
## @toml2docs:none-default
|
||||
default_column_prefix = "greptime"
|
||||
|
||||
## Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_in_flight_write_bytes = "1GB"
|
||||
|
||||
## Policy when write bytes quota is exhausted.
|
||||
## Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail"
|
||||
## @toml2docs:none-default
|
||||
#+ write_bytes_exhausted_policy = "wait"
|
||||
|
||||
## Initialize all regions in the background during the startup.
|
||||
## By default, it provides services after all regions have been initialized.
|
||||
init_regions_in_background = false
|
||||
@@ -32,6 +22,10 @@ max_concurrent_queries = 0
|
||||
## Enable telemetry to collect anonymous usage data. Enabled by default.
|
||||
#+ enable_telemetry = true
|
||||
|
||||
## The maximum in-flight write bytes.
|
||||
## @toml2docs:none-default
|
||||
#+ max_in_flight_write_bytes = "500MB"
|
||||
|
||||
## The runtime options.
|
||||
#+ [runtime]
|
||||
## The number of threads to execute the runtime for global read operations.
|
||||
@@ -49,6 +43,10 @@ timeout = "0s"
|
||||
## The following units are supported: `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`.
|
||||
## Set to 0 to disable limit.
|
||||
body_limit = "64MB"
|
||||
## Maximum total memory for all concurrent HTTP request bodies.
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_total_body_memory = "1GB"
|
||||
## HTTP CORS support, it's turned on by default
|
||||
## This allows browser to access http APIs without CORS restrictions
|
||||
enable_cors = true
|
||||
@@ -69,6 +67,10 @@ prom_validation_mode = "strict"
|
||||
bind_addr = "127.0.0.1:4001"
|
||||
## The number of server worker threads.
|
||||
runtime_size = 8
|
||||
## Maximum total memory for all concurrent gRPC request messages.
|
||||
## Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
## @toml2docs:none-default
|
||||
#+ max_total_message_memory = "1GB"
|
||||
## The maximum connection age for gRPC connection.
|
||||
## The value can be a human-readable time string. For example: `10m` for ten minutes or `1h` for one hour.
|
||||
## Refer to https://grpc.io/docs/guides/keepalive/ for more details.
|
||||
|
||||
@@ -1,94 +0,0 @@
|
||||
---
|
||||
Feature Name: Vector Index
|
||||
Tracking Issue: TBD
|
||||
Date: 2025-12-04
|
||||
Author: "TBD"
|
||||
---
|
||||
|
||||
# Summary
|
||||
Introduce a per-SST approximate nearest neighbor (ANN) index for `VECTOR(dim)` columns with a pluggable engine. USearch HNSW is the initial engine, while the design keeps VSAG (default when linked) and future engines selectable at DDL or alter time and encoded in the index metadata. The index is built alongside SST creation and accelerates `ORDER BY vec_*_distance(column, <literal vector>) LIMIT k` queries, falling back to the existing brute-force path when an index is unavailable or ineligible.
|
||||
|
||||
# Motivation
|
||||
Vector distances are currently computed with nalgebra across all rows (O(N)) before sorting, which does not scale to millions of vectors. An on-disk ANN index with sub-linear search reduces latency and compute cost for common RAG, semantic search, and recommendation workloads without changing SQL.
|
||||
|
||||
# Details
|
||||
|
||||
## Current Behavior
|
||||
`VECTOR(dim)` values are stored as binary blobs. Queries call `vec_cos_distance`/`vec_l2sq_distance`/`vec_dot_product` via nalgebra for every row and then sort; there is no indexing or caching.
|
||||
|
||||
## Index Eligibility and Configuration
|
||||
Only `VECTOR(dim)` columns can be indexed. A column metadata flag follows the existing column-option pattern with an intentionally small surface area:
|
||||
- `engine`: `vsag` (default when the binding is built) or `usearch`. If a configured engine is unavailable at runtime, the builder logs and falls back to `usearch` while leaving the option intact for future rebuilds.
|
||||
- `metric`: `cosine` (default), `l2sq`, or `dot`; mismatches with query functions force brute-force execution.
|
||||
- `m`: HNSW graph connectivity (higher = denser graph, more memory, better recall), default `16`.
|
||||
- `ef_construct`: build-time expansion, default `128`.
|
||||
- `ef_search`: query-time expansion, default `64`; engines may clamp values.
|
||||
|
||||
Option semantics mirror HNSW defaults so both USearch and VSAG can honor them; engine-specific tunables stay in reserved key-value pairs inside the blob header for forward compatibility.
|
||||
|
||||
DDL reuses column extensions similar to inverted/fulltext indexes:
|
||||
|
||||
```sql
|
||||
CREATE TABLE embeddings (
|
||||
ts TIMESTAMP TIME INDEX,
|
||||
id STRING PRIMARY KEY,
|
||||
vec VECTOR(384) VECTOR INDEX WITH (engine = 'vsag', metric = 'cosine', ef_search = 64)
|
||||
);
|
||||
```
|
||||
|
||||
Altering column options toggles the flag, can switch engines (for example `usearch` -> `vsag`), and triggers rebuilds through the existing alter/compaction flow. Engine choice stays in table metadata and each blob header; new SSTs use the configured engine while older SSTs remain readable under their recorded engine until compaction or a manual rebuild rewrites them.
|
||||
|
||||
## Storage and Format
|
||||
- One vector index per indexed column per SST, stored as a Puffin blob with type `greptime-vector-index-v1`.
|
||||
- Each blob records the engine (`usearch`, `vsag`, future values) and engine parameters in the header so readers can select the matching decoder. Mixed-engine SSTs remain readable because the engine id travels with the blob.
|
||||
- USearch uses `f32` vectors and SST row offsets (`u64`) as keys; nulls and `OpType::Delete` rows are skipped. Row ids are the absolute SST ordinal so readers can derive `RowSelection` directly from parquet row group lengths without extra side tables.
|
||||
- Blob layout:
|
||||
- Header: version, column id, dimension, engine id, metric, `m`, `ef_construct`, `ef_search`, and reserved engine-specific key-value pairs.
|
||||
- Counts: total rows written and indexed rows.
|
||||
- Payload: USearch binary produced by `save_to_buffer`.
|
||||
- An empty index (no eligible vectors) results in no available index entry for that column.
|
||||
- `puffin_manager` registers the blob type so caches and readers discover it alongside inverted/fulltext/bloom blobs in the same index file.
|
||||
|
||||
## Row Visibility and Duplicates
|
||||
- The indexer increments `row_offset` for every incoming row (including skipped/null/delete rows) so offsets stay aligned with parquet ordering across row groups.
|
||||
- Only `OpType::Put` rows with the expected dimension are inserted; `OpType::Delete` and malformed rows are skipped but still advance `row_offset`, matching the data plane’s visibility rules.
|
||||
- Multiple versions of the same primary key remain in the graph; the read path intersects search hits with the standard mito2 deduplication/visibility pipeline (sequence-aware dedup, delete filtering, projection) before returning results.
|
||||
- Searches overfetch beyond `k` to compensate for rows discarded by visibility checks and to avoid reissuing index reads.
|
||||
|
||||
## Build Path (mito2 write)
|
||||
Extend `sst::index::Indexer` to optionally create a `VectorIndexer` when region metadata marks a column as vector-indexed, mirroring how inverted/fulltext/bloom filters attach to `IndexerBuilderImpl` in `mito2`.
|
||||
|
||||
The indexer consumes `Batch`/`RecordBatch` data and shares memory tracking and abort semantics with existing indexers:
|
||||
- Maintain a running `row_offset` that follows SST write order and spans row groups so the search result can be turned into `RowSelection`.
|
||||
- For each `OpType::Put`, if the vector is non-null and matches the declared dimension, insert into USearch with `row_offset` as the key; otherwise skip.
|
||||
- Track memory with existing index build metrics; on failure, abort only the vector index while keeping SST writing unaffected.
|
||||
|
||||
Engine selection is table-driven: the builder picks the configured engine (default `vsag`, fallback `usearch` if `vsag` is not compiled in) and dispatches to the matching implementation. Unknown engines skip index build with a warning.
|
||||
|
||||
On `finish`, serialize the engine-tagged index into the Puffin writer and record `IndexType::Vector` metadata for the column. `IndexOutput` and `FileMeta::indexes/available_indexes` gain a vector entry so manifest updates and `RegionVersion` surface per-column presence, following patterns used by inverted/fulltext/bloom indexes. Planner/metadata validation ensures that mismatched dimensions only reduce the indexed-row count and do not break reads.
|
||||
|
||||
## Read Path (mito2 query)
|
||||
A planner rule in `query` identifies eligible plans on mito2 tables: a single `ORDER BY vec_cos_distance|vec_l2sq_distance|vec_dot_product(<vector column>, <literal vector>)` in ascending order plus a `LIMIT`/`TopK`. The rule rejects plans with multiple sort keys, non-literal query vectors, or additional projections that would change the distance expression and falls back to brute-force in those cases.
|
||||
|
||||
For eligible scans, build a `VectorIndexScan` execution node that:
|
||||
- Consults SST metadata for `IndexType::Vector`, loads the index via Puffin using the existing `mito2::cache::index` infrastructure, and dispatches to the engine declared in the blob header (USearch/VSAG/etc.).
|
||||
- Runs the engine’s `search` with an overfetch (for example 2×k) to tolerate rows filtered by deletes, dimension mismatches, or late-stage dedup; keys already match SST row offsets produced by the writer.
|
||||
- Converts hits to `RowSelection` using parquet row group lengths and reuses the parquet reader so visibility, projection, and deduplication logic stay unchanged; distances are recomputed with `vec_*_distance` before the final trim to k to guarantee ordering and to merge distributed partial results deterministically.
|
||||
|
||||
Any unsupported shape, load error, or cache miss falls back to the current brute-force execution path.
|
||||
|
||||
## Lifecycle and Maintenance
|
||||
Lifecycle piggybacks on the existing SST/index flow: rebuilds run where other secondary indexes do, graphs are always rebuilt from source rows (no HNSW merge), and cleanup/versioning/caching reuse the existing Puffin and index cache paths.
|
||||
|
||||
# Implementation Plan
|
||||
1. Add the `usearch` dependency (wrapper module in `index` or `mito2`) and map minimal HNSW options; keep an engine trait that allows plugging VSAG without changing the rest of the pipeline.
|
||||
2. Introduce `IndexType::Vector` and a column metadata key for vector index options (including `engine`); add SQL parser and `SHOW CREATE TABLE` support for `VECTOR INDEX WITH (...)`.
|
||||
3. Implement `vector_index` build/read modules under `mito2` (and `index` if shared), including Puffin serialization that records engine id, blob-type registration with `puffin_manager`, and integration with the `Indexer` builder, `IndexOutput`, manifest updates, and compaction rebuild.
|
||||
4. Extend the query planner/execution to detect eligible plans and drive a `RowSelection`-based ANN scan with a fallback path, dispatching by engine at read time and using existing Puffin and index caches.
|
||||
5. Add unit tests for serialization/search correctness and an end-to-end test covering plan rewrite, cache usage, engine selection, and fallback; add a mixed-engine test to confirm old USearch blobs still serve after a VSAG switch.
|
||||
6. Follow up with an optional VSAG engine binding (feature flag), validate parity with USearch on dense vectors, exercise alternative algorithms (for example PQ), and flip the default `engine` to `vsag` when the binding is present.
|
||||
|
||||
# Alternatives
|
||||
- **VSAG (follow-up engine):** C++ library with HNSW and additional algorithms (for example SINDI for sparse vectors and PQ) targeting in-memory and disk-friendly search. Provides parameter generators and a roadmap for GPU-assisted build and graph compression. Compared to FAISS it is newer with fewer integrations but bundles sparse/dense coverage and out-of-core focus in one engine. Fits the pluggable-engine design and would become the default `engine = 'vsag'` when linked; USearch remains available for lighter dependencies.
|
||||
- **FAISS:** Broad feature set (IVF/IVFPQ/PQ/HNSW, GPU acceleration, scalar filtering, pre/post filters) and battle-tested performance across datasets, but it requires a heavier C++/GPU toolchain, has no official Rust binding, and is less disk-centric than VSAG; integrating it would add more build/distribution burden than USearch/VSAG.
|
||||
- **Do nothing:** Keep brute-force evaluation, which remains O(N) and unacceptable at scale.
|
||||
@@ -61,12 +61,6 @@ pub struct StoreConfig {
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
#[clap(long)]
|
||||
pub meta_schema_name: Option<String>,
|
||||
|
||||
/// Automatically create PostgreSQL schema if it doesn't exist (default: true).
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
#[clap(long, default_value_t = true)]
|
||||
pub auto_create_schema: bool,
|
||||
|
||||
/// TLS mode for backend store connections (etcd, PostgreSQL, MySQL)
|
||||
#[clap(long = "backend-tls-mode", value_enum, default_value = "disable")]
|
||||
pub backend_tls_mode: TlsMode,
|
||||
@@ -144,7 +138,6 @@ impl StoreConfig {
|
||||
schema_name,
|
||||
table_name,
|
||||
max_txn_ops,
|
||||
self.auto_create_schema,
|
||||
)
|
||||
.await
|
||||
.map_err(BoxedError::new)?)
|
||||
|
||||
@@ -155,6 +155,8 @@ pub struct StartCommand {
|
||||
#[clap(short, long)]
|
||||
selector: Option<String>,
|
||||
#[clap(long)]
|
||||
use_memory_store: Option<bool>,
|
||||
#[clap(long)]
|
||||
enable_region_failover: Option<bool>,
|
||||
#[clap(long)]
|
||||
http_addr: Option<String>,
|
||||
@@ -184,6 +186,7 @@ impl Debug for StartCommand {
|
||||
.field("store_addrs", &self.sanitize_store_addrs())
|
||||
.field("config_file", &self.config_file)
|
||||
.field("selector", &self.selector)
|
||||
.field("use_memory_store", &self.use_memory_store)
|
||||
.field("enable_region_failover", &self.enable_region_failover)
|
||||
.field("http_addr", &self.http_addr)
|
||||
.field("http_timeout", &self.http_timeout)
|
||||
@@ -265,6 +268,10 @@ impl StartCommand {
|
||||
.context(error::UnsupportedSelectorTypeSnafu { selector_type })?;
|
||||
}
|
||||
|
||||
if let Some(use_memory_store) = self.use_memory_store {
|
||||
opts.use_memory_store = use_memory_store;
|
||||
}
|
||||
|
||||
if let Some(enable_region_failover) = self.enable_region_failover {
|
||||
opts.enable_region_failover = enable_region_failover;
|
||||
}
|
||||
@@ -384,6 +391,7 @@ mod tests {
|
||||
server_addr = "127.0.0.1:3002"
|
||||
store_addr = "127.0.0.1:2379"
|
||||
selector = "LeaseBased"
|
||||
use_memory_store = false
|
||||
|
||||
[logging]
|
||||
level = "debug"
|
||||
@@ -462,6 +470,7 @@ mod tests {
|
||||
server_addr = "127.0.0.1:3002"
|
||||
datanode_lease_secs = 15
|
||||
selector = "LeaseBased"
|
||||
use_memory_store = false
|
||||
|
||||
[http]
|
||||
addr = "127.0.0.1:4000"
|
||||
|
||||
@@ -17,7 +17,6 @@ ahash.workspace = true
|
||||
api.workspace = true
|
||||
arc-swap = "1.0"
|
||||
arrow.workspace = true
|
||||
arrow-cast.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
async-trait.workspace = true
|
||||
bincode = "=1.3.3"
|
||||
@@ -47,7 +46,6 @@ geohash = { version = "0.13", optional = true }
|
||||
h3o = { version = "0.6", optional = true }
|
||||
hyperloglogplus = "0.4"
|
||||
jsonb.workspace = true
|
||||
jsonpath-rust = "0.7.5"
|
||||
memchr = "2.7"
|
||||
mito-codec.workspace = true
|
||||
nalgebra.workspace = true
|
||||
|
||||
@@ -13,24 +13,17 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::fmt::{self, Display};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{ArrayRef, BinaryViewArray, StringViewArray, StructArray};
|
||||
use arrow::compute;
|
||||
use arrow::datatypes::{Float64Type, Int64Type, UInt64Type};
|
||||
use datafusion_common::DataFusionError;
|
||||
use datafusion_common::arrow::array::{
|
||||
Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder,
|
||||
StringViewBuilder,
|
||||
};
|
||||
use datafusion_common::arrow::datatypes::DataType;
|
||||
use datafusion_common::{DataFusionError, Result};
|
||||
use datafusion_expr::type_coercion::aggregates::STRINGS;
|
||||
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
|
||||
use datatypes::arrow_array::string_array_value_at_index;
|
||||
use datatypes::json::JsonStructureSettings;
|
||||
use jsonpath_rust::JsonPath;
|
||||
use serde_json::Value;
|
||||
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature};
|
||||
|
||||
use crate::function::{Function, extract_args};
|
||||
use crate::helper;
|
||||
@@ -165,7 +158,11 @@ impl JsonGetString {
|
||||
impl Default for JsonGetString {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
signature: Signature::any(2, Volatility::Immutable),
|
||||
// TODO(LFC): Use a more clear type here instead of "Binary" for Json input, once we have a "Json" type.
|
||||
signature: helper::one_of_sigs2(
|
||||
vec![DataType::Binary, DataType::BinaryView],
|
||||
vec![DataType::Utf8, DataType::Utf8View],
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -175,7 +172,7 @@ impl Function for JsonGetString {
|
||||
Self::NAME
|
||||
}
|
||||
|
||||
fn return_type(&self, _: &[DataType]) -> Result<DataType> {
|
||||
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
|
||||
Ok(DataType::Utf8View)
|
||||
}
|
||||
|
||||
@@ -183,203 +180,33 @@ impl Function for JsonGetString {
|
||||
&self.signature
|
||||
}
|
||||
|
||||
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
|
||||
fn invoke_with_args(
|
||||
&self,
|
||||
args: ScalarFunctionArgs,
|
||||
) -> datafusion_common::Result<ColumnarValue> {
|
||||
let [arg0, arg1] = extract_args(self.name(), &args)?;
|
||||
|
||||
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
|
||||
let jsons = arg0.as_binary_view();
|
||||
let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
|
||||
let paths = arg1.as_string_view();
|
||||
|
||||
let result = match arg0.data_type() {
|
||||
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
|
||||
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
|
||||
let jsons = arg0.as_binary_view();
|
||||
jsonb_get_string(jsons, paths)?
|
||||
}
|
||||
DataType::Struct(_) => {
|
||||
let jsons = arg0.as_struct();
|
||||
json_struct_get_string(jsons, paths)?
|
||||
}
|
||||
_ => {
|
||||
return Err(DataFusionError::Execution(format!(
|
||||
"{} not supported argument type {}",
|
||||
Self::NAME,
|
||||
arg0.data_type(),
|
||||
)));
|
||||
}
|
||||
};
|
||||
let size = jsons.len();
|
||||
let mut builder = StringViewBuilder::with_capacity(size);
|
||||
|
||||
Ok(ColumnarValue::Array(result))
|
||||
}
|
||||
}
|
||||
|
||||
fn jsonb_get_string(jsons: &BinaryViewArray, paths: &StringViewArray) -> Result<ArrayRef> {
|
||||
let size = jsons.len();
|
||||
let mut builder = StringViewBuilder::with_capacity(size);
|
||||
|
||||
for i in 0..size {
|
||||
let json = jsons.is_valid(i).then(|| jsons.value(i));
|
||||
let path = paths.is_valid(i).then(|| paths.value(i));
|
||||
let result = match (json, path) {
|
||||
(Some(json), Some(path)) => {
|
||||
get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok())
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
builder.append_option(result);
|
||||
}
|
||||
|
||||
Ok(Arc::new(builder.finish()))
|
||||
}
|
||||
|
||||
fn json_struct_get_string(jsons: &StructArray, paths: &StringViewArray) -> Result<ArrayRef> {
|
||||
let size = jsons.len();
|
||||
let mut builder = StringViewBuilder::with_capacity(size);
|
||||
|
||||
for i in 0..size {
|
||||
if jsons.is_null(i) || paths.is_null(i) {
|
||||
builder.append_null();
|
||||
continue;
|
||||
}
|
||||
let path = paths.value(i);
|
||||
|
||||
// naively assume the JSON path is our kind of indexing to the field, by removing its "root"
|
||||
let field_path = path.replace("$.", "");
|
||||
let column = jsons.column_by_name(&field_path);
|
||||
|
||||
if let Some(column) = column {
|
||||
if let Some(v) = string_array_value_at_index(column, i) {
|
||||
builder.append_value(v);
|
||||
} else {
|
||||
builder.append_value(arrow_cast::display::array_value_to_string(column, i)?);
|
||||
}
|
||||
} else {
|
||||
let Some(raw) = jsons
|
||||
.column_by_name(JsonStructureSettings::RAW_FIELD)
|
||||
.and_then(|x| string_array_value_at_index(x, i))
|
||||
else {
|
||||
builder.append_null();
|
||||
continue;
|
||||
};
|
||||
|
||||
let path: JsonPath<Value> = JsonPath::try_from(path).map_err(|e| {
|
||||
DataFusionError::Execution(format!("{path} is not a valid JSON path: {e}"))
|
||||
})?;
|
||||
// the wanted field is not retrievable from the JSON struct columns directly, we have
|
||||
// to combine everything (columns and the "_raw") into a complete JSON value to find it
|
||||
let value = json_struct_to_value(raw, jsons, i)?;
|
||||
|
||||
match path.find(&value) {
|
||||
Value::Null => builder.append_null(),
|
||||
Value::Array(values) => match values.as_slice() {
|
||||
[] => builder.append_null(),
|
||||
[x] => {
|
||||
if let Some(s) = x.as_str() {
|
||||
builder.append_value(s)
|
||||
} else {
|
||||
builder.append_value(x.to_string())
|
||||
}
|
||||
}
|
||||
x => builder.append_value(
|
||||
x.iter()
|
||||
.map(|v| v.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", "),
|
||||
),
|
||||
},
|
||||
// Safety: guarded by the returns of `path.find` as documented
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Arc::new(builder.finish()))
|
||||
}
|
||||
|
||||
fn json_struct_to_value(raw: &str, jsons: &StructArray, i: usize) -> Result<Value> {
|
||||
let Ok(mut json) = Value::from_str(raw) else {
|
||||
return Err(DataFusionError::Internal(format!(
|
||||
"inner field '{}' is not a valid JSON string",
|
||||
JsonStructureSettings::RAW_FIELD
|
||||
)));
|
||||
};
|
||||
|
||||
for (column_name, column) in jsons.column_names().into_iter().zip(jsons.columns()) {
|
||||
if column_name == JsonStructureSettings::RAW_FIELD {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (json_pointer, field) = if let Some((json_object, field)) = column_name.rsplit_once(".")
|
||||
{
|
||||
let json_pointer = format!("/{}", json_object.replace(".", "/"));
|
||||
(json_pointer, field)
|
||||
} else {
|
||||
("".to_string(), column_name)
|
||||
};
|
||||
let Some(json_object) = json
|
||||
.pointer_mut(&json_pointer)
|
||||
.and_then(|x| x.as_object_mut())
|
||||
else {
|
||||
return Err(DataFusionError::Internal(format!(
|
||||
"value at JSON pointer '{}' is not an object",
|
||||
json_pointer
|
||||
)));
|
||||
};
|
||||
|
||||
macro_rules! insert {
|
||||
($column: ident, $i: ident, $json_object: ident, $field: ident) => {{
|
||||
if let Some(value) = $column
|
||||
.is_valid($i)
|
||||
.then(|| serde_json::Value::from($column.value($i)))
|
||||
{
|
||||
$json_object.insert($field.to_string(), value);
|
||||
for i in 0..size {
|
||||
let json = jsons.is_valid(i).then(|| jsons.value(i));
|
||||
let path = paths.is_valid(i).then(|| paths.value(i));
|
||||
let result = match (json, path) {
|
||||
(Some(json), Some(path)) => {
|
||||
get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok())
|
||||
}
|
||||
}};
|
||||
_ => None,
|
||||
};
|
||||
builder.append_option(result);
|
||||
}
|
||||
|
||||
match column.data_type() {
|
||||
// boolean => Value::Bool
|
||||
DataType::Boolean => {
|
||||
let column = column.as_boolean();
|
||||
insert!(column, i, json_object, field);
|
||||
}
|
||||
// int => Value::Number
|
||||
DataType::Int64 => {
|
||||
let column = column.as_primitive::<Int64Type>();
|
||||
insert!(column, i, json_object, field);
|
||||
}
|
||||
DataType::UInt64 => {
|
||||
let column = column.as_primitive::<UInt64Type>();
|
||||
insert!(column, i, json_object, field);
|
||||
}
|
||||
DataType::Float64 => {
|
||||
let column = column.as_primitive::<Float64Type>();
|
||||
insert!(column, i, json_object, field);
|
||||
}
|
||||
// string => Value::String
|
||||
DataType::Utf8 => {
|
||||
let column = column.as_string::<i32>();
|
||||
insert!(column, i, json_object, field);
|
||||
}
|
||||
DataType::LargeUtf8 => {
|
||||
let column = column.as_string::<i64>();
|
||||
insert!(column, i, json_object, field);
|
||||
}
|
||||
DataType::Utf8View => {
|
||||
let column = column.as_string_view();
|
||||
insert!(column, i, json_object, field);
|
||||
}
|
||||
// other => Value::Array and Value::Object
|
||||
_ => {
|
||||
return Err(DataFusionError::NotImplemented(format!(
|
||||
"{} is not yet supported to be executed with field {} of datatype {}",
|
||||
JsonGetString::NAME,
|
||||
column_name,
|
||||
column.data_type()
|
||||
)));
|
||||
}
|
||||
}
|
||||
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
|
||||
}
|
||||
Ok(json)
|
||||
}
|
||||
|
||||
impl Display for JsonGetString {
|
||||
@@ -469,13 +296,11 @@ impl Display for JsonGetObject {
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{Float64Array, Int64Array, StructArray};
|
||||
use arrow_schema::Field;
|
||||
use datafusion_common::ScalarValue;
|
||||
use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray};
|
||||
use datafusion_common::arrow::datatypes::{Float64Type, Int64Type};
|
||||
use datatypes::types::parse_string_to_jsonb;
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -649,123 +474,42 @@ mod tests {
|
||||
r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#,
|
||||
r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#,
|
||||
];
|
||||
let paths = vec!["$.a.b", "$.a", ""];
|
||||
let results = [Some("a"), Some("d"), None];
|
||||
|
||||
// complete JSON is:
|
||||
// {
|
||||
// "kind": "foo",
|
||||
// "payload": {
|
||||
// "code": 404,
|
||||
// "success": false,
|
||||
// "result": {
|
||||
// "error": "not found",
|
||||
// "time_cost": 1.234
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
let json_struct: ArrayRef = Arc::new(StructArray::new(
|
||||
vec![
|
||||
Field::new("kind", DataType::Utf8, true),
|
||||
Field::new("payload.code", DataType::Int64, true),
|
||||
Field::new("payload.result.time_cost", DataType::Float64, true),
|
||||
Field::new(JsonStructureSettings::RAW_FIELD, DataType::Utf8View, true),
|
||||
]
|
||||
.into(),
|
||||
vec![
|
||||
Arc::new(StringArray::from_iter([Some("foo")])) as ArrayRef,
|
||||
Arc::new(Int64Array::from_iter([Some(404)])),
|
||||
Arc::new(Float64Array::from_iter([Some(1.234)])),
|
||||
Arc::new(StringViewArray::from_iter([Some(
|
||||
json! ({
|
||||
"payload": {
|
||||
"success": false,
|
||||
"result": {
|
||||
"error": "not found"
|
||||
}
|
||||
}
|
||||
})
|
||||
.to_string(),
|
||||
)])),
|
||||
],
|
||||
None,
|
||||
));
|
||||
|
||||
let paths = vec![
|
||||
"$.a.b",
|
||||
"$.a",
|
||||
"",
|
||||
"$.kind",
|
||||
"$.payload.code",
|
||||
"$.payload.result.time_cost",
|
||||
"$.payload",
|
||||
"$.payload.success",
|
||||
"$.payload.result",
|
||||
"$.payload.result.error",
|
||||
"$.payload.result.not-exists",
|
||||
"$.payload.not-exists",
|
||||
"$.not-exists",
|
||||
"$",
|
||||
];
|
||||
let expects = [
|
||||
Some("a"),
|
||||
Some("d"),
|
||||
None,
|
||||
Some("foo"),
|
||||
Some("404"),
|
||||
Some("1.234"),
|
||||
Some(
|
||||
r#"{"code":404,"result":{"error":"not found","time_cost":1.234},"success":false}"#,
|
||||
),
|
||||
Some("false"),
|
||||
Some(r#"{"error":"not found","time_cost":1.234}"#),
|
||||
Some("not found"),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
Some(
|
||||
r#"{"kind":"foo","payload":{"code":404,"result":{"error":"not found","time_cost":1.234},"success":false}}"#,
|
||||
),
|
||||
];
|
||||
|
||||
let mut jsons = json_strings
|
||||
let jsonbs = json_strings
|
||||
.iter()
|
||||
.map(|s| {
|
||||
let value = jsonb::parse_value(s.as_bytes()).unwrap();
|
||||
Arc::new(BinaryArray::from_iter_values([value.to_vec()])) as ArrayRef
|
||||
value.to_vec()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let json_struct_arrays =
|
||||
std::iter::repeat_n(json_struct, expects.len() - jsons.len()).collect::<Vec<_>>();
|
||||
jsons.extend(json_struct_arrays);
|
||||
|
||||
for i in 0..jsons.len() {
|
||||
let json = &jsons[i];
|
||||
let path = paths[i];
|
||||
let expect = expects[i];
|
||||
let args = ScalarFunctionArgs {
|
||||
args: vec![
|
||||
ColumnarValue::Array(Arc::new(BinaryArray::from_iter_values(jsonbs))),
|
||||
ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))),
|
||||
],
|
||||
arg_fields: vec![],
|
||||
number_rows: 3,
|
||||
return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
|
||||
config_options: Arc::new(Default::default()),
|
||||
};
|
||||
let result = json_get_string
|
||||
.invoke_with_args(args)
|
||||
.and_then(|x| x.to_array(3))
|
||||
.unwrap();
|
||||
let vector = result.as_string_view();
|
||||
|
||||
let args = ScalarFunctionArgs {
|
||||
args: vec![
|
||||
ColumnarValue::Array(json.clone()),
|
||||
ColumnarValue::Scalar(path.into()),
|
||||
],
|
||||
arg_fields: vec![],
|
||||
number_rows: 1,
|
||||
return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
|
||||
config_options: Arc::new(Default::default()),
|
||||
};
|
||||
let result = json_get_string
|
||||
.invoke_with_args(args)
|
||||
.and_then(|x| x.to_array(1))
|
||||
.unwrap();
|
||||
|
||||
let result = result.as_string_view();
|
||||
assert_eq!(1, result.len());
|
||||
let actual = result.is_valid(0).then(|| result.value(0));
|
||||
assert_eq!(actual, expect);
|
||||
assert_eq!(3, vector.len());
|
||||
for (i, gt) in results.iter().enumerate() {
|
||||
let result = vector.is_valid(i).then(|| vector.value(i));
|
||||
assert_eq!(*gt, result);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_get_object() -> Result<()> {
|
||||
fn test_json_get_object() -> datafusion_common::Result<()> {
|
||||
let udf = JsonGetObject::default();
|
||||
assert_eq!("json_get_object", udf.name());
|
||||
assert_eq!(
|
||||
|
||||
@@ -37,12 +37,6 @@ pub struct MemoryManager<M: MemoryMetrics> {
|
||||
quota: Option<MemoryQuota<M>>,
|
||||
}
|
||||
|
||||
impl<M: MemoryMetrics + Default> Default for MemoryManager<M> {
|
||||
fn default() -> Self {
|
||||
Self::new(0, M::default())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct MemoryQuota<M: MemoryMetrics> {
|
||||
pub(crate) semaphore: Arc<Semaphore>,
|
||||
|
||||
@@ -514,22 +514,6 @@ impl Display for GcRegionsReply {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct EnterStagingRegion {
|
||||
pub region_id: RegionId,
|
||||
pub partition_expr: String,
|
||||
}
|
||||
|
||||
impl Display for EnterStagingRegion {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"EnterStagingRegion(region_id={}, partition_expr={})",
|
||||
self.region_id, self.partition_expr
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Display, PartialEq)]
|
||||
pub enum Instruction {
|
||||
/// Opens regions.
|
||||
@@ -557,8 +541,6 @@ pub enum Instruction {
|
||||
GcRegions(GcRegions),
|
||||
/// Temporary suspend serving reads or writes
|
||||
Suspend,
|
||||
/// Makes regions enter staging state.
|
||||
EnterStagingRegions(Vec<EnterStagingRegion>),
|
||||
}
|
||||
|
||||
impl Instruction {
|
||||
@@ -615,13 +597,6 @@ impl Instruction {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_enter_staging_regions(self) -> Option<Vec<EnterStagingRegion>> {
|
||||
match self {
|
||||
Self::EnterStagingRegions(enter_staging) => Some(enter_staging),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The reply of [UpgradeRegion].
|
||||
@@ -715,28 +690,6 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct EnterStagingRegionReply {
|
||||
pub region_id: RegionId,
|
||||
/// Returns true if the region is under the new region rule.
|
||||
pub ready: bool,
|
||||
/// Indicates whether the region exists.
|
||||
pub exists: bool,
|
||||
/// Return error if any during the operation.
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct EnterStagingRegionsReply {
|
||||
pub replies: Vec<EnterStagingRegionReply>,
|
||||
}
|
||||
|
||||
impl EnterStagingRegionsReply {
|
||||
pub fn new(replies: Vec<EnterStagingRegionReply>) -> Self {
|
||||
Self { replies }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum InstructionReply {
|
||||
@@ -757,7 +710,6 @@ pub enum InstructionReply {
|
||||
FlushRegions(FlushRegionReply),
|
||||
GetFileRefs(GetFileRefsReply),
|
||||
GcRegions(GcRegionsReply),
|
||||
EnterStagingRegions(EnterStagingRegionsReply),
|
||||
}
|
||||
|
||||
impl Display for InstructionReply {
|
||||
@@ -774,13 +726,6 @@ impl Display for InstructionReply {
|
||||
Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply),
|
||||
Self::GetFileRefs(reply) => write!(f, "InstructionReply::GetFileRefs({})", reply),
|
||||
Self::GcRegions(reply) => write!(f, "InstructionReply::GcRegion({})", reply),
|
||||
Self::EnterStagingRegions(reply) => {
|
||||
write!(
|
||||
f,
|
||||
"InstructionReply::EnterStagingRegions({:?})",
|
||||
reply.replies
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -821,20 +766,13 @@ impl InstructionReply {
|
||||
_ => panic!("Expected FlushRegions reply"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn expect_enter_staging_regions_reply(self) -> Vec<EnterStagingRegionReply> {
|
||||
match self {
|
||||
Self::EnterStagingRegions(reply) => reply.replies,
|
||||
_ => panic!("Expected EnterStagingRegion reply"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use store_api::storage::{FileId, FileRef};
|
||||
use store_api::storage::FileId;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1209,14 +1147,12 @@ mod tests {
|
||||
let mut manifest = FileRefsManifest::default();
|
||||
let r0 = RegionId::new(1024, 1);
|
||||
let r1 = RegionId::new(1024, 2);
|
||||
manifest.file_refs.insert(
|
||||
r0,
|
||||
HashSet::from([FileRef::new(r0, FileId::random(), None)]),
|
||||
);
|
||||
manifest.file_refs.insert(
|
||||
r1,
|
||||
HashSet::from([FileRef::new(r1, FileId::random(), None)]),
|
||||
);
|
||||
manifest
|
||||
.file_refs
|
||||
.insert(r0, HashSet::from([FileId::random()]));
|
||||
manifest
|
||||
.file_refs
|
||||
.insert(r1, HashSet::from([FileId::random()]));
|
||||
manifest.manifest_version.insert(r0, 10);
|
||||
manifest.manifest_version.insert(r1, 20);
|
||||
|
||||
|
||||
@@ -848,7 +848,7 @@ impl PgStore {
|
||||
.context(CreatePostgresPoolSnafu)?,
|
||||
};
|
||||
|
||||
Self::with_pg_pool(pool, None, table_name, max_txn_ops, false).await
|
||||
Self::with_pg_pool(pool, None, table_name, max_txn_ops).await
|
||||
}
|
||||
|
||||
/// Create [PgStore] impl of [KvBackendRef] from url (backward compatibility).
|
||||
@@ -862,7 +862,6 @@ impl PgStore {
|
||||
schema_name: Option<&str>,
|
||||
table_name: &str,
|
||||
max_txn_ops: usize,
|
||||
auto_create_schema: bool,
|
||||
) -> Result<KvBackendRef> {
|
||||
// Ensure the postgres metadata backend is ready to use.
|
||||
let client = match pool.get().await {
|
||||
@@ -874,23 +873,9 @@ impl PgStore {
|
||||
.fail();
|
||||
}
|
||||
};
|
||||
|
||||
// Automatically create schema if enabled and schema_name is provided.
|
||||
if auto_create_schema
|
||||
&& let Some(schema) = schema_name
|
||||
&& !schema.is_empty()
|
||||
{
|
||||
let create_schema_sql = format!("CREATE SCHEMA IF NOT EXISTS \"{}\"", schema);
|
||||
client
|
||||
.execute(&create_schema_sql, &[])
|
||||
.await
|
||||
.with_context(|_| PostgresExecutionSnafu {
|
||||
sql: create_schema_sql.clone(),
|
||||
})?;
|
||||
}
|
||||
|
||||
let template_factory = PgSqlTemplateFactory::new(schema_name, table_name);
|
||||
let sql_template_set = template_factory.build();
|
||||
// Do not attempt to create schema implicitly.
|
||||
client
|
||||
.execute(&sql_template_set.create_table_statement, &[])
|
||||
.await
|
||||
@@ -974,7 +959,7 @@ mod tests {
|
||||
let Some(pool) = build_pg15_pool().await else {
|
||||
return;
|
||||
};
|
||||
let res = PgStore::with_pg_pool(pool, None, "pg15_public_should_fail", 128, false).await;
|
||||
let res = PgStore::with_pg_pool(pool, None, "pg15_public_should_fail", 128).await;
|
||||
assert!(
|
||||
res.is_err(),
|
||||
"creating table in public should fail for test_user"
|
||||
@@ -1229,249 +1214,4 @@ mod tests {
|
||||
let t = PgSqlTemplateFactory::format_table_ident(Some(""), "test_table");
|
||||
assert_eq!(t, "\"test_table\"");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_auto_create_schema_enabled() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
maybe_skip_postgres_integration_test!();
|
||||
let endpoints = std::env::var("GT_POSTGRES_ENDPOINTS").unwrap();
|
||||
let mut cfg = Config::new();
|
||||
cfg.url = Some(endpoints);
|
||||
let pool = cfg
|
||||
.create_pool(Some(Runtime::Tokio1), NoTls)
|
||||
.context(CreatePostgresPoolSnafu)
|
||||
.unwrap();
|
||||
|
||||
let schema_name = "test_auto_create_enabled";
|
||||
let table_name = "test_table";
|
||||
|
||||
// Drop the schema if it exists to start clean
|
||||
let client = pool.get().await.unwrap();
|
||||
let _ = client
|
||||
.execute(
|
||||
&format!("DROP SCHEMA IF EXISTS \"{}\" CASCADE", schema_name),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
|
||||
// Create store with auto_create_schema enabled
|
||||
let _ = PgStore::with_pg_pool(pool.clone(), Some(schema_name), table_name, 128, true)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify schema was created
|
||||
let row = client
|
||||
.query_one(
|
||||
"SELECT schema_name FROM information_schema.schemata WHERE schema_name = $1",
|
||||
&[&schema_name],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let created_schema: String = row.get(0);
|
||||
assert_eq!(created_schema, schema_name);
|
||||
|
||||
// Verify table was created in the schema
|
||||
let row = client
|
||||
.query_one(
|
||||
"SELECT table_schema, table_name FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2",
|
||||
&[&schema_name, &table_name],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let created_table_schema: String = row.get(0);
|
||||
let created_table_name: String = row.get(1);
|
||||
assert_eq!(created_table_schema, schema_name);
|
||||
assert_eq!(created_table_name, table_name);
|
||||
|
||||
// Cleanup
|
||||
let _ = client
|
||||
.execute(
|
||||
&format!("DROP SCHEMA IF EXISTS \"{}\" CASCADE", schema_name),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_auto_create_schema_disabled() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
maybe_skip_postgres_integration_test!();
|
||||
let endpoints = std::env::var("GT_POSTGRES_ENDPOINTS").unwrap();
|
||||
let mut cfg = Config::new();
|
||||
cfg.url = Some(endpoints);
|
||||
let pool = cfg
|
||||
.create_pool(Some(Runtime::Tokio1), NoTls)
|
||||
.context(CreatePostgresPoolSnafu)
|
||||
.unwrap();
|
||||
|
||||
let schema_name = "test_auto_create_disabled";
|
||||
let table_name = "test_table";
|
||||
|
||||
// Drop the schema if it exists to start clean
|
||||
let client = pool.get().await.unwrap();
|
||||
let _ = client
|
||||
.execute(
|
||||
&format!("DROP SCHEMA IF EXISTS \"{}\" CASCADE", schema_name),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
|
||||
// Try to create store with auto_create_schema disabled (should fail)
|
||||
let result =
|
||||
PgStore::with_pg_pool(pool.clone(), Some(schema_name), table_name, 128, false).await;
|
||||
|
||||
// Verify it failed because schema doesn't exist
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"Expected error when schema doesn't exist and auto_create_schema is disabled"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_auto_create_schema_already_exists() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
maybe_skip_postgres_integration_test!();
|
||||
let endpoints = std::env::var("GT_POSTGRES_ENDPOINTS").unwrap();
|
||||
let mut cfg = Config::new();
|
||||
cfg.url = Some(endpoints);
|
||||
let pool = cfg
|
||||
.create_pool(Some(Runtime::Tokio1), NoTls)
|
||||
.context(CreatePostgresPoolSnafu)
|
||||
.unwrap();
|
||||
|
||||
let schema_name = "test_auto_create_existing";
|
||||
let table_name = "test_table";
|
||||
|
||||
// Manually create the schema first
|
||||
let client = pool.get().await.unwrap();
|
||||
let _ = client
|
||||
.execute(
|
||||
&format!("DROP SCHEMA IF EXISTS \"{}\" CASCADE", schema_name),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
client
|
||||
.execute(&format!("CREATE SCHEMA \"{}\"", schema_name), &[])
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create store with auto_create_schema enabled (should succeed idempotently)
|
||||
let _ = PgStore::with_pg_pool(pool.clone(), Some(schema_name), table_name, 128, true)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify schema still exists
|
||||
let row = client
|
||||
.query_one(
|
||||
"SELECT schema_name FROM information_schema.schemata WHERE schema_name = $1",
|
||||
&[&schema_name],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let created_schema: String = row.get(0);
|
||||
assert_eq!(created_schema, schema_name);
|
||||
|
||||
// Verify table was created in the schema
|
||||
let row = client
|
||||
.query_one(
|
||||
"SELECT table_schema, table_name FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2",
|
||||
&[&schema_name, &table_name],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let created_table_schema: String = row.get(0);
|
||||
let created_table_name: String = row.get(1);
|
||||
assert_eq!(created_table_schema, schema_name);
|
||||
assert_eq!(created_table_name, table_name);
|
||||
|
||||
// Cleanup
|
||||
let _ = client
|
||||
.execute(
|
||||
&format!("DROP SCHEMA IF EXISTS \"{}\" CASCADE", schema_name),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_auto_create_schema_no_schema_name() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
maybe_skip_postgres_integration_test!();
|
||||
let endpoints = std::env::var("GT_POSTGRES_ENDPOINTS").unwrap();
|
||||
let mut cfg = Config::new();
|
||||
cfg.url = Some(endpoints);
|
||||
let pool = cfg
|
||||
.create_pool(Some(Runtime::Tokio1), NoTls)
|
||||
.context(CreatePostgresPoolSnafu)
|
||||
.unwrap();
|
||||
|
||||
let table_name = "test_table_no_schema";
|
||||
|
||||
// Create store with auto_create_schema enabled but no schema name (should succeed)
|
||||
// This should create the table in the default schema (public)
|
||||
let _ = PgStore::with_pg_pool(pool.clone(), None, table_name, 128, true)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify table was created in public schema
|
||||
let client = pool.get().await.unwrap();
|
||||
let row = client
|
||||
.query_one(
|
||||
"SELECT table_schema, table_name FROM information_schema.tables WHERE table_name = $1",
|
||||
&[&table_name],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let created_table_schema: String = row.get(0);
|
||||
let created_table_name: String = row.get(1);
|
||||
assert_eq!(created_table_name, table_name);
|
||||
// Verify it's in public schema (or whichever is the default)
|
||||
assert!(created_table_schema == "public" || !created_table_schema.is_empty());
|
||||
|
||||
// Cleanup
|
||||
let _ = client
|
||||
.execute(&format!("DROP TABLE IF EXISTS \"{}\"", table_name), &[])
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_auto_create_schema_with_empty_schema_name() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
maybe_skip_postgres_integration_test!();
|
||||
let endpoints = std::env::var("GT_POSTGRES_ENDPOINTS").unwrap();
|
||||
let mut cfg = Config::new();
|
||||
cfg.url = Some(endpoints);
|
||||
let pool = cfg
|
||||
.create_pool(Some(Runtime::Tokio1), NoTls)
|
||||
.context(CreatePostgresPoolSnafu)
|
||||
.unwrap();
|
||||
|
||||
let table_name = "test_table_empty_schema";
|
||||
|
||||
// Create store with auto_create_schema enabled but empty schema name (should succeed)
|
||||
// This should create the table in the default schema (public)
|
||||
let _ = PgStore::with_pg_pool(pool.clone(), Some(""), table_name, 128, true)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify table was created in public schema
|
||||
let client = pool.get().await.unwrap();
|
||||
let row = client
|
||||
.query_one(
|
||||
"SELECT table_schema, table_name FROM information_schema.tables WHERE table_name = $1",
|
||||
&[&table_name],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let created_table_schema: String = row.get(0);
|
||||
let created_table_name: String = row.get(1);
|
||||
assert_eq!(created_table_name, table_name);
|
||||
// Verify it's in public schema (or whichever is the default)
|
||||
assert!(created_table_schema == "public" || !created_table_schema.is_empty());
|
||||
|
||||
// Cleanup
|
||||
let _ = client
|
||||
.execute(&format!("DROP TABLE IF EXISTS \"{}\"", table_name), &[])
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,12 +5,10 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
arrow-schema.workspace = true
|
||||
common-base.workspace = true
|
||||
common-decimal.workspace = true
|
||||
common-error.workspace = true
|
||||
common-macro.workspace = true
|
||||
common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
datafusion-sql.workspace = true
|
||||
datatypes.workspace = true
|
||||
|
||||
@@ -14,12 +14,11 @@
|
||||
|
||||
use std::str::FromStr;
|
||||
|
||||
use arrow_schema::extension::ExtensionType;
|
||||
use common_time::Timestamp;
|
||||
use common_time::timezone::Timezone;
|
||||
use datatypes::extension::json::JsonExtensionType;
|
||||
use datatypes::json::JsonStructureSettings;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema};
|
||||
use datatypes::schema::ColumnDefaultConstraint;
|
||||
use datatypes::types::{JsonFormat, parse_string_to_jsonb, parse_string_to_vector_type_value};
|
||||
use datatypes::value::{OrderedF32, OrderedF64, Value};
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
@@ -125,14 +124,13 @@ pub(crate) fn sql_number_to_value(data_type: &ConcreteDataType, n: &str) -> Resu
|
||||
/// If `auto_string_to_numeric` is true, tries to cast the string value to numeric values,
|
||||
/// and returns error if the cast fails.
|
||||
pub fn sql_value_to_value(
|
||||
column_schema: &ColumnSchema,
|
||||
column_name: &str,
|
||||
data_type: &ConcreteDataType,
|
||||
sql_val: &SqlValue,
|
||||
timezone: Option<&Timezone>,
|
||||
unary_op: Option<UnaryOperator>,
|
||||
auto_string_to_numeric: bool,
|
||||
) -> Result<Value> {
|
||||
let column_name = &column_schema.name;
|
||||
let data_type = &column_schema.data_type;
|
||||
let mut value = match sql_val {
|
||||
SqlValue::Number(n, _) => sql_number_to_value(data_type, n)?,
|
||||
SqlValue::Null => Value::Null,
|
||||
@@ -148,9 +146,13 @@ pub fn sql_value_to_value(
|
||||
|
||||
(*b).into()
|
||||
}
|
||||
SqlValue::DoubleQuotedString(s) | SqlValue::SingleQuotedString(s) => {
|
||||
parse_string_to_value(column_schema, s.clone(), timezone, auto_string_to_numeric)?
|
||||
}
|
||||
SqlValue::DoubleQuotedString(s) | SqlValue::SingleQuotedString(s) => parse_string_to_value(
|
||||
column_name,
|
||||
s.clone(),
|
||||
data_type,
|
||||
timezone,
|
||||
auto_string_to_numeric,
|
||||
)?,
|
||||
SqlValue::HexStringLiteral(s) => {
|
||||
// Should not directly write binary into json column
|
||||
ensure!(
|
||||
@@ -242,12 +244,12 @@ pub fn sql_value_to_value(
|
||||
}
|
||||
|
||||
pub(crate) fn parse_string_to_value(
|
||||
column_schema: &ColumnSchema,
|
||||
column_name: &str,
|
||||
s: String,
|
||||
data_type: &ConcreteDataType,
|
||||
timezone: Option<&Timezone>,
|
||||
auto_string_to_numeric: bool,
|
||||
) -> Result<Value> {
|
||||
let data_type = &column_schema.data_type;
|
||||
if auto_string_to_numeric && let Some(value) = auto_cast_to_numeric(&s, data_type)? {
|
||||
return Ok(value);
|
||||
}
|
||||
@@ -255,7 +257,7 @@ pub(crate) fn parse_string_to_value(
|
||||
ensure!(
|
||||
data_type.is_stringifiable(),
|
||||
ColumnTypeMismatchSnafu {
|
||||
column_name: column_schema.name.clone(),
|
||||
column_name,
|
||||
expect: data_type.clone(),
|
||||
actual: ConcreteDataType::string_datatype(),
|
||||
}
|
||||
@@ -301,21 +303,23 @@ pub(crate) fn parse_string_to_value(
|
||||
}
|
||||
}
|
||||
ConcreteDataType::Binary(_) => Ok(Value::Binary(s.as_bytes().into())),
|
||||
ConcreteDataType::Json(j) => match &j.format {
|
||||
JsonFormat::Jsonb => {
|
||||
let v = parse_string_to_jsonb(&s).context(DatatypeSnafu)?;
|
||||
Ok(Value::Binary(v.into()))
|
||||
ConcreteDataType::Json(j) => {
|
||||
match &j.format {
|
||||
JsonFormat::Jsonb => {
|
||||
let v = parse_string_to_jsonb(&s).context(DatatypeSnafu)?;
|
||||
Ok(Value::Binary(v.into()))
|
||||
}
|
||||
JsonFormat::Native(_inner) => {
|
||||
// Always use the structured version at this level.
|
||||
let serde_json_value =
|
||||
serde_json::from_str(&s).context(DeserializeSnafu { json: s })?;
|
||||
let json_structure_settings = JsonStructureSettings::Structured(None);
|
||||
json_structure_settings
|
||||
.encode(serde_json_value)
|
||||
.context(DatatypeSnafu)
|
||||
}
|
||||
}
|
||||
JsonFormat::Native(_) => {
|
||||
let extension_type: Option<JsonExtensionType> =
|
||||
column_schema.extension_type().context(DatatypeSnafu)?;
|
||||
let json_structure_settings = extension_type
|
||||
.and_then(|x| x.metadata().json_structure_settings.clone())
|
||||
.unwrap_or_default();
|
||||
let v = serde_json::from_str(&s).context(DeserializeSnafu { json: s })?;
|
||||
json_structure_settings.encode(v).context(DatatypeSnafu)
|
||||
}
|
||||
},
|
||||
}
|
||||
ConcreteDataType::Vector(d) => {
|
||||
let v = parse_string_to_vector_type_value(&s, Some(d.dim)).context(DatatypeSnafu)?;
|
||||
Ok(Value::Binary(v.into()))
|
||||
@@ -413,265 +417,305 @@ mod test {
|
||||
|
||||
use super::*;
|
||||
|
||||
macro_rules! call_parse_string_to_value {
|
||||
($column_name: expr, $input: expr, $data_type: expr) => {
|
||||
call_parse_string_to_value!($column_name, $input, $data_type, None)
|
||||
};
|
||||
($column_name: expr, $input: expr, $data_type: expr, timezone = $timezone: expr) => {
|
||||
call_parse_string_to_value!($column_name, $input, $data_type, Some($timezone))
|
||||
};
|
||||
($column_name: expr, $input: expr, $data_type: expr, $timezone: expr) => {{
|
||||
let column_schema = ColumnSchema::new($column_name, $data_type, true);
|
||||
parse_string_to_value(&column_schema, $input, $timezone, true)
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_to_value_auto_numeric() -> Result<()> {
|
||||
fn test_string_to_value_auto_numeric() {
|
||||
// Test string to boolean with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"true".to_string(),
|
||||
ConcreteDataType::boolean_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::boolean_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Boolean(true), result);
|
||||
|
||||
// Test invalid string to boolean with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_a_boolean".to_string(),
|
||||
ConcreteDataType::boolean_datatype()
|
||||
&ConcreteDataType::boolean_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to int8
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"42".to_string(),
|
||||
ConcreteDataType::int8_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::int8_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Int8(42), result);
|
||||
|
||||
// Test invalid string to int8 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_an_int8".to_string(),
|
||||
ConcreteDataType::int8_datatype()
|
||||
&ConcreteDataType::int8_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to int16
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"1000".to_string(),
|
||||
ConcreteDataType::int16_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::int16_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Int16(1000), result);
|
||||
|
||||
// Test invalid string to int16 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_an_int16".to_string(),
|
||||
ConcreteDataType::int16_datatype()
|
||||
&ConcreteDataType::int16_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to int32
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"100000".to_string(),
|
||||
ConcreteDataType::int32_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::int32_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Int32(100000), result);
|
||||
|
||||
// Test invalid string to int32 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_an_int32".to_string(),
|
||||
ConcreteDataType::int32_datatype()
|
||||
&ConcreteDataType::int32_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to int64
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"1000000".to_string(),
|
||||
ConcreteDataType::int64_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::int64_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Int64(1000000), result);
|
||||
|
||||
// Test invalid string to int64 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_an_int64".to_string(),
|
||||
ConcreteDataType::int64_datatype()
|
||||
&ConcreteDataType::int64_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to uint8
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"200".to_string(),
|
||||
ConcreteDataType::uint8_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::uint8_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::UInt8(200), result);
|
||||
|
||||
// Test invalid string to uint8 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_a_uint8".to_string(),
|
||||
ConcreteDataType::uint8_datatype()
|
||||
&ConcreteDataType::uint8_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to uint16
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"60000".to_string(),
|
||||
ConcreteDataType::uint16_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::uint16_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::UInt16(60000), result);
|
||||
|
||||
// Test invalid string to uint16 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_a_uint16".to_string(),
|
||||
ConcreteDataType::uint16_datatype()
|
||||
&ConcreteDataType::uint16_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to uint32
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"4000000000".to_string(),
|
||||
ConcreteDataType::uint32_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::uint32_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::UInt32(4000000000), result);
|
||||
|
||||
// Test invalid string to uint32 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_a_uint32".to_string(),
|
||||
ConcreteDataType::uint32_datatype()
|
||||
&ConcreteDataType::uint32_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to uint64
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"18446744073709551615".to_string(),
|
||||
ConcreteDataType::uint64_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::uint64_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::UInt64(18446744073709551615), result);
|
||||
|
||||
// Test invalid string to uint64 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_a_uint64".to_string(),
|
||||
ConcreteDataType::uint64_datatype()
|
||||
&ConcreteDataType::uint64_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to float32
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"3.5".to_string(),
|
||||
ConcreteDataType::float32_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::float32_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Float32(OrderedF32::from(3.5)), result);
|
||||
|
||||
// Test invalid string to float32 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_a_float32".to_string(),
|
||||
ConcreteDataType::float32_datatype()
|
||||
&ConcreteDataType::float32_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
|
||||
// Test string to float64
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"3.5".to_string(),
|
||||
ConcreteDataType::float64_datatype()
|
||||
)?;
|
||||
&ConcreteDataType::float64_datatype(),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Float64(OrderedF64::from(3.5)), result);
|
||||
|
||||
// Test invalid string to float64 with auto cast
|
||||
let result = call_parse_string_to_value!(
|
||||
let result = parse_string_to_value(
|
||||
"col",
|
||||
"not_a_float64".to_string(),
|
||||
ConcreteDataType::float64_datatype()
|
||||
&ConcreteDataType::float64_datatype(),
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(result.is_err());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
macro_rules! call_sql_value_to_value {
|
||||
($column_name: expr, $data_type: expr, $sql_value: expr) => {
|
||||
call_sql_value_to_value!($column_name, $data_type, $sql_value, None, None, false)
|
||||
};
|
||||
($column_name: expr, $data_type: expr, $sql_value: expr, timezone = $timezone: expr) => {
|
||||
call_sql_value_to_value!(
|
||||
$column_name,
|
||||
$data_type,
|
||||
$sql_value,
|
||||
Some($timezone),
|
||||
None,
|
||||
false
|
||||
)
|
||||
};
|
||||
($column_name: expr, $data_type: expr, $sql_value: expr, unary_op = $unary_op: expr) => {
|
||||
call_sql_value_to_value!(
|
||||
$column_name,
|
||||
$data_type,
|
||||
$sql_value,
|
||||
None,
|
||||
Some($unary_op),
|
||||
false
|
||||
)
|
||||
};
|
||||
($column_name: expr, $data_type: expr, $sql_value: expr, auto_string_to_numeric) => {
|
||||
call_sql_value_to_value!($column_name, $data_type, $sql_value, None, None, true)
|
||||
};
|
||||
($column_name: expr, $data_type: expr, $sql_value: expr, $timezone: expr, $unary_op: expr, $auto_string_to_numeric: expr) => {{
|
||||
let column_schema = ColumnSchema::new($column_name, $data_type, true);
|
||||
sql_value_to_value(
|
||||
&column_schema,
|
||||
$sql_value,
|
||||
$timezone,
|
||||
$unary_op,
|
||||
$auto_string_to_numeric,
|
||||
)
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sql_value_to_value() -> Result<()> {
|
||||
fn test_sql_value_to_value() {
|
||||
let sql_val = SqlValue::Null;
|
||||
assert_eq!(
|
||||
Value::Null,
|
||||
call_sql_value_to_value!("a", ConcreteDataType::float64_datatype(), &sql_val)?
|
||||
sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::float64_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false
|
||||
)
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
let sql_val = SqlValue::Boolean(true);
|
||||
assert_eq!(
|
||||
Value::Boolean(true),
|
||||
call_sql_value_to_value!("a", ConcreteDataType::boolean_datatype(), &sql_val)?
|
||||
sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::boolean_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false
|
||||
)
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
let sql_val = SqlValue::Number("3.0".to_string(), false);
|
||||
assert_eq!(
|
||||
Value::Float64(OrderedFloat(3.0)),
|
||||
call_sql_value_to_value!("a", ConcreteDataType::float64_datatype(), &sql_val)?
|
||||
sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::float64_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false
|
||||
)
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
let sql_val = SqlValue::Number("3.0".to_string(), false);
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::boolean_datatype(), &sql_val);
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::boolean_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
);
|
||||
assert!(v.is_err());
|
||||
assert!(format!("{v:?}").contains("Failed to parse number '3.0' to boolean column type"));
|
||||
|
||||
let sql_val = SqlValue::Boolean(true);
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::float64_datatype(), &sql_val);
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::float64_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
);
|
||||
assert!(v.is_err());
|
||||
assert!(
|
||||
format!("{v:?}").contains(
|
||||
@@ -681,18 +725,41 @@ mod test {
|
||||
);
|
||||
|
||||
let sql_val = SqlValue::HexStringLiteral("48656c6c6f20776f726c6421".to_string());
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::binary_datatype(), &sql_val)?;
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::binary_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Binary(Bytes::from(b"Hello world!".as_slice())), v);
|
||||
|
||||
let sql_val = SqlValue::DoubleQuotedString("MorningMyFriends".to_string());
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::binary_datatype(), &sql_val)?;
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::binary_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
Value::Binary(Bytes::from(b"MorningMyFriends".as_slice())),
|
||||
v
|
||||
);
|
||||
|
||||
let sql_val = SqlValue::HexStringLiteral("9AF".to_string());
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::binary_datatype(), &sql_val);
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::binary_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
);
|
||||
assert!(v.is_err());
|
||||
assert!(
|
||||
format!("{v:?}").contains("odd number of digits"),
|
||||
@@ -700,16 +767,38 @@ mod test {
|
||||
);
|
||||
|
||||
let sql_val = SqlValue::HexStringLiteral("AG".to_string());
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::binary_datatype(), &sql_val);
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::binary_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
);
|
||||
assert!(v.is_err());
|
||||
assert!(format!("{v:?}").contains("invalid character"), "v is {v:?}",);
|
||||
|
||||
let sql_val = SqlValue::DoubleQuotedString("MorningMyFriends".to_string());
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::json_datatype(), &sql_val);
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::json_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
);
|
||||
assert!(v.is_err());
|
||||
|
||||
let sql_val = SqlValue::DoubleQuotedString(r#"{"a":"b"}"#.to_string());
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::json_datatype(), &sql_val)?;
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::json_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
Value::Binary(Bytes::from(
|
||||
jsonb::parse_value(r#"{"a":"b"}"#.as_bytes())
|
||||
@@ -719,15 +808,16 @@ mod test {
|
||||
)),
|
||||
v
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_json_to_jsonb() {
|
||||
match call_parse_string_to_value!(
|
||||
match parse_string_to_value(
|
||||
"json_col",
|
||||
r#"{"a": "b"}"#.to_string(),
|
||||
ConcreteDataType::json_datatype()
|
||||
&ConcreteDataType::json_datatype(),
|
||||
None,
|
||||
false,
|
||||
) {
|
||||
Ok(Value::Binary(b)) => {
|
||||
assert_eq!(
|
||||
@@ -743,10 +833,12 @@ mod test {
|
||||
}
|
||||
|
||||
assert!(
|
||||
call_parse_string_to_value!(
|
||||
parse_string_to_value(
|
||||
"json_col",
|
||||
r#"Nicola Kovac is the best rifler in the world"#.to_string(),
|
||||
ConcreteDataType::json_datatype()
|
||||
&ConcreteDataType::json_datatype(),
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.is_err()
|
||||
)
|
||||
@@ -786,10 +878,13 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_parse_date_literal() {
|
||||
let value = call_sql_value_to_value!(
|
||||
let value = sql_value_to_value(
|
||||
"date",
|
||||
ConcreteDataType::date_datatype(),
|
||||
&SqlValue::DoubleQuotedString("2022-02-22".to_string())
|
||||
&ConcreteDataType::date_datatype(),
|
||||
&SqlValue::DoubleQuotedString("2022-02-22".to_string()),
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(ConcreteDataType::date_datatype(), value.data_type());
|
||||
@@ -800,11 +895,13 @@ mod test {
|
||||
}
|
||||
|
||||
// with timezone
|
||||
let value = call_sql_value_to_value!(
|
||||
let value = sql_value_to_value(
|
||||
"date",
|
||||
ConcreteDataType::date_datatype(),
|
||||
&ConcreteDataType::date_datatype(),
|
||||
&SqlValue::DoubleQuotedString("2022-02-22".to_string()),
|
||||
timezone = &Timezone::from_tz_string("+07:00").unwrap()
|
||||
Some(&Timezone::from_tz_string("+07:00").unwrap()),
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(ConcreteDataType::date_datatype(), value.data_type());
|
||||
@@ -816,12 +913,16 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_timestamp_literal() -> Result<()> {
|
||||
match call_parse_string_to_value!(
|
||||
fn test_parse_timestamp_literal() {
|
||||
match parse_string_to_value(
|
||||
"timestamp_col",
|
||||
"2022-02-22T00:01:01+08:00".to_string(),
|
||||
ConcreteDataType::timestamp_millisecond_datatype()
|
||||
)? {
|
||||
&ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap()
|
||||
{
|
||||
Value::Timestamp(ts) => {
|
||||
assert_eq!(1645459261000, ts.value());
|
||||
assert_eq!(TimeUnit::Millisecond, ts.unit());
|
||||
@@ -831,11 +932,15 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
match call_parse_string_to_value!(
|
||||
match parse_string_to_value(
|
||||
"timestamp_col",
|
||||
"2022-02-22T00:01:01+08:00".to_string(),
|
||||
ConcreteDataType::timestamp_datatype(TimeUnit::Second)
|
||||
)? {
|
||||
&ConcreteDataType::timestamp_datatype(TimeUnit::Second),
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap()
|
||||
{
|
||||
Value::Timestamp(ts) => {
|
||||
assert_eq!(1645459261, ts.value());
|
||||
assert_eq!(TimeUnit::Second, ts.unit());
|
||||
@@ -845,11 +950,15 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
match call_parse_string_to_value!(
|
||||
match parse_string_to_value(
|
||||
"timestamp_col",
|
||||
"2022-02-22T00:01:01+08:00".to_string(),
|
||||
ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond)
|
||||
)? {
|
||||
&ConcreteDataType::timestamp_datatype(TimeUnit::Microsecond),
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap()
|
||||
{
|
||||
Value::Timestamp(ts) => {
|
||||
assert_eq!(1645459261000000, ts.value());
|
||||
assert_eq!(TimeUnit::Microsecond, ts.unit());
|
||||
@@ -859,11 +968,15 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
match call_parse_string_to_value!(
|
||||
match parse_string_to_value(
|
||||
"timestamp_col",
|
||||
"2022-02-22T00:01:01+08:00".to_string(),
|
||||
ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond)
|
||||
)? {
|
||||
&ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap()
|
||||
{
|
||||
Value::Timestamp(ts) => {
|
||||
assert_eq!(1645459261000000000, ts.value());
|
||||
assert_eq!(TimeUnit::Nanosecond, ts.unit());
|
||||
@@ -874,21 +987,26 @@ mod test {
|
||||
}
|
||||
|
||||
assert!(
|
||||
call_parse_string_to_value!(
|
||||
parse_string_to_value(
|
||||
"timestamp_col",
|
||||
"2022-02-22T00:01:01+08".to_string(),
|
||||
ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond)
|
||||
&ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.is_err()
|
||||
);
|
||||
|
||||
// with timezone
|
||||
match call_parse_string_to_value!(
|
||||
match parse_string_to_value(
|
||||
"timestamp_col",
|
||||
"2022-02-22T00:01:01".to_string(),
|
||||
ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
|
||||
timezone = &Timezone::from_tz_string("Asia/Shanghai").unwrap()
|
||||
)? {
|
||||
&ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond),
|
||||
Some(&Timezone::from_tz_string("Asia/Shanghai").unwrap()),
|
||||
false,
|
||||
)
|
||||
.unwrap()
|
||||
{
|
||||
Value::Timestamp(ts) => {
|
||||
assert_eq!(1645459261000000000, ts.value());
|
||||
assert_eq!("2022-02-21 16:01:01+0000", ts.to_iso8601_string());
|
||||
@@ -898,42 +1016,51 @@ mod test {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_placeholder_value() {
|
||||
assert!(
|
||||
call_sql_value_to_value!(
|
||||
sql_value_to_value(
|
||||
"test",
|
||||
ConcreteDataType::string_datatype(),
|
||||
&SqlValue::Placeholder("default".into())
|
||||
)
|
||||
.is_err()
|
||||
);
|
||||
assert!(
|
||||
call_sql_value_to_value!(
|
||||
"test",
|
||||
ConcreteDataType::string_datatype(),
|
||||
&ConcreteDataType::string_datatype(),
|
||||
&SqlValue::Placeholder("default".into()),
|
||||
unary_op = UnaryOperator::Minus
|
||||
None,
|
||||
None,
|
||||
false
|
||||
)
|
||||
.is_err()
|
||||
);
|
||||
assert!(
|
||||
call_sql_value_to_value!(
|
||||
sql_value_to_value(
|
||||
"test",
|
||||
ConcreteDataType::uint16_datatype(),
|
||||
&ConcreteDataType::string_datatype(),
|
||||
&SqlValue::Placeholder("default".into()),
|
||||
None,
|
||||
Some(UnaryOperator::Minus),
|
||||
false
|
||||
)
|
||||
.is_err()
|
||||
);
|
||||
assert!(
|
||||
sql_value_to_value(
|
||||
"test",
|
||||
&ConcreteDataType::uint16_datatype(),
|
||||
&SqlValue::Number("3".into(), false),
|
||||
unary_op = UnaryOperator::Minus
|
||||
None,
|
||||
Some(UnaryOperator::Minus),
|
||||
false
|
||||
)
|
||||
.is_err()
|
||||
);
|
||||
assert!(
|
||||
call_sql_value_to_value!(
|
||||
sql_value_to_value(
|
||||
"test",
|
||||
ConcreteDataType::uint16_datatype(),
|
||||
&SqlValue::Number("3".into(), false)
|
||||
&ConcreteDataType::uint16_datatype(),
|
||||
&SqlValue::Number("3".into(), false),
|
||||
None,
|
||||
None,
|
||||
false
|
||||
)
|
||||
.is_ok()
|
||||
);
|
||||
@@ -943,60 +1070,77 @@ mod test {
|
||||
fn test_auto_string_to_numeric() {
|
||||
// Test with auto_string_to_numeric=true
|
||||
let sql_val = SqlValue::SingleQuotedString("123".to_string());
|
||||
let v = call_sql_value_to_value!(
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
ConcreteDataType::int32_datatype(),
|
||||
&ConcreteDataType::int32_datatype(),
|
||||
&sql_val,
|
||||
auto_string_to_numeric
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Int32(123), v);
|
||||
|
||||
// Test with a float string
|
||||
let sql_val = SqlValue::SingleQuotedString("3.5".to_string());
|
||||
let v = call_sql_value_to_value!(
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
ConcreteDataType::float64_datatype(),
|
||||
&ConcreteDataType::float64_datatype(),
|
||||
&sql_val,
|
||||
auto_string_to_numeric
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Float64(OrderedFloat(3.5)), v);
|
||||
|
||||
// Test with auto_string_to_numeric=false
|
||||
let sql_val = SqlValue::SingleQuotedString("123".to_string());
|
||||
let v = call_sql_value_to_value!("a", ConcreteDataType::int32_datatype(), &sql_val);
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
&ConcreteDataType::int32_datatype(),
|
||||
&sql_val,
|
||||
None,
|
||||
None,
|
||||
false,
|
||||
);
|
||||
assert!(v.is_err());
|
||||
|
||||
// Test with an invalid numeric string but auto_string_to_numeric=true
|
||||
// Should return an error now with the new auto_cast_to_numeric behavior
|
||||
let sql_val = SqlValue::SingleQuotedString("not_a_number".to_string());
|
||||
let v = call_sql_value_to_value!(
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
ConcreteDataType::int32_datatype(),
|
||||
&ConcreteDataType::int32_datatype(),
|
||||
&sql_val,
|
||||
auto_string_to_numeric
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(v.is_err());
|
||||
|
||||
// Test with boolean type
|
||||
let sql_val = SqlValue::SingleQuotedString("true".to_string());
|
||||
let v = call_sql_value_to_value!(
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
ConcreteDataType::boolean_datatype(),
|
||||
&ConcreteDataType::boolean_datatype(),
|
||||
&sql_val,
|
||||
auto_string_to_numeric
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(Value::Boolean(true), v);
|
||||
|
||||
// Non-numeric types should still be handled normally
|
||||
let sql_val = SqlValue::SingleQuotedString("hello".to_string());
|
||||
let v = call_sql_value_to_value!(
|
||||
let v = sql_value_to_value(
|
||||
"a",
|
||||
ConcreteDataType::string_datatype(),
|
||||
&ConcreteDataType::string_datatype(),
|
||||
&sql_val,
|
||||
auto_string_to_numeric
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
);
|
||||
assert!(v.is_ok());
|
||||
}
|
||||
|
||||
@@ -14,8 +14,8 @@
|
||||
|
||||
use common_time::timezone::Timezone;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::ColumnDefaultConstraint;
|
||||
use datatypes::schema::constraint::{CURRENT_TIMESTAMP, CURRENT_TIMESTAMP_FN};
|
||||
use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema};
|
||||
use snafu::ensure;
|
||||
use sqlparser::ast::ValueWithSpan;
|
||||
pub use sqlparser::ast::{
|
||||
@@ -47,12 +47,9 @@ pub fn parse_column_default_constraint(
|
||||
);
|
||||
|
||||
let default_constraint = match &opt.option {
|
||||
ColumnOption::Default(Expr::Value(v)) => {
|
||||
let schema = ColumnSchema::new(column_name, data_type.clone(), true);
|
||||
ColumnDefaultConstraint::Value(sql_value_to_value(
|
||||
&schema, &v.value, timezone, None, false,
|
||||
)?)
|
||||
}
|
||||
ColumnOption::Default(Expr::Value(v)) => ColumnDefaultConstraint::Value(
|
||||
sql_value_to_value(column_name, data_type, &v.value, timezone, None, false)?,
|
||||
),
|
||||
ColumnOption::Default(Expr::Function(func)) => {
|
||||
let mut func = format!("{func}").to_lowercase();
|
||||
// normalize CURRENT_TIMESTAMP to CURRENT_TIMESTAMP()
|
||||
@@ -83,7 +80,8 @@ pub fn parse_column_default_constraint(
|
||||
|
||||
if let Expr::Value(v) = &**expr {
|
||||
let value = sql_value_to_value(
|
||||
&ColumnSchema::new(column_name, data_type.clone(), true),
|
||||
column_name,
|
||||
data_type,
|
||||
&v.value,
|
||||
timezone,
|
||||
Some(*op),
|
||||
|
||||
@@ -24,7 +24,6 @@ use store_api::storage::GcReport;
|
||||
|
||||
mod close_region;
|
||||
mod downgrade_region;
|
||||
mod enter_staging;
|
||||
mod file_ref;
|
||||
mod flush_region;
|
||||
mod gc_worker;
|
||||
@@ -33,7 +32,6 @@ mod upgrade_region;
|
||||
|
||||
use crate::heartbeat::handler::close_region::CloseRegionsHandler;
|
||||
use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler;
|
||||
use crate::heartbeat::handler::enter_staging::EnterStagingRegionsHandler;
|
||||
use crate::heartbeat::handler::file_ref::GetFileRefsHandler;
|
||||
use crate::heartbeat::handler::flush_region::FlushRegionsHandler;
|
||||
use crate::heartbeat::handler::gc_worker::GcRegionsHandler;
|
||||
@@ -125,9 +123,6 @@ impl RegionHeartbeatResponseHandler {
|
||||
Instruction::GcRegions(_) => Ok(Some(Box::new(GcRegionsHandler.into()))),
|
||||
Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
|
||||
Instruction::Suspend => Ok(None),
|
||||
Instruction::EnterStagingRegions(_) => {
|
||||
Ok(Some(Box::new(EnterStagingRegionsHandler.into())))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -141,7 +136,6 @@ pub enum InstructionHandlers {
|
||||
UpgradeRegions(UpgradeRegionsHandler),
|
||||
GetFileRefs(GetFileRefsHandler),
|
||||
GcRegions(GcRegionsHandler),
|
||||
EnterStagingRegions(EnterStagingRegionsHandler),
|
||||
}
|
||||
|
||||
macro_rules! impl_from_handler {
|
||||
@@ -163,8 +157,7 @@ impl_from_handler!(
|
||||
DowngradeRegionsHandler => DowngradeRegions,
|
||||
UpgradeRegionsHandler => UpgradeRegions,
|
||||
GetFileRefsHandler => GetFileRefs,
|
||||
GcRegionsHandler => GcRegions,
|
||||
EnterStagingRegionsHandler => EnterStagingRegions
|
||||
GcRegionsHandler => GcRegions
|
||||
);
|
||||
|
||||
macro_rules! dispatch_instr {
|
||||
@@ -209,7 +202,6 @@ dispatch_instr!(
|
||||
UpgradeRegions => UpgradeRegions,
|
||||
GetFileRefs => GetFileRefs,
|
||||
GcRegions => GcRegions,
|
||||
EnterStagingRegions => EnterStagingRegions
|
||||
);
|
||||
|
||||
#[async_trait]
|
||||
@@ -262,9 +254,7 @@ mod tests {
|
||||
use common_meta::heartbeat::mailbox::{
|
||||
HeartbeatMailbox, IncomingMessage, MailboxRef, MessageMeta,
|
||||
};
|
||||
use common_meta::instruction::{
|
||||
DowngradeRegion, EnterStagingRegion, OpenRegion, UpgradeRegion,
|
||||
};
|
||||
use common_meta::instruction::{DowngradeRegion, OpenRegion, UpgradeRegion};
|
||||
use mito2::config::MitoConfig;
|
||||
use mito2::engine::MITO_ENGINE_NAME;
|
||||
use mito2::test_util::{CreateRequestBuilder, TestEnv};
|
||||
@@ -345,16 +335,6 @@ mod tests {
|
||||
region_id,
|
||||
..Default::default()
|
||||
}]);
|
||||
assert!(
|
||||
heartbeat_handler
|
||||
.is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction)))
|
||||
);
|
||||
|
||||
// Enter staging region
|
||||
let instruction = Instruction::EnterStagingRegions(vec![EnterStagingRegion {
|
||||
region_id,
|
||||
partition_expr: "".to_string(),
|
||||
}]);
|
||||
assert!(
|
||||
heartbeat_handler.is_acceptable(&heartbeat_env.create_handler_ctx((meta, instruction)))
|
||||
);
|
||||
|
||||
@@ -1,243 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_meta::instruction::{
|
||||
EnterStagingRegion, EnterStagingRegionReply, EnterStagingRegionsReply, InstructionReply,
|
||||
};
|
||||
use common_telemetry::{error, warn};
|
||||
use futures::future::join_all;
|
||||
use store_api::region_request::{EnterStagingRequest, RegionRequest};
|
||||
|
||||
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
|
||||
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct EnterStagingRegionsHandler;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl InstructionHandler for EnterStagingRegionsHandler {
|
||||
type Instruction = Vec<EnterStagingRegion>;
|
||||
|
||||
async fn handle(
|
||||
&self,
|
||||
ctx: &HandlerContext,
|
||||
enter_staging: Self::Instruction,
|
||||
) -> Option<InstructionReply> {
|
||||
let futures = enter_staging.into_iter().map(|enter_staging_region| {
|
||||
Self::handle_enter_staging_region(ctx, enter_staging_region)
|
||||
});
|
||||
let results = join_all(futures).await;
|
||||
Some(InstructionReply::EnterStagingRegions(
|
||||
EnterStagingRegionsReply::new(results),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl EnterStagingRegionsHandler {
|
||||
async fn handle_enter_staging_region(
|
||||
ctx: &HandlerContext,
|
||||
EnterStagingRegion {
|
||||
region_id,
|
||||
partition_expr,
|
||||
}: EnterStagingRegion,
|
||||
) -> EnterStagingRegionReply {
|
||||
let Some(writable) = ctx.region_server.is_region_leader(region_id) else {
|
||||
warn!("Region: {} is not found", region_id);
|
||||
return EnterStagingRegionReply {
|
||||
region_id,
|
||||
ready: false,
|
||||
exists: false,
|
||||
error: None,
|
||||
};
|
||||
};
|
||||
if !writable {
|
||||
warn!("Region: {} is not writable", region_id);
|
||||
return EnterStagingRegionReply {
|
||||
region_id,
|
||||
ready: false,
|
||||
exists: true,
|
||||
error: Some("Region is not writable".into()),
|
||||
};
|
||||
}
|
||||
|
||||
match ctx
|
||||
.region_server
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::EnterStaging(EnterStagingRequest { partition_expr }),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(_) => EnterStagingRegionReply {
|
||||
region_id,
|
||||
ready: true,
|
||||
exists: true,
|
||||
error: None,
|
||||
},
|
||||
Err(err) => {
|
||||
error!(err; "Failed to enter staging region");
|
||||
EnterStagingRegionReply {
|
||||
region_id,
|
||||
ready: false,
|
||||
exists: true,
|
||||
error: Some(format!("{err:?}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_meta::instruction::EnterStagingRegion;
|
||||
use mito2::config::MitoConfig;
|
||||
use mito2::engine::MITO_ENGINE_NAME;
|
||||
use mito2::test_util::{CreateRequestBuilder, TestEnv};
|
||||
use store_api::path_utils::table_dir;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::region_request::RegionRequest;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::heartbeat::handler::enter_staging::EnterStagingRegionsHandler;
|
||||
use crate::heartbeat::handler::{HandlerContext, InstructionHandler};
|
||||
use crate::region_server::RegionServer;
|
||||
use crate::tests::{MockRegionEngine, mock_region_server};
|
||||
|
||||
const PARTITION_EXPR: &str = "partition_expr";
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_not_exist() {
|
||||
let mut mock_region_server = mock_region_server();
|
||||
let (mock_engine, _) = MockRegionEngine::new(MITO_ENGINE_NAME);
|
||||
mock_region_server.register_engine(mock_engine);
|
||||
let handler_context = HandlerContext::new_for_test(mock_region_server);
|
||||
let region_id = RegionId::new(1024, 1);
|
||||
let replies = EnterStagingRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
vec![EnterStagingRegion {
|
||||
region_id,
|
||||
partition_expr: "".to_string(),
|
||||
}],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let replies = replies.expect_enter_staging_regions_reply();
|
||||
let reply = &replies[0];
|
||||
assert!(!reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert!(!reply.ready);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_region_not_writable() {
|
||||
let mock_region_server = mock_region_server();
|
||||
let region_id = RegionId::new(1024, 1);
|
||||
let (mock_engine, _) =
|
||||
MockRegionEngine::with_custom_apply_fn(MITO_ENGINE_NAME, |region_engine| {
|
||||
region_engine.mock_role = Some(Some(RegionRole::Follower));
|
||||
region_engine.handle_request_mock_fn = Some(Box::new(|_, _| Ok(0)));
|
||||
});
|
||||
mock_region_server.register_test_region(region_id, mock_engine);
|
||||
let handler_context = HandlerContext::new_for_test(mock_region_server);
|
||||
let replies = EnterStagingRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
vec![EnterStagingRegion {
|
||||
region_id,
|
||||
partition_expr: "".to_string(),
|
||||
}],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let replies = replies.expect_enter_staging_regions_reply();
|
||||
let reply = &replies[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_some());
|
||||
assert!(!reply.ready);
|
||||
}
|
||||
|
||||
async fn prepare_region(region_server: &RegionServer) {
|
||||
let builder = CreateRequestBuilder::new();
|
||||
let mut create_req = builder.build();
|
||||
create_req.table_dir = table_dir("test", 1024);
|
||||
let region_id = RegionId::new(1024, 1);
|
||||
region_server
|
||||
.handle_request(region_id, RegionRequest::Create(create_req))
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_enter_staging() {
|
||||
let mut region_server = mock_region_server();
|
||||
let region_id = RegionId::new(1024, 1);
|
||||
let mut engine_env = TestEnv::new().await;
|
||||
let engine = engine_env.create_engine(MitoConfig::default()).await;
|
||||
region_server.register_engine(Arc::new(engine.clone()));
|
||||
prepare_region(®ion_server).await;
|
||||
|
||||
let handler_context = HandlerContext::new_for_test(region_server);
|
||||
let replies = EnterStagingRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
vec![EnterStagingRegion {
|
||||
region_id,
|
||||
partition_expr: PARTITION_EXPR.to_string(),
|
||||
}],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let replies = replies.expect_enter_staging_regions_reply();
|
||||
let reply = &replies[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert!(reply.ready);
|
||||
|
||||
// Should be ok to enter staging mode again with the same partition expr
|
||||
let replies = EnterStagingRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
vec![EnterStagingRegion {
|
||||
region_id,
|
||||
partition_expr: PARTITION_EXPR.to_string(),
|
||||
}],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let replies = replies.expect_enter_staging_regions_reply();
|
||||
let reply = &replies[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_none());
|
||||
assert!(reply.ready);
|
||||
|
||||
// Should throw error if try to enter staging mode again with a different partition expr
|
||||
let replies = EnterStagingRegionsHandler
|
||||
.handle(
|
||||
&handler_context,
|
||||
vec![EnterStagingRegion {
|
||||
region_id,
|
||||
partition_expr: "".to_string(),
|
||||
}],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let replies = replies.expect_enter_staging_regions_reply();
|
||||
let reply = &replies[0];
|
||||
assert!(reply.exists);
|
||||
assert!(reply.error.is_some());
|
||||
assert!(!reply.ready);
|
||||
}
|
||||
}
|
||||
@@ -19,7 +19,6 @@ use arrow::datatypes::{
|
||||
Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
|
||||
TimestampNanosecondType, TimestampSecondType,
|
||||
};
|
||||
use arrow_array::Array;
|
||||
use common_time::time::Time;
|
||||
use common_time::{Duration, Timestamp};
|
||||
|
||||
@@ -127,28 +126,3 @@ pub fn duration_array_value(array: &ArrayRef, i: usize) -> Duration {
|
||||
};
|
||||
Duration::new(v, time_unit.into())
|
||||
}
|
||||
|
||||
/// Get the string value at index `i` for `Utf8`, `LargeUtf8`, or `Utf8View` arrays.
|
||||
///
|
||||
/// Returns `None` when the array type is not a string type or the value is null.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If index `i` is out of bounds.
|
||||
pub fn string_array_value_at_index(array: &ArrayRef, i: usize) -> Option<&str> {
|
||||
match array.data_type() {
|
||||
DataType::Utf8 => {
|
||||
let array = array.as_string::<i32>();
|
||||
array.is_valid(i).then(|| array.value(i))
|
||||
}
|
||||
DataType::LargeUtf8 => {
|
||||
let array = array.as_string::<i64>();
|
||||
array.is_valid(i).then(|| array.value(i))
|
||||
}
|
||||
DataType::Utf8View => {
|
||||
let array = array.as_string_view();
|
||||
array.is_valid(i).then(|| array.value(i))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,9 +26,9 @@ use std::sync::Arc;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value as Json};
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
use snafu::{ResultExt, ensure};
|
||||
|
||||
use crate::error::{self, InvalidJsonSnafu, Result, SerializeSnafu};
|
||||
use crate::error::{self, Error};
|
||||
use crate::json::value::{JsonValue, JsonVariant};
|
||||
use crate::types::json_type::{JsonNativeType, JsonNumberType, JsonObjectType};
|
||||
use crate::types::{StructField, StructType};
|
||||
@@ -71,7 +71,7 @@ impl JsonStructureSettings {
|
||||
pub const RAW_FIELD: &'static str = "_raw";
|
||||
|
||||
/// Decode an encoded StructValue back into a serde_json::Value.
|
||||
pub fn decode(&self, value: Value) -> Result<Json> {
|
||||
pub fn decode(&self, value: Value) -> Result<Json, Error> {
|
||||
let context = JsonContext {
|
||||
key_path: String::new(),
|
||||
settings: self,
|
||||
@@ -82,7 +82,7 @@ impl JsonStructureSettings {
|
||||
/// Decode a StructValue that was encoded with current settings back into a fully structured StructValue.
|
||||
/// This is useful for reconstructing the original structure from encoded data, especially when
|
||||
/// unstructured encoding was used for some fields.
|
||||
pub fn decode_struct(&self, struct_value: StructValue) -> Result<StructValue> {
|
||||
pub fn decode_struct(&self, struct_value: StructValue) -> Result<StructValue, Error> {
|
||||
let context = JsonContext {
|
||||
key_path: String::new(),
|
||||
settings: self,
|
||||
@@ -91,11 +91,7 @@ impl JsonStructureSettings {
|
||||
}
|
||||
|
||||
/// Encode a serde_json::Value into a Value::Json using current settings.
|
||||
pub fn encode(&self, json: Json) -> Result<Value> {
|
||||
if let Some(json_struct) = self.json_struct() {
|
||||
return encode_by_struct(json_struct, json);
|
||||
}
|
||||
|
||||
pub fn encode(&self, json: Json) -> Result<Value, Error> {
|
||||
let context = JsonContext {
|
||||
key_path: String::new(),
|
||||
settings: self,
|
||||
@@ -108,21 +104,13 @@ impl JsonStructureSettings {
|
||||
&self,
|
||||
json: Json,
|
||||
data_type: Option<&JsonNativeType>,
|
||||
) -> Result<Value> {
|
||||
) -> Result<Value, Error> {
|
||||
let context = JsonContext {
|
||||
key_path: String::new(),
|
||||
settings: self,
|
||||
};
|
||||
encode_json_with_context(json, data_type, &context).map(|v| Value::Json(Box::new(v)))
|
||||
}
|
||||
|
||||
fn json_struct(&self) -> Option<&StructType> {
|
||||
match &self {
|
||||
JsonStructureSettings::Structured(fields) => fields.as_ref(),
|
||||
JsonStructureSettings::PartialUnstructuredByKey { fields, .. } => fields.as_ref(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for JsonStructureSettings {
|
||||
@@ -156,54 +144,12 @@ impl<'a> JsonContext<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn encode_by_struct(json_struct: &StructType, mut json: Json) -> Result<Value> {
|
||||
let Some(json_object) = json.as_object_mut() else {
|
||||
return InvalidJsonSnafu {
|
||||
value: "expect JSON object when struct is provided",
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
let mut encoded = BTreeMap::new();
|
||||
|
||||
fn extract_field(json_object: &mut Map<String, Json>, field: &str) -> Result<Option<Json>> {
|
||||
let (first, rest) = field.split_once('.').unwrap_or((field, ""));
|
||||
|
||||
if rest.is_empty() {
|
||||
Ok(json_object.remove(first))
|
||||
} else {
|
||||
let Some(value) = json_object.get_mut(first) else {
|
||||
return Ok(None);
|
||||
};
|
||||
let json_object = value.as_object_mut().with_context(|| InvalidJsonSnafu {
|
||||
value: format!(r#"expect "{}" an object"#, first),
|
||||
})?;
|
||||
extract_field(json_object, rest)
|
||||
}
|
||||
}
|
||||
|
||||
let fields = json_struct.fields();
|
||||
for field in fields.iter() {
|
||||
let Some(field_value) = extract_field(json_object, field.name())? else {
|
||||
continue;
|
||||
};
|
||||
let field_type: JsonNativeType = field.data_type().into();
|
||||
let field_value = try_convert_to_expected_type(field_value, &field_type)?;
|
||||
encoded.insert(field.name().to_string(), field_value);
|
||||
}
|
||||
|
||||
let rest = serde_json::to_string(json_object).context(SerializeSnafu)?;
|
||||
encoded.insert(JsonStructureSettings::RAW_FIELD.to_string(), rest.into());
|
||||
|
||||
let value: JsonValue = encoded.into();
|
||||
Ok(Value::Json(Box::new(value)))
|
||||
}
|
||||
|
||||
/// Main encoding function with key path tracking
|
||||
pub fn encode_json_with_context<'a>(
|
||||
json: Json,
|
||||
data_type: Option<&JsonNativeType>,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<JsonValue> {
|
||||
) -> Result<JsonValue, Error> {
|
||||
// Check if the entire encoding should be unstructured
|
||||
if matches!(context.settings, JsonStructureSettings::UnstructuredRaw) {
|
||||
let json_string = json.to_string();
|
||||
@@ -269,7 +215,7 @@ fn encode_json_object_with_context<'a>(
|
||||
mut json_object: Map<String, Json>,
|
||||
fields: Option<&JsonObjectType>,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<JsonValue> {
|
||||
) -> Result<JsonValue, Error> {
|
||||
let mut object = BTreeMap::new();
|
||||
// First, process fields from the provided schema in their original order
|
||||
if let Some(fields) = fields {
|
||||
@@ -302,7 +248,7 @@ fn encode_json_array_with_context<'a>(
|
||||
json_array: Vec<Json>,
|
||||
item_type: Option<&JsonNativeType>,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<JsonValue> {
|
||||
) -> Result<JsonValue, Error> {
|
||||
let json_array_len = json_array.len();
|
||||
let mut items = Vec::with_capacity(json_array_len);
|
||||
let mut element_type = item_type.cloned();
|
||||
@@ -340,7 +286,7 @@ fn encode_json_value_with_context<'a>(
|
||||
json: Json,
|
||||
expected_type: Option<&JsonNativeType>,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<JsonValue> {
|
||||
) -> Result<JsonValue, Error> {
|
||||
// Check if current key should be treated as unstructured
|
||||
if context.is_unstructured_key() {
|
||||
return Ok(json.to_string().into());
|
||||
@@ -355,7 +301,7 @@ fn encode_json_value_with_context<'a>(
|
||||
if let Some(expected) = expected_type
|
||||
&& let Ok(value) = try_convert_to_expected_type(i, expected)
|
||||
{
|
||||
return Ok(value.into());
|
||||
return Ok(value);
|
||||
}
|
||||
Ok(i.into())
|
||||
} else if let Some(u) = n.as_u64() {
|
||||
@@ -363,7 +309,7 @@ fn encode_json_value_with_context<'a>(
|
||||
if let Some(expected) = expected_type
|
||||
&& let Ok(value) = try_convert_to_expected_type(u, expected)
|
||||
{
|
||||
return Ok(value.into());
|
||||
return Ok(value);
|
||||
}
|
||||
if u <= i64::MAX as u64 {
|
||||
Ok((u as i64).into())
|
||||
@@ -375,7 +321,7 @@ fn encode_json_value_with_context<'a>(
|
||||
if let Some(expected) = expected_type
|
||||
&& let Ok(value) = try_convert_to_expected_type(f, expected)
|
||||
{
|
||||
return Ok(value.into());
|
||||
return Ok(value);
|
||||
}
|
||||
|
||||
// Default to f64 for floating point numbers
|
||||
@@ -389,7 +335,7 @@ fn encode_json_value_with_context<'a>(
|
||||
if let Some(expected) = expected_type
|
||||
&& let Ok(value) = try_convert_to_expected_type(s.as_str(), expected)
|
||||
{
|
||||
return Ok(value.into());
|
||||
return Ok(value);
|
||||
}
|
||||
Ok(s.into())
|
||||
}
|
||||
@@ -399,7 +345,10 @@ fn encode_json_value_with_context<'a>(
|
||||
}
|
||||
|
||||
/// Main decoding function with key path tracking
|
||||
pub fn decode_value_with_context(value: Value, context: &JsonContext) -> Result<Json> {
|
||||
pub fn decode_value_with_context<'a>(
|
||||
value: Value,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<Json, Error> {
|
||||
// Check if the entire decoding should be unstructured
|
||||
if matches!(context.settings, JsonStructureSettings::UnstructuredRaw) {
|
||||
return decode_unstructured_value(value);
|
||||
@@ -421,7 +370,7 @@ pub fn decode_value_with_context(value: Value, context: &JsonContext) -> Result<
|
||||
fn decode_struct_with_context<'a>(
|
||||
struct_value: StructValue,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<Json> {
|
||||
) -> Result<Json, Error> {
|
||||
let mut json_object = Map::with_capacity(struct_value.len());
|
||||
|
||||
let (items, fields) = struct_value.into_parts();
|
||||
@@ -436,7 +385,10 @@ fn decode_struct_with_context<'a>(
|
||||
}
|
||||
|
||||
/// Decode a list value to JSON array
|
||||
fn decode_list_with_context(list_value: ListValue, context: &JsonContext) -> Result<Json> {
|
||||
fn decode_list_with_context<'a>(
|
||||
list_value: ListValue,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<Json, Error> {
|
||||
let mut json_array = Vec::with_capacity(list_value.len());
|
||||
|
||||
let data_items = list_value.take_items();
|
||||
@@ -451,7 +403,7 @@ fn decode_list_with_context(list_value: ListValue, context: &JsonContext) -> Res
|
||||
}
|
||||
|
||||
/// Decode unstructured value (stored as string)
|
||||
fn decode_unstructured_value(value: Value) -> Result<Json> {
|
||||
fn decode_unstructured_value(value: Value) -> Result<Json, Error> {
|
||||
match value {
|
||||
// Handle expected format: StructValue with single _raw field
|
||||
Value::Struct(struct_value) => {
|
||||
@@ -491,7 +443,7 @@ fn decode_unstructured_value(value: Value) -> Result<Json> {
|
||||
}
|
||||
|
||||
/// Decode primitive value to JSON
|
||||
fn decode_primitive_value(value: Value) -> Result<Json> {
|
||||
fn decode_primitive_value(value: Value) -> Result<Json, Error> {
|
||||
match value {
|
||||
Value::Null => Ok(Json::Null),
|
||||
Value::Boolean(b) => Ok(Json::Bool(b)),
|
||||
@@ -535,7 +487,7 @@ fn decode_primitive_value(value: Value) -> Result<Json> {
|
||||
fn decode_struct_with_settings<'a>(
|
||||
struct_value: StructValue,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<StructValue> {
|
||||
) -> Result<StructValue, Error> {
|
||||
// Check if we can return the struct directly (Structured case)
|
||||
if matches!(context.settings, JsonStructureSettings::Structured(_)) {
|
||||
return Ok(struct_value);
|
||||
@@ -615,7 +567,7 @@ fn decode_struct_with_settings<'a>(
|
||||
fn decode_list_with_settings<'a>(
|
||||
list_value: ListValue,
|
||||
context: &JsonContext<'a>,
|
||||
) -> Result<ListValue> {
|
||||
) -> Result<ListValue, Error> {
|
||||
let mut items = Vec::with_capacity(list_value.len());
|
||||
|
||||
let (data_items, datatype) = list_value.into_parts();
|
||||
@@ -640,7 +592,7 @@ fn decode_list_with_settings<'a>(
|
||||
}
|
||||
|
||||
/// Helper function to decode a struct that was encoded with UnstructuredRaw settings
|
||||
fn decode_unstructured_raw_struct(struct_value: StructValue) -> Result<StructValue> {
|
||||
fn decode_unstructured_raw_struct(struct_value: StructValue) -> Result<StructValue, Error> {
|
||||
// For UnstructuredRaw, the struct must have exactly one field named "_raw"
|
||||
if struct_value.struct_type().fields().len() == 1 {
|
||||
let field = &struct_value.struct_type().fields()[0];
|
||||
@@ -684,9 +636,12 @@ fn decode_unstructured_raw_struct(struct_value: StructValue) -> Result<StructVal
|
||||
}
|
||||
|
||||
/// Helper function to try converting a value to an expected type
|
||||
fn try_convert_to_expected_type<T>(value: T, expected_type: &JsonNativeType) -> Result<JsonVariant>
|
||||
fn try_convert_to_expected_type<T>(
|
||||
value: T,
|
||||
expected_type: &JsonNativeType,
|
||||
) -> Result<JsonValue, Error>
|
||||
where
|
||||
T: Into<JsonVariant>,
|
||||
T: Into<JsonValue>,
|
||||
{
|
||||
let value = value.into();
|
||||
let cast_error = || {
|
||||
@@ -695,7 +650,7 @@ where
|
||||
}
|
||||
.fail()
|
||||
};
|
||||
let actual_type = &value.native_type();
|
||||
let actual_type = value.json_type().native_type();
|
||||
match (actual_type, expected_type) {
|
||||
(x, y) if x == y => Ok(value),
|
||||
(JsonNativeType::Number(x), JsonNativeType::Number(y)) => match (x, y) {
|
||||
@@ -736,107 +691,6 @@ mod tests {
|
||||
use crate::data_type::ConcreteDataType;
|
||||
use crate::types::ListType;
|
||||
|
||||
#[test]
|
||||
fn test_encode_by_struct() {
|
||||
let json_struct: StructType = [
|
||||
StructField::new("s", ConcreteDataType::string_datatype(), true),
|
||||
StructField::new("foo.i", ConcreteDataType::int64_datatype(), true),
|
||||
StructField::new("x.y.z", ConcreteDataType::boolean_datatype(), true),
|
||||
]
|
||||
.into();
|
||||
|
||||
let json = json!({
|
||||
"s": "hello",
|
||||
"t": "world",
|
||||
"foo": {
|
||||
"i": 1,
|
||||
"j": 2
|
||||
},
|
||||
"x": {
|
||||
"y": {
|
||||
"z": true
|
||||
}
|
||||
}
|
||||
});
|
||||
let value = encode_by_struct(&json_struct, json).unwrap();
|
||||
assert_eq!(
|
||||
value.to_string(),
|
||||
r#"Json({ _raw: {"foo":{"j":2},"t":"world","x":{"y":{}}}, foo.i: 1, s: hello, x.y.z: true })"#
|
||||
);
|
||||
|
||||
let json = json!({
|
||||
"t": "world",
|
||||
"foo": {
|
||||
"i": 1,
|
||||
"j": 2
|
||||
},
|
||||
"x": {
|
||||
"y": {
|
||||
"z": true
|
||||
}
|
||||
}
|
||||
});
|
||||
let value = encode_by_struct(&json_struct, json).unwrap();
|
||||
assert_eq!(
|
||||
value.to_string(),
|
||||
r#"Json({ _raw: {"foo":{"j":2},"t":"world","x":{"y":{}}}, foo.i: 1, x.y.z: true })"#
|
||||
);
|
||||
|
||||
let json = json!({
|
||||
"s": 1234,
|
||||
"foo": {
|
||||
"i": 1,
|
||||
"j": 2
|
||||
},
|
||||
"x": {
|
||||
"y": {
|
||||
"z": true
|
||||
}
|
||||
}
|
||||
});
|
||||
let value = encode_by_struct(&json_struct, json).unwrap();
|
||||
assert_eq!(
|
||||
value.to_string(),
|
||||
r#"Json({ _raw: {"foo":{"j":2},"x":{"y":{}}}, foo.i: 1, s: 1234, x.y.z: true })"#
|
||||
);
|
||||
|
||||
let json = json!({
|
||||
"s": "hello",
|
||||
"t": "world",
|
||||
"foo": {
|
||||
"i": "bar",
|
||||
"j": 2
|
||||
},
|
||||
"x": {
|
||||
"y": {
|
||||
"z": true
|
||||
}
|
||||
}
|
||||
});
|
||||
let result = encode_by_struct(&json_struct, json);
|
||||
assert_eq!(
|
||||
result.unwrap_err().to_string(),
|
||||
"Cannot cast value bar to Number(I64)"
|
||||
);
|
||||
|
||||
let json = json!({
|
||||
"s": "hello",
|
||||
"t": "world",
|
||||
"foo": {
|
||||
"i": 1,
|
||||
"j": 2
|
||||
},
|
||||
"x": {
|
||||
"y": "z"
|
||||
}
|
||||
});
|
||||
let result = encode_by_struct(&json_struct, json);
|
||||
assert_eq!(
|
||||
result.unwrap_err().to_string(),
|
||||
r#"Invalid JSON: expect "y" an object"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_json_null() {
|
||||
let json = Json::Null;
|
||||
|
||||
@@ -82,18 +82,6 @@ impl From<f64> for JsonNumber {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Number> for JsonNumber {
|
||||
fn from(n: Number) -> Self {
|
||||
if let Some(i) = n.as_i64() {
|
||||
i.into()
|
||||
} else if let Some(i) = n.as_u64() {
|
||||
i.into()
|
||||
} else {
|
||||
n.as_f64().unwrap_or(f64::NAN).into()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for JsonNumber {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
@@ -121,28 +109,7 @@ pub enum JsonVariant {
|
||||
}
|
||||
|
||||
impl JsonVariant {
|
||||
pub(crate) fn as_i64(&self) -> Option<i64> {
|
||||
match self {
|
||||
JsonVariant::Number(n) => n.as_i64(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn as_u64(&self) -> Option<u64> {
|
||||
match self {
|
||||
JsonVariant::Number(n) => n.as_u64(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn as_f64(&self) -> Option<f64> {
|
||||
match self {
|
||||
JsonVariant::Number(n) => Some(n.as_f64()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn native_type(&self) -> JsonNativeType {
|
||||
fn native_type(&self) -> JsonNativeType {
|
||||
match self {
|
||||
JsonVariant::Null => JsonNativeType::Null,
|
||||
JsonVariant::Bool(_) => JsonNativeType::Bool,
|
||||
@@ -238,32 +205,6 @@ impl<K: Into<String>, V: Into<JsonVariant>, const N: usize> From<[(K, V); N]> fo
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Value> for JsonVariant {
|
||||
fn from(v: serde_json::Value) -> Self {
|
||||
fn helper(v: serde_json::Value) -> JsonVariant {
|
||||
match v {
|
||||
serde_json::Value::Null => JsonVariant::Null,
|
||||
serde_json::Value::Bool(b) => b.into(),
|
||||
serde_json::Value::Number(n) => n.into(),
|
||||
serde_json::Value::String(s) => s.into(),
|
||||
serde_json::Value::Array(array) => {
|
||||
JsonVariant::Array(array.into_iter().map(helper).collect())
|
||||
}
|
||||
serde_json::Value::Object(object) => {
|
||||
JsonVariant::Object(object.into_iter().map(|(k, v)| (k, helper(v))).collect())
|
||||
}
|
||||
}
|
||||
}
|
||||
helper(v)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BTreeMap<String, JsonVariant>> for JsonVariant {
|
||||
fn from(v: BTreeMap<String, JsonVariant>) -> Self {
|
||||
Self::Object(v)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for JsonVariant {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
@@ -336,11 +277,24 @@ impl JsonValue {
|
||||
}
|
||||
|
||||
pub(crate) fn as_i64(&self) -> Option<i64> {
|
||||
self.json_variant.as_i64()
|
||||
match self.json_variant {
|
||||
JsonVariant::Number(n) => n.as_i64(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn as_u64(&self) -> Option<u64> {
|
||||
self.json_variant.as_u64()
|
||||
match self.json_variant {
|
||||
JsonVariant::Number(n) => n.as_u64(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn as_f64(&self) -> Option<f64> {
|
||||
match self.json_variant {
|
||||
JsonVariant::Number(n) => Some(n.as_f64()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn as_f64_lossy(&self) -> Option<f64> {
|
||||
|
||||
@@ -122,9 +122,9 @@ pub struct StructField {
|
||||
}
|
||||
|
||||
impl StructField {
|
||||
pub fn new<T: Into<String>>(name: T, data_type: ConcreteDataType, nullable: bool) -> Self {
|
||||
pub fn new(name: String, data_type: ConcreteDataType, nullable: bool) -> Self {
|
||||
StructField {
|
||||
name: name.into(),
|
||||
name,
|
||||
data_type,
|
||||
nullable,
|
||||
metadata: BTreeMap::new(),
|
||||
|
||||
@@ -490,6 +490,7 @@ impl<'a> FlownodeServiceBuilder<'a> {
|
||||
let config = GrpcServerConfig {
|
||||
max_recv_message_size: opts.grpc.max_recv_message_size.as_bytes() as usize,
|
||||
max_send_message_size: opts.grpc.max_send_message_size.as_bytes() as usize,
|
||||
max_total_message_memory: opts.grpc.max_total_message_memory.as_bytes() as usize,
|
||||
tls: opts.grpc.tls.clone(),
|
||||
max_connection_age: opts.grpc.max_connection_age,
|
||||
};
|
||||
|
||||
@@ -32,7 +32,6 @@ common-frontend.workspace = true
|
||||
common-function.workspace = true
|
||||
common-grpc.workspace = true
|
||||
common-macro.workspace = true
|
||||
common-memory-manager.workspace = true
|
||||
common-meta.workspace = true
|
||||
common-options.workspace = true
|
||||
common-procedure.workspace = true
|
||||
|
||||
@@ -357,6 +357,14 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to acquire more permits from limiter"))]
|
||||
AcquireLimiter {
|
||||
#[snafu(source)]
|
||||
error: tokio::sync::AcquireError,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Service suspended"))]
|
||||
Suspended {
|
||||
#[snafu(implicit)]
|
||||
@@ -441,6 +449,8 @@ impl ErrorExt for Error {
|
||||
|
||||
Error::StatementTimeout { .. } => StatusCode::Cancelled,
|
||||
|
||||
Error::AcquireLimiter { .. } => StatusCode::Internal,
|
||||
|
||||
Error::Suspended { .. } => StatusCode::Suspended,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ use std::sync::Arc;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_config::config::Configurable;
|
||||
use common_event_recorder::EventRecorderOptions;
|
||||
use common_memory_manager::OnExhaustedPolicy;
|
||||
use common_options::datanode::DatanodeClientOptions;
|
||||
use common_options::memory::MemoryOptions;
|
||||
use common_telemetry::logging::{LoggingOptions, SlowQueryOptions, TracingOptions};
|
||||
@@ -46,12 +45,6 @@ pub struct FrontendOptions {
|
||||
pub default_timezone: Option<String>,
|
||||
pub default_column_prefix: Option<String>,
|
||||
pub heartbeat: HeartbeatOptions,
|
||||
/// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
|
||||
/// Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
pub max_in_flight_write_bytes: ReadableSize,
|
||||
/// Policy when write bytes quota is exhausted.
|
||||
/// Options: "wait" (default, 10s), "wait(<duration>)", "fail"
|
||||
pub write_bytes_exhausted_policy: OnExhaustedPolicy,
|
||||
pub http: HttpOptions,
|
||||
pub grpc: GrpcOptions,
|
||||
/// The internal gRPC options for the frontend service.
|
||||
@@ -70,6 +63,7 @@ pub struct FrontendOptions {
|
||||
pub user_provider: Option<String>,
|
||||
pub tracing: TracingOptions,
|
||||
pub query: QueryOptions,
|
||||
pub max_in_flight_write_bytes: Option<ReadableSize>,
|
||||
pub slow_query: SlowQueryOptions,
|
||||
pub memory: MemoryOptions,
|
||||
/// The event recorder options.
|
||||
@@ -83,8 +77,6 @@ impl Default for FrontendOptions {
|
||||
default_timezone: None,
|
||||
default_column_prefix: None,
|
||||
heartbeat: HeartbeatOptions::frontend_default(),
|
||||
max_in_flight_write_bytes: ReadableSize(0),
|
||||
write_bytes_exhausted_policy: OnExhaustedPolicy::default(),
|
||||
http: HttpOptions::default(),
|
||||
grpc: GrpcOptions::default(),
|
||||
internal_grpc: None,
|
||||
@@ -101,6 +93,7 @@ impl Default for FrontendOptions {
|
||||
user_provider: None,
|
||||
tracing: TracingOptions::default(),
|
||||
query: QueryOptions::default(),
|
||||
max_in_flight_write_bytes: None,
|
||||
slow_query: SlowQueryOptions::default(),
|
||||
memory: MemoryOptions::default(),
|
||||
event_recorder: EventRecorderOptions::default(),
|
||||
|
||||
@@ -97,6 +97,7 @@ use crate::error::{
|
||||
ParseSqlSnafu, PermissionSnafu, PlanStatementSnafu, Result, SqlExecInterceptedSnafu,
|
||||
StatementTimeoutSnafu, TableOperationSnafu,
|
||||
};
|
||||
use crate::limiter::LimiterRef;
|
||||
use crate::stream_wrapper::CancellableStreamWrapper;
|
||||
|
||||
lazy_static! {
|
||||
@@ -117,6 +118,7 @@ pub struct Instance {
|
||||
deleter: DeleterRef,
|
||||
table_metadata_manager: TableMetadataManagerRef,
|
||||
event_recorder: Option<EventRecorderRef>,
|
||||
limiter: Option<LimiterRef>,
|
||||
process_manager: ProcessManagerRef,
|
||||
slow_query_options: SlowQueryOptions,
|
||||
suspend: Arc<AtomicBool>,
|
||||
|
||||
@@ -49,6 +49,7 @@ use crate::events::EventHandlerImpl;
|
||||
use crate::frontend::FrontendOptions;
|
||||
use crate::instance::Instance;
|
||||
use crate::instance::region_query::FrontendRegionQueryHandler;
|
||||
use crate::limiter::Limiter;
|
||||
|
||||
/// The frontend [`Instance`] builder.
|
||||
pub struct FrontendBuilder {
|
||||
@@ -247,6 +248,14 @@ impl FrontendBuilder {
|
||||
self.options.event_recorder.ttl,
|
||||
))));
|
||||
|
||||
// Create the limiter if the max_in_flight_write_bytes is set.
|
||||
let limiter = self
|
||||
.options
|
||||
.max_in_flight_write_bytes
|
||||
.map(|max_in_flight_write_bytes| {
|
||||
Arc::new(Limiter::new(max_in_flight_write_bytes.as_bytes() as usize))
|
||||
});
|
||||
|
||||
Ok(Instance {
|
||||
catalog_manager: self.catalog_manager,
|
||||
pipeline_operator,
|
||||
@@ -257,6 +266,7 @@ impl FrontendBuilder {
|
||||
deleter,
|
||||
table_metadata_manager: Arc::new(TableMetadataManager::new(kv_backend)),
|
||||
event_recorder: Some(event_recorder),
|
||||
limiter,
|
||||
process_manager,
|
||||
otlp_metrics_table_legacy_cache: DashMap::new(),
|
||||
slow_query_options: self.options.slow_query.clone(),
|
||||
|
||||
@@ -71,6 +71,12 @@ impl GrpcQueryHandler for Instance {
|
||||
.check_permission(ctx.current_user(), PermissionReq::GrpcRequest(&request))
|
||||
.context(PermissionSnafu)?;
|
||||
|
||||
let _guard = if let Some(limiter) = &self.limiter {
|
||||
Some(limiter.limit_request(&request).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let output = match request {
|
||||
Request::Inserts(requests) => self.handle_inserts(requests, ctx.clone()).await?,
|
||||
Request::RowInserts(requests) => {
|
||||
|
||||
@@ -22,7 +22,7 @@ use common_error::ext::BoxedError;
|
||||
use common_time::Timestamp;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use servers::error::{
|
||||
AuthSnafu, CatalogSnafu, Error, TimestampOverflowSnafu, UnexpectedResultSnafu,
|
||||
AuthSnafu, CatalogSnafu, Error, OtherSnafu, TimestampOverflowSnafu, UnexpectedResultSnafu,
|
||||
};
|
||||
use servers::influxdb::InfluxdbRequest;
|
||||
use servers::interceptor::{LineProtocolInterceptor, LineProtocolInterceptorRef};
|
||||
@@ -59,6 +59,18 @@ impl InfluxdbLineProtocolHandler for Instance {
|
||||
.post_lines_conversion(requests, ctx.clone())
|
||||
.await?;
|
||||
|
||||
let _guard = if let Some(limiter) = &self.limiter {
|
||||
Some(
|
||||
limiter
|
||||
.limit_row_inserts(&requests)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(OtherSnafu)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.handle_influx_row_inserts(requests, ctx)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
|
||||
@@ -23,7 +23,8 @@ use datatypes::timestamp::TimestampNanosecond;
|
||||
use pipeline::pipeline_operator::PipelineOperator;
|
||||
use pipeline::{Pipeline, PipelineInfo, PipelineVersion};
|
||||
use servers::error::{
|
||||
AuthSnafu, Error as ServerError, ExecuteGrpcRequestSnafu, PipelineSnafu, Result as ServerResult,
|
||||
AuthSnafu, Error as ServerError, ExecuteGrpcRequestSnafu, OtherSnafu, PipelineSnafu,
|
||||
Result as ServerResult,
|
||||
};
|
||||
use servers::interceptor::{LogIngestInterceptor, LogIngestInterceptorRef};
|
||||
use servers::query_handler::PipelineHandler;
|
||||
@@ -123,6 +124,18 @@ impl Instance {
|
||||
log: RowInsertRequests,
|
||||
ctx: QueryContextRef,
|
||||
) -> ServerResult<Output> {
|
||||
let _guard = if let Some(limiter) = &self.limiter {
|
||||
Some(
|
||||
limiter
|
||||
.limit_row_inserts(&log)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(OtherSnafu)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.inserter
|
||||
.handle_log_inserts(log, ctx, self.statement_executor.as_ref())
|
||||
.await
|
||||
@@ -135,6 +148,18 @@ impl Instance {
|
||||
rows: RowInsertRequests,
|
||||
ctx: QueryContextRef,
|
||||
) -> ServerResult<Output> {
|
||||
let _guard = if let Some(limiter) = &self.limiter {
|
||||
Some(
|
||||
limiter
|
||||
.limit_row_inserts(&rows)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(OtherSnafu)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.inserter
|
||||
.handle_trace_inserts(rows, ctx, self.statement_executor.as_ref())
|
||||
.await
|
||||
|
||||
@@ -16,7 +16,7 @@ use async_trait::async_trait;
|
||||
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
|
||||
use common_error::ext::BoxedError;
|
||||
use common_telemetry::tracing;
|
||||
use servers::error::{self as server_error, AuthSnafu, ExecuteGrpcQuerySnafu};
|
||||
use servers::error::{self as server_error, AuthSnafu, ExecuteGrpcQuerySnafu, OtherSnafu};
|
||||
use servers::opentsdb::codec::DataPoint;
|
||||
use servers::opentsdb::data_point_to_grpc_row_insert_requests;
|
||||
use servers::query_handler::OpentsdbProtocolHandler;
|
||||
@@ -41,6 +41,18 @@ impl OpentsdbProtocolHandler for Instance {
|
||||
|
||||
let (requests, _) = data_point_to_grpc_row_insert_requests(data_points)?;
|
||||
|
||||
let _guard = if let Some(limiter) = &self.limiter {
|
||||
Some(
|
||||
limiter
|
||||
.limit_row_inserts(&requests)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(OtherSnafu)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// OpenTSDB is single value.
|
||||
let output = self
|
||||
.handle_row_inserts(requests, ctx, true, true)
|
||||
|
||||
@@ -24,7 +24,7 @@ use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
|
||||
use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
|
||||
use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest;
|
||||
use pipeline::{GreptimePipelineParams, PipelineWay};
|
||||
use servers::error::{self, AuthSnafu, Result as ServerResult};
|
||||
use servers::error::{self, AuthSnafu, OtherSnafu, Result as ServerResult};
|
||||
use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
|
||||
use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
|
||||
use servers::otlp;
|
||||
@@ -83,6 +83,18 @@ impl OpenTelemetryProtocolHandler for Instance {
|
||||
ctx
|
||||
};
|
||||
|
||||
let _guard = if let Some(limiter) = &self.limiter {
|
||||
Some(
|
||||
limiter
|
||||
.limit_row_inserts(&requests)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(OtherSnafu)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// If the user uses the legacy path, it is by default without metric engine.
|
||||
if metric_ctx.is_legacy || !metric_ctx.with_metric_engine {
|
||||
self.handle_row_inserts(requests, ctx, false, false)
|
||||
@@ -179,6 +191,18 @@ impl OpenTelemetryProtocolHandler for Instance {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let _guard = if let Some(limiter) = &self.limiter {
|
||||
Some(
|
||||
limiter
|
||||
.limit_ctx_req(&opt_req)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(OtherSnafu)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let mut outputs = vec![];
|
||||
|
||||
for (temp_ctx, requests) in opt_req.as_req_iter(ctx) {
|
||||
|
||||
@@ -175,6 +175,18 @@ impl PromStoreProtocolHandler for Instance {
|
||||
.get::<PromStoreProtocolInterceptorRef<servers::error::Error>>();
|
||||
interceptor_ref.pre_write(&request, ctx.clone())?;
|
||||
|
||||
let _guard = if let Some(limiter) = &self.limiter {
|
||||
Some(
|
||||
limiter
|
||||
.limit_row_inserts(&request)
|
||||
.await
|
||||
.map_err(BoxedError::new)
|
||||
.context(error::OtherSnafu)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let output = if with_metric_engine {
|
||||
let physical_table = ctx
|
||||
.extension(PHYSICAL_TABLE_PARAM)
|
||||
|
||||
@@ -19,6 +19,7 @@ pub mod events;
|
||||
pub mod frontend;
|
||||
pub mod heartbeat;
|
||||
pub mod instance;
|
||||
pub(crate) mod limiter;
|
||||
pub(crate) mod metrics;
|
||||
pub mod server;
|
||||
pub mod service_config;
|
||||
|
||||
332
src/frontend/src/limiter.rs
Normal file
332
src/frontend/src/limiter.rs
Normal file
@@ -0,0 +1,332 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::column::Values;
|
||||
use api::v1::greptime_request::Request;
|
||||
use api::v1::value::ValueData;
|
||||
use api::v1::{
|
||||
Decimal128, InsertRequests, IntervalMonthDayNano, JsonValue, RowInsertRequest,
|
||||
RowInsertRequests, json_value,
|
||||
};
|
||||
use pipeline::ContextReq;
|
||||
use snafu::ResultExt;
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
|
||||
use crate::error::{AcquireLimiterSnafu, Result};
|
||||
|
||||
pub(crate) type LimiterRef = Arc<Limiter>;
|
||||
|
||||
/// A frontend request limiter that controls the total size of in-flight write
|
||||
/// requests.
|
||||
pub(crate) struct Limiter {
|
||||
max_in_flight_write_bytes: usize,
|
||||
byte_counter: Arc<Semaphore>,
|
||||
}
|
||||
|
||||
impl Limiter {
|
||||
pub fn new(max_in_flight_write_bytes: usize) -> Self {
|
||||
Self {
|
||||
byte_counter: Arc::new(Semaphore::new(max_in_flight_write_bytes)),
|
||||
max_in_flight_write_bytes,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn limit_request(&self, request: &Request) -> Result<OwnedSemaphorePermit> {
|
||||
let size = match request {
|
||||
Request::Inserts(requests) => self.insert_requests_data_size(requests),
|
||||
Request::RowInserts(requests) => {
|
||||
self.rows_insert_requests_data_size(requests.inserts.iter())
|
||||
}
|
||||
_ => 0,
|
||||
};
|
||||
self.limit_in_flight_write_bytes(size).await
|
||||
}
|
||||
|
||||
pub async fn limit_row_inserts(
|
||||
&self,
|
||||
requests: &RowInsertRequests,
|
||||
) -> Result<OwnedSemaphorePermit> {
|
||||
let size = self.rows_insert_requests_data_size(requests.inserts.iter());
|
||||
self.limit_in_flight_write_bytes(size).await
|
||||
}
|
||||
|
||||
pub async fn limit_ctx_req(&self, opt_req: &ContextReq) -> Result<OwnedSemaphorePermit> {
|
||||
let size = self.rows_insert_requests_data_size(opt_req.ref_all_req());
|
||||
self.limit_in_flight_write_bytes(size).await
|
||||
}
|
||||
|
||||
/// Await until more inflight bytes are available
|
||||
pub async fn limit_in_flight_write_bytes(&self, bytes: usize) -> Result<OwnedSemaphorePermit> {
|
||||
self.byte_counter
|
||||
.clone()
|
||||
.acquire_many_owned(bytes as u32)
|
||||
.await
|
||||
.context(AcquireLimiterSnafu)
|
||||
}
|
||||
|
||||
/// Returns the current in-flight write bytes.
|
||||
#[allow(dead_code)]
|
||||
pub fn in_flight_write_bytes(&self) -> usize {
|
||||
self.max_in_flight_write_bytes - self.byte_counter.available_permits()
|
||||
}
|
||||
|
||||
fn insert_requests_data_size(&self, request: &InsertRequests) -> usize {
|
||||
let mut size: usize = 0;
|
||||
for insert in &request.inserts {
|
||||
for column in &insert.columns {
|
||||
if let Some(values) = &column.values {
|
||||
size += Self::size_of_column_values(values);
|
||||
}
|
||||
}
|
||||
}
|
||||
size
|
||||
}
|
||||
|
||||
fn rows_insert_requests_data_size<'a>(
|
||||
&self,
|
||||
inserts: impl Iterator<Item = &'a RowInsertRequest>,
|
||||
) -> usize {
|
||||
let mut size: usize = 0;
|
||||
for insert in inserts {
|
||||
if let Some(rows) = &insert.rows {
|
||||
for row in &rows.rows {
|
||||
for value in &row.values {
|
||||
if let Some(value) = &value.value_data {
|
||||
size += Self::size_of_value_data(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
size
|
||||
}
|
||||
|
||||
fn size_of_column_values(values: &Values) -> usize {
|
||||
let mut size: usize = 0;
|
||||
size += values.i8_values.len() * size_of::<i32>();
|
||||
size += values.i16_values.len() * size_of::<i32>();
|
||||
size += values.i32_values.len() * size_of::<i32>();
|
||||
size += values.i64_values.len() * size_of::<i64>();
|
||||
size += values.u8_values.len() * size_of::<u32>();
|
||||
size += values.u16_values.len() * size_of::<u32>();
|
||||
size += values.u32_values.len() * size_of::<u32>();
|
||||
size += values.u64_values.len() * size_of::<u64>();
|
||||
size += values.f32_values.len() * size_of::<f32>();
|
||||
size += values.f64_values.len() * size_of::<f64>();
|
||||
size += values.bool_values.len() * size_of::<bool>();
|
||||
size += values
|
||||
.binary_values
|
||||
.iter()
|
||||
.map(|v| v.len() * size_of::<u8>())
|
||||
.sum::<usize>();
|
||||
size += values.string_values.iter().map(|v| v.len()).sum::<usize>();
|
||||
size += values.date_values.len() * size_of::<i32>();
|
||||
size += values.datetime_values.len() * size_of::<i64>();
|
||||
size += values.timestamp_second_values.len() * size_of::<i64>();
|
||||
size += values.timestamp_millisecond_values.len() * size_of::<i64>();
|
||||
size += values.timestamp_microsecond_values.len() * size_of::<i64>();
|
||||
size += values.timestamp_nanosecond_values.len() * size_of::<i64>();
|
||||
size += values.time_second_values.len() * size_of::<i64>();
|
||||
size += values.time_millisecond_values.len() * size_of::<i64>();
|
||||
size += values.time_microsecond_values.len() * size_of::<i64>();
|
||||
size += values.time_nanosecond_values.len() * size_of::<i64>();
|
||||
size += values.interval_year_month_values.len() * size_of::<i64>();
|
||||
size += values.interval_day_time_values.len() * size_of::<i64>();
|
||||
size += values.interval_month_day_nano_values.len() * size_of::<IntervalMonthDayNano>();
|
||||
size += values.decimal128_values.len() * size_of::<Decimal128>();
|
||||
size += values
|
||||
.list_values
|
||||
.iter()
|
||||
.map(|v| {
|
||||
v.items
|
||||
.iter()
|
||||
.map(|item| {
|
||||
item.value_data
|
||||
.as_ref()
|
||||
.map(Self::size_of_value_data)
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum::<usize>()
|
||||
})
|
||||
.sum::<usize>();
|
||||
size += values
|
||||
.struct_values
|
||||
.iter()
|
||||
.map(|v| {
|
||||
v.items
|
||||
.iter()
|
||||
.map(|item| {
|
||||
item.value_data
|
||||
.as_ref()
|
||||
.map(Self::size_of_value_data)
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum::<usize>()
|
||||
})
|
||||
.sum::<usize>();
|
||||
|
||||
size
|
||||
}
|
||||
|
||||
fn size_of_value_data(value: &ValueData) -> usize {
|
||||
match value {
|
||||
ValueData::I8Value(_) => size_of::<i32>(),
|
||||
ValueData::I16Value(_) => size_of::<i32>(),
|
||||
ValueData::I32Value(_) => size_of::<i32>(),
|
||||
ValueData::I64Value(_) => size_of::<i64>(),
|
||||
ValueData::U8Value(_) => size_of::<u32>(),
|
||||
ValueData::U16Value(_) => size_of::<u32>(),
|
||||
ValueData::U32Value(_) => size_of::<u32>(),
|
||||
ValueData::U64Value(_) => size_of::<u64>(),
|
||||
ValueData::F32Value(_) => size_of::<f32>(),
|
||||
ValueData::F64Value(_) => size_of::<f64>(),
|
||||
ValueData::BoolValue(_) => size_of::<bool>(),
|
||||
ValueData::BinaryValue(v) => v.len() * size_of::<u8>(),
|
||||
ValueData::StringValue(v) => v.len(),
|
||||
ValueData::DateValue(_) => size_of::<i32>(),
|
||||
ValueData::DatetimeValue(_) => size_of::<i64>(),
|
||||
ValueData::TimestampSecondValue(_) => size_of::<i64>(),
|
||||
ValueData::TimestampMillisecondValue(_) => size_of::<i64>(),
|
||||
ValueData::TimestampMicrosecondValue(_) => size_of::<i64>(),
|
||||
ValueData::TimestampNanosecondValue(_) => size_of::<i64>(),
|
||||
ValueData::TimeSecondValue(_) => size_of::<i64>(),
|
||||
ValueData::TimeMillisecondValue(_) => size_of::<i64>(),
|
||||
ValueData::TimeMicrosecondValue(_) => size_of::<i64>(),
|
||||
ValueData::TimeNanosecondValue(_) => size_of::<i64>(),
|
||||
ValueData::IntervalYearMonthValue(_) => size_of::<i32>(),
|
||||
ValueData::IntervalDayTimeValue(_) => size_of::<i64>(),
|
||||
ValueData::IntervalMonthDayNanoValue(_) => size_of::<IntervalMonthDayNano>(),
|
||||
ValueData::Decimal128Value(_) => size_of::<Decimal128>(),
|
||||
ValueData::ListValue(list_values) => list_values
|
||||
.items
|
||||
.iter()
|
||||
.map(|item| {
|
||||
item.value_data
|
||||
.as_ref()
|
||||
.map(Self::size_of_value_data)
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum(),
|
||||
ValueData::StructValue(struct_values) => struct_values
|
||||
.items
|
||||
.iter()
|
||||
.map(|item| {
|
||||
item.value_data
|
||||
.as_ref()
|
||||
.map(Self::size_of_value_data)
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum(),
|
||||
ValueData::JsonValue(v) => {
|
||||
fn calc(v: &JsonValue) -> usize {
|
||||
let Some(value) = v.value.as_ref() else {
|
||||
return 0;
|
||||
};
|
||||
match value {
|
||||
json_value::Value::Boolean(_) => size_of::<bool>(),
|
||||
json_value::Value::Int(_) => size_of::<i64>(),
|
||||
json_value::Value::Uint(_) => size_of::<u64>(),
|
||||
json_value::Value::Float(_) => size_of::<f64>(),
|
||||
json_value::Value::Str(s) => s.len(),
|
||||
json_value::Value::Array(array) => array.items.iter().map(calc).sum(),
|
||||
json_value::Value::Object(object) => object
|
||||
.entries
|
||||
.iter()
|
||||
.flat_map(|entry| {
|
||||
entry.value.as_ref().map(|v| entry.key.len() + calc(v))
|
||||
})
|
||||
.sum(),
|
||||
}
|
||||
}
|
||||
calc(v)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use api::v1::column::Values;
|
||||
use api::v1::greptime_request::Request;
|
||||
use api::v1::{Column, InsertRequest};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn generate_request(size: usize) -> Request {
|
||||
let i8_values = vec![0; size / 4];
|
||||
Request::Inserts(InsertRequests {
|
||||
inserts: vec![InsertRequest {
|
||||
columns: vec![Column {
|
||||
values: Some(Values {
|
||||
i8_values,
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
}],
|
||||
..Default::default()
|
||||
}],
|
||||
})
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_limiter() {
|
||||
let limiter_ref: LimiterRef = Arc::new(Limiter::new(1024));
|
||||
let tasks_count = 10;
|
||||
let request_data_size = 100;
|
||||
let mut handles = vec![];
|
||||
|
||||
// Generate multiple requests to test the limiter.
|
||||
for _ in 0..tasks_count {
|
||||
let limiter = limiter_ref.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
let result = limiter
|
||||
.limit_request(&generate_request(request_data_size))
|
||||
.await;
|
||||
assert!(result.is_ok());
|
||||
});
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all threads to complete.
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_in_flight_write_bytes() {
|
||||
let limiter_ref: LimiterRef = Arc::new(Limiter::new(1024));
|
||||
let req1 = generate_request(100);
|
||||
let result1 = limiter_ref
|
||||
.limit_request(&req1)
|
||||
.await
|
||||
.expect("failed to acquire permits");
|
||||
assert_eq!(limiter_ref.in_flight_write_bytes(), 100);
|
||||
|
||||
let req2 = generate_request(200);
|
||||
let result2 = limiter_ref
|
||||
.limit_request(&req2)
|
||||
.await
|
||||
.expect("failed to acquire permits");
|
||||
assert_eq!(limiter_ref.in_flight_write_bytes(), 300);
|
||||
|
||||
drop(result1);
|
||||
assert_eq!(limiter_ref.in_flight_write_bytes(), 200);
|
||||
|
||||
drop(result2);
|
||||
assert_eq!(limiter_ref.in_flight_write_bytes(), 0);
|
||||
}
|
||||
}
|
||||
@@ -40,7 +40,6 @@ use servers::otel_arrow::OtelArrowServiceHandler;
|
||||
use servers::postgres::PostgresServer;
|
||||
use servers::query_handler::grpc::ServerGrpcQueryHandlerAdapter;
|
||||
use servers::query_handler::sql::ServerSqlQueryHandlerAdapter;
|
||||
use servers::request_memory_limiter::ServerMemoryLimiter;
|
||||
use servers::server::{Server, ServerHandlers};
|
||||
use servers::tls::{ReloadableTlsServerConfig, maybe_watch_server_tls_config};
|
||||
use snafu::ResultExt;
|
||||
@@ -77,25 +76,15 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn grpc_server_builder(
|
||||
&self,
|
||||
opts: &GrpcOptions,
|
||||
request_memory_limiter: ServerMemoryLimiter,
|
||||
) -> Result<GrpcServerBuilder> {
|
||||
pub fn grpc_server_builder(&self, opts: &GrpcOptions) -> Result<GrpcServerBuilder> {
|
||||
let builder = GrpcServerBuilder::new(opts.as_config(), common_runtime::global_runtime())
|
||||
.with_memory_limiter(request_memory_limiter)
|
||||
.with_tls_config(opts.tls.clone())
|
||||
.context(error::InvalidTlsConfigSnafu)?;
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
pub fn http_server_builder(
|
||||
&self,
|
||||
opts: &FrontendOptions,
|
||||
request_memory_limiter: ServerMemoryLimiter,
|
||||
) -> HttpServerBuilder {
|
||||
pub fn http_server_builder(&self, opts: &FrontendOptions) -> HttpServerBuilder {
|
||||
let mut builder = HttpServerBuilder::new(opts.http.clone())
|
||||
.with_memory_limiter(request_memory_limiter)
|
||||
.with_sql_handler(ServerSqlQueryHandlerAdapter::arc(self.instance.clone()));
|
||||
|
||||
let validator = self.plugins.get::<LogValidatorRef>();
|
||||
@@ -180,12 +169,11 @@ where
|
||||
meta_client: &Option<MetaClientOptions>,
|
||||
name: Option<String>,
|
||||
external: bool,
|
||||
request_memory_limiter: ServerMemoryLimiter,
|
||||
) -> Result<GrpcServer> {
|
||||
let builder = if let Some(builder) = self.grpc_server_builder.take() {
|
||||
builder
|
||||
} else {
|
||||
self.grpc_server_builder(grpc, request_memory_limiter)?
|
||||
self.grpc_server_builder(grpc)?
|
||||
};
|
||||
|
||||
let user_provider = if external {
|
||||
@@ -247,16 +235,11 @@ where
|
||||
Ok(grpc_server)
|
||||
}
|
||||
|
||||
fn build_http_server(
|
||||
&mut self,
|
||||
opts: &FrontendOptions,
|
||||
toml: String,
|
||||
request_memory_limiter: ServerMemoryLimiter,
|
||||
) -> Result<HttpServer> {
|
||||
fn build_http_server(&mut self, opts: &FrontendOptions, toml: String) -> Result<HttpServer> {
|
||||
let builder = if let Some(builder) = self.http_server_builder.take() {
|
||||
builder
|
||||
} else {
|
||||
self.http_server_builder(opts, request_memory_limiter)
|
||||
self.http_server_builder(opts)
|
||||
};
|
||||
|
||||
let http_server = builder
|
||||
@@ -274,12 +257,6 @@ where
|
||||
let toml = opts.to_toml().context(TomlFormatSnafu)?;
|
||||
let opts: FrontendOptions = opts.into();
|
||||
|
||||
// Create request memory limiter for all server protocols
|
||||
let request_memory_limiter = ServerMemoryLimiter::new(
|
||||
opts.max_in_flight_write_bytes.as_bytes(),
|
||||
opts.write_bytes_exhausted_policy,
|
||||
);
|
||||
|
||||
let handlers = ServerHandlers::default();
|
||||
|
||||
let user_provider = self.plugins.get::<UserProviderRef>();
|
||||
@@ -287,13 +264,7 @@ where
|
||||
{
|
||||
// Always init GRPC server
|
||||
let grpc_addr = parse_addr(&opts.grpc.bind_addr)?;
|
||||
let grpc_server = self.build_grpc_server(
|
||||
&opts.grpc,
|
||||
&opts.meta_client,
|
||||
None,
|
||||
true,
|
||||
request_memory_limiter.clone(),
|
||||
)?;
|
||||
let grpc_server = self.build_grpc_server(&opts.grpc, &opts.meta_client, None, true)?;
|
||||
handlers.insert((Box::new(grpc_server), grpc_addr));
|
||||
}
|
||||
|
||||
@@ -305,7 +276,6 @@ where
|
||||
&opts.meta_client,
|
||||
Some("INTERNAL_GRPC_SERVER".to_string()),
|
||||
false,
|
||||
request_memory_limiter.clone(),
|
||||
)?;
|
||||
handlers.insert((Box::new(grpc_server), grpc_addr));
|
||||
}
|
||||
@@ -314,8 +284,7 @@ where
|
||||
// Always init HTTP server
|
||||
let http_options = &opts.http;
|
||||
let http_addr = parse_addr(&http_options.addr)?;
|
||||
let http_server =
|
||||
self.build_http_server(&opts, toml, request_memory_limiter.clone())?;
|
||||
let http_server = self.build_http_server(&opts, toml)?;
|
||||
handlers.insert((Box::new(http_server), http_addr));
|
||||
}
|
||||
|
||||
|
||||
@@ -339,7 +339,6 @@ pub async fn metasrv_builder(
|
||||
opts.meta_schema_name.as_deref(),
|
||||
&opts.meta_table_name,
|
||||
opts.max_txn_ops,
|
||||
opts.auto_create_schema,
|
||||
)
|
||||
.await
|
||||
.context(error::KvBackendSnafu)?;
|
||||
|
||||
@@ -194,7 +194,7 @@ impl SchedulerCtx for DefaultGcSchedulerCtx {
|
||||
}
|
||||
|
||||
// Send GetFileRefs instructions to each datanode
|
||||
let mut all_file_refs: HashMap<RegionId, HashSet<_>> = HashMap::new();
|
||||
let mut all_file_refs: HashMap<RegionId, HashSet<FileId>> = HashMap::new();
|
||||
let mut all_manifest_versions = HashMap::new();
|
||||
|
||||
for (peer, regions) in datanode2query_regions {
|
||||
|
||||
@@ -53,7 +53,6 @@ pub fn new_empty_report_with(region_ids: impl IntoIterator<Item = RegionId>) ->
|
||||
}
|
||||
GcReport {
|
||||
deleted_files,
|
||||
deleted_indexes: HashMap::new(),
|
||||
need_retry_regions: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,11 +454,7 @@ async fn test_region_gc_concurrency_with_retryable_errors() {
|
||||
(
|
||||
region_id,
|
||||
// mock the actual gc report with deleted files when succeeded(even no files to delete)
|
||||
GcReport::new(
|
||||
HashMap::from([(region_id, vec![])]),
|
||||
Default::default(),
|
||||
HashSet::new(),
|
||||
),
|
||||
GcReport::new(HashMap::from([(region_id, vec![])]), HashSet::new()),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -20,7 +20,7 @@ use common_meta::datanode::RegionManifestInfo;
|
||||
use common_meta::peer::Peer;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::storage::{FileId, FileRef, FileRefsManifest, GcReport, RegionId};
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
|
||||
use crate::gc::mock::{
|
||||
MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
|
||||
@@ -60,10 +60,7 @@ async fn test_gc_regions_failure_handling() {
|
||||
|
||||
let file_refs = FileRefsManifest {
|
||||
manifest_version: HashMap::from([(region_id, 1)]),
|
||||
file_refs: HashMap::from([(
|
||||
region_id,
|
||||
HashSet::from([FileRef::new(region_id, FileId::random(), None)]),
|
||||
)]),
|
||||
file_refs: HashMap::from([(region_id, HashSet::from([FileId::random()]))]),
|
||||
};
|
||||
|
||||
let ctx = Arc::new(
|
||||
|
||||
@@ -356,7 +356,8 @@ impl BatchGcProcedure {
|
||||
}
|
||||
|
||||
// Send GetFileRefs instructions to each datanode
|
||||
let mut all_file_refs: HashMap<RegionId, HashSet<_>> = HashMap::new();
|
||||
let mut all_file_refs: HashMap<RegionId, HashSet<store_api::storage::FileId>> =
|
||||
HashMap::new();
|
||||
let mut all_manifest_versions = HashMap::new();
|
||||
|
||||
for (peer, regions) in datanode2query_regions {
|
||||
|
||||
@@ -163,6 +163,8 @@ pub struct MetasrvOptions {
|
||||
pub backend_client: BackendClientOptions,
|
||||
/// The type of selector.
|
||||
pub selector: SelectorType,
|
||||
/// Whether to use the memory store.
|
||||
pub use_memory_store: bool,
|
||||
/// Whether to enable region failover.
|
||||
pub enable_region_failover: bool,
|
||||
/// The base heartbeat interval.
|
||||
@@ -231,9 +233,6 @@ pub struct MetasrvOptions {
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
/// Optional PostgreSQL schema for metadata table (defaults to current search_path if empty).
|
||||
pub meta_schema_name: Option<String>,
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
/// Automatically create PostgreSQL schema if it doesn't exist (default: true).
|
||||
pub auto_create_schema: bool,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub node_max_idle_time: Duration,
|
||||
/// The event recorder options.
|
||||
@@ -251,6 +250,7 @@ impl fmt::Debug for MetasrvOptions {
|
||||
.field("store_addrs", &self.sanitize_store_addrs())
|
||||
.field("backend_tls", &self.backend_tls)
|
||||
.field("selector", &self.selector)
|
||||
.field("use_memory_store", &self.use_memory_store)
|
||||
.field("enable_region_failover", &self.enable_region_failover)
|
||||
.field(
|
||||
"allow_region_failover_on_local_wal",
|
||||
@@ -301,6 +301,7 @@ impl Default for MetasrvOptions {
|
||||
store_addrs: vec!["127.0.0.1:2379".to_string()],
|
||||
backend_tls: None,
|
||||
selector: SelectorType::default(),
|
||||
use_memory_store: false,
|
||||
enable_region_failover: false,
|
||||
heartbeat_interval: distributed_time_constants::BASE_HEARTBEAT_INTERVAL,
|
||||
region_failure_detector_initialization_delay: Duration::from_secs(10 * 60),
|
||||
@@ -336,8 +337,6 @@ impl Default for MetasrvOptions {
|
||||
meta_election_lock_id: common_meta::kv_backend::DEFAULT_META_ELECTION_LOCK_ID,
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
meta_schema_name: None,
|
||||
#[cfg(feature = "pg_kvbackend")]
|
||||
auto_create_schema: true,
|
||||
node_max_idle_time: Duration::from_secs(24 * 60 * 60),
|
||||
event_recorder: EventRecorderOptions::default(),
|
||||
stats_persistence: StatsPersistenceOptions::default(),
|
||||
|
||||
@@ -12,14 +12,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
pub(crate) mod enter_staging_region;
|
||||
pub(crate) mod repartition_start;
|
||||
pub(crate) mod update_metadata;
|
||||
pub(crate) mod utils;
|
||||
|
||||
use std::any::Any;
|
||||
use std::fmt::Debug;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::DatanodeId;
|
||||
@@ -37,7 +34,6 @@ use uuid::Uuid;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::procedure::repartition::plan::RegionDescriptor;
|
||||
use crate::service::mailbox::MailboxRef;
|
||||
|
||||
pub type GroupId = Uuid;
|
||||
|
||||
@@ -49,10 +45,6 @@ pub struct Context {
|
||||
pub cache_invalidator: CacheInvalidatorRef,
|
||||
|
||||
pub table_metadata_manager: TableMetadataManagerRef,
|
||||
|
||||
pub mailbox: MailboxRef,
|
||||
|
||||
pub server_addr: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
@@ -192,13 +184,6 @@ impl Context {
|
||||
.await
|
||||
.context(error::TableMetadataManagerSnafu)
|
||||
}
|
||||
|
||||
/// Returns the next operation timeout.
|
||||
///
|
||||
/// If the next operation timeout is not set, it will return `None`.
|
||||
pub fn next_operation_timeout(&self) -> Option<Duration> {
|
||||
Some(Duration::from_secs(10))
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the region routes of the given table route value.
|
||||
|
||||
@@ -1,717 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::any::Any;
|
||||
use std::collections::HashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use api::v1::meta::MailboxMessage;
|
||||
use common_meta::instruction::{
|
||||
EnterStagingRegionReply, EnterStagingRegionsReply, Instruction, InstructionReply,
|
||||
};
|
||||
use common_meta::peer::Peer;
|
||||
use common_procedure::{Context as ProcedureContext, Status};
|
||||
use common_telemetry::info;
|
||||
use futures::future::join_all;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
|
||||
use crate::error::{self, Error, Result};
|
||||
use crate::handler::HeartbeatMailbox;
|
||||
use crate::procedure::repartition::group::utils::{
|
||||
HandleMultipleResult, group_region_routes_by_peer, handle_multiple_results,
|
||||
};
|
||||
use crate::procedure::repartition::group::{Context, GroupPrepareResult, State};
|
||||
use crate::procedure::repartition::plan::RegionDescriptor;
|
||||
use crate::service::mailbox::{Channel, MailboxRef};
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct EnterStagingRegion;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
#[typetag::serde]
|
||||
impl State for EnterStagingRegion {
|
||||
async fn next(
|
||||
&mut self,
|
||||
ctx: &mut Context,
|
||||
_procedure_ctx: &ProcedureContext,
|
||||
) -> Result<(Box<dyn State>, Status)> {
|
||||
self.enter_staging_regions(ctx).await?;
|
||||
|
||||
Ok(Self::next_state())
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn Any {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl EnterStagingRegion {
|
||||
#[allow(dead_code)]
|
||||
fn next_state() -> (Box<dyn State>, Status) {
|
||||
// TODO(weny): change it later.
|
||||
(Box::new(EnterStagingRegion), Status::executing(true))
|
||||
}
|
||||
|
||||
fn build_enter_staging_instructions(
|
||||
prepare_result: &GroupPrepareResult,
|
||||
targets: &[RegionDescriptor],
|
||||
) -> Result<HashMap<Peer, Instruction>> {
|
||||
let target_partition_expr_by_region = targets
|
||||
.iter()
|
||||
.map(|target| {
|
||||
Ok((
|
||||
target.region_id,
|
||||
target
|
||||
.partition_expr
|
||||
.as_json_str()
|
||||
.context(error::SerializePartitionExprSnafu)?,
|
||||
))
|
||||
})
|
||||
.collect::<Result<HashMap<_, _>>>()?;
|
||||
// Safety: `leader_peer` is set for all region routes, checked in `repartition_start`.
|
||||
let target_region_routes_by_peer =
|
||||
group_region_routes_by_peer(&prepare_result.target_routes);
|
||||
let mut instructions = HashMap::with_capacity(target_region_routes_by_peer.len());
|
||||
for (peer, region_ids) in target_region_routes_by_peer {
|
||||
let enter_staging_regions = region_ids
|
||||
.into_iter()
|
||||
.map(|region_id| common_meta::instruction::EnterStagingRegion {
|
||||
region_id,
|
||||
// Safety: the target_routes is constructed from the targets, so the region_id is always present in the map.
|
||||
partition_expr: target_partition_expr_by_region[®ion_id].clone(),
|
||||
})
|
||||
.collect();
|
||||
instructions.insert(
|
||||
peer.clone(),
|
||||
Instruction::EnterStagingRegions(enter_staging_regions),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(instructions)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
async fn enter_staging_regions(&self, ctx: &mut Context) -> Result<()> {
|
||||
let table_id = ctx.persistent_ctx.table_id;
|
||||
let group_id = ctx.persistent_ctx.group_id;
|
||||
// Safety: the group prepare result is set in the RepartitionStart state.
|
||||
let prepare_result = ctx.persistent_ctx.group_prepare_result.as_ref().unwrap();
|
||||
let targets = &ctx.persistent_ctx.targets;
|
||||
let instructions = Self::build_enter_staging_instructions(prepare_result, targets)?;
|
||||
let operation_timeout =
|
||||
ctx.next_operation_timeout()
|
||||
.context(error::ExceededDeadlineSnafu {
|
||||
operation: "Enter staging regions",
|
||||
})?;
|
||||
let (peers, tasks): (Vec<_>, Vec<_>) = instructions
|
||||
.iter()
|
||||
.map(|(peer, instruction)| {
|
||||
(
|
||||
peer,
|
||||
Self::enter_staging_region(
|
||||
&ctx.mailbox,
|
||||
&ctx.server_addr,
|
||||
peer,
|
||||
instruction,
|
||||
operation_timeout,
|
||||
),
|
||||
)
|
||||
})
|
||||
.unzip();
|
||||
info!(
|
||||
"Sent enter staging regions instructions to peers: {:?} for repartition table {}, group id {}",
|
||||
peers, table_id, group_id
|
||||
);
|
||||
|
||||
let format_err_msg = |idx: usize, error: &Error| {
|
||||
let peer = peers[idx];
|
||||
format!(
|
||||
"Failed to enter staging regions on datanode {:?}, error: {:?}",
|
||||
peer, error
|
||||
)
|
||||
};
|
||||
// Waits for all tasks to complete.
|
||||
let results = join_all(tasks).await;
|
||||
let result = handle_multiple_results(&results);
|
||||
match result {
|
||||
HandleMultipleResult::AllSuccessful => Ok(()),
|
||||
HandleMultipleResult::AllRetryable(retryable_errors) => error::RetryLaterSnafu {
|
||||
reason: format!(
|
||||
"All retryable errors during entering staging regions for repartition table {}, group id {}: {:?}",
|
||||
table_id, group_id,
|
||||
retryable_errors
|
||||
.iter()
|
||||
.map(|(idx, error)| format_err_msg(*idx, error))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
),
|
||||
}
|
||||
.fail(),
|
||||
HandleMultipleResult::AllNonRetryable(non_retryable_errors) => error::UnexpectedSnafu {
|
||||
violated: format!(
|
||||
"All non retryable errors during entering staging regions for repartition table {}, group id {}: {:?}",
|
||||
table_id, group_id,
|
||||
non_retryable_errors
|
||||
.iter()
|
||||
.map(|(idx, error)| format_err_msg(*idx, error))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
),
|
||||
}
|
||||
.fail(),
|
||||
HandleMultipleResult::PartialRetryable {
|
||||
retryable_errors,
|
||||
non_retryable_errors,
|
||||
} => error::UnexpectedSnafu {
|
||||
violated: format!(
|
||||
"Partial retryable errors during entering staging regions for repartition table {}, group id {}: {:?}, non retryable errors: {:?}",
|
||||
table_id, group_id,
|
||||
retryable_errors
|
||||
.iter()
|
||||
.map(|(idx, error)| format_err_msg(*idx, error))
|
||||
.collect::<Vec<_>>()
|
||||
.join(","),
|
||||
non_retryable_errors
|
||||
.iter()
|
||||
.map(|(idx, error)| format_err_msg(*idx, error))
|
||||
.collect::<Vec<_>>()
|
||||
.join(","),
|
||||
),
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Enter staging region on a datanode.
|
||||
///
|
||||
/// Retry:
|
||||
/// - Pusher is not found.
|
||||
/// - Mailbox timeout.
|
||||
///
|
||||
/// Abort(non-retry):
|
||||
/// - Unexpected instruction reply.
|
||||
/// - Exceeded deadline of enter staging regions instruction.
|
||||
/// - Target region doesn't exist on the datanode.
|
||||
async fn enter_staging_region(
|
||||
mailbox: &MailboxRef,
|
||||
server_addr: &str,
|
||||
peer: &Peer,
|
||||
instruction: &Instruction,
|
||||
timeout: Duration,
|
||||
) -> Result<()> {
|
||||
let ch = Channel::Datanode(peer.id);
|
||||
let message = MailboxMessage::json_message(
|
||||
&format!("Enter staging regions: {:?}", instruction),
|
||||
&format!("Metasrv@{}", server_addr),
|
||||
&format!("Datanode-{}@{}", peer.id, peer.addr),
|
||||
common_time::util::current_time_millis(),
|
||||
&instruction,
|
||||
)
|
||||
.with_context(|_| error::SerializeToJsonSnafu {
|
||||
input: instruction.to_string(),
|
||||
})?;
|
||||
let now = Instant::now();
|
||||
let receiver = mailbox.send(&ch, message, timeout).await;
|
||||
|
||||
let receiver = match receiver {
|
||||
Ok(receiver) => receiver,
|
||||
Err(error::Error::PusherNotFound { .. }) => error::RetryLaterSnafu {
|
||||
reason: format!(
|
||||
"Pusher not found for enter staging regions on datanode {:?}, elapsed: {:?}",
|
||||
peer,
|
||||
now.elapsed()
|
||||
),
|
||||
}
|
||||
.fail()?,
|
||||
Err(err) => {
|
||||
return Err(err);
|
||||
}
|
||||
};
|
||||
|
||||
match receiver.await {
|
||||
Ok(msg) => {
|
||||
let reply = HeartbeatMailbox::json_reply(&msg)?;
|
||||
info!(
|
||||
"Received enter staging regions reply: {:?}, elapsed: {:?}",
|
||||
reply,
|
||||
now.elapsed()
|
||||
);
|
||||
let InstructionReply::EnterStagingRegions(EnterStagingRegionsReply { replies }) =
|
||||
reply
|
||||
else {
|
||||
return error::UnexpectedInstructionReplySnafu {
|
||||
mailbox_message: msg.to_string(),
|
||||
reason: "expect enter staging regions reply",
|
||||
}
|
||||
.fail();
|
||||
};
|
||||
for reply in replies {
|
||||
Self::handle_enter_staging_region_reply(&reply, &now, peer)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Err(error::Error::MailboxTimeout { .. }) => {
|
||||
let reason = format!(
|
||||
"Mailbox received timeout for enter staging regions on datanode {:?}, elapsed: {:?}",
|
||||
peer,
|
||||
now.elapsed()
|
||||
);
|
||||
error::RetryLaterSnafu { reason }.fail()
|
||||
}
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_enter_staging_region_reply(
|
||||
EnterStagingRegionReply {
|
||||
region_id,
|
||||
ready,
|
||||
exists,
|
||||
error,
|
||||
}: &EnterStagingRegionReply,
|
||||
now: &Instant,
|
||||
peer: &Peer,
|
||||
) -> Result<()> {
|
||||
ensure!(
|
||||
exists,
|
||||
error::UnexpectedSnafu {
|
||||
violated: format!(
|
||||
"Region {} doesn't exist on datanode {:?}, elapsed: {:?}",
|
||||
region_id,
|
||||
peer,
|
||||
now.elapsed()
|
||||
)
|
||||
}
|
||||
);
|
||||
|
||||
if error.is_some() {
|
||||
return error::RetryLaterSnafu {
|
||||
reason: format!(
|
||||
"Failed to enter staging region {} on datanode {:?}, error: {:?}, elapsed: {:?}",
|
||||
region_id, peer, error, now.elapsed()
|
||||
),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
|
||||
ensure!(
|
||||
ready,
|
||||
error::RetryLaterSnafu {
|
||||
reason: format!(
|
||||
"Region {} is still entering staging state on datanode {:?}, elapsed: {:?}",
|
||||
region_id,
|
||||
peer,
|
||||
now.elapsed()
|
||||
),
|
||||
}
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::assert_matches::assert_matches;
|
||||
use std::time::Duration;
|
||||
|
||||
use common_meta::instruction::Instruction;
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::router::{Region, RegionRoute};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::error::{self, Error};
|
||||
use crate::procedure::repartition::group::GroupPrepareResult;
|
||||
use crate::procedure::repartition::group::enter_staging_region::EnterStagingRegion;
|
||||
use crate::procedure::repartition::plan::RegionDescriptor;
|
||||
use crate::procedure::repartition::test_util::{
|
||||
TestingEnv, new_persistent_context, range_expr,
|
||||
};
|
||||
use crate::procedure::test_util::{
|
||||
new_close_region_reply, new_enter_staging_region_reply, send_mock_reply,
|
||||
};
|
||||
use crate::service::mailbox::Channel;
|
||||
|
||||
#[test]
|
||||
fn test_build_enter_staging_instructions() {
|
||||
let table_id = 1024;
|
||||
let prepare_result = GroupPrepareResult {
|
||||
source_routes: vec![RegionRoute {
|
||||
region: Region {
|
||||
id: RegionId::new(table_id, 1),
|
||||
..Default::default()
|
||||
},
|
||||
leader_peer: Some(Peer::empty(1)),
|
||||
..Default::default()
|
||||
}],
|
||||
target_routes: vec![
|
||||
RegionRoute {
|
||||
region: Region {
|
||||
id: RegionId::new(table_id, 1),
|
||||
..Default::default()
|
||||
},
|
||||
leader_peer: Some(Peer::empty(1)),
|
||||
..Default::default()
|
||||
},
|
||||
RegionRoute {
|
||||
region: Region {
|
||||
id: RegionId::new(table_id, 2),
|
||||
..Default::default()
|
||||
},
|
||||
leader_peer: Some(Peer::empty(2)),
|
||||
..Default::default()
|
||||
},
|
||||
],
|
||||
central_region: RegionId::new(table_id, 1),
|
||||
central_region_datanode_id: 1,
|
||||
};
|
||||
let targets = test_targets();
|
||||
let instructions =
|
||||
EnterStagingRegion::build_enter_staging_instructions(&prepare_result, &targets)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(instructions.len(), 2);
|
||||
let instruction_1 = instructions
|
||||
.get(&Peer::empty(1))
|
||||
.unwrap()
|
||||
.clone()
|
||||
.into_enter_staging_regions()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
instruction_1,
|
||||
vec![common_meta::instruction::EnterStagingRegion {
|
||||
region_id: RegionId::new(table_id, 1),
|
||||
partition_expr: range_expr("x", 0, 10).as_json_str().unwrap(),
|
||||
}]
|
||||
);
|
||||
let instruction_2 = instructions
|
||||
.get(&Peer::empty(2))
|
||||
.unwrap()
|
||||
.clone()
|
||||
.into_enter_staging_regions()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
instruction_2,
|
||||
vec![common_meta::instruction::EnterStagingRegion {
|
||||
region_id: RegionId::new(table_id, 2),
|
||||
partition_expr: range_expr("x", 10, 20).as_json_str().unwrap(),
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_datanode_is_unreachable() {
|
||||
let env = TestingEnv::new();
|
||||
let server_addr = "localhost";
|
||||
let peer = Peer::empty(1);
|
||||
let instruction =
|
||||
Instruction::EnterStagingRegions(vec![common_meta::instruction::EnterStagingRegion {
|
||||
region_id: RegionId::new(1024, 1),
|
||||
partition_expr: range_expr("x", 0, 10).as_json_str().unwrap(),
|
||||
}]);
|
||||
let timeout = Duration::from_secs(10);
|
||||
|
||||
let err = EnterStagingRegion::enter_staging_region(
|
||||
env.mailbox_ctx.mailbox(),
|
||||
server_addr,
|
||||
&peer,
|
||||
&instruction,
|
||||
timeout,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_matches!(err, Error::RetryLater { .. });
|
||||
assert!(err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_enter_staging_region_exceeded_deadline() {
|
||||
let mut env = TestingEnv::new();
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
env.mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(1), tx)
|
||||
.await;
|
||||
let server_addr = "localhost";
|
||||
let peer = Peer::empty(1);
|
||||
let instruction =
|
||||
Instruction::EnterStagingRegions(vec![common_meta::instruction::EnterStagingRegion {
|
||||
region_id: RegionId::new(1024, 1),
|
||||
partition_expr: range_expr("x", 0, 10).as_json_str().unwrap(),
|
||||
}]);
|
||||
let timeout = Duration::from_secs(10);
|
||||
|
||||
// Sends a timeout error.
|
||||
send_mock_reply(env.mailbox_ctx.mailbox().clone(), rx, |id| {
|
||||
Err(error::MailboxTimeoutSnafu { id }.build())
|
||||
});
|
||||
|
||||
let err = EnterStagingRegion::enter_staging_region(
|
||||
env.mailbox_ctx.mailbox(),
|
||||
server_addr,
|
||||
&peer,
|
||||
&instruction,
|
||||
timeout,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::RetryLater { .. });
|
||||
assert!(err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unexpected_instruction_reply() {
|
||||
let mut env = TestingEnv::new();
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
|
||||
let server_addr = "localhost";
|
||||
let peer = Peer::empty(1);
|
||||
let instruction =
|
||||
Instruction::EnterStagingRegions(vec![common_meta::instruction::EnterStagingRegion {
|
||||
region_id: RegionId::new(1024, 1),
|
||||
partition_expr: range_expr("x", 0, 10).as_json_str().unwrap(),
|
||||
}]);
|
||||
let timeout = Duration::from_secs(10);
|
||||
|
||||
env.mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(1), tx)
|
||||
.await;
|
||||
// Sends an incorrect reply.
|
||||
send_mock_reply(env.mailbox_ctx.mailbox().clone(), rx, |id| {
|
||||
Ok(new_close_region_reply(id))
|
||||
});
|
||||
|
||||
let err = EnterStagingRegion::enter_staging_region(
|
||||
env.mailbox_ctx.mailbox(),
|
||||
server_addr,
|
||||
&peer,
|
||||
&instruction,
|
||||
timeout,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::UnexpectedInstructionReply { .. });
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_enter_staging_region_failed_to_enter_staging_state() {
|
||||
let mut env = TestingEnv::new();
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
env.mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(1), tx)
|
||||
.await;
|
||||
let server_addr = "localhost";
|
||||
let peer = Peer::empty(1);
|
||||
let instruction =
|
||||
Instruction::EnterStagingRegions(vec![common_meta::instruction::EnterStagingRegion {
|
||||
region_id: RegionId::new(1024, 1),
|
||||
partition_expr: range_expr("x", 0, 10).as_json_str().unwrap(),
|
||||
}]);
|
||||
let timeout = Duration::from_secs(10);
|
||||
|
||||
// Sends a failed reply.
|
||||
send_mock_reply(env.mailbox_ctx.mailbox().clone(), rx, |id| {
|
||||
Ok(new_enter_staging_region_reply(
|
||||
id,
|
||||
RegionId::new(1024, 1),
|
||||
false,
|
||||
true,
|
||||
Some("test mocked".to_string()),
|
||||
))
|
||||
});
|
||||
|
||||
let err = EnterStagingRegion::enter_staging_region(
|
||||
env.mailbox_ctx.mailbox(),
|
||||
server_addr,
|
||||
&peer,
|
||||
&instruction,
|
||||
timeout,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::RetryLater { .. });
|
||||
assert!(err.is_retryable());
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
env.mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(1), tx)
|
||||
.await;
|
||||
// Region doesn't exist on the datanode.
|
||||
send_mock_reply(env.mailbox_ctx.mailbox().clone(), rx, |id| {
|
||||
Ok(new_enter_staging_region_reply(
|
||||
id,
|
||||
RegionId::new(1024, 1),
|
||||
false,
|
||||
false,
|
||||
None,
|
||||
))
|
||||
});
|
||||
|
||||
let err = EnterStagingRegion::enter_staging_region(
|
||||
env.mailbox_ctx.mailbox(),
|
||||
server_addr,
|
||||
&peer,
|
||||
&instruction,
|
||||
timeout,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::Unexpected { .. });
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
|
||||
fn test_prepare_result(table_id: u32) -> GroupPrepareResult {
|
||||
GroupPrepareResult {
|
||||
source_routes: vec![],
|
||||
target_routes: vec![
|
||||
RegionRoute {
|
||||
region: Region {
|
||||
id: RegionId::new(table_id, 1),
|
||||
..Default::default()
|
||||
},
|
||||
leader_peer: Some(Peer::empty(1)),
|
||||
..Default::default()
|
||||
},
|
||||
RegionRoute {
|
||||
region: Region {
|
||||
id: RegionId::new(table_id, 2),
|
||||
..Default::default()
|
||||
},
|
||||
leader_peer: Some(Peer::empty(2)),
|
||||
..Default::default()
|
||||
},
|
||||
],
|
||||
central_region: RegionId::new(table_id, 1),
|
||||
central_region_datanode_id: 1,
|
||||
}
|
||||
}
|
||||
|
||||
fn test_targets() -> Vec<RegionDescriptor> {
|
||||
vec![
|
||||
RegionDescriptor {
|
||||
region_id: RegionId::new(1024, 1),
|
||||
partition_expr: range_expr("x", 0, 10),
|
||||
},
|
||||
RegionDescriptor {
|
||||
region_id: RegionId::new(1024, 2),
|
||||
partition_expr: range_expr("x", 10, 20),
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_enter_staging_regions_all_successful() {
|
||||
let mut env = TestingEnv::new();
|
||||
let table_id = 1024;
|
||||
let targets = test_targets();
|
||||
let mut persistent_context = new_persistent_context(table_id, vec![], targets);
|
||||
persistent_context.group_prepare_result = Some(test_prepare_result(table_id));
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
env.mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(1), tx)
|
||||
.await;
|
||||
send_mock_reply(env.mailbox_ctx.mailbox().clone(), rx, |id| {
|
||||
Ok(new_enter_staging_region_reply(
|
||||
id,
|
||||
RegionId::new(1024, 1),
|
||||
true,
|
||||
true,
|
||||
None,
|
||||
))
|
||||
});
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
env.mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(2), tx)
|
||||
.await;
|
||||
send_mock_reply(env.mailbox_ctx.mailbox().clone(), rx, |id| {
|
||||
Ok(new_enter_staging_region_reply(
|
||||
id,
|
||||
RegionId::new(1024, 2),
|
||||
true,
|
||||
true,
|
||||
None,
|
||||
))
|
||||
});
|
||||
let mut ctx = env.create_context(persistent_context);
|
||||
EnterStagingRegion
|
||||
.enter_staging_regions(&mut ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_enter_staging_region_retryable() {
|
||||
let env = TestingEnv::new();
|
||||
let table_id = 1024;
|
||||
let targets = test_targets();
|
||||
let mut persistent_context = new_persistent_context(table_id, vec![], targets);
|
||||
persistent_context.group_prepare_result = Some(test_prepare_result(table_id));
|
||||
let mut ctx = env.create_context(persistent_context);
|
||||
let err = EnterStagingRegion
|
||||
.enter_staging_regions(&mut ctx)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::RetryLater { .. });
|
||||
assert!(err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_enter_staging_regions_non_retryable() {
|
||||
let mut env = TestingEnv::new();
|
||||
let table_id = 1024;
|
||||
let targets = test_targets();
|
||||
let mut persistent_context = new_persistent_context(table_id, vec![], targets);
|
||||
persistent_context.group_prepare_result = Some(test_prepare_result(table_id));
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
env.mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(1), tx)
|
||||
.await;
|
||||
// Sends an incorrect reply.
|
||||
send_mock_reply(env.mailbox_ctx.mailbox().clone(), rx, |id| {
|
||||
Ok(new_close_region_reply(id))
|
||||
});
|
||||
|
||||
let mut ctx = env.create_context(persistent_context.clone());
|
||||
// Datanode 1 returns unexpected reply.
|
||||
// Datanode 2 is unreachable.
|
||||
let err = EnterStagingRegion
|
||||
.enter_staging_regions(&mut ctx)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::Unexpected { .. });
|
||||
assert!(!err.is_retryable());
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(1);
|
||||
env.mailbox_ctx
|
||||
.insert_heartbeat_response_receiver(Channel::Datanode(2), tx)
|
||||
.await;
|
||||
// Sends an incorrect reply.
|
||||
send_mock_reply(env.mailbox_ctx.mailbox().clone(), rx, |id| {
|
||||
Ok(new_close_region_reply(id))
|
||||
});
|
||||
let mut ctx = env.create_context(persistent_context);
|
||||
// Datanode 1 returns unexpected reply.
|
||||
// Datanode 2 returns unexpected reply.
|
||||
let err = EnterStagingRegion
|
||||
.enter_staging_regions(&mut ctx)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_matches!(err, Error::Unexpected { .. });
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
}
|
||||
@@ -97,17 +97,6 @@ impl RepartitionStart {
|
||||
.map(|r| (*r).clone())
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
for target_region_route in &target_region_routes {
|
||||
ensure!(
|
||||
target_region_route.leader_peer.is_some(),
|
||||
error::UnexpectedSnafu {
|
||||
violated: format!(
|
||||
"Leader peer is not set for region: {}",
|
||||
target_region_route.region.id
|
||||
),
|
||||
}
|
||||
);
|
||||
}
|
||||
let central_region = sources[0].region_id;
|
||||
let central_region_datanode_id = source_region_routes[0]
|
||||
.leader_peer
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use common_meta::peer::Peer;
|
||||
use common_meta::rpc::router::RegionRoute;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::error::{Error, Result};
|
||||
|
||||
/// Groups the region routes by the leader peer.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the leader peer is not set for any of the region routes.
|
||||
pub(crate) fn group_region_routes_by_peer(
|
||||
region_routes: &[RegionRoute],
|
||||
) -> HashMap<&Peer, Vec<RegionId>> {
|
||||
let mut map: HashMap<&Peer, Vec<RegionId>> = HashMap::new();
|
||||
for region_route in region_routes {
|
||||
map.entry(region_route.leader_peer.as_ref().unwrap())
|
||||
.or_default()
|
||||
.push(region_route.region.id);
|
||||
}
|
||||
map
|
||||
}
|
||||
|
||||
/// Returns `true` if all results are successful.
|
||||
fn all_successful(results: &[Result<()>]) -> bool {
|
||||
results.iter().all(Result::is_ok)
|
||||
}
|
||||
|
||||
pub enum HandleMultipleResult<'a> {
|
||||
AllSuccessful,
|
||||
AllRetryable(Vec<(usize, &'a Error)>),
|
||||
PartialRetryable {
|
||||
retryable_errors: Vec<(usize, &'a Error)>,
|
||||
non_retryable_errors: Vec<(usize, &'a Error)>,
|
||||
},
|
||||
AllNonRetryable(Vec<(usize, &'a Error)>),
|
||||
}
|
||||
|
||||
/// Evaluates results from multiple operations and categorizes errors by retryability.
|
||||
///
|
||||
/// If all operations succeed, returns `AllSuccessful`.
|
||||
/// If all errors are retryable, returns `AllRetryable`.
|
||||
/// If all errors are non-retryable, returns `AllNonRetryable`.
|
||||
/// Otherwise, returns `PartialRetryable` with separate collections for retryable and non-retryable errors.
|
||||
pub(crate) fn handle_multiple_results<'a>(results: &'a [Result<()>]) -> HandleMultipleResult<'a> {
|
||||
if all_successful(results) {
|
||||
return HandleMultipleResult::AllSuccessful;
|
||||
}
|
||||
|
||||
let mut retryable_errors = Vec::new();
|
||||
let mut non_retryable_errors = Vec::new();
|
||||
for (index, result) in results.iter().enumerate() {
|
||||
if let Err(error) = result {
|
||||
if error.is_retryable() {
|
||||
retryable_errors.push((index, error));
|
||||
} else {
|
||||
non_retryable_errors.push((index, error));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match (retryable_errors.is_empty(), non_retryable_errors.is_empty()) {
|
||||
(true, false) => HandleMultipleResult::AllNonRetryable(non_retryable_errors),
|
||||
(false, true) => HandleMultipleResult::AllRetryable(retryable_errors),
|
||||
(false, false) => HandleMultipleResult::PartialRetryable {
|
||||
retryable_errors,
|
||||
non_retryable_errors,
|
||||
},
|
||||
// Should not happen, but include for completeness
|
||||
(true, true) => HandleMultipleResult::AllSuccessful,
|
||||
}
|
||||
}
|
||||
@@ -32,7 +32,6 @@ use crate::procedure::test_util::MailboxContext;
|
||||
pub struct TestingEnv {
|
||||
pub table_metadata_manager: TableMetadataManagerRef,
|
||||
pub mailbox_ctx: MailboxContext,
|
||||
pub server_addr: String,
|
||||
}
|
||||
|
||||
impl Default for TestingEnv {
|
||||
@@ -52,11 +51,10 @@ impl TestingEnv {
|
||||
Self {
|
||||
table_metadata_manager,
|
||||
mailbox_ctx,
|
||||
server_addr: "localhost".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_context(&self, persistent_context: PersistentContext) -> Context {
|
||||
pub fn create_context(self, persistent_context: PersistentContext) -> Context {
|
||||
let cache_invalidator = Arc::new(MetasrvCacheInvalidator::new(
|
||||
self.mailbox_ctx.mailbox().clone(),
|
||||
MetasrvInfo {
|
||||
@@ -68,8 +66,6 @@ impl TestingEnv {
|
||||
persistent_ctx: persistent_context,
|
||||
table_metadata_manager: self.table_metadata_manager.clone(),
|
||||
cache_invalidator,
|
||||
mailbox: self.mailbox_ctx.mailbox().clone(),
|
||||
server_addr: self.server_addr.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,8 @@ use std::collections::HashMap;
|
||||
use api::v1::meta::mailbox_message::Payload;
|
||||
use api::v1::meta::{HeartbeatResponse, MailboxMessage};
|
||||
use common_meta::instruction::{
|
||||
DowngradeRegionReply, DowngradeRegionsReply, EnterStagingRegionReply, EnterStagingRegionsReply,
|
||||
FlushRegionReply, InstructionReply, SimpleReply, UpgradeRegionReply, UpgradeRegionsReply,
|
||||
DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply,
|
||||
UpgradeRegionReply, UpgradeRegionsReply,
|
||||
};
|
||||
use common_meta::key::TableMetadataManagerRef;
|
||||
use common_meta::key::table_route::TableRouteValue;
|
||||
@@ -198,7 +198,7 @@ pub fn new_downgrade_region_reply(
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates a [InstructionReply::UpgradeRegions] reply.
|
||||
/// Generates a [InstructionReply::UpgradeRegion] reply.
|
||||
pub fn new_upgrade_region_reply(
|
||||
id: u64,
|
||||
ready: bool,
|
||||
@@ -225,34 +225,6 @@ pub fn new_upgrade_region_reply(
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates a [InstructionReply::EnterStagingRegions] reply.
|
||||
pub fn new_enter_staging_region_reply(
|
||||
id: u64,
|
||||
region_id: RegionId,
|
||||
ready: bool,
|
||||
exists: bool,
|
||||
error: Option<String>,
|
||||
) -> MailboxMessage {
|
||||
MailboxMessage {
|
||||
id,
|
||||
subject: "mock".to_string(),
|
||||
from: "datanode".to_string(),
|
||||
to: "meta".to_string(),
|
||||
timestamp_millis: current_time_millis(),
|
||||
payload: Some(Payload::Json(
|
||||
serde_json::to_string(&InstructionReply::EnterStagingRegions(
|
||||
EnterStagingRegionsReply::new(vec![EnterStagingRegionReply {
|
||||
region_id,
|
||||
ready,
|
||||
exists,
|
||||
error,
|
||||
}]),
|
||||
))
|
||||
.unwrap(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Mock the test data for WAL pruning.
|
||||
pub async fn new_wal_prune_metadata(
|
||||
table_metadata_manager: TableMetadataManagerRef,
|
||||
|
||||
2
src/mito2/src/cache/file_cache.rs
vendored
2
src/mito2/src/cache/file_cache.rs
vendored
@@ -727,7 +727,7 @@ impl fmt::Display for FileType {
|
||||
|
||||
impl FileType {
|
||||
/// Parses the file type from string.
|
||||
pub(crate) fn parse(s: &str) -> Option<FileType> {
|
||||
fn parse(s: &str) -> Option<FileType> {
|
||||
match s {
|
||||
"parquet" => Some(FileType::Parquet),
|
||||
"puffin" => Some(FileType::Puffin(0)),
|
||||
|
||||
@@ -23,13 +23,11 @@ use api::v1::Rows;
|
||||
use common_error::ext::ErrorExt;
|
||||
use common_error::status_code::StatusCode;
|
||||
use common_recordbatch::RecordBatches;
|
||||
use datatypes::value::Value;
|
||||
use object_store::Buffer;
|
||||
use object_store::layers::mock::{
|
||||
Entry, Error as MockError, ErrorKind, List, Lister, Metadata, MockLayerBuilder,
|
||||
Result as MockResult, Write, Writer,
|
||||
};
|
||||
use partition::expr::{PartitionExpr, col};
|
||||
use store_api::region_engine::{RegionEngine, SettableRegionRoleState};
|
||||
use store_api::region_request::{
|
||||
EnterStagingRequest, RegionAlterRequest, RegionFlushRequest, RegionRequest,
|
||||
@@ -40,16 +38,10 @@ use store_api::storage::{RegionId, ScanRequest};
|
||||
use crate::config::MitoConfig;
|
||||
use crate::engine::listener::NotifyEnterStagingResultListener;
|
||||
use crate::error::Error;
|
||||
use crate::region::{RegionLeaderState, RegionRoleState, parse_partition_expr};
|
||||
use crate::region::{RegionLeaderState, RegionRoleState};
|
||||
use crate::request::WorkerRequest;
|
||||
use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema};
|
||||
|
||||
fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr {
|
||||
col(col_name)
|
||||
.gt_eq(Value::Int64(start))
|
||||
.and(col(col_name).lt(Value::Int64(end)))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_staging_state_integration() {
|
||||
test_staging_state_integration_with_format(false).await;
|
||||
@@ -235,9 +227,7 @@ async fn test_staging_state_validation_patterns() {
|
||||
);
|
||||
}
|
||||
|
||||
fn default_partition_expr() -> String {
|
||||
range_expr("a", 0, 100).as_json_str().unwrap()
|
||||
}
|
||||
const PARTITION_EXPR: &str = "partition_expr";
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_staging_manifest_directory() {
|
||||
@@ -247,7 +237,6 @@ async fn test_staging_manifest_directory() {
|
||||
|
||||
async fn test_staging_manifest_directory_with_format(flat_format: bool) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let partition_expr = default_partition_expr();
|
||||
let mut env = TestEnv::new().await;
|
||||
let engine = env
|
||||
.create_engine(MitoConfig {
|
||||
@@ -285,14 +274,14 @@ async fn test_staging_manifest_directory_with_format(flat_format: bool) {
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::EnterStaging(EnterStagingRequest {
|
||||
partition_expr: partition_expr.clone(),
|
||||
partition_expr: PARTITION_EXPR.to_string(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let staging_partition_expr = region.staging_partition_expr.lock().unwrap().clone();
|
||||
assert_eq!(staging_partition_expr.unwrap(), partition_expr);
|
||||
assert_eq!(staging_partition_expr.unwrap(), PARTITION_EXPR);
|
||||
{
|
||||
let manager = region.manifest_ctx.manifest_manager.read().await;
|
||||
assert_eq!(
|
||||
@@ -303,7 +292,7 @@ async fn test_staging_manifest_directory_with_format(flat_format: bool) {
|
||||
.partition_expr
|
||||
.as_deref()
|
||||
.unwrap(),
|
||||
&partition_expr,
|
||||
PARTITION_EXPR
|
||||
);
|
||||
assert!(manager.manifest().metadata.partition_expr.is_none());
|
||||
}
|
||||
@@ -313,7 +302,7 @@ async fn test_staging_manifest_directory_with_format(flat_format: bool) {
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::EnterStaging(EnterStagingRequest {
|
||||
partition_expr: partition_expr.clone(),
|
||||
partition_expr: PARTITION_EXPR.to_string(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -388,7 +377,6 @@ async fn test_staging_exit_success_with_manifests() {
|
||||
|
||||
async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let partition_expr = default_partition_expr();
|
||||
let mut env = TestEnv::new().await;
|
||||
let engine = env
|
||||
.create_engine(MitoConfig {
|
||||
@@ -419,7 +407,7 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::EnterStaging(EnterStagingRequest {
|
||||
partition_expr: partition_expr.clone(),
|
||||
partition_expr: PARTITION_EXPR.to_string(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -477,25 +465,6 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
|
||||
"Staging manifest directory should contain 3 files before exit, got: {:?}",
|
||||
staging_files_before
|
||||
);
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
{
|
||||
let manager = region.manifest_ctx.manifest_manager.read().await;
|
||||
let staging_manifest = manager.staging_manifest().unwrap();
|
||||
assert_eq!(staging_manifest.files.len(), 3);
|
||||
assert_eq!(
|
||||
staging_manifest.metadata.partition_expr.as_ref().unwrap(),
|
||||
&partition_expr
|
||||
);
|
||||
let expr = parse_partition_expr(Some(partition_expr.as_str()))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
for file in staging_manifest.files.values() {
|
||||
let Some(file_expr) = file.partition_expr.as_ref() else {
|
||||
continue;
|
||||
};
|
||||
assert_eq!(*file_expr, expr);
|
||||
}
|
||||
}
|
||||
|
||||
// Count normal manifest files before exit
|
||||
let normal_manifest_dir = format!("{}/manifest", region_dir);
|
||||
@@ -614,7 +583,6 @@ async fn test_write_stall_on_enter_staging() {
|
||||
|
||||
async fn test_write_stall_on_enter_staging_with_format(flat_format: bool) {
|
||||
let mut env = TestEnv::new().await;
|
||||
let partition_expr = default_partition_expr();
|
||||
let listener = Arc::new(NotifyEnterStagingResultListener::default());
|
||||
let engine = env
|
||||
.create_engine_with(
|
||||
@@ -654,7 +622,7 @@ async fn test_write_stall_on_enter_staging_with_format(flat_format: bool) {
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::EnterStaging(EnterStagingRequest {
|
||||
partition_expr: partition_expr.clone(),
|
||||
partition_expr: PARTITION_EXPR.to_string(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -738,7 +706,6 @@ impl Write for MockWriter {
|
||||
}
|
||||
|
||||
async fn test_enter_staging_error(env: &mut TestEnv, flat_format: bool) {
|
||||
let partition_expr = default_partition_expr();
|
||||
let engine = env
|
||||
.create_engine(MitoConfig {
|
||||
default_experimental_flat_format: flat_format,
|
||||
@@ -756,7 +723,7 @@ async fn test_enter_staging_error(env: &mut TestEnv, flat_format: bool) {
|
||||
.handle_request(
|
||||
region_id,
|
||||
RegionRequest::EnterStaging(EnterStagingRequest {
|
||||
partition_expr: partition_expr.clone(),
|
||||
partition_expr: PARTITION_EXPR.to_string(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -36,8 +36,8 @@ use crate::access_layer::{
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error::{
|
||||
Error, FlushRegionSnafu, JoinSnafu, RegionClosedSnafu, RegionDroppedSnafu,
|
||||
RegionTruncatedSnafu, Result,
|
||||
Error, FlushRegionSnafu, InvalidPartitionExprSnafu, JoinSnafu, RegionClosedSnafu,
|
||||
RegionDroppedSnafu, RegionTruncatedSnafu, Result,
|
||||
};
|
||||
use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
|
||||
use crate::memtable::{
|
||||
@@ -54,7 +54,7 @@ use crate::read::merge::MergeReaderBuilder;
|
||||
use crate::read::{FlatSource, Source};
|
||||
use crate::region::options::{IndexOptions, MergeMode, RegionOptions};
|
||||
use crate::region::version::{VersionControlData, VersionControlRef, VersionRef};
|
||||
use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState, parse_partition_expr};
|
||||
use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState};
|
||||
use crate::request::{
|
||||
BackgroundNotify, FlushFailed, FlushFinished, OptionOutputTx, OutputTx, SenderBulkRequest,
|
||||
SenderDdlRequest, SenderWriteRequest, WorkerRequest, WorkerRequestWithTime,
|
||||
@@ -252,10 +252,6 @@ pub(crate) struct RegionFlushTask {
|
||||
pub(crate) flush_semaphore: Arc<Semaphore>,
|
||||
/// Whether the region is in staging mode.
|
||||
pub(crate) is_staging: bool,
|
||||
/// Partition expression of the region.
|
||||
///
|
||||
/// This is used to generate the file meta.
|
||||
pub(crate) partition_expr: Option<String>,
|
||||
}
|
||||
|
||||
impl RegionFlushTask {
|
||||
@@ -445,8 +441,14 @@ impl RegionFlushTask {
|
||||
let mut file_metas = Vec::with_capacity(memtables.len());
|
||||
let mut flushed_bytes = 0;
|
||||
let mut series_count = 0;
|
||||
// Convert partition expression once outside the map
|
||||
let partition_expr = match &version.metadata.partition_expr {
|
||||
None => None,
|
||||
Some(json_expr) if json_expr.is_empty() => None,
|
||||
Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str)
|
||||
.with_context(|_| InvalidPartitionExprSnafu { expr: json_str })?,
|
||||
};
|
||||
let mut flush_metrics = Metrics::new(WriteType::Flush);
|
||||
let partition_expr = parse_partition_expr(self.partition_expr.as_deref())?;
|
||||
for mem in memtables {
|
||||
if mem.is_empty() {
|
||||
// Skip empty memtables.
|
||||
@@ -799,8 +801,7 @@ fn memtable_flat_sources(
|
||||
if last_iter_rows > min_flush_rows {
|
||||
let maybe_dedup = merge_and_dedup(
|
||||
&schema,
|
||||
options.append_mode,
|
||||
options.merge_mode(),
|
||||
options,
|
||||
field_column_start,
|
||||
std::mem::replace(&mut input_iters, Vec::with_capacity(num_ranges)),
|
||||
)?;
|
||||
@@ -812,13 +813,7 @@ fn memtable_flat_sources(
|
||||
|
||||
// Handle remaining iters.
|
||||
if !input_iters.is_empty() {
|
||||
let maybe_dedup = merge_and_dedup(
|
||||
&schema,
|
||||
options.append_mode,
|
||||
options.merge_mode(),
|
||||
field_column_start,
|
||||
input_iters,
|
||||
)?;
|
||||
let maybe_dedup = merge_and_dedup(&schema, options, field_column_start, input_iters)?;
|
||||
|
||||
flat_sources.sources.push(FlatSource::Iter(maybe_dedup));
|
||||
}
|
||||
@@ -827,64 +822,19 @@ fn memtable_flat_sources(
|
||||
Ok(flat_sources)
|
||||
}
|
||||
|
||||
/// Merges multiple record batch iterators and applies deduplication based on the specified mode.
|
||||
///
|
||||
/// This function is used during the flush process to combine data from multiple memtable ranges
|
||||
/// into a single stream while handling duplicate records according to the configured merge strategy.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `schema` - The Arrow schema reference that defines the structure of the record batches
|
||||
/// * `append_mode` - When true, no deduplication is performed and all records are preserved.
|
||||
/// This is used for append-only workloads where duplicate handling is not required.
|
||||
/// * `merge_mode` - The strategy used for deduplication when not in append mode:
|
||||
/// - `MergeMode::LastRow`: Keeps the last record for each primary key
|
||||
/// - `MergeMode::LastNonNull`: Keeps the last non-null values for each field
|
||||
/// * `field_column_start` - The starting column index for fields in the record batch.
|
||||
/// Used when `MergeMode::LastNonNull` to identify which columns
|
||||
/// contain field values versus primary key columns.
|
||||
/// * `input_iters` - A vector of record batch iterators to be merged and deduplicated
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a boxed record batch iterator that yields the merged and potentially deduplicated
|
||||
/// record batches.
|
||||
///
|
||||
/// # Behavior
|
||||
///
|
||||
/// 1. Creates a `FlatMergeIterator` to merge all input iterators in sorted order based on
|
||||
/// primary key and timestamp
|
||||
/// 2. If `append_mode` is true, returns the merge iterator directly without deduplication
|
||||
/// 3. If `append_mode` is false, wraps the merge iterator with a `FlatDedupIterator` that
|
||||
/// applies the specified merge mode:
|
||||
/// - `LastRow`: Removes duplicate rows, keeping only the last one
|
||||
/// - `LastNonNull`: Removes duplicates but preserves the last non-null value for each field
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```ignore
|
||||
/// let merged_iter = merge_and_dedup(
|
||||
/// &schema,
|
||||
/// false, // not append mode, apply dedup
|
||||
/// MergeMode::LastRow,
|
||||
/// 2, // fields start at column 2 after primary key columns
|
||||
/// vec![iter1, iter2, iter3],
|
||||
/// )?;
|
||||
/// ```
|
||||
pub fn merge_and_dedup(
|
||||
fn merge_and_dedup(
|
||||
schema: &SchemaRef,
|
||||
append_mode: bool,
|
||||
merge_mode: MergeMode,
|
||||
options: &RegionOptions,
|
||||
field_column_start: usize,
|
||||
input_iters: Vec<BoxedRecordBatchIterator>,
|
||||
) -> Result<BoxedRecordBatchIterator> {
|
||||
let merge_iter = FlatMergeIterator::new(schema.clone(), input_iters, DEFAULT_READ_BATCH_SIZE)?;
|
||||
let maybe_dedup = if append_mode {
|
||||
let maybe_dedup = if options.append_mode {
|
||||
// No dedup in append mode
|
||||
Box::new(merge_iter) as _
|
||||
} else {
|
||||
// Dedup according to merge mode.
|
||||
match merge_mode {
|
||||
match options.merge_mode() {
|
||||
MergeMode::LastRow => {
|
||||
Box::new(FlatDedupIterator::new(merge_iter, FlatLastRow::new(false))) as _
|
||||
}
|
||||
@@ -1331,7 +1281,6 @@ mod tests {
|
||||
index_options: IndexOptions::default(),
|
||||
flush_semaphore: Arc::new(Semaphore::new(2)),
|
||||
is_staging: false,
|
||||
partition_expr: None,
|
||||
};
|
||||
task.push_sender(OptionOutputTx::from(output_tx));
|
||||
scheduler
|
||||
@@ -1375,7 +1324,6 @@ mod tests {
|
||||
index_options: IndexOptions::default(),
|
||||
flush_semaphore: Arc::new(Semaphore::new(2)),
|
||||
is_staging: false,
|
||||
partition_expr: None,
|
||||
})
|
||||
.collect();
|
||||
// Schedule first task.
|
||||
@@ -1568,7 +1516,6 @@ mod tests {
|
||||
index_options: IndexOptions::default(),
|
||||
flush_semaphore: Arc::new(Semaphore::new(2)),
|
||||
is_staging: false,
|
||||
partition_expr: None,
|
||||
})
|
||||
.collect();
|
||||
// Schedule first task.
|
||||
|
||||
@@ -28,62 +28,28 @@ use std::time::Duration;
|
||||
use common_meta::datanode::GcStat;
|
||||
use common_telemetry::{debug, error, info, warn};
|
||||
use common_time::Timestamp;
|
||||
use itertools::Itertools;
|
||||
use object_store::{Entry, Lister};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::ResultExt as _;
|
||||
use store_api::storage::{FileId, FileRef, FileRefsManifest, GcReport, IndexVersion, RegionId};
|
||||
use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
|
||||
use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
use crate::access_layer::AccessLayerRef;
|
||||
use crate::cache::CacheManagerRef;
|
||||
use crate::cache::file_cache::FileType;
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error::{
|
||||
DurationOutOfRangeSnafu, JoinSnafu, OpenDalSnafu, Result, TooManyGcJobsSnafu, UnexpectedSnafu,
|
||||
};
|
||||
use crate::manifest::action::{RegionManifest, RemovedFile};
|
||||
use crate::metrics::{GC_DELETE_FILE_CNT, GC_ORPHANED_INDEX_FILES, GC_SKIPPED_UNPARSABLE_FILES};
|
||||
use crate::manifest::action::RegionManifest;
|
||||
use crate::metrics::GC_DELETE_FILE_CNT;
|
||||
use crate::region::{MitoRegionRef, RegionRoleState};
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId, delete_files, delete_index};
|
||||
use crate::sst::file::delete_files;
|
||||
use crate::sst::location::{self};
|
||||
|
||||
#[cfg(test)]
|
||||
mod worker_test;
|
||||
|
||||
/// Helper function to determine if a file should be deleted based on common logic
|
||||
/// shared between Parquet and Puffin file types.
|
||||
fn should_delete_file(
|
||||
is_in_manifest: bool,
|
||||
is_in_tmp_ref: bool,
|
||||
is_linger: bool,
|
||||
is_eligible_for_delete: bool,
|
||||
entry: &Entry,
|
||||
unknown_file_may_linger_until: chrono::DateTime<chrono::Utc>,
|
||||
) -> bool {
|
||||
let is_known = is_linger || is_eligible_for_delete;
|
||||
|
||||
let is_unknown_linger_time_exceeded = || {
|
||||
// if the file's expel time is unknown(because not appear in delta manifest), we keep it for a while
|
||||
// using it's last modified time
|
||||
// notice unknown files use a different lingering time
|
||||
entry
|
||||
.metadata()
|
||||
.last_modified()
|
||||
.map(|t| t < unknown_file_may_linger_until)
|
||||
.unwrap_or(false)
|
||||
};
|
||||
|
||||
!is_in_manifest
|
||||
&& !is_in_tmp_ref
|
||||
&& if is_known {
|
||||
is_eligible_for_delete
|
||||
} else {
|
||||
is_unknown_linger_time_exceeded()
|
||||
}
|
||||
}
|
||||
|
||||
/// Limit the amount of concurrent GC jobs on the datanode
|
||||
pub struct GcLimiter {
|
||||
pub gc_job_limit: Arc<tokio::sync::Semaphore>,
|
||||
@@ -242,7 +208,7 @@ impl LocalGcWorker {
|
||||
}
|
||||
|
||||
/// Get tmp ref files for all current regions
|
||||
pub async fn read_tmp_ref_files(&self) -> Result<HashMap<RegionId, HashSet<FileRef>>> {
|
||||
pub async fn read_tmp_ref_files(&self) -> Result<HashMap<RegionId, HashSet<FileId>>> {
|
||||
let mut tmp_ref_files = HashMap::new();
|
||||
for (region_id, file_refs) in &self.file_ref_manifest.file_refs {
|
||||
tmp_ref_files
|
||||
@@ -264,7 +230,6 @@ impl LocalGcWorker {
|
||||
let now = std::time::Instant::now();
|
||||
|
||||
let mut deleted_files = HashMap::new();
|
||||
let mut deleted_indexes = HashMap::new();
|
||||
let tmp_ref_files = self.read_tmp_ref_files().await?;
|
||||
for (region_id, region) in &self.regions {
|
||||
let per_region_time = std::time::Instant::now();
|
||||
@@ -282,12 +247,7 @@ impl LocalGcWorker {
|
||||
.cloned()
|
||||
.unwrap_or_else(HashSet::new);
|
||||
let files = self.do_region_gc(region.clone(), &tmp_ref_files).await?;
|
||||
let index_files = files
|
||||
.iter()
|
||||
.filter_map(|f| f.index_version().map(|v| (f.file_id(), v)))
|
||||
.collect_vec();
|
||||
deleted_files.insert(*region_id, files.into_iter().map(|f| f.file_id()).collect());
|
||||
deleted_indexes.insert(*region_id, index_files);
|
||||
deleted_files.insert(*region_id, files);
|
||||
debug!(
|
||||
"GC for region {} took {} secs.",
|
||||
region_id,
|
||||
@@ -300,7 +260,6 @@ impl LocalGcWorker {
|
||||
);
|
||||
let report = GcReport {
|
||||
deleted_files,
|
||||
deleted_indexes,
|
||||
need_retry_regions: HashSet::new(),
|
||||
};
|
||||
Ok(report)
|
||||
@@ -323,8 +282,8 @@ impl LocalGcWorker {
|
||||
pub async fn do_region_gc(
|
||||
&self,
|
||||
region: MitoRegionRef,
|
||||
tmp_ref_files: &HashSet<FileRef>,
|
||||
) -> Result<Vec<RemovedFile>> {
|
||||
tmp_ref_files: &HashSet<FileId>,
|
||||
) -> Result<Vec<FileId>> {
|
||||
let region_id = region.region_id();
|
||||
|
||||
debug!("Doing gc for region {}", region_id);
|
||||
@@ -352,83 +311,64 @@ impl LocalGcWorker {
|
||||
.map(|s| s.len())
|
||||
.sum::<usize>();
|
||||
|
||||
let in_manifest = current_files
|
||||
.iter()
|
||||
.map(|(file_id, meta)| (*file_id, meta.index_version()))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let in_used: HashSet<FileId> = current_files
|
||||
.keys()
|
||||
.cloned()
|
||||
.chain(tmp_ref_files.clone().into_iter())
|
||||
.collect();
|
||||
|
||||
let in_tmp_ref = tmp_ref_files
|
||||
.iter()
|
||||
.map(|file_ref| (file_ref.file_id, file_ref.index_version))
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let deletable_files = self
|
||||
.list_to_be_deleted_files(
|
||||
region_id,
|
||||
&in_manifest,
|
||||
&in_tmp_ref,
|
||||
recently_removed_files,
|
||||
all_entries,
|
||||
)
|
||||
let unused_files = self
|
||||
.list_to_be_deleted_files(region_id, &in_used, recently_removed_files, all_entries)
|
||||
.await?;
|
||||
|
||||
let unused_file_cnt = deletable_files.len();
|
||||
let unused_file_cnt = unused_files.len();
|
||||
|
||||
debug!(
|
||||
"gc: for region {region_id}: In manifest files: {}, Tmp ref file cnt: {}, recently removed files: {}, Unused files to delete: {} ",
|
||||
"gc: for region {region_id}: In manifest files: {}, Tmp ref file cnt: {}, In-used files: {}, recently removed files: {}, Unused files to delete: {} ",
|
||||
current_files.len(),
|
||||
tmp_ref_files.len(),
|
||||
in_used.len(),
|
||||
removed_file_cnt,
|
||||
deletable_files.len()
|
||||
unused_files.len()
|
||||
);
|
||||
|
||||
// TODO(discord9): for now, ignore async index file as it's design is not stable, need to be improved once
|
||||
// index file design is stable
|
||||
let file_pairs: Vec<(FileId, u64)> =
|
||||
unused_files.iter().map(|file_id| (*file_id, 0)).collect();
|
||||
// TODO(discord9): gc worker need another major refactor to support versioned index files
|
||||
|
||||
debug!(
|
||||
"Found {} unused index files to delete for region {}",
|
||||
deletable_files.len(),
|
||||
file_pairs.len(),
|
||||
region_id
|
||||
);
|
||||
|
||||
self.delete_files(region_id, &deletable_files).await?;
|
||||
self.delete_files(region_id, &file_pairs).await?;
|
||||
|
||||
debug!(
|
||||
"Successfully deleted {} unused files for region {}",
|
||||
unused_file_cnt, region_id
|
||||
);
|
||||
self.update_manifest_removed_files(®ion, deletable_files.clone())
|
||||
// TODO(discord9): update region manifest about deleted files
|
||||
self.update_manifest_removed_files(®ion, unused_files.clone())
|
||||
.await?;
|
||||
|
||||
Ok(deletable_files)
|
||||
Ok(unused_files)
|
||||
}
|
||||
|
||||
async fn delete_files(&self, region_id: RegionId, removed_files: &[RemovedFile]) -> Result<()> {
|
||||
let mut index_ids = vec![];
|
||||
let file_pairs = removed_files
|
||||
.iter()
|
||||
.filter_map(|f| match f {
|
||||
RemovedFile::File(file_id, v) => Some((*file_id, v.unwrap_or(0))),
|
||||
RemovedFile::Index(file_id, index_version) => {
|
||||
let region_index_id =
|
||||
RegionIndexId::new(RegionFileId::new(region_id, *file_id), *index_version);
|
||||
index_ids.push(region_index_id);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect_vec();
|
||||
async fn delete_files(&self, region_id: RegionId, file_ids: &[(FileId, u64)]) -> Result<()> {
|
||||
delete_files(
|
||||
region_id,
|
||||
&file_pairs,
|
||||
file_ids,
|
||||
true,
|
||||
&self.access_layer,
|
||||
&self.cache_manager,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for index_id in index_ids {
|
||||
delete_index(index_id, &self.access_layer, &self.cache_manager).await?;
|
||||
}
|
||||
|
||||
// FIXME(discord9): if files are already deleted before calling delete_files, the metric will be inaccurate, no clean way to fix it now
|
||||
GC_DELETE_FILE_CNT.add(removed_files.len() as i64);
|
||||
GC_DELETE_FILE_CNT.add(file_ids.len() as i64);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -437,7 +377,7 @@ impl LocalGcWorker {
|
||||
async fn update_manifest_removed_files(
|
||||
&self,
|
||||
region: &MitoRegionRef,
|
||||
deleted_files: Vec<RemovedFile>,
|
||||
deleted_files: Vec<FileId>,
|
||||
) -> Result<()> {
|
||||
let deleted_file_cnt = deleted_files.len();
|
||||
debug!(
|
||||
@@ -463,12 +403,12 @@ impl LocalGcWorker {
|
||||
pub async fn get_removed_files_expel_times(
|
||||
&self,
|
||||
region_manifest: &Arc<RegionManifest>,
|
||||
) -> Result<BTreeMap<Timestamp, HashSet<RemovedFile>>> {
|
||||
) -> Result<BTreeMap<Timestamp, HashSet<FileId>>> {
|
||||
let mut ret = BTreeMap::new();
|
||||
for files in ®ion_manifest.removed_files.removed_files {
|
||||
let expel_time = Timestamp::new_millisecond(files.removed_at);
|
||||
let set = ret.entry(expel_time).or_insert_with(HashSet::new);
|
||||
set.extend(files.files.iter().cloned());
|
||||
set.extend(files.file_ids.iter().cloned());
|
||||
}
|
||||
|
||||
Ok(ret)
|
||||
@@ -595,136 +535,75 @@ impl LocalGcWorker {
|
||||
Ok(all_entries)
|
||||
}
|
||||
|
||||
/// Filter files to determine which ones can be deleted based on usage status and lingering time.
|
||||
/// Returns a vector of file IDs that are safe to delete.
|
||||
fn filter_deletable_files(
|
||||
&self,
|
||||
entries: Vec<Entry>,
|
||||
in_manifest: &HashMap<FileId, Option<IndexVersion>>,
|
||||
in_tmp_ref: &HashSet<(FileId, Option<IndexVersion>)>,
|
||||
may_linger_files: &HashSet<&RemovedFile>,
|
||||
eligible_for_delete: &HashSet<&RemovedFile>,
|
||||
in_use_filenames: &HashSet<FileId>,
|
||||
may_linger_filenames: &HashSet<&FileId>,
|
||||
eligible_for_removal: &HashSet<&FileId>,
|
||||
unknown_file_may_linger_until: chrono::DateTime<chrono::Utc>,
|
||||
) -> Vec<RemovedFile> {
|
||||
let mut ready_for_delete = vec![];
|
||||
// all group by file id for easier checking
|
||||
let in_tmp_ref: HashMap<FileId, HashSet<IndexVersion>> =
|
||||
in_tmp_ref
|
||||
.iter()
|
||||
.fold(HashMap::new(), |mut acc, (file, version)| {
|
||||
let indices = acc.entry(*file).or_default();
|
||||
if let Some(version) = version {
|
||||
indices.insert(*version);
|
||||
}
|
||||
acc
|
||||
});
|
||||
|
||||
let may_linger_files: HashMap<FileId, HashSet<&RemovedFile>> = may_linger_files
|
||||
.iter()
|
||||
.fold(HashMap::new(), |mut acc, file| {
|
||||
let indices = acc.entry(file.file_id()).or_default();
|
||||
indices.insert(file);
|
||||
acc
|
||||
});
|
||||
|
||||
let eligible_for_delete: HashMap<FileId, HashSet<&RemovedFile>> = eligible_for_delete
|
||||
.iter()
|
||||
.fold(HashMap::new(), |mut acc, file| {
|
||||
let indices = acc.entry(file.file_id()).or_default();
|
||||
indices.insert(file);
|
||||
acc
|
||||
});
|
||||
) -> (Vec<FileId>, HashSet<FileId>) {
|
||||
let mut all_unused_files_ready_for_delete = vec![];
|
||||
let mut all_in_exist_linger_files = HashSet::new();
|
||||
|
||||
for entry in entries {
|
||||
let (file_id, file_type) = match location::parse_file_id_type_from_path(entry.name()) {
|
||||
Ok((file_id, file_type)) => (file_id, file_type),
|
||||
let file_id = match location::parse_file_id_from_path(entry.name()) {
|
||||
Ok(file_id) => file_id,
|
||||
Err(err) => {
|
||||
error!(err; "Failed to parse file id from path: {}", entry.name());
|
||||
// if we can't parse the file id, it means it's not a sst or index file
|
||||
// shouldn't delete it because we don't know what it is
|
||||
GC_SKIPPED_UNPARSABLE_FILES.inc();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let should_delete = match file_type {
|
||||
FileType::Parquet => {
|
||||
let is_in_manifest = in_manifest.contains_key(&file_id);
|
||||
let is_in_tmp_ref = in_tmp_ref.contains_key(&file_id);
|
||||
let is_linger = may_linger_files.contains_key(&file_id);
|
||||
let is_eligible_for_delete = eligible_for_delete.contains_key(&file_id);
|
||||
if may_linger_filenames.contains(&file_id) {
|
||||
all_in_exist_linger_files.insert(file_id);
|
||||
}
|
||||
|
||||
should_delete_file(
|
||||
is_in_manifest,
|
||||
is_in_tmp_ref,
|
||||
is_linger,
|
||||
is_eligible_for_delete,
|
||||
&entry,
|
||||
unknown_file_may_linger_until,
|
||||
)
|
||||
}
|
||||
FileType::Puffin(version) => {
|
||||
// notice need to check both file id and version
|
||||
let is_in_manifest = in_manifest
|
||||
.get(&file_id)
|
||||
.map(|opt_ver| *opt_ver == Some(version))
|
||||
.unwrap_or(false);
|
||||
let is_in_tmp_ref = in_tmp_ref
|
||||
.get(&file_id)
|
||||
.map(|versions| versions.contains(&version))
|
||||
.unwrap_or(false);
|
||||
let is_linger = may_linger_files
|
||||
.get(&file_id)
|
||||
.map(|files| files.contains(&&RemovedFile::Index(file_id, version)))
|
||||
.unwrap_or(false);
|
||||
let is_eligible_for_delete = eligible_for_delete
|
||||
.get(&file_id)
|
||||
.map(|files| files.contains(&&RemovedFile::Index(file_id, version)))
|
||||
.unwrap_or(false);
|
||||
|
||||
should_delete_file(
|
||||
is_in_manifest,
|
||||
is_in_tmp_ref,
|
||||
is_linger,
|
||||
is_eligible_for_delete,
|
||||
&entry,
|
||||
unknown_file_may_linger_until,
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
if should_delete {
|
||||
let removed_file = match file_type {
|
||||
FileType::Parquet => {
|
||||
// notice this cause we don't track index version for parquet files
|
||||
// since entries comes from listing, we can't get index version from path
|
||||
RemovedFile::File(file_id, None)
|
||||
}
|
||||
FileType::Puffin(version) => {
|
||||
GC_ORPHANED_INDEX_FILES.inc();
|
||||
RemovedFile::Index(file_id, version)
|
||||
let should_delete = !in_use_filenames.contains(&file_id)
|
||||
&& !may_linger_filenames.contains(&file_id)
|
||||
&& {
|
||||
if !eligible_for_removal.contains(&file_id) {
|
||||
// if the file's expel time is unknown(because not appear in delta manifest), we keep it for a while
|
||||
// using it's last modified time
|
||||
// notice unknown files use a different lingering time
|
||||
entry
|
||||
.metadata()
|
||||
.last_modified()
|
||||
.map(|t| t < unknown_file_may_linger_until)
|
||||
.unwrap_or(false)
|
||||
} else {
|
||||
// if the file did appear in manifest delta(and passes previous predicate), we can delete it immediately
|
||||
true
|
||||
}
|
||||
};
|
||||
ready_for_delete.push(removed_file);
|
||||
|
||||
if should_delete {
|
||||
all_unused_files_ready_for_delete.push(file_id);
|
||||
}
|
||||
}
|
||||
ready_for_delete
|
||||
|
||||
(all_unused_files_ready_for_delete, all_in_exist_linger_files)
|
||||
}
|
||||
|
||||
/// List files to be deleted based on their presence in the manifest, temporary references, and recently removed files.
|
||||
/// Returns a vector of `RemovedFile` that are eligible for deletion.
|
||||
///
|
||||
/// When `full_file_listing` is false, this method will only delete (subset of) files tracked in
|
||||
/// `recently_removed_files`, which significantly
|
||||
/// improves performance. When `full_file_listing` is true, it read from `all_entries` to find
|
||||
/// and delete orphan files (files not tracked in the manifest).
|
||||
/// Concurrently list unused files in the region dir
|
||||
/// because there may be a lot of files in the region dir
|
||||
/// and listing them may take a long time.
|
||||
///
|
||||
/// When `full_file_listing` is false, this method will only delete files tracked in
|
||||
/// `recently_removed_files` without performing expensive list operations, which significantly
|
||||
/// improves performance. When `full_file_listing` is true, it performs a full listing to
|
||||
/// find and delete orphan files.
|
||||
pub async fn list_to_be_deleted_files(
|
||||
&self,
|
||||
region_id: RegionId,
|
||||
in_manifest: &HashMap<FileId, Option<IndexVersion>>,
|
||||
in_tmp_ref: &HashSet<(FileId, Option<IndexVersion>)>,
|
||||
recently_removed_files: BTreeMap<Timestamp, HashSet<RemovedFile>>,
|
||||
in_used: &HashSet<FileId>,
|
||||
recently_removed_files: BTreeMap<Timestamp, HashSet<FileId>>,
|
||||
all_entries: Vec<Entry>,
|
||||
) -> Result<Vec<RemovedFile>> {
|
||||
) -> Result<Vec<FileId>> {
|
||||
let now = chrono::Utc::now();
|
||||
let may_linger_until = self
|
||||
.opt
|
||||
@@ -755,10 +634,8 @@ impl LocalGcWorker {
|
||||
};
|
||||
debug!("may_linger_files: {:?}", may_linger_files);
|
||||
|
||||
let all_may_linger_files = may_linger_files.values().flatten().collect::<HashSet<_>>();
|
||||
let may_linger_filenames = may_linger_files.values().flatten().collect::<HashSet<_>>();
|
||||
|
||||
// known files(tracked in removed files field) that are eligible for removal
|
||||
// (passed lingering time)
|
||||
let eligible_for_removal = recently_removed_files
|
||||
.values()
|
||||
.flatten()
|
||||
@@ -769,24 +646,12 @@ impl LocalGcWorker {
|
||||
if !self.full_file_listing {
|
||||
// Only delete files that:
|
||||
// 1. Are in recently_removed_files (tracked in manifest)
|
||||
// 2. Are not in use(in manifest or tmp ref)
|
||||
// 2. Are not in use
|
||||
// 3. Have passed the lingering time
|
||||
let files_to_delete: Vec<RemovedFile> = eligible_for_removal
|
||||
let files_to_delete: Vec<FileId> = eligible_for_removal
|
||||
.iter()
|
||||
.filter(|file_id| {
|
||||
let in_use = match file_id {
|
||||
RemovedFile::File(file_id, index_version) => {
|
||||
in_manifest.get(file_id) == Some(index_version)
|
||||
|| in_tmp_ref.contains(&(*file_id, *index_version))
|
||||
}
|
||||
RemovedFile::Index(file_id, index_version) => {
|
||||
in_manifest.get(file_id) == Some(&Some(*index_version))
|
||||
|| in_tmp_ref.contains(&(*file_id, Some(*index_version)))
|
||||
}
|
||||
};
|
||||
!in_use
|
||||
})
|
||||
.map(|&f| f.clone())
|
||||
.filter(|file_id| !in_used.contains(*file_id))
|
||||
.map(|&f| *f)
|
||||
.collect();
|
||||
|
||||
info!(
|
||||
@@ -801,14 +666,16 @@ impl LocalGcWorker {
|
||||
// Full file listing mode: get the full list of files from object store
|
||||
|
||||
// Step 3: Filter files to determine which ones can be deleted
|
||||
let all_unused_files_ready_for_delete = self.filter_deletable_files(
|
||||
all_entries,
|
||||
in_manifest,
|
||||
in_tmp_ref,
|
||||
&all_may_linger_files,
|
||||
&eligible_for_removal,
|
||||
unknown_file_may_linger_until,
|
||||
);
|
||||
let (all_unused_files_ready_for_delete, all_in_exist_linger_files) = self
|
||||
.filter_deletable_files(
|
||||
all_entries,
|
||||
in_used,
|
||||
&may_linger_filenames,
|
||||
&eligible_for_removal,
|
||||
unknown_file_may_linger_until,
|
||||
);
|
||||
|
||||
debug!("All in exist linger files: {:?}", all_in_exist_linger_files);
|
||||
|
||||
Ok(all_unused_files_ready_for_delete)
|
||||
}
|
||||
|
||||
@@ -19,13 +19,12 @@ use api::v1::Rows;
|
||||
use common_telemetry::init_default_ut_logging;
|
||||
use store_api::region_engine::RegionEngine as _;
|
||||
use store_api::region_request::{RegionCompactRequest, RegionRequest};
|
||||
use store_api::storage::{FileRef, FileRefsManifest, RegionId};
|
||||
use store_api::storage::{FileRefsManifest, RegionId};
|
||||
|
||||
use crate::config::MitoConfig;
|
||||
use crate::engine::MitoEngine;
|
||||
use crate::engine::compaction_test::{delete_and_flush, put_and_flush};
|
||||
use crate::gc::{GcConfig, LocalGcWorker};
|
||||
use crate::manifest::action::RemovedFile;
|
||||
use crate::region::MitoRegionRef;
|
||||
use crate::test_util::{
|
||||
CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, rows_schema,
|
||||
@@ -121,9 +120,9 @@ async fn test_gc_worker_basic_truncate() {
|
||||
let manifest = region.manifest_ctx.manifest().await;
|
||||
assert!(
|
||||
manifest.removed_files.removed_files[0]
|
||||
.files
|
||||
.contains(&RemovedFile::File(to_be_deleted_file_id, None))
|
||||
&& manifest.removed_files.removed_files[0].files.len() == 1
|
||||
.file_ids
|
||||
.contains(&to_be_deleted_file_id)
|
||||
&& manifest.removed_files.removed_files[0].file_ids.len() == 1
|
||||
&& manifest.files.is_empty(),
|
||||
"Manifest after truncate: {:?}",
|
||||
manifest
|
||||
@@ -215,9 +214,9 @@ async fn test_gc_worker_truncate_with_ref() {
|
||||
let manifest = region.manifest_ctx.manifest().await;
|
||||
assert!(
|
||||
manifest.removed_files.removed_files[0]
|
||||
.files
|
||||
.contains(&RemovedFile::File(to_be_deleted_file_id, None))
|
||||
&& manifest.removed_files.removed_files[0].files.len() == 1
|
||||
.file_ids
|
||||
.contains(&to_be_deleted_file_id)
|
||||
&& manifest.removed_files.removed_files[0].file_ids.len() == 1
|
||||
&& manifest.files.is_empty(),
|
||||
"Manifest after truncate: {:?}",
|
||||
manifest
|
||||
@@ -226,11 +225,7 @@ async fn test_gc_worker_truncate_with_ref() {
|
||||
|
||||
let regions = BTreeMap::from([(region_id, region.clone())]);
|
||||
let file_ref_manifest = FileRefsManifest {
|
||||
file_refs: [(
|
||||
region_id,
|
||||
HashSet::from([FileRef::new(region_id, to_be_deleted_file_id, None)]),
|
||||
)]
|
||||
.into(),
|
||||
file_refs: [(region_id, HashSet::from([to_be_deleted_file_id]))].into(),
|
||||
manifest_version: [(region_id, version)].into(),
|
||||
};
|
||||
let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await;
|
||||
@@ -240,7 +235,7 @@ async fn test_gc_worker_truncate_with_ref() {
|
||||
|
||||
let manifest = region.manifest_ctx.manifest().await;
|
||||
assert!(
|
||||
manifest.removed_files.removed_files[0].files.len() == 1 && manifest.files.is_empty(),
|
||||
manifest.removed_files.removed_files[0].file_ids.len() == 1 && manifest.files.is_empty(),
|
||||
"Manifest: {:?}",
|
||||
manifest
|
||||
);
|
||||
@@ -305,7 +300,7 @@ async fn test_gc_worker_basic_compact() {
|
||||
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let manifest = region.manifest_ctx.manifest().await;
|
||||
assert_eq!(manifest.removed_files.removed_files[0].files.len(), 3);
|
||||
assert_eq!(manifest.removed_files.removed_files[0].file_ids.len(), 3);
|
||||
|
||||
let version = manifest.manifest_version;
|
||||
|
||||
@@ -381,7 +376,7 @@ async fn test_gc_worker_compact_with_ref() {
|
||||
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let manifest = region.manifest_ctx.manifest().await;
|
||||
assert_eq!(manifest.removed_files.removed_files[0].files.len(), 3);
|
||||
assert_eq!(manifest.removed_files.removed_files[0].file_ids.len(), 3);
|
||||
|
||||
let version = manifest.manifest_version;
|
||||
|
||||
@@ -390,12 +385,9 @@ async fn test_gc_worker_compact_with_ref() {
|
||||
file_refs: HashMap::from([(
|
||||
region_id,
|
||||
manifest.removed_files.removed_files[0]
|
||||
.files
|
||||
.file_ids
|
||||
.iter()
|
||||
.map(|removed_file| match removed_file {
|
||||
RemovedFile::File(file_id, v) => FileRef::new(region_id, *file_id, *v),
|
||||
RemovedFile::Index(file_id, v) => FileRef::new(region_id, *file_id, Some(*v)),
|
||||
})
|
||||
.cloned()
|
||||
.collect(),
|
||||
)]),
|
||||
manifest_version: [(region_id, version)].into(),
|
||||
|
||||
@@ -22,7 +22,7 @@ use serde::{Deserialize, Serialize};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::ManifestVersion;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::{FileId, IndexVersion, RegionId, SequenceNumber};
|
||||
use store_api::storage::{FileId, RegionId, SequenceNumber};
|
||||
use strum::Display;
|
||||
|
||||
use crate::error::{RegionMetadataNotFoundSnafu, Result, SerdeJsonSnafu, Utf8Snafu};
|
||||
@@ -193,27 +193,17 @@ impl RegionManifestBuilder {
|
||||
|
||||
pub fn apply_edit(&mut self, manifest_version: ManifestVersion, edit: RegionEdit) {
|
||||
self.manifest_version = manifest_version;
|
||||
|
||||
let mut removed_files = vec![];
|
||||
for file in edit.files_to_add {
|
||||
if let Some(old_file) = self.files.insert(file.file_id, file.clone())
|
||||
&& let Some(old_index) = old_file.index_version()
|
||||
&& !old_file.is_index_up_to_date(&file)
|
||||
{
|
||||
// The old file has an index that is now outdated.
|
||||
removed_files.push(RemovedFile::Index(old_file.file_id, old_index));
|
||||
}
|
||||
self.files.insert(file.file_id, file);
|
||||
}
|
||||
removed_files.extend(
|
||||
self.removed_files.add_removed_files(
|
||||
edit.files_to_remove
|
||||
.iter()
|
||||
.map(|f| RemovedFile::File(f.file_id, f.index_version())),
|
||||
.map(|meta| meta.file_id)
|
||||
.collect(),
|
||||
edit.timestamp_ms
|
||||
.unwrap_or_else(|| Utc::now().timestamp_millis()),
|
||||
);
|
||||
let at = edit
|
||||
.timestamp_ms
|
||||
.unwrap_or_else(|| Utc::now().timestamp_millis());
|
||||
self.removed_files.add_removed_files(removed_files, at);
|
||||
|
||||
for file in edit.files_to_remove {
|
||||
self.files.remove(&file.file_id);
|
||||
}
|
||||
@@ -246,10 +236,7 @@ impl RegionManifestBuilder {
|
||||
self.flushed_sequence = truncated_sequence;
|
||||
self.truncated_entry_id = Some(truncated_entry_id);
|
||||
self.removed_files.add_removed_files(
|
||||
self.files
|
||||
.values()
|
||||
.map(|f| RemovedFile::File(f.file_id, f.index_version()))
|
||||
.collect(),
|
||||
self.files.values().map(|meta| meta.file_id).collect(),
|
||||
truncate
|
||||
.timestamp_ms
|
||||
.unwrap_or_else(|| Utc::now().timestamp_millis()),
|
||||
@@ -258,10 +245,7 @@ impl RegionManifestBuilder {
|
||||
}
|
||||
TruncateKind::Partial { files_to_remove } => {
|
||||
self.removed_files.add_removed_files(
|
||||
files_to_remove
|
||||
.iter()
|
||||
.map(|f| RemovedFile::File(f.file_id, f.index_version()))
|
||||
.collect(),
|
||||
files_to_remove.iter().map(|meta| meta.file_id).collect(),
|
||||
truncate
|
||||
.timestamp_ms
|
||||
.unwrap_or_else(|| Utc::now().timestamp_millis()),
|
||||
@@ -311,22 +295,20 @@ pub struct RemovedFilesRecord {
|
||||
|
||||
impl RemovedFilesRecord {
|
||||
/// Clear the actually deleted files from the list of removed files
|
||||
pub fn clear_deleted_files(&mut self, deleted_files: Vec<RemovedFile>) {
|
||||
pub fn clear_deleted_files(&mut self, deleted_files: Vec<FileId>) {
|
||||
let deleted_file_set: HashSet<_> = HashSet::from_iter(deleted_files);
|
||||
for files in self.removed_files.iter_mut() {
|
||||
files
|
||||
.files
|
||||
.retain(|removed| !deleted_file_set.contains(removed));
|
||||
files.file_ids.retain(|fid| !deleted_file_set.contains(fid));
|
||||
}
|
||||
|
||||
self.removed_files.retain(|fs| !fs.files.is_empty());
|
||||
self.removed_files.retain(|fs| !fs.file_ids.is_empty());
|
||||
}
|
||||
|
||||
pub fn update_file_removed_cnt_to_stats(&self, stats: &ManifestStats) {
|
||||
let cnt = self
|
||||
.removed_files
|
||||
.iter()
|
||||
.map(|r| r.files.len() as u64)
|
||||
.map(|r| r.file_ids.len() as u64)
|
||||
.sum();
|
||||
stats
|
||||
.file_removed_cnt
|
||||
@@ -340,42 +322,18 @@ pub struct RemovedFiles {
|
||||
/// the files are removed from manifest. The timestamp is in milliseconds since unix epoch.
|
||||
pub removed_at: i64,
|
||||
/// The set of file ids that are removed.
|
||||
pub files: HashSet<RemovedFile>,
|
||||
}
|
||||
|
||||
/// A removed file, which can be a data file(optional paired with a index file) or an outdated index file.
|
||||
#[derive(Serialize, Deserialize, Hash, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum RemovedFile {
|
||||
File(FileId, Option<IndexVersion>),
|
||||
Index(FileId, IndexVersion),
|
||||
}
|
||||
|
||||
impl RemovedFile {
|
||||
pub fn file_id(&self) -> FileId {
|
||||
match self {
|
||||
RemovedFile::File(file_id, _) => *file_id,
|
||||
RemovedFile::Index(file_id, _) => *file_id,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index_version(&self) -> Option<IndexVersion> {
|
||||
match self {
|
||||
RemovedFile::File(_, index_version) => *index_version,
|
||||
RemovedFile::Index(_, index_version) => Some(*index_version),
|
||||
}
|
||||
}
|
||||
pub file_ids: HashSet<FileId>,
|
||||
}
|
||||
|
||||
impl RemovedFilesRecord {
|
||||
/// Add a record of removed files with the current timestamp.
|
||||
pub fn add_removed_files(&mut self, removed: Vec<RemovedFile>, at: i64) {
|
||||
if removed.is_empty() {
|
||||
pub fn add_removed_files(&mut self, file_ids: HashSet<FileId>, at: i64) {
|
||||
if file_ids.is_empty() {
|
||||
return;
|
||||
}
|
||||
let files = removed.into_iter().collect();
|
||||
self.removed_files.push(RemovedFiles {
|
||||
removed_at: at,
|
||||
files,
|
||||
file_ids,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -780,10 +738,10 @@ mod tests {
|
||||
removed_files: RemovedFilesRecord {
|
||||
removed_files: vec![RemovedFiles {
|
||||
removed_at: 0,
|
||||
files: HashSet::from([RemovedFile::File(
|
||||
FileId::parse_str("4b220a70-2b03-4641-9687-b65d94641208").unwrap(),
|
||||
None,
|
||||
)]),
|
||||
file_ids: HashSet::from([FileId::parse_str(
|
||||
"4b220a70-2b03-4641-9687-b65d94641208",
|
||||
)
|
||||
.unwrap()]),
|
||||
}],
|
||||
},
|
||||
sst_format: FormatType::PrimaryKey,
|
||||
|
||||
@@ -21,6 +21,7 @@ use futures::TryStreamExt;
|
||||
use object_store::ObjectStore;
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::FileId;
|
||||
use store_api::{MAX_VERSION, MIN_VERSION, ManifestVersion};
|
||||
|
||||
use crate::cache::manifest_cache::ManifestCache;
|
||||
@@ -30,7 +31,7 @@ use crate::error::{
|
||||
};
|
||||
use crate::manifest::action::{
|
||||
RegionChange, RegionCheckpoint, RegionEdit, RegionManifest, RegionManifestBuilder,
|
||||
RegionMetaAction, RegionMetaActionList, RemovedFile,
|
||||
RegionMetaAction, RegionMetaActionList,
|
||||
};
|
||||
use crate::manifest::checkpointer::Checkpointer;
|
||||
use crate::manifest::storage::{
|
||||
@@ -588,7 +589,7 @@ impl RegionManifestManager {
|
||||
}
|
||||
|
||||
/// Clear deleted files from manifest's `removed_files` field without update version. Notice if datanode exit before checkpoint then new manifest by open region may still contain these deleted files, which is acceptable for gc process.
|
||||
pub fn clear_deleted_files(&mut self, deleted_files: Vec<RemovedFile>) {
|
||||
pub fn clear_deleted_files(&mut self, deleted_files: Vec<FileId>) {
|
||||
let mut manifest = (*self.manifest()).clone();
|
||||
manifest.removed_files.clear_deleted_files(deleted_files);
|
||||
self.set_manifest(Arc::new(manifest));
|
||||
|
||||
@@ -74,7 +74,7 @@ impl BulkIterContext {
|
||||
.collect();
|
||||
|
||||
let read_format = ReadFormat::new(
|
||||
region_metadata.clone(),
|
||||
region_metadata,
|
||||
projection,
|
||||
true,
|
||||
None,
|
||||
@@ -82,18 +82,10 @@ impl BulkIterContext {
|
||||
skip_auto_convert,
|
||||
)?;
|
||||
|
||||
let dyn_filters = predicate
|
||||
.as_ref()
|
||||
.map(|pred| pred.dyn_filters().clone())
|
||||
.unwrap_or_default();
|
||||
|
||||
Ok(Self {
|
||||
base: RangeBase {
|
||||
filters: simple_filters,
|
||||
dyn_filters,
|
||||
read_format,
|
||||
prune_schema: region_metadata.schema.clone(),
|
||||
expected_metadata: Some(region_metadata),
|
||||
codec,
|
||||
// we don't need to compat batch since all batch in memtable have the same schema.
|
||||
compat_batch: None,
|
||||
|
||||
@@ -509,20 +509,6 @@ lazy_static! {
|
||||
"mito gc deleted file count",
|
||||
).unwrap();
|
||||
|
||||
/// Counter for the number of unparsable files skipped by GC.
|
||||
pub static ref GC_SKIPPED_UNPARSABLE_FILES: IntCounter =
|
||||
register_int_counter!(
|
||||
"greptime_mito_gc_skipped_unparsable_files",
|
||||
"mito gc skipped unparsable files count",
|
||||
).unwrap();
|
||||
|
||||
/// Counter for the number of orphaned index files found by GC.
|
||||
pub static ref GC_ORPHANED_INDEX_FILES: IntCounter =
|
||||
register_int_counter!(
|
||||
"greptime_mito_gc_orphaned_index_files",
|
||||
"mito gc orphaned index files count",
|
||||
).unwrap();
|
||||
|
||||
/// Total number of files downloaded during cache fill on region open.
|
||||
pub static ref CACHE_FILL_DOWNLOADED_FILES: IntCounter = register_int_counter!(
|
||||
"mito_cache_fill_downloaded_files",
|
||||
|
||||
@@ -1181,9 +1181,7 @@ pub fn build_file_range_scan_stream(
|
||||
};
|
||||
for range in ranges {
|
||||
let build_reader_start = Instant::now();
|
||||
let Some(reader) = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await? else {
|
||||
continue;
|
||||
};
|
||||
let reader = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await?;
|
||||
let build_cost = build_reader_start.elapsed();
|
||||
part_metrics.inc_build_reader_cost(build_cost);
|
||||
let compat_batch = range.compat_batch();
|
||||
@@ -1241,7 +1239,7 @@ pub fn build_flat_file_range_scan_stream(
|
||||
};
|
||||
for range in ranges {
|
||||
let build_reader_start = Instant::now();
|
||||
let Some(mut reader) = range.flat_reader(fetch_metrics.as_deref()).await? else{continue};
|
||||
let mut reader = range.flat_reader(fetch_metrics.as_deref()).await?;
|
||||
let build_cost = build_reader_start.elapsed();
|
||||
part_metrics.inc_build_reader_cost(build_cost);
|
||||
|
||||
|
||||
@@ -27,8 +27,7 @@ use std::sync::{Arc, Mutex, RwLock};
|
||||
|
||||
use common_telemetry::{error, info, warn};
|
||||
use crossbeam_utils::atomic::AtomicCell;
|
||||
use partition::expr::PartitionExpr;
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
use snafu::{OptionExt, ensure};
|
||||
use store_api::ManifestVersion;
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::logstore::provider::Provider;
|
||||
@@ -44,8 +43,8 @@ pub use utils::*;
|
||||
|
||||
use crate::access_layer::AccessLayerRef;
|
||||
use crate::error::{
|
||||
FlushableRegionStateSnafu, InvalidPartitionExprSnafu, RegionNotFoundSnafu, RegionStateSnafu,
|
||||
RegionTruncatedSnafu, Result, UpdateManifestSnafu,
|
||||
FlushableRegionStateSnafu, RegionNotFoundSnafu, RegionStateSnafu, RegionTruncatedSnafu, Result,
|
||||
UpdateManifestSnafu,
|
||||
};
|
||||
use crate::manifest::action::{
|
||||
RegionChange, RegionManifest, RegionMetaAction, RegionMetaActionList,
|
||||
@@ -726,28 +725,6 @@ impl MitoRegion {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the partition expression string for this region.
|
||||
///
|
||||
/// If the region is currently in staging state, this returns the partition expression held in
|
||||
/// the staging partition field. Otherwise, it returns the partition expression from the primary
|
||||
/// region metadata (current committed version).
|
||||
pub fn maybe_staging_partition_expr_str(&self) -> Option<String> {
|
||||
let is_staging = self.is_staging();
|
||||
if is_staging {
|
||||
let staging_partition_expr = self.staging_partition_expr.lock().unwrap();
|
||||
if staging_partition_expr.is_none() {
|
||||
warn!(
|
||||
"Staging partition expr is none for region {} in staging state",
|
||||
self.region_id
|
||||
);
|
||||
}
|
||||
staging_partition_expr.clone()
|
||||
} else {
|
||||
let version = self.version();
|
||||
version.metadata.partition_expr.clone()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Context to update the region manifest.
|
||||
@@ -1294,19 +1271,6 @@ impl ManifestStats {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses the partition expression from a JSON string.
|
||||
pub fn parse_partition_expr(partition_expr_str: Option<&str>) -> Result<Option<PartitionExpr>> {
|
||||
match partition_expr_str {
|
||||
None => Ok(None),
|
||||
Some("") => Ok(None),
|
||||
Some(json_str) => {
|
||||
let expr = partition::expr::PartitionExpr::from_json_str(json_str)
|
||||
.with_context(|_| InvalidPartitionExprSnafu { expr: json_str })?;
|
||||
Ok(expr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::atomic::AtomicU64;
|
||||
|
||||
@@ -312,14 +312,6 @@ impl FileMeta {
|
||||
!self.available_indexes.is_empty()
|
||||
}
|
||||
|
||||
pub fn index_version(&self) -> Option<IndexVersion> {
|
||||
if self.exists_index() {
|
||||
Some(self.index_version)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether the index file is up-to-date comparing to another file meta.
|
||||
pub fn is_index_up_to_date(&self, other: &FileMeta) -> bool {
|
||||
self.exists_index() && other.exists_index() && self.index_version >= other.index_version
|
||||
|
||||
@@ -178,8 +178,8 @@ impl FilePurger for ObjectStoreFilePurger {
|
||||
// if not on local file system, instead inform the global file purger to remove the file reference.
|
||||
// notice that no matter whether the file is deleted or not, we need to remove the reference
|
||||
// because the file is no longer in use nonetheless.
|
||||
// for same reason, we don't care about index_outdated here.
|
||||
self.file_ref_manager.remove_file(&file_meta);
|
||||
// TODO(discord9): consider impl a .tombstone file to reduce files needed to list
|
||||
}
|
||||
|
||||
fn new_file(&self, file_meta: &FileMeta) {
|
||||
|
||||
@@ -91,7 +91,7 @@ impl FileReferenceManager {
|
||||
// get from in memory file handles
|
||||
for region_id in query_regions.iter().map(|r| r.region_id()) {
|
||||
if let Some(files) = self.ref_file_set(region_id) {
|
||||
ref_files.insert(region_id, files);
|
||||
ref_files.insert(region_id, files.into_iter().map(|f| f.file_id).collect());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,17 +108,10 @@ impl FileReferenceManager {
|
||||
let manifest = related_region.manifest_ctx.manifest().await;
|
||||
for meta in manifest.files.values() {
|
||||
if queries.contains(&meta.region_id) {
|
||||
// since gc couldn't happen together with repartition
|
||||
// (both the queries and related_region acquire region read lock), no need to worry about
|
||||
// staging manifest in repartition here.
|
||||
ref_files
|
||||
.entry(meta.region_id)
|
||||
.or_insert_with(HashSet::new)
|
||||
.insert(FileRef::new(
|
||||
meta.region_id,
|
||||
meta.file_id,
|
||||
meta.index_version(),
|
||||
));
|
||||
.insert(meta.file_id);
|
||||
}
|
||||
}
|
||||
// not sure if related region's manifest version is needed, but record it for now.
|
||||
@@ -139,11 +132,7 @@ impl FileReferenceManager {
|
||||
let region_id = file_meta.region_id;
|
||||
let mut is_new = false;
|
||||
{
|
||||
let file_ref = FileRef::new(
|
||||
file_meta.region_id,
|
||||
file_meta.file_id,
|
||||
file_meta.index_version(),
|
||||
);
|
||||
let file_ref = FileRef::new(file_meta.region_id, file_meta.file_id);
|
||||
self.files_per_region
|
||||
.entry(region_id)
|
||||
.and_modify(|refs| {
|
||||
@@ -168,7 +157,7 @@ impl FileReferenceManager {
|
||||
/// If the reference count reaches zero, the file reference will be removed from the manager.
|
||||
pub fn remove_file(&self, file_meta: &FileMeta) {
|
||||
let region_id = file_meta.region_id;
|
||||
let file_ref = FileRef::new(region_id, file_meta.file_id, file_meta.index_version());
|
||||
let file_ref = FileRef::new(region_id, file_meta.file_id);
|
||||
|
||||
let mut remove_table_entry = false;
|
||||
let mut remove_file_ref = false;
|
||||
@@ -258,23 +247,13 @@ mod tests {
|
||||
.get(&file_meta.region_id)
|
||||
.unwrap()
|
||||
.files,
|
||||
HashMap::from_iter([(
|
||||
FileRef::new(
|
||||
file_meta.region_id,
|
||||
file_meta.file_id,
|
||||
file_meta.index_version()
|
||||
),
|
||||
1
|
||||
)])
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 1)])
|
||||
);
|
||||
|
||||
file_ref_mgr.add_file(&file_meta);
|
||||
|
||||
let expected_region_ref_manifest = HashSet::from_iter([FileRef::new(
|
||||
file_meta.region_id,
|
||||
file_meta.file_id,
|
||||
file_meta.index_version(),
|
||||
)]);
|
||||
let expected_region_ref_manifest =
|
||||
HashSet::from_iter([FileRef::new(file_meta.region_id, file_meta.file_id)]);
|
||||
|
||||
assert_eq!(
|
||||
file_ref_mgr.ref_file_set(file_meta.region_id).unwrap(),
|
||||
@@ -287,14 +266,7 @@ mod tests {
|
||||
.get(&file_meta.region_id)
|
||||
.unwrap()
|
||||
.files,
|
||||
HashMap::from_iter([(
|
||||
FileRef::new(
|
||||
file_meta.region_id,
|
||||
file_meta.file_id,
|
||||
file_meta.index_version()
|
||||
),
|
||||
2
|
||||
)])
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 2)])
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
@@ -310,14 +282,7 @@ mod tests {
|
||||
.get(&file_meta.region_id)
|
||||
.unwrap()
|
||||
.files,
|
||||
HashMap::from_iter([(
|
||||
FileRef::new(
|
||||
file_meta.region_id,
|
||||
file_meta.file_id,
|
||||
file_meta.index_version()
|
||||
),
|
||||
1
|
||||
)])
|
||||
HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 1)])
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
|
||||
@@ -19,7 +19,6 @@ use store_api::path_utils::region_name;
|
||||
use store_api::region_request::PathType;
|
||||
use store_api::storage::{FileId, RegionId};
|
||||
|
||||
use crate::cache::file_cache::FileType;
|
||||
use crate::error::UnexpectedSnafu;
|
||||
use crate::sst::file::{RegionFileId, RegionIndexId};
|
||||
|
||||
@@ -111,29 +110,31 @@ pub fn parse_index_file_info(filepath: &str) -> crate::error::Result<(FileId, u6
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_file_id_type_from_path(filepath: &str) -> crate::error::Result<(FileId, FileType)> {
|
||||
/// Get RegionFileId from sst or index filename
|
||||
pub fn parse_file_id_from_path(filepath: &str) -> crate::error::Result<FileId> {
|
||||
let filename = filepath.rsplit('/').next().context(UnexpectedSnafu {
|
||||
reason: format!("invalid file path: {}", filepath),
|
||||
})?;
|
||||
// get part before first '.'
|
||||
let parts: Vec<&str> = filename.split('.').collect();
|
||||
if parts.len() < 2 {
|
||||
if parts.len() != 2 {
|
||||
return UnexpectedSnafu {
|
||||
reason: format!("invalid file name: {}", filename),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
if parts[1] != "parquet" && parts[1] != "puffin" {
|
||||
return UnexpectedSnafu {
|
||||
reason: format!("invalid file extension: {}", parts[1]),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
let file_id = parts[0];
|
||||
let file_id = FileId::parse_str(file_id).map_err(|e| {
|
||||
FileId::parse_str(file_id).map_err(|e| {
|
||||
UnexpectedSnafu {
|
||||
reason: format!("invalid file id: {}, err: {}", file_id, e),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
let file_type = FileType::parse(parts[1..].join(".").as_str()).context(UnexpectedSnafu {
|
||||
reason: format!("invalid file type in file name: {}", filename),
|
||||
})?;
|
||||
Ok((file_id, file_type))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -219,62 +220,4 @@ mod tests {
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, 42);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_file_id_type_from_path() {
|
||||
use crate::cache::file_cache::FileType;
|
||||
|
||||
// Test parquet file
|
||||
let file_id = FileId::random();
|
||||
let path = format!("table_dir/1_0000000002/data/{}.parquet", file_id);
|
||||
let result = parse_file_id_type_from_path(&path).unwrap();
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, FileType::Parquet);
|
||||
|
||||
// Test puffin file (legacy format, version 0)
|
||||
let file_id = FileId::random();
|
||||
let path = format!("table_dir/1_0000000002/index/{}.puffin", file_id);
|
||||
let result = parse_file_id_type_from_path(&path).unwrap();
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, FileType::Puffin(0));
|
||||
|
||||
// Test versioned puffin file
|
||||
let file_id = FileId::random();
|
||||
let path = format!("table_dir/1_0000000002/index/{}.1.puffin", file_id);
|
||||
let result = parse_file_id_type_from_path(&path).unwrap();
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, FileType::Puffin(1));
|
||||
|
||||
// Test with different path types
|
||||
let file_id = FileId::random();
|
||||
let path = format!("table_dir/1_0000000002/metadata/{}.parquet", file_id);
|
||||
let result = parse_file_id_type_from_path(&path).unwrap();
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, FileType::Parquet);
|
||||
|
||||
// Test with bare path type
|
||||
let file_id = FileId::random();
|
||||
let path = format!("table_dir/1_0000000002/{}.parquet", file_id);
|
||||
let result = parse_file_id_type_from_path(&path).unwrap();
|
||||
assert_eq!(result.0.to_string(), file_id.to_string());
|
||||
assert_eq!(result.1, FileType::Parquet);
|
||||
|
||||
// Test error cases
|
||||
// Invalid file extension
|
||||
let result = parse_file_id_type_from_path("table_dir/1_0000000002/data/test.invalid");
|
||||
assert!(result.is_err());
|
||||
|
||||
// Invalid file ID
|
||||
let result =
|
||||
parse_file_id_type_from_path("table_dir/1_0000000002/data/invalid-file-id.parquet");
|
||||
assert!(result.is_err());
|
||||
|
||||
// No file extension
|
||||
let result = parse_file_id_type_from_path("table_dir/1_0000000002/data/test");
|
||||
assert!(result.is_err());
|
||||
|
||||
// Empty filename
|
||||
let result = parse_file_id_type_from_path("table_dir/1_0000000002/data/");
|
||||
assert!(result.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,19 +21,15 @@ use std::sync::Arc;
|
||||
|
||||
use api::v1::{OpType, SemanticType};
|
||||
use common_telemetry::error;
|
||||
use datafusion::physical_plan::expressions::DynamicFilterPhysicalExpr;
|
||||
use datatypes::arrow::array::{ArrayRef, BooleanArray};
|
||||
use datatypes::arrow::buffer::BooleanBuffer;
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::schema::Schema;
|
||||
use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec};
|
||||
use parquet::arrow::arrow_reader::RowSelection;
|
||||
use parquet::file::metadata::ParquetMetaData;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::codec::PrimaryKeyEncoding;
|
||||
use store_api::metadata::RegionMetadataRef;
|
||||
use store_api::storage::{ColumnId, TimeSeriesRowSelector};
|
||||
use table::predicate::Predicate;
|
||||
|
||||
use crate::error::{
|
||||
ComputeArrowSnafu, DataTypeMismatchSnafu, DecodeSnafu, DecodeStatsSnafu, RecordBatchSnafu,
|
||||
@@ -50,7 +46,6 @@ use crate::sst::parquet::reader::{
|
||||
FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext,
|
||||
};
|
||||
use crate::sst::parquet::row_group::ParquetFetchMetrics;
|
||||
use crate::sst::parquet::stats::RowGroupPruningStats;
|
||||
|
||||
/// Checks if a row group contains delete operations by examining the min value of op_type column.
|
||||
///
|
||||
@@ -119,62 +114,12 @@ impl FileRange {
|
||||
row_selection.row_count() == rows_in_group as usize
|
||||
}
|
||||
|
||||
/// Performs pruning before reading the [FileRange].
|
||||
/// It use latest dynamic filters with row group statistics to prune the range.
|
||||
///
|
||||
/// Returns false if the entire range is pruned and can be skipped.
|
||||
fn in_dynamic_filter_range(&self) -> bool {
|
||||
if self.context.base.dyn_filters.is_empty() {
|
||||
return true;
|
||||
}
|
||||
let curr_row_group = self
|
||||
.context
|
||||
.reader_builder
|
||||
.parquet_metadata()
|
||||
.row_group(self.row_group_idx);
|
||||
let read_format = self.context.read_format();
|
||||
let prune_schema = &self.context.base.prune_schema;
|
||||
let stats = RowGroupPruningStats::new(
|
||||
std::slice::from_ref(curr_row_group),
|
||||
read_format,
|
||||
self.context.base.expected_metadata.clone(),
|
||||
self.compute_skip_fields(),
|
||||
);
|
||||
|
||||
// not costly to create a predicate here since dynamic filters are wrapped in Arc
|
||||
let pred = Predicate::new(vec![]).with_dyn_filters(self.context.base.dyn_filters.clone());
|
||||
|
||||
pred.prune_with_stats(&stats, prune_schema.arrow_schema())
|
||||
.first()
|
||||
.cloned()
|
||||
.unwrap_or(true) // unexpected, not skip just in case
|
||||
}
|
||||
|
||||
fn compute_skip_fields(&self) -> bool {
|
||||
match self.context.base.pre_filter_mode {
|
||||
PreFilterMode::All => false,
|
||||
PreFilterMode::SkipFields => true,
|
||||
PreFilterMode::SkipFieldsOnDelete => {
|
||||
// Check if this specific row group contains delete op
|
||||
row_group_contains_delete(
|
||||
self.context.reader_builder.parquet_metadata(),
|
||||
self.row_group_idx,
|
||||
self.context.reader_builder.file_path(),
|
||||
)
|
||||
.unwrap_or(true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a reader to read the [FileRange].
|
||||
pub(crate) async fn reader(
|
||||
&self,
|
||||
selector: Option<TimeSeriesRowSelector>,
|
||||
fetch_metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<Option<PruneReader>> {
|
||||
if !self.in_dynamic_filter_range() {
|
||||
return Ok(None);
|
||||
}
|
||||
) -> Result<PruneReader> {
|
||||
let parquet_reader = self
|
||||
.context
|
||||
.reader_builder
|
||||
@@ -225,17 +170,14 @@ impl FileRange {
|
||||
)
|
||||
};
|
||||
|
||||
Ok(Some(prune_reader))
|
||||
Ok(prune_reader)
|
||||
}
|
||||
|
||||
/// Creates a flat reader that returns RecordBatch.
|
||||
pub(crate) async fn flat_reader(
|
||||
&self,
|
||||
fetch_metrics: Option<&ParquetFetchMetrics>,
|
||||
) -> Result<Option<FlatPruneReader>> {
|
||||
if !self.in_dynamic_filter_range() {
|
||||
return Ok(None);
|
||||
}
|
||||
) -> Result<FlatPruneReader> {
|
||||
let parquet_reader = self
|
||||
.context
|
||||
.reader_builder
|
||||
@@ -256,7 +198,7 @@ impl FileRange {
|
||||
skip_fields,
|
||||
);
|
||||
|
||||
Ok(Some(flat_prune_reader))
|
||||
Ok(flat_prune_reader)
|
||||
}
|
||||
|
||||
/// Returns the helper to compat batches.
|
||||
@@ -282,10 +224,22 @@ pub(crate) type FileRangeContextRef = Arc<FileRangeContext>;
|
||||
|
||||
impl FileRangeContext {
|
||||
/// Creates a new [FileRangeContext].
|
||||
pub(crate) fn new(reader_builder: RowGroupReaderBuilder, base: RangeBase) -> Self {
|
||||
pub(crate) fn new(
|
||||
reader_builder: RowGroupReaderBuilder,
|
||||
filters: Vec<SimpleFilterContext>,
|
||||
read_format: ReadFormat,
|
||||
codec: Arc<dyn PrimaryKeyCodec>,
|
||||
pre_filter_mode: PreFilterMode,
|
||||
) -> Self {
|
||||
Self {
|
||||
reader_builder,
|
||||
base,
|
||||
base: RangeBase {
|
||||
filters,
|
||||
read_format,
|
||||
codec,
|
||||
compat_batch: None,
|
||||
pre_filter_mode,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -369,13 +323,8 @@ pub enum PreFilterMode {
|
||||
pub(crate) struct RangeBase {
|
||||
/// Filters pushed down.
|
||||
pub(crate) filters: Vec<SimpleFilterContext>,
|
||||
/// Dynamic filter physical exprs.
|
||||
pub(crate) dyn_filters: Arc<Vec<DynamicFilterPhysicalExpr>>,
|
||||
/// Helper to read the SST.
|
||||
pub(crate) read_format: ReadFormat,
|
||||
pub(crate) expected_metadata: Option<RegionMetadataRef>,
|
||||
/// Schema used for pruning with dynamic filters.
|
||||
pub(crate) prune_schema: Arc<Schema>,
|
||||
/// Decoder for primary keys
|
||||
pub(crate) codec: Arc<dyn PrimaryKeyCodec>,
|
||||
/// Optional helper to compat batches.
|
||||
|
||||
@@ -62,7 +62,7 @@ use crate::sst::index::inverted_index::applier::{
|
||||
InvertedIndexApplierRef, InvertedIndexApplyMetrics,
|
||||
};
|
||||
use crate::sst::parquet::file_range::{
|
||||
FileRangeContext, FileRangeContextRef, PreFilterMode, RangeBase, row_group_contains_delete,
|
||||
FileRangeContext, FileRangeContextRef, PreFilterMode, row_group_contains_delete,
|
||||
};
|
||||
use crate::sst::parquet::format::{ReadFormat, need_override_sequence};
|
||||
use crate::sst::parquet::metadata::MetadataLoader;
|
||||
@@ -342,12 +342,6 @@ impl ParquetReaderBuilder {
|
||||
);
|
||||
}
|
||||
|
||||
let prune_schema = self
|
||||
.expected_metadata
|
||||
.as_ref()
|
||||
.map(|meta| meta.schema.clone())
|
||||
.unwrap_or_else(|| region_meta.schema.clone());
|
||||
|
||||
let reader_builder = RowGroupReaderBuilder {
|
||||
file_handle: self.file_handle.clone(),
|
||||
file_path,
|
||||
@@ -374,26 +368,14 @@ impl ParquetReaderBuilder {
|
||||
vec![]
|
||||
};
|
||||
|
||||
let dyn_filters = if let Some(predicate) = &self.predicate {
|
||||
predicate.dyn_filters().clone()
|
||||
} else {
|
||||
Arc::new(vec![])
|
||||
};
|
||||
|
||||
let codec = build_primary_key_codec(read_format.metadata());
|
||||
|
||||
let context = FileRangeContext::new(
|
||||
reader_builder,
|
||||
RangeBase {
|
||||
filters,
|
||||
dyn_filters,
|
||||
read_format,
|
||||
expected_metadata: self.expected_metadata.clone(),
|
||||
prune_schema,
|
||||
codec,
|
||||
compat_batch: None,
|
||||
pre_filter_mode: self.pre_filter_mode,
|
||||
},
|
||||
filters,
|
||||
read_format,
|
||||
codec,
|
||||
self.pre_filter_mode,
|
||||
);
|
||||
|
||||
metrics.build_cost += start.elapsed();
|
||||
|
||||
@@ -114,7 +114,13 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
info!("Flush region: {} before alteration", region_id);
|
||||
|
||||
// Try to submit a flush task.
|
||||
let task = self.new_flush_task(®ion, FlushReason::Alter, None, self.config.clone());
|
||||
let task = self.new_flush_task(
|
||||
®ion,
|
||||
FlushReason::Alter,
|
||||
None,
|
||||
self.config.clone(),
|
||||
region.is_staging(),
|
||||
);
|
||||
if let Err(e) =
|
||||
self.flush_scheduler
|
||||
.schedule_flush(region.region_id, ®ion.version_control, task)
|
||||
|
||||
@@ -70,6 +70,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
FlushReason::EnterStaging,
|
||||
None,
|
||||
self.config.clone(),
|
||||
region.is_staging(),
|
||||
);
|
||||
if let Err(e) =
|
||||
self.flush_scheduler
|
||||
|
||||
@@ -80,8 +80,13 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
|
||||
if region.last_flush_millis() < min_last_flush_time {
|
||||
// If flush time of this region is earlier than `min_last_flush_time`, we can flush this region.
|
||||
let task =
|
||||
self.new_flush_task(region, FlushReason::EngineFull, None, self.config.clone());
|
||||
let task = self.new_flush_task(
|
||||
region,
|
||||
FlushReason::EngineFull,
|
||||
None,
|
||||
self.config.clone(),
|
||||
region.is_staging(),
|
||||
);
|
||||
self.flush_scheduler.schedule_flush(
|
||||
region.region_id,
|
||||
®ion.version_control,
|
||||
@@ -119,8 +124,13 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
// Stop flushing regions if memory usage is already below the flush limit
|
||||
break;
|
||||
}
|
||||
let task =
|
||||
self.new_flush_task(region, FlushReason::EngineFull, None, self.config.clone());
|
||||
let task = self.new_flush_task(
|
||||
region,
|
||||
FlushReason::EngineFull,
|
||||
None,
|
||||
self.config.clone(),
|
||||
region.is_staging(),
|
||||
);
|
||||
debug!("Scheduling flush task for region {}", region.region_id);
|
||||
// Schedule a flush task for the current region
|
||||
self.flush_scheduler
|
||||
@@ -139,6 +149,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
reason: FlushReason,
|
||||
row_group_size: Option<usize>,
|
||||
engine_config: Arc<MitoConfig>,
|
||||
is_staging: bool,
|
||||
) -> RegionFlushTask {
|
||||
RegionFlushTask {
|
||||
region_id: region.region_id,
|
||||
@@ -153,8 +164,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
manifest_ctx: region.manifest_ctx.clone(),
|
||||
index_options: region.version().options.index_options.clone(),
|
||||
flush_semaphore: self.flush_semaphore.clone(),
|
||||
is_staging: region.is_staging(),
|
||||
partition_expr: region.maybe_staging_partition_expr_str(),
|
||||
is_staging,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -180,8 +190,14 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
} else {
|
||||
FlushReason::Manual
|
||||
};
|
||||
let mut task =
|
||||
self.new_flush_task(®ion, reason, request.row_group_size, self.config.clone());
|
||||
|
||||
let mut task = self.new_flush_task(
|
||||
®ion,
|
||||
reason,
|
||||
request.row_group_size,
|
||||
self.config.clone(),
|
||||
region.is_staging(),
|
||||
);
|
||||
task.push_sender(sender);
|
||||
if let Err(e) =
|
||||
self.flush_scheduler
|
||||
@@ -211,6 +227,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
FlushReason::Periodically,
|
||||
None,
|
||||
self.config.clone(),
|
||||
region.is_staging(),
|
||||
);
|
||||
self.flush_scheduler.schedule_flush(
|
||||
region.region_id,
|
||||
|
||||
@@ -410,7 +410,8 @@ fn sql_value_to_value(
|
||||
})?
|
||||
} else {
|
||||
common_sql::convert::sql_value_to_value(
|
||||
column_schema,
|
||||
column,
|
||||
&column_schema.data_type,
|
||||
sql_val,
|
||||
timezone,
|
||||
None,
|
||||
|
||||
@@ -52,7 +52,6 @@ use common_time::Timestamp;
|
||||
use common_time::range::TimestampRange;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use humantime::format_duration;
|
||||
use itertools::Itertools;
|
||||
use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef};
|
||||
@@ -645,20 +644,11 @@ impl StatementExecutor {
|
||||
})?
|
||||
.unit();
|
||||
|
||||
let start_column = ColumnSchema::new(
|
||||
"range_start",
|
||||
ConcreteDataType::timestamp_datatype(time_unit),
|
||||
false,
|
||||
);
|
||||
let end_column = ColumnSchema::new(
|
||||
"range_end",
|
||||
ConcreteDataType::timestamp_datatype(time_unit),
|
||||
false,
|
||||
);
|
||||
let mut time_ranges = Vec::with_capacity(sql_values_time_range.len());
|
||||
for (start, end) in sql_values_time_range {
|
||||
let start = common_sql::convert::sql_value_to_value(
|
||||
&start_column,
|
||||
"range_start",
|
||||
&ConcreteDataType::timestamp_datatype(time_unit),
|
||||
start,
|
||||
Some(&query_ctx.timezone()),
|
||||
None,
|
||||
@@ -677,7 +667,8 @@ impl StatementExecutor {
|
||||
})?;
|
||||
|
||||
let end = common_sql::convert::sql_value_to_value(
|
||||
&end_column,
|
||||
"range_end",
|
||||
&ConcreteDataType::timestamp_datatype(time_unit),
|
||||
end,
|
||||
Some(&query_ctx.timezone()),
|
||||
None,
|
||||
|
||||
@@ -242,12 +242,8 @@ fn values_to_vectors_by_exact_types(
|
||||
args.iter()
|
||||
.zip(exact_types.iter())
|
||||
.map(|(value, data_type)| {
|
||||
let schema = ColumnSchema::new(
|
||||
DUMMY_COLUMN,
|
||||
ConcreteDataType::from_arrow_type(data_type),
|
||||
true,
|
||||
);
|
||||
let value = sql_value_to_value(&schema, value, tz, None, false)
|
||||
let data_type = &ConcreteDataType::from_arrow_type(data_type);
|
||||
let value = sql_value_to_value(DUMMY_COLUMN, data_type, value, tz, None, false)
|
||||
.context(error::SqlCommonSnafu)?;
|
||||
|
||||
Ok(value_to_vector(value))
|
||||
@@ -264,12 +260,10 @@ fn values_to_vectors_by_valid_types(
|
||||
args.iter()
|
||||
.map(|value| {
|
||||
for data_type in valid_types {
|
||||
let schema = ColumnSchema::new(
|
||||
DUMMY_COLUMN,
|
||||
ConcreteDataType::from_arrow_type(data_type),
|
||||
true,
|
||||
);
|
||||
if let Ok(value) = sql_value_to_value(&schema, value, tz, None, false) {
|
||||
let data_type = &ConcreteDataType::from_arrow_type(data_type);
|
||||
if let Ok(value) =
|
||||
sql_value_to_value(DUMMY_COLUMN, data_type, value, tz, None, false)
|
||||
{
|
||||
return Ok(value_to_vector(value));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,7 +50,7 @@ use common_time::{Timestamp, Timezone};
|
||||
use datafusion_common::tree_node::TreeNodeVisitor;
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::{ColumnSchema, RawSchema, Schema};
|
||||
use datatypes::schema::{RawSchema, Schema};
|
||||
use datatypes::value::Value;
|
||||
use partition::expr::{Operand, PartitionExpr, RestrictedOp};
|
||||
use partition::multi_dim::MultiDimPartitionRule;
|
||||
@@ -2001,7 +2001,8 @@ fn convert_value(
|
||||
unary_op: Option<UnaryOperator>,
|
||||
) -> Result<Value> {
|
||||
sql_value_to_value(
|
||||
&ColumnSchema::new("<NONAME>", data_type, true),
|
||||
"<NONAME>",
|
||||
&data_type,
|
||||
value,
|
||||
Some(timezone),
|
||||
unary_op,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -41,7 +41,6 @@ common-frontend.workspace = true
|
||||
common-grpc.workspace = true
|
||||
common-macro.workspace = true
|
||||
common-mem-prof = { workspace = true, optional = true }
|
||||
common-memory-manager.workspace = true
|
||||
common-meta.workspace = true
|
||||
common-plugins.workspace = true
|
||||
common-pprof = { workspace = true, optional = true }
|
||||
@@ -88,7 +87,7 @@ operator.workspace = true
|
||||
otel-arrow-rust.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pg_interval = "0.4"
|
||||
pgwire = { version = "0.37", default-features = false, features = [
|
||||
pgwire = { version = "0.36.3", default-features = false, features = [
|
||||
"server-api-ring",
|
||||
"pg-ext-types",
|
||||
] }
|
||||
|
||||
@@ -95,13 +95,6 @@ pub enum Error {
|
||||
error: tonic::transport::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("Request memory limit exceeded"))]
|
||||
MemoryLimitExceeded {
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
source: common_memory_manager::Error,
|
||||
},
|
||||
|
||||
#[snafu(display("{} server is already started", server))]
|
||||
AlreadyStarted {
|
||||
server: String,
|
||||
@@ -792,8 +785,6 @@ impl ErrorExt for Error {
|
||||
Cancelled { .. } => StatusCode::Cancelled,
|
||||
|
||||
Suspended { .. } => StatusCode::Suspended,
|
||||
|
||||
MemoryLimitExceeded { .. } => StatusCode::RateLimited,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -52,6 +52,7 @@ use crate::error::{AlreadyStartedSnafu, InternalSnafu, Result, StartGrpcSnafu, T
|
||||
use crate::metrics::MetricsMiddlewareLayer;
|
||||
use crate::otel_arrow::{HeaderInterceptor, OtelArrowServiceHandler};
|
||||
use crate::query_handler::OpenTelemetryProtocolHandlerRef;
|
||||
use crate::request_limiter::RequestMemoryLimiter;
|
||||
use crate::server::Server;
|
||||
use crate::tls::TlsOption;
|
||||
|
||||
@@ -68,6 +69,8 @@ pub struct GrpcOptions {
|
||||
pub max_recv_message_size: ReadableSize,
|
||||
/// Max gRPC sending(encoding) message size
|
||||
pub max_send_message_size: ReadableSize,
|
||||
/// Maximum total memory for all concurrent gRPC request messages. 0 disables the limit.
|
||||
pub max_total_message_memory: ReadableSize,
|
||||
/// Compression mode in Arrow Flight service.
|
||||
pub flight_compression: FlightCompression,
|
||||
pub runtime_size: usize,
|
||||
@@ -123,6 +126,7 @@ impl GrpcOptions {
|
||||
GrpcServerConfig {
|
||||
max_recv_message_size: self.max_recv_message_size.as_bytes() as usize,
|
||||
max_send_message_size: self.max_send_message_size.as_bytes() as usize,
|
||||
max_total_message_memory: self.max_total_message_memory.as_bytes() as usize,
|
||||
tls: self.tls.clone(),
|
||||
max_connection_age: self.max_connection_age,
|
||||
}
|
||||
@@ -141,6 +145,7 @@ impl Default for GrpcOptions {
|
||||
server_addr: String::new(),
|
||||
max_recv_message_size: DEFAULT_MAX_GRPC_RECV_MESSAGE_SIZE,
|
||||
max_send_message_size: DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE,
|
||||
max_total_message_memory: ReadableSize(0),
|
||||
flight_compression: FlightCompression::ArrowIpc,
|
||||
runtime_size: 8,
|
||||
tls: TlsOption::default(),
|
||||
@@ -162,6 +167,7 @@ impl GrpcOptions {
|
||||
server_addr: format!("127.0.0.1:{}", DEFAULT_INTERNAL_GRPC_ADDR_PORT),
|
||||
max_recv_message_size: DEFAULT_MAX_GRPC_RECV_MESSAGE_SIZE,
|
||||
max_send_message_size: DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE,
|
||||
max_total_message_memory: ReadableSize(0),
|
||||
flight_compression: FlightCompression::ArrowIpc,
|
||||
runtime_size: 8,
|
||||
tls: TlsOption::default(),
|
||||
@@ -228,6 +234,7 @@ pub struct GrpcServer {
|
||||
bind_addr: Option<SocketAddr>,
|
||||
name: Option<String>,
|
||||
config: GrpcServerConfig,
|
||||
memory_limiter: RequestMemoryLimiter,
|
||||
}
|
||||
|
||||
/// Grpc Server configuration
|
||||
@@ -237,6 +244,8 @@ pub struct GrpcServerConfig {
|
||||
pub max_recv_message_size: usize,
|
||||
// Max gRPC sending(encoding) message size
|
||||
pub max_send_message_size: usize,
|
||||
/// Maximum total memory for all concurrent gRPC request messages. 0 disables the limit.
|
||||
pub max_total_message_memory: usize,
|
||||
pub tls: TlsOption,
|
||||
/// Maximum time that a channel may exist.
|
||||
/// Useful when the server wants to control the reconnection of its clients.
|
||||
@@ -249,6 +258,7 @@ impl Default for GrpcServerConfig {
|
||||
Self {
|
||||
max_recv_message_size: DEFAULT_MAX_GRPC_RECV_MESSAGE_SIZE.as_bytes() as usize,
|
||||
max_send_message_size: DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE.as_bytes() as usize,
|
||||
max_total_message_memory: 0,
|
||||
tls: TlsOption::default(),
|
||||
max_connection_age: None,
|
||||
}
|
||||
@@ -288,6 +298,11 @@ impl GrpcServer {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the memory limiter for monitoring current memory usage
|
||||
pub fn memory_limiter(&self) -> &RequestMemoryLimiter {
|
||||
&self.memory_limiter
|
||||
}
|
||||
}
|
||||
|
||||
pub struct HealthCheckHandler;
|
||||
|
||||
@@ -46,7 +46,7 @@ use crate::grpc::{GrpcServer, GrpcServerConfig};
|
||||
use crate::otel_arrow::{HeaderInterceptor, OtelArrowServiceHandler};
|
||||
use crate::prometheus_handler::PrometheusHandlerRef;
|
||||
use crate::query_handler::OpenTelemetryProtocolHandlerRef;
|
||||
use crate::request_memory_limiter::ServerMemoryLimiter;
|
||||
use crate::request_limiter::RequestMemoryLimiter;
|
||||
use crate::tls::TlsOption;
|
||||
|
||||
/// Add a gRPC service (`service`) to a `builder`([RoutesBuilder]).
|
||||
@@ -92,14 +92,12 @@ pub struct GrpcServerBuilder {
|
||||
HeaderInterceptor,
|
||||
>,
|
||||
>,
|
||||
memory_limiter: ServerMemoryLimiter,
|
||||
memory_limiter: RequestMemoryLimiter,
|
||||
}
|
||||
|
||||
impl GrpcServerBuilder {
|
||||
pub fn new(config: GrpcServerConfig, runtime: Runtime) -> Self {
|
||||
// Create a default unlimited limiter (can be overridden with with_memory_limiter)
|
||||
let memory_limiter = ServerMemoryLimiter::default();
|
||||
|
||||
let memory_limiter = RequestMemoryLimiter::new(config.max_total_message_memory);
|
||||
Self {
|
||||
name: None,
|
||||
config,
|
||||
@@ -111,12 +109,6 @@ impl GrpcServerBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
/// Set a global memory limiter for all server protocols.
|
||||
pub fn with_memory_limiter(mut self, limiter: ServerMemoryLimiter) -> Self {
|
||||
self.memory_limiter = limiter;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn config(&self) -> &GrpcServerConfig {
|
||||
&self.config
|
||||
}
|
||||
@@ -125,7 +117,7 @@ impl GrpcServerBuilder {
|
||||
&self.runtime
|
||||
}
|
||||
|
||||
pub fn memory_limiter(&self) -> &ServerMemoryLimiter {
|
||||
pub fn memory_limiter(&self) -> &RequestMemoryLimiter {
|
||||
&self.memory_limiter
|
||||
}
|
||||
|
||||
@@ -246,6 +238,7 @@ impl GrpcServerBuilder {
|
||||
bind_addr: None,
|
||||
name: self.name,
|
||||
config: self.config,
|
||||
memory_limiter: self.memory_limiter,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,7 +26,8 @@ use tonic::{Request, Response, Status, Streaming};
|
||||
use crate::grpc::greptime_handler::GreptimeRequestHandler;
|
||||
use crate::grpc::{TonicResult, cancellation};
|
||||
use crate::hint_headers;
|
||||
use crate::request_memory_limiter::ServerMemoryLimiter;
|
||||
use crate::metrics::{METRIC_GRPC_MEMORY_USAGE_BYTES, METRIC_GRPC_REQUESTS_REJECTED_TOTAL};
|
||||
use crate::request_limiter::RequestMemoryLimiter;
|
||||
|
||||
pub(crate) struct DatabaseService {
|
||||
handler: GreptimeRequestHandler,
|
||||
@@ -51,12 +52,25 @@ impl GreptimeDatabase for DatabaseService {
|
||||
remote_addr, hints
|
||||
);
|
||||
|
||||
let _guard = if let Some(limiter) = request.extensions().get::<ServerMemoryLimiter>() {
|
||||
let message_size = request.get_ref().encoded_len() as u64;
|
||||
Some(limiter.acquire(message_size).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let _guard = request
|
||||
.extensions()
|
||||
.get::<RequestMemoryLimiter>()
|
||||
.filter(|limiter| limiter.is_enabled())
|
||||
.and_then(|limiter| {
|
||||
let message_size = request.get_ref().encoded_len();
|
||||
limiter
|
||||
.try_acquire(message_size)
|
||||
.map(|guard| {
|
||||
guard.inspect(|g| {
|
||||
METRIC_GRPC_MEMORY_USAGE_BYTES.set(g.current_usage() as i64);
|
||||
})
|
||||
})
|
||||
.inspect_err(|_| {
|
||||
METRIC_GRPC_REQUESTS_REJECTED_TOTAL.inc();
|
||||
})
|
||||
.transpose()
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
let handler = self.handler.clone();
|
||||
let request_future = async move {
|
||||
@@ -105,7 +119,7 @@ impl GreptimeDatabase for DatabaseService {
|
||||
remote_addr, hints
|
||||
);
|
||||
|
||||
let limiter = request.extensions().get::<ServerMemoryLimiter>().cloned();
|
||||
let limiter = request.extensions().get::<RequestMemoryLimiter>().cloned();
|
||||
|
||||
let handler = self.handler.clone();
|
||||
let request_future = async move {
|
||||
@@ -115,12 +129,24 @@ impl GreptimeDatabase for DatabaseService {
|
||||
while let Some(request) = stream.next().await {
|
||||
let request = request?;
|
||||
|
||||
let _guard = if let Some(limiter_ref) = &limiter {
|
||||
let message_size = request.encoded_len() as u64;
|
||||
Some(limiter_ref.acquire(message_size).await?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let _guard = limiter
|
||||
.as_ref()
|
||||
.filter(|limiter| limiter.is_enabled())
|
||||
.and_then(|limiter| {
|
||||
let message_size = request.encoded_len();
|
||||
limiter
|
||||
.try_acquire(message_size)
|
||||
.map(|guard| {
|
||||
guard.inspect(|g| {
|
||||
METRIC_GRPC_MEMORY_USAGE_BYTES.set(g.current_usage() as i64);
|
||||
})
|
||||
})
|
||||
.inspect_err(|_| {
|
||||
METRIC_GRPC_REQUESTS_REJECTED_TOTAL.inc();
|
||||
})
|
||||
.transpose()
|
||||
})
|
||||
.transpose()?;
|
||||
let output = handler.handle_request(request, hints.clone()).await?;
|
||||
match output.data {
|
||||
OutputData::AffectedRows(rows) => affected_rows += rows,
|
||||
|
||||
@@ -29,7 +29,6 @@ use bytes;
|
||||
use bytes::Bytes;
|
||||
use common_grpc::flight::do_put::{DoPutMetadata, DoPutResponse};
|
||||
use common_grpc::flight::{FlightDecoder, FlightEncoder, FlightMessage};
|
||||
use common_memory_manager::MemoryGuard;
|
||||
use common_query::{Output, OutputData};
|
||||
use common_recordbatch::DfRecordBatch;
|
||||
use common_telemetry::debug;
|
||||
@@ -40,7 +39,7 @@ use futures::{Stream, future, ready};
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use prost::Message;
|
||||
use session::context::{QueryContext, QueryContextRef};
|
||||
use snafu::{IntoError, ResultExt, ensure};
|
||||
use snafu::{ResultExt, ensure};
|
||||
use table::table_name::TableName;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
@@ -50,8 +49,8 @@ use crate::error::{InvalidParameterSnafu, Result, ToJsonSnafu};
|
||||
pub use crate::grpc::flight::stream::FlightRecordBatchStream;
|
||||
use crate::grpc::greptime_handler::{GreptimeRequestHandler, get_request_type};
|
||||
use crate::grpc::{FlightCompression, TonicResult, context_auth};
|
||||
use crate::request_memory_limiter::ServerMemoryLimiter;
|
||||
use crate::request_memory_metrics::RequestMemoryMetrics;
|
||||
use crate::metrics::{METRIC_GRPC_MEMORY_USAGE_BYTES, METRIC_GRPC_REQUESTS_REJECTED_TOTAL};
|
||||
use crate::request_limiter::{RequestMemoryGuard, RequestMemoryLimiter};
|
||||
use crate::{error, hint_headers};
|
||||
|
||||
pub type TonicStream<T> = Pin<Box<dyn Stream<Item = TonicResult<T>> + Send + 'static>>;
|
||||
@@ -220,7 +219,7 @@ impl FlightCraft for GreptimeRequestHandler {
|
||||
) -> TonicResult<Response<TonicStream<PutResult>>> {
|
||||
let (headers, extensions, stream) = request.into_parts();
|
||||
|
||||
let limiter = extensions.get::<ServerMemoryLimiter>().cloned();
|
||||
let limiter = extensions.get::<RequestMemoryLimiter>().cloned();
|
||||
|
||||
let query_ctx = context_auth::create_query_context_from_grpc_metadata(&headers)?;
|
||||
context_auth::check_auth(self.user_provider.clone(), &headers, query_ctx.clone()).await?;
|
||||
@@ -261,7 +260,7 @@ pub struct PutRecordBatchRequest {
|
||||
pub record_batch: DfRecordBatch,
|
||||
pub schema_bytes: Bytes,
|
||||
pub flight_data: FlightData,
|
||||
pub(crate) _guard: Option<MemoryGuard<RequestMemoryMetrics>>,
|
||||
pub(crate) _guard: Option<RequestMemoryGuard>,
|
||||
}
|
||||
|
||||
impl PutRecordBatchRequest {
|
||||
@@ -271,24 +270,28 @@ impl PutRecordBatchRequest {
|
||||
request_id: i64,
|
||||
schema_bytes: Bytes,
|
||||
flight_data: FlightData,
|
||||
limiter: Option<&ServerMemoryLimiter>,
|
||||
limiter: Option<&RequestMemoryLimiter>,
|
||||
) -> Result<Self> {
|
||||
let memory_usage = flight_data.data_body.len()
|
||||
+ flight_data.app_metadata.len()
|
||||
+ flight_data.data_header.len();
|
||||
|
||||
let _guard = if let Some(limiter) = limiter {
|
||||
let guard = limiter.try_acquire(memory_usage as u64).ok_or_else(|| {
|
||||
let inner_err = common_memory_manager::Error::MemoryLimitExceeded {
|
||||
requested_bytes: memory_usage as u64,
|
||||
limit_bytes: limiter.limit_bytes(),
|
||||
};
|
||||
error::MemoryLimitExceededSnafu.into_error(inner_err)
|
||||
})?;
|
||||
Some(guard)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let _guard = limiter
|
||||
.filter(|limiter| limiter.is_enabled())
|
||||
.map(|limiter| {
|
||||
limiter
|
||||
.try_acquire(memory_usage)
|
||||
.map(|guard| {
|
||||
guard.inspect(|g| {
|
||||
METRIC_GRPC_MEMORY_USAGE_BYTES.set(g.current_usage() as i64);
|
||||
})
|
||||
})
|
||||
.inspect_err(|_| {
|
||||
METRIC_GRPC_REQUESTS_REJECTED_TOTAL.inc();
|
||||
})
|
||||
})
|
||||
.transpose()?
|
||||
.flatten();
|
||||
|
||||
Ok(Self {
|
||||
table_name,
|
||||
@@ -305,7 +308,7 @@ pub struct PutRecordBatchRequestStream {
|
||||
flight_data_stream: Streaming<FlightData>,
|
||||
catalog: String,
|
||||
schema_name: String,
|
||||
limiter: Option<ServerMemoryLimiter>,
|
||||
limiter: Option<RequestMemoryLimiter>,
|
||||
// Client now lazily sends schema data so we cannot eagerly wait for it.
|
||||
// Instead, we need to decode while receiving record batches.
|
||||
state: StreamState,
|
||||
@@ -328,7 +331,7 @@ impl PutRecordBatchRequestStream {
|
||||
flight_data_stream: Streaming<FlightData>,
|
||||
catalog: String,
|
||||
schema: String,
|
||||
limiter: Option<ServerMemoryLimiter>,
|
||||
limiter: Option<RequestMemoryLimiter>,
|
||||
) -> TonicResult<Self> {
|
||||
Ok(Self {
|
||||
flight_data_stream,
|
||||
@@ -392,6 +395,7 @@ impl Stream for PutRecordBatchRequestStream {
|
||||
|
||||
match poll {
|
||||
Some(Ok(flight_data)) => {
|
||||
// Clone limiter once to avoid borrowing issues
|
||||
let limiter = self.limiter.clone();
|
||||
|
||||
match &mut self.state {
|
||||
|
||||
@@ -18,15 +18,15 @@ use futures::future::BoxFuture;
|
||||
use tonic::server::NamedService;
|
||||
use tower::{Layer, Service};
|
||||
|
||||
use crate::request_memory_limiter::ServerMemoryLimiter;
|
||||
use crate::request_limiter::RequestMemoryLimiter;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct MemoryLimiterExtensionLayer {
|
||||
limiter: ServerMemoryLimiter,
|
||||
limiter: RequestMemoryLimiter,
|
||||
}
|
||||
|
||||
impl MemoryLimiterExtensionLayer {
|
||||
pub fn new(limiter: ServerMemoryLimiter) -> Self {
|
||||
pub fn new(limiter: RequestMemoryLimiter) -> Self {
|
||||
Self { limiter }
|
||||
}
|
||||
}
|
||||
@@ -45,7 +45,7 @@ impl<S> Layer<S> for MemoryLimiterExtensionLayer {
|
||||
#[derive(Clone)]
|
||||
pub struct MemoryLimiterExtensionService<S> {
|
||||
inner: S,
|
||||
limiter: ServerMemoryLimiter,
|
||||
limiter: RequestMemoryLimiter,
|
||||
}
|
||||
|
||||
impl<S: NamedService> NamedService for MemoryLimiterExtensionService<S> {
|
||||
|
||||
@@ -83,7 +83,7 @@ use crate::query_handler::{
|
||||
OpenTelemetryProtocolHandlerRef, OpentsdbProtocolHandlerRef, PipelineHandlerRef,
|
||||
PromStoreProtocolHandlerRef,
|
||||
};
|
||||
use crate::request_memory_limiter::ServerMemoryLimiter;
|
||||
use crate::request_limiter::RequestMemoryLimiter;
|
||||
use crate::server::Server;
|
||||
|
||||
pub mod authorize;
|
||||
@@ -134,7 +134,7 @@ pub struct HttpServer {
|
||||
router: StdMutex<Router>,
|
||||
shutdown_tx: Mutex<Option<Sender<()>>>,
|
||||
user_provider: Option<UserProviderRef>,
|
||||
memory_limiter: ServerMemoryLimiter,
|
||||
memory_limiter: RequestMemoryLimiter,
|
||||
|
||||
// plugins
|
||||
plugins: Plugins,
|
||||
@@ -157,6 +157,9 @@ pub struct HttpOptions {
|
||||
|
||||
pub body_limit: ReadableSize,
|
||||
|
||||
/// Maximum total memory for all concurrent HTTP request bodies. 0 disables the limit.
|
||||
pub max_total_body_memory: ReadableSize,
|
||||
|
||||
/// Validation mode while decoding Prometheus remote write requests.
|
||||
pub prom_validation_mode: PromValidationMode,
|
||||
|
||||
@@ -201,6 +204,7 @@ impl Default for HttpOptions {
|
||||
timeout: Duration::from_secs(0),
|
||||
disable_dashboard: false,
|
||||
body_limit: DEFAULT_BODY_LIMIT,
|
||||
max_total_body_memory: ReadableSize(0),
|
||||
cors_allowed_origins: Vec::new(),
|
||||
enable_cors: true,
|
||||
prom_validation_mode: PromValidationMode::Strict,
|
||||
@@ -535,12 +539,12 @@ pub struct GreptimeOptionsConfigState {
|
||||
pub greptime_config_options: String,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct HttpServerBuilder {
|
||||
options: HttpOptions,
|
||||
plugins: Plugins,
|
||||
user_provider: Option<UserProviderRef>,
|
||||
router: Router,
|
||||
memory_limiter: ServerMemoryLimiter,
|
||||
}
|
||||
|
||||
impl HttpServerBuilder {
|
||||
@@ -550,16 +554,9 @@ impl HttpServerBuilder {
|
||||
plugins: Plugins::default(),
|
||||
user_provider: None,
|
||||
router: Router::new(),
|
||||
memory_limiter: ServerMemoryLimiter::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set a global memory limiter for all server protocols.
|
||||
pub fn with_memory_limiter(mut self, limiter: ServerMemoryLimiter) -> Self {
|
||||
self.memory_limiter = limiter;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_sql_handler(self, sql_handler: ServerSqlQueryHandlerRef) -> Self {
|
||||
let sql_router = HttpServer::route_sql(ApiState { sql_handler });
|
||||
|
||||
@@ -753,6 +750,8 @@ impl HttpServerBuilder {
|
||||
}
|
||||
|
||||
pub fn build(self) -> HttpServer {
|
||||
let memory_limiter =
|
||||
RequestMemoryLimiter::new(self.options.max_total_body_memory.as_bytes() as usize);
|
||||
HttpServer {
|
||||
options: self.options,
|
||||
user_provider: self.user_provider,
|
||||
@@ -760,7 +759,7 @@ impl HttpServerBuilder {
|
||||
plugins: self.plugins,
|
||||
router: StdMutex::new(self.router),
|
||||
bind_addr: None,
|
||||
memory_limiter: self.memory_limiter,
|
||||
memory_limiter,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,10 +19,11 @@ use axum::middleware::Next;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::request_memory_limiter::ServerMemoryLimiter;
|
||||
use crate::metrics::{METRIC_HTTP_MEMORY_USAGE_BYTES, METRIC_HTTP_REQUESTS_REJECTED_TOTAL};
|
||||
use crate::request_limiter::RequestMemoryLimiter;
|
||||
|
||||
pub async fn memory_limit_middleware(
|
||||
State(limiter): State<ServerMemoryLimiter>,
|
||||
State(limiter): State<RequestMemoryLimiter>,
|
||||
req: Request,
|
||||
next: Next,
|
||||
) -> Response {
|
||||
@@ -30,12 +31,15 @@ pub async fn memory_limit_middleware(
|
||||
.headers()
|
||||
.get(http::header::CONTENT_LENGTH)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.and_then(|v| v.parse::<u64>().ok())
|
||||
.and_then(|v| v.parse::<usize>().ok())
|
||||
.unwrap_or(0);
|
||||
|
||||
let _guard = match limiter.acquire(content_length).await {
|
||||
Ok(guard) => guard,
|
||||
let _guard = match limiter.try_acquire(content_length) {
|
||||
Ok(guard) => guard.inspect(|g| {
|
||||
METRIC_HTTP_MEMORY_USAGE_BYTES.set(g.current_usage() as i64);
|
||||
}),
|
||||
Err(e) => {
|
||||
METRIC_HTTP_REQUESTS_REJECTED_TOTAL.inc();
|
||||
return (
|
||||
StatusCode::TOO_MANY_REQUESTS,
|
||||
format!("Request body memory limit exceeded: {}", e),
|
||||
|
||||
@@ -50,8 +50,7 @@ pub mod prometheus_handler;
|
||||
pub mod proto;
|
||||
pub mod query_handler;
|
||||
pub mod repeated_field;
|
||||
pub mod request_memory_limiter;
|
||||
pub mod request_memory_metrics;
|
||||
pub mod request_limiter;
|
||||
mod row_writer;
|
||||
pub mod server;
|
||||
pub mod tls;
|
||||
|
||||
@@ -299,24 +299,24 @@ lazy_static! {
|
||||
"servers handle bulk insert elapsed",
|
||||
).unwrap();
|
||||
|
||||
// Unified request memory metrics
|
||||
/// Current memory in use by all concurrent requests (HTTP, gRPC, Flight).
|
||||
pub static ref REQUEST_MEMORY_IN_USE: IntGauge = register_int_gauge!(
|
||||
"greptime_servers_request_memory_in_use_bytes",
|
||||
"bytes currently reserved for all concurrent request bodies and messages"
|
||||
pub static ref METRIC_HTTP_MEMORY_USAGE_BYTES: IntGauge = register_int_gauge!(
|
||||
"greptime_servers_http_memory_usage_bytes",
|
||||
"current http request memory usage in bytes"
|
||||
).unwrap();
|
||||
|
||||
/// Maximum configured memory for all concurrent requests.
|
||||
pub static ref REQUEST_MEMORY_LIMIT: IntGauge = register_int_gauge!(
|
||||
"greptime_servers_request_memory_limit_bytes",
|
||||
"maximum bytes allowed for all concurrent request bodies and messages"
|
||||
pub static ref METRIC_HTTP_REQUESTS_REJECTED_TOTAL: IntCounter = register_int_counter!(
|
||||
"greptime_servers_http_requests_rejected_total",
|
||||
"total number of http requests rejected due to memory limit"
|
||||
).unwrap();
|
||||
|
||||
/// Total number of rejected requests due to memory exhaustion.
|
||||
pub static ref REQUEST_MEMORY_REJECTED: IntCounterVec = register_int_counter_vec!(
|
||||
"greptime_servers_request_memory_rejected_total",
|
||||
"number of requests rejected due to memory limit",
|
||||
&["reason"]
|
||||
pub static ref METRIC_GRPC_MEMORY_USAGE_BYTES: IntGauge = register_int_gauge!(
|
||||
"greptime_servers_grpc_memory_usage_bytes",
|
||||
"current grpc request memory usage in bytes"
|
||||
).unwrap();
|
||||
|
||||
pub static ref METRIC_GRPC_REQUESTS_REJECTED_TOTAL: IntCounter = register_int_counter!(
|
||||
"greptime_servers_grpc_requests_rejected_total",
|
||||
"total number of grpc requests rejected due to memory limit"
|
||||
).unwrap();
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ use common_time::{Date, Timestamp};
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode};
|
||||
use datafusion_expr::LogicalPlan;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use datatypes::types::TimestampType;
|
||||
use datatypes::value::{self, Value};
|
||||
use itertools::Itertools;
|
||||
@@ -255,10 +254,9 @@ pub fn convert_value(param: &ParamValue, t: &ConcreteDataType) -> Result<ScalarV
|
||||
/// Convert an MySQL expression to a scalar value.
|
||||
/// It automatically handles the conversion of strings to numeric values.
|
||||
pub fn convert_expr_to_scalar_value(param: &Expr, t: &ConcreteDataType) -> Result<ScalarValue> {
|
||||
let column_schema = ColumnSchema::new("", t.clone(), true);
|
||||
match param {
|
||||
Expr::Value(v) => {
|
||||
let v = sql_value_to_value(&column_schema, &v.value, None, None, true);
|
||||
let v = sql_value_to_value("", t, &v.value, None, None, true);
|
||||
match v {
|
||||
Ok(v) => v
|
||||
.try_to_scalar_value(t)
|
||||
@@ -270,7 +268,7 @@ pub fn convert_expr_to_scalar_value(param: &Expr, t: &ConcreteDataType) -> Resul
|
||||
}
|
||||
}
|
||||
Expr::UnaryOp { op, expr } if let Expr::Value(v) = &**expr => {
|
||||
let v = sql_value_to_value(&column_schema, &v.value, None, Some(*op), true);
|
||||
let v = sql_value_to_value("", t, &v.value, None, Some(*op), true);
|
||||
match v {
|
||||
Ok(v) => v
|
||||
.try_to_scalar_value(t)
|
||||
|
||||
@@ -28,13 +28,13 @@ fn build_string_data_rows(
|
||||
schema: Arc<Vec<FieldInfo>>,
|
||||
rows: Vec<Vec<String>>,
|
||||
) -> Vec<PgWireResult<DataRow>> {
|
||||
let mut encoder = DataRowEncoder::new(schema.clone());
|
||||
rows.iter()
|
||||
.map(|row| {
|
||||
let mut encoder = DataRowEncoder::new(schema.clone());
|
||||
for value in row {
|
||||
encoder.encode_field(&Some(value))?;
|
||||
}
|
||||
Ok(encoder.take_row())
|
||||
encoder.finish()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -262,26 +262,6 @@ impl QueryParser for DefaultQueryParser {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn get_parameter_types(&self, _stmt: &Self::Statement) -> PgWireResult<Vec<Type>> {
|
||||
// we have our own implementation of describes in ExtendedQueryHandler
|
||||
// so we don't use these methods
|
||||
Err(PgWireError::ApiError(
|
||||
"get_parameter_types is not expected to be called".into(),
|
||||
))
|
||||
}
|
||||
|
||||
fn get_result_schema(
|
||||
&self,
|
||||
_stmt: &Self::Statement,
|
||||
_column_format: Option<&Format>,
|
||||
) -> PgWireResult<Vec<FieldInfo>> {
|
||||
// we have our own implementation of describes in ExtendedQueryHandler
|
||||
// so we don't use these methods
|
||||
Err(PgWireError::ApiError(
|
||||
"get_result_schema is not expected to be called".into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
||||
@@ -395,13 +395,13 @@ impl Iterator for RecordBatchRowIterator {
|
||||
type Item = PgWireResult<DataRow>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut encoder = DataRowEncoder::new(self.pg_schema.clone());
|
||||
if self.i < self.record_batch.num_rows() {
|
||||
let mut encoder = DataRowEncoder::new(self.pg_schema.clone());
|
||||
if let Err(e) = self.encode_row(self.i, &mut encoder) {
|
||||
return Some(Err(e));
|
||||
}
|
||||
self.i += 1;
|
||||
Some(Ok(encoder.take_row()))
|
||||
Some(encoder.finish())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
|
||||
230
src/servers/src/request_limiter.rs
Normal file
230
src/servers/src/request_limiter.rs
Normal file
@@ -0,0 +1,230 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Request memory limiter for controlling total memory usage of concurrent requests.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::error::{Result, TooManyConcurrentRequestsSnafu};
|
||||
|
||||
/// Limiter for total memory usage of concurrent request bodies.
|
||||
///
|
||||
/// Tracks the total memory used by all concurrent request bodies
|
||||
/// and rejects new requests when the limit is reached.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct RequestMemoryLimiter {
|
||||
inner: Option<Arc<LimiterInner>>,
|
||||
}
|
||||
|
||||
struct LimiterInner {
|
||||
current_usage: AtomicUsize,
|
||||
max_memory: usize,
|
||||
}
|
||||
|
||||
impl RequestMemoryLimiter {
|
||||
/// Create a new memory limiter.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `max_memory` - Maximum total memory for all concurrent request bodies in bytes (0 = unlimited)
|
||||
pub fn new(max_memory: usize) -> Self {
|
||||
if max_memory == 0 {
|
||||
return Self { inner: None };
|
||||
}
|
||||
|
||||
Self {
|
||||
inner: Some(Arc::new(LimiterInner {
|
||||
current_usage: AtomicUsize::new(0),
|
||||
max_memory,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to acquire memory for a request of given size.
|
||||
///
|
||||
/// Returns `Ok(RequestMemoryGuard)` if memory was acquired successfully.
|
||||
/// Returns `Err` if the memory limit would be exceeded.
|
||||
pub fn try_acquire(&self, request_size: usize) -> Result<Option<RequestMemoryGuard>> {
|
||||
let Some(inner) = self.inner.as_ref() else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let mut new_usage = 0;
|
||||
let result =
|
||||
inner
|
||||
.current_usage
|
||||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |current| {
|
||||
new_usage = current.saturating_add(request_size);
|
||||
if new_usage <= inner.max_memory {
|
||||
Some(new_usage)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
match result {
|
||||
Ok(_) => Ok(Some(RequestMemoryGuard {
|
||||
size: request_size,
|
||||
limiter: Arc::clone(inner),
|
||||
usage_snapshot: new_usage,
|
||||
})),
|
||||
Err(_current) => TooManyConcurrentRequestsSnafu {
|
||||
limit: inner.max_memory,
|
||||
request_size,
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if limiter is enabled
|
||||
pub fn is_enabled(&self) -> bool {
|
||||
self.inner.is_some()
|
||||
}
|
||||
|
||||
/// Get current memory usage
|
||||
pub fn current_usage(&self) -> usize {
|
||||
self.inner
|
||||
.as_ref()
|
||||
.map(|inner| inner.current_usage.load(Ordering::Relaxed))
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Get max memory limit
|
||||
pub fn max_memory(&self) -> usize {
|
||||
self.inner
|
||||
.as_ref()
|
||||
.map(|inner| inner.max_memory)
|
||||
.unwrap_or(0)
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII guard that releases memory when dropped
|
||||
pub struct RequestMemoryGuard {
|
||||
size: usize,
|
||||
limiter: Arc<LimiterInner>,
|
||||
usage_snapshot: usize,
|
||||
}
|
||||
|
||||
impl RequestMemoryGuard {
|
||||
/// Returns the total memory usage snapshot at the time this guard was acquired.
|
||||
pub fn current_usage(&self) -> usize {
|
||||
self.usage_snapshot
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RequestMemoryGuard {
|
||||
fn drop(&mut self) {
|
||||
self.limiter
|
||||
.current_usage
|
||||
.fetch_sub(self.size, Ordering::Release);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tokio::sync::Barrier;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_limiter_disabled() {
|
||||
let limiter = RequestMemoryLimiter::new(0);
|
||||
assert!(!limiter.is_enabled());
|
||||
assert!(limiter.try_acquire(1000000).unwrap().is_none());
|
||||
assert_eq!(limiter.current_usage(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_limiter_basic() {
|
||||
let limiter = RequestMemoryLimiter::new(1000);
|
||||
assert!(limiter.is_enabled());
|
||||
assert_eq!(limiter.max_memory(), 1000);
|
||||
assert_eq!(limiter.current_usage(), 0);
|
||||
|
||||
// Acquire 400 bytes
|
||||
let _guard1 = limiter.try_acquire(400).unwrap();
|
||||
assert_eq!(limiter.current_usage(), 400);
|
||||
|
||||
// Acquire another 500 bytes
|
||||
let _guard2 = limiter.try_acquire(500).unwrap();
|
||||
assert_eq!(limiter.current_usage(), 900);
|
||||
|
||||
// Try to acquire 200 bytes - should fail (900 + 200 > 1000)
|
||||
let result = limiter.try_acquire(200);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(limiter.current_usage(), 900);
|
||||
|
||||
// Drop first guard
|
||||
drop(_guard1);
|
||||
assert_eq!(limiter.current_usage(), 500);
|
||||
|
||||
// Now we can acquire 200 bytes
|
||||
let _guard3 = limiter.try_acquire(200).unwrap();
|
||||
assert_eq!(limiter.current_usage(), 700);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_limiter_exact_limit() {
|
||||
let limiter = RequestMemoryLimiter::new(1000);
|
||||
|
||||
// Acquire exactly the limit
|
||||
let _guard = limiter.try_acquire(1000).unwrap();
|
||||
assert_eq!(limiter.current_usage(), 1000);
|
||||
|
||||
// Try to acquire 1 more byte - should fail
|
||||
let result = limiter.try_acquire(1);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn test_limiter_concurrent() {
|
||||
let limiter = RequestMemoryLimiter::new(1000);
|
||||
let barrier = Arc::new(Barrier::new(11)); // 10 tasks + main
|
||||
let mut handles = vec![];
|
||||
|
||||
// Spawn 10 tasks each trying to acquire 200 bytes
|
||||
for _ in 0..10 {
|
||||
let limiter_clone = limiter.clone();
|
||||
let barrier_clone = barrier.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
barrier_clone.wait().await;
|
||||
limiter_clone.try_acquire(200)
|
||||
});
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Let all tasks start together
|
||||
barrier.wait().await;
|
||||
|
||||
let mut success_count = 0;
|
||||
let mut fail_count = 0;
|
||||
let mut guards = Vec::new();
|
||||
|
||||
for handle in handles {
|
||||
match handle.await.unwrap() {
|
||||
Ok(Some(guard)) => {
|
||||
success_count += 1;
|
||||
guards.push(guard);
|
||||
}
|
||||
Err(_) => fail_count += 1,
|
||||
Ok(None) => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
// Only 5 tasks should succeed (5 * 200 = 1000)
|
||||
assert_eq!(success_count, 5);
|
||||
assert_eq!(fail_count, 5);
|
||||
drop(guards);
|
||||
}
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Unified memory limiter for all server request protocols.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_memory_manager::{MemoryGuard, MemoryManager, OnExhaustedPolicy, PermitGranularity};
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::error::{MemoryLimitExceededSnafu, Result};
|
||||
use crate::request_memory_metrics::RequestMemoryMetrics;
|
||||
|
||||
/// Unified memory limiter for all server request protocols.
|
||||
///
|
||||
/// Manages a global memory pool for HTTP requests, gRPC messages, and
|
||||
/// Arrow Flight batches without distinguishing between them.
|
||||
#[derive(Clone)]
|
||||
pub struct ServerMemoryLimiter {
|
||||
manager: Arc<MemoryManager<RequestMemoryMetrics>>,
|
||||
policy: OnExhaustedPolicy,
|
||||
}
|
||||
|
||||
impl Default for ServerMemoryLimiter {
|
||||
/// Creates a limiter with unlimited memory (0 bytes) and default policy.
|
||||
fn default() -> Self {
|
||||
Self::new(0, OnExhaustedPolicy::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl ServerMemoryLimiter {
|
||||
/// Creates a new unified memory limiter.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `total_bytes` - Maximum total memory for all concurrent requests (0 = unlimited)
|
||||
/// * `policy` - Policy when memory quota is exhausted
|
||||
pub fn new(total_bytes: u64, policy: OnExhaustedPolicy) -> Self {
|
||||
let manager = Arc::new(MemoryManager::with_granularity(
|
||||
total_bytes,
|
||||
PermitGranularity::Kilobyte,
|
||||
RequestMemoryMetrics,
|
||||
));
|
||||
|
||||
Self { manager, policy }
|
||||
}
|
||||
|
||||
/// Acquire memory for a request.
|
||||
pub async fn acquire(&self, bytes: u64) -> Result<MemoryGuard<RequestMemoryMetrics>> {
|
||||
self.manager
|
||||
.acquire_with_policy(bytes, self.policy)
|
||||
.await
|
||||
.context(MemoryLimitExceededSnafu)
|
||||
}
|
||||
|
||||
/// Try to acquire memory without waiting.
|
||||
pub fn try_acquire(&self, bytes: u64) -> Option<MemoryGuard<RequestMemoryMetrics>> {
|
||||
self.manager.try_acquire(bytes)
|
||||
}
|
||||
|
||||
/// Returns total memory limit in bytes (0 if unlimited).
|
||||
pub fn limit_bytes(&self) -> u64 {
|
||||
self.manager.limit_bytes()
|
||||
}
|
||||
}
|
||||
@@ -1,40 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Unified metrics adapter for all server protocols.
|
||||
|
||||
use common_memory_manager::MemoryMetrics;
|
||||
|
||||
use crate::metrics::{REQUEST_MEMORY_IN_USE, REQUEST_MEMORY_LIMIT, REQUEST_MEMORY_REJECTED};
|
||||
|
||||
/// Metrics adapter for unified request memory tracking.
|
||||
///
|
||||
/// This adapter tracks memory usage for all server protocols (HTTP, gRPC, Arrow Flight)
|
||||
/// without distinguishing between them. All requests contribute to the same set of metrics.
|
||||
#[derive(Clone, Copy, Debug, Default)]
|
||||
pub struct RequestMemoryMetrics;
|
||||
|
||||
impl MemoryMetrics for RequestMemoryMetrics {
|
||||
fn set_limit(&self, bytes: i64) {
|
||||
REQUEST_MEMORY_LIMIT.set(bytes);
|
||||
}
|
||||
|
||||
fn set_in_use(&self, bytes: i64) {
|
||||
REQUEST_MEMORY_IN_USE.set(bytes);
|
||||
}
|
||||
|
||||
fn inc_rejected(&self, reason: &str) {
|
||||
REQUEST_MEMORY_REJECTED.with_label_values(&[reason]).inc();
|
||||
}
|
||||
}
|
||||
@@ -40,8 +40,4 @@ impl Dialect for GreptimeDbDialect {
|
||||
fn supports_filter_during_aggregation(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn supports_struct_literal(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -215,13 +215,6 @@ pub enum Error {
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Invalid JSON structure setting, reason: {reason}"))]
|
||||
InvalidJsonStructureSetting {
|
||||
reason: String,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to serialize column default constraint"))]
|
||||
SerializeColumnDefaultConstraint {
|
||||
#[snafu(implicit)]
|
||||
@@ -381,7 +374,6 @@ impl ErrorExt for Error {
|
||||
|
||||
InvalidColumnOption { .. }
|
||||
| InvalidExprAsOptionValue { .. }
|
||||
| InvalidJsonStructureSetting { .. }
|
||||
| InvalidDatabaseName { .. }
|
||||
| InvalidDatabaseOption { .. }
|
||||
| ColumnTypeMismatch { .. }
|
||||
|
||||
@@ -40,17 +40,16 @@ pub(super) fn parse_json_datatype_options(parser: &mut Parser<'_>) -> Result<Opt
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use sqlparser::ast::{DataType, Expr, Ident, StructField};
|
||||
use sqlparser::ast::DataType;
|
||||
|
||||
use crate::dialect::GreptimeDbDialect;
|
||||
use crate::parser::{ParseOptions, ParserContext};
|
||||
use crate::statements::OptionMap;
|
||||
use crate::statements::create::{
|
||||
Column, JSON_FORMAT_FULL_STRUCTURED, JSON_FORMAT_PARTIAL, JSON_FORMAT_RAW, JSON_OPT_FIELDS,
|
||||
JSON_OPT_FORMAT, JSON_OPT_UNSTRUCTURED_KEYS,
|
||||
Column, JSON_FORMAT_FULL_STRUCTURED, JSON_FORMAT_PARTIAL, JSON_FORMAT_RAW, JSON_OPT_FORMAT,
|
||||
JSON_OPT_UNSTRUCTURED_KEYS,
|
||||
};
|
||||
use crate::statements::statement::Statement;
|
||||
use crate::util::OptionValue;
|
||||
|
||||
#[test]
|
||||
fn test_parse_json_datatype_options() {
|
||||
@@ -78,42 +77,6 @@ mod tests {
|
||||
|
||||
let sql = r#"
|
||||
CREATE TABLE json_data (
|
||||
my_json JSON(format = "partial", fields = Struct<i Int, "o.a" String, "o.b" String, `x.y.z` Float64>),
|
||||
ts TIMESTAMP TIME INDEX,
|
||||
)"#;
|
||||
let options = parse(sql).unwrap();
|
||||
assert_eq!(options.len(), 2);
|
||||
let option = options.value(JSON_OPT_FIELDS);
|
||||
let expected = OptionValue::try_new(Expr::Struct {
|
||||
values: vec![],
|
||||
fields: vec![
|
||||
StructField {
|
||||
field_name: Some(Ident::new("i")),
|
||||
field_type: DataType::Int(None),
|
||||
options: None,
|
||||
},
|
||||
StructField {
|
||||
field_name: Some(Ident::with_quote('"', "o.a")),
|
||||
field_type: DataType::String(None),
|
||||
options: None,
|
||||
},
|
||||
StructField {
|
||||
field_name: Some(Ident::with_quote('"', "o.b")),
|
||||
field_type: DataType::String(None),
|
||||
options: None,
|
||||
},
|
||||
StructField {
|
||||
field_name: Some(Ident::with_quote('`', "x.y.z")),
|
||||
field_type: DataType::Float64,
|
||||
options: None,
|
||||
},
|
||||
],
|
||||
})
|
||||
.ok();
|
||||
assert_eq!(option, expected.as_ref());
|
||||
|
||||
let sql = r#"
|
||||
CREATE TABLE json_data (
|
||||
my_json JSON(format = "partial", unstructured_keys = ["k", "foo.bar", "a.b.c"]),
|
||||
ts TIMESTAMP TIME INDEX,
|
||||
)"#;
|
||||
|
||||
@@ -40,7 +40,6 @@ use api::v1::SemanticType;
|
||||
use common_sql::default_constraint::parse_column_default_constraint;
|
||||
use common_time::timezone::Timezone;
|
||||
use datatypes::extension::json::{JsonExtensionType, JsonMetadata};
|
||||
use datatypes::json::JsonStructureSettings;
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::{COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema};
|
||||
use datatypes::types::json_type::JsonNativeType;
|
||||
@@ -282,17 +281,8 @@ pub fn sql_data_type_to_concrete_data_type(
|
||||
}
|
||||
},
|
||||
SqlDataType::JSON => {
|
||||
let format = if let Some(x) = column_extensions.build_json_structure_settings()? {
|
||||
if let Some(fields) = match x {
|
||||
JsonStructureSettings::Structured(fields) => fields,
|
||||
JsonStructureSettings::UnstructuredRaw => None,
|
||||
JsonStructureSettings::PartialUnstructuredByKey { fields, .. } => fields,
|
||||
} {
|
||||
let datatype = &ConcreteDataType::Struct(fields);
|
||||
JsonFormat::Native(Box::new(datatype.into()))
|
||||
} else {
|
||||
JsonFormat::Native(Box::new(JsonNativeType::Null))
|
||||
}
|
||||
let format = if column_extensions.json_datatype_options.is_some() {
|
||||
JsonFormat::Native(Box::new(JsonNativeType::Null))
|
||||
} else {
|
||||
JsonFormat::Jsonb
|
||||
};
|
||||
|
||||
@@ -14,30 +14,27 @@
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt::{Display, Formatter};
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_catalog::consts::FILE_ENGINE;
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use datatypes::json::JsonStructureSettings;
|
||||
use datatypes::schema::{
|
||||
FulltextOptions, SkippingIndexOptions, VectorDistanceMetric, VectorIndexEngineType,
|
||||
VectorIndexOptions,
|
||||
};
|
||||
use datatypes::types::StructType;
|
||||
use itertools::Itertools;
|
||||
use serde::Serialize;
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use snafu::ResultExt;
|
||||
use sqlparser::ast::{ColumnOptionDef, DataType, Expr, Query};
|
||||
use sqlparser_derive::{Visit, VisitMut};
|
||||
|
||||
use crate::ast::{ColumnDef, Ident, ObjectName, Value as SqlValue};
|
||||
use crate::error::{
|
||||
InvalidFlowQuerySnafu, InvalidJsonStructureSettingSnafu, InvalidSqlSnafu, Result,
|
||||
SetFulltextOptionSnafu, SetSkippingIndexOptionSnafu,
|
||||
InvalidFlowQuerySnafu, InvalidSqlSnafu, Result, SetFulltextOptionSnafu,
|
||||
SetSkippingIndexOptionSnafu,
|
||||
};
|
||||
use crate::statements::OptionMap;
|
||||
use crate::statements::statement::Statement;
|
||||
use crate::statements::tql::Tql;
|
||||
use crate::statements::{OptionMap, sql_data_type_to_concrete_data_type};
|
||||
use crate::util::OptionValue;
|
||||
|
||||
const LINE_SEP: &str = ",\n";
|
||||
@@ -47,7 +44,6 @@ pub const VECTOR_OPT_DIM: &str = "dim";
|
||||
|
||||
pub const JSON_OPT_UNSTRUCTURED_KEYS: &str = "unstructured_keys";
|
||||
pub const JSON_OPT_FORMAT: &str = "format";
|
||||
pub(crate) const JSON_OPT_FIELDS: &str = "fields";
|
||||
pub const JSON_FORMAT_FULL_STRUCTURED: &str = "structured";
|
||||
pub const JSON_FORMAT_RAW: &str = "raw";
|
||||
pub const JSON_FORMAT_PARTIAL: &str = "partial";
|
||||
@@ -350,51 +346,14 @@ impl ColumnExtensions {
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
let fields = if let Some(value) = options.value(JSON_OPT_FIELDS) {
|
||||
let fields = value
|
||||
.as_struct_fields()
|
||||
.context(InvalidJsonStructureSettingSnafu {
|
||||
reason: format!(r#"expect "{JSON_OPT_FIELDS}" a struct, actual: "{value}""#,),
|
||||
})?;
|
||||
let fields = fields
|
||||
.iter()
|
||||
.map(|field| {
|
||||
let name = field.field_name.as_ref().map(|x| x.value.clone()).context(
|
||||
InvalidJsonStructureSettingSnafu {
|
||||
reason: format!(r#"missing field name in "{field}""#),
|
||||
},
|
||||
)?;
|
||||
let datatype = sql_data_type_to_concrete_data_type(
|
||||
&field.field_type,
|
||||
&Default::default(),
|
||||
)?;
|
||||
Ok(datatypes::types::StructField::new(name, datatype, true))
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
Some(StructType::new(Arc::new(fields)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
options
|
||||
.get(JSON_OPT_FORMAT)
|
||||
.map(|format| match format {
|
||||
JSON_FORMAT_FULL_STRUCTURED => Ok(JsonStructureSettings::Structured(fields)),
|
||||
JSON_FORMAT_PARTIAL => {
|
||||
let fields = fields.map(|fields| {
|
||||
let mut fields = Arc::unwrap_or_clone(fields.fields());
|
||||
fields.push(datatypes::types::StructField::new(
|
||||
JsonStructureSettings::RAW_FIELD.to_string(),
|
||||
ConcreteDataType::string_datatype(),
|
||||
true,
|
||||
));
|
||||
StructType::new(Arc::new(fields))
|
||||
});
|
||||
Ok(JsonStructureSettings::PartialUnstructuredByKey {
|
||||
fields,
|
||||
unstructured_keys,
|
||||
})
|
||||
}
|
||||
JSON_FORMAT_FULL_STRUCTURED => Ok(JsonStructureSettings::Structured(None)),
|
||||
JSON_FORMAT_PARTIAL => Ok(JsonStructureSettings::PartialUnstructuredByKey {
|
||||
fields: None,
|
||||
unstructured_keys,
|
||||
}),
|
||||
JSON_FORMAT_RAW => Ok(JsonStructureSettings::UnstructuredRaw),
|
||||
_ => InvalidSqlSnafu {
|
||||
msg: format!("unknown JSON datatype 'format': {format}"),
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user