diff --git a/Cargo.lock b/Cargo.lock index 695f19b072..edb8ce04d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -212,7 +212,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c" [[package]] name = "api" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow-schema 57.3.0", "common-base", @@ -933,7 +933,7 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "auth" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -1523,7 +1523,7 @@ dependencies = [ [[package]] name = "cache" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "catalog", "common-error", @@ -1559,7 +1559,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "catalog" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow 57.3.0", @@ -1894,7 +1894,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "cli" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-stream", "async-trait", @@ -1951,7 +1951,7 @@ dependencies = [ [[package]] name = "client" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arc-swap", @@ -1983,7 +1983,7 @@ dependencies = [ "serde_json", "snafu 0.8.6", "store-api", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "tokio", "tokio-stream", "tonic 0.14.2", @@ -2023,7 +2023,7 @@ dependencies = [ [[package]] name = "cmd" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -2155,24 +2155,19 @@ dependencies = [ [[package]] name = "common-base" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "anymap2", "async-trait", "bitvec", "bytes", - "common-error", - "common-macro", "common-test-util", "futures", "lazy_static", - "paste", "pin-project", - "rand 0.9.1", "regex", "serde", - "snafu 0.8.6", "tokio", "toml 0.8.23", "zeroize", @@ -2180,14 +2175,14 @@ dependencies = [ [[package]] name = "common-catalog" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "const_format", ] [[package]] name = "common-config" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-base", "common-error", @@ -2203,7 +2198,6 @@ dependencies = [ "object-store", "serde", "serde_json", - "serde_with", "snafu 0.8.6", "temp-env", "tempfile", @@ -2212,7 +2206,7 @@ dependencies = [ [[package]] name = "common-datasource" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow 57.3.0", "arrow-schema 57.3.0", @@ -2248,7 +2242,7 @@ dependencies = [ [[package]] name = "common-decimal" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "bigdecimal 0.4.8", "common-error", @@ -2261,7 +2255,7 @@ dependencies = [ [[package]] name = "common-error" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-macro", "http 1.3.1", @@ -2272,7 +2266,7 @@ dependencies = [ [[package]] name = "common-event-recorder" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -2295,7 +2289,7 @@ dependencies = [ [[package]] name = "common-frontend" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -2316,7 +2310,7 @@ dependencies = [ [[package]] name = "common-function" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -2379,7 +2373,7 @@ dependencies = [ [[package]] name = "common-greptimedb-telemetry" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "common-runtime", @@ -2396,7 +2390,7 @@ dependencies = [ [[package]] name = "common-grpc" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow-flight", @@ -2431,7 +2425,7 @@ dependencies = [ [[package]] name = "common-grpc-expr" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "common-base", @@ -2451,7 +2445,7 @@ dependencies = [ [[package]] name = "common-macro" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "greptime-proto", "once_cell", @@ -2462,7 +2456,7 @@ dependencies = [ [[package]] name = "common-mem-prof" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "anyhow", "common-error", @@ -2478,7 +2472,7 @@ dependencies = [ [[package]] name = "common-memory-manager" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-error", "common-macro", @@ -2490,7 +2484,7 @@ dependencies = [ [[package]] name = "common-meta" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "anymap2", "api", @@ -2561,7 +2555,7 @@ dependencies = [ [[package]] name = "common-options" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-grpc", "humantime-serde", @@ -2571,11 +2565,11 @@ dependencies = [ [[package]] name = "common-plugins" -version = "1.0.0-rc.2" +version = "1.0.0" [[package]] name = "common-pprof" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-error", "common-macro", @@ -2586,7 +2580,7 @@ dependencies = [ [[package]] name = "common-procedure" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-stream", @@ -2615,7 +2609,7 @@ dependencies = [ [[package]] name = "common-procedure-test" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "common-procedure", @@ -2625,7 +2619,7 @@ dependencies = [ [[package]] name = "common-query" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -2651,7 +2645,7 @@ dependencies = [ [[package]] name = "common-recordbatch" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arc-swap", "common-base", @@ -2676,7 +2670,7 @@ dependencies = [ [[package]] name = "common-runtime" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "clap", @@ -2705,7 +2699,7 @@ dependencies = [ [[package]] name = "common-session" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "serde", "strum 0.27.1", @@ -2713,7 +2707,7 @@ dependencies = [ [[package]] name = "common-sql" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow-schema 57.3.0", "common-base", @@ -2733,7 +2727,7 @@ dependencies = [ [[package]] name = "common-stat" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-base", "common-runtime", @@ -2748,7 +2742,7 @@ dependencies = [ [[package]] name = "common-telemetry" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "backtrace", "common-base", @@ -2777,7 +2771,7 @@ dependencies = [ [[package]] name = "common-test-util" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "client", "common-grpc", @@ -2790,7 +2784,7 @@ dependencies = [ [[package]] name = "common-time" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow 57.3.0", "chrono", @@ -2808,7 +2802,7 @@ dependencies = [ [[package]] name = "common-version" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "cargo-manifest", "const_format", @@ -2818,7 +2812,7 @@ dependencies = [ [[package]] name = "common-wal" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-base", "common-error", @@ -2841,7 +2835,7 @@ dependencies = [ [[package]] name = "common-workload" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "common-telemetry", "serde", @@ -4203,7 +4197,7 @@ dependencies = [ [[package]] name = "datanode" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow-flight", @@ -4271,7 +4265,7 @@ dependencies = [ [[package]] name = "datatypes" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arrow 57.3.0", "arrow-array 57.3.0", @@ -4949,7 +4943,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-engine" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -5081,7 +5075,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" [[package]] name = "flow" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow 57.3.0", @@ -5150,7 +5144,7 @@ dependencies = [ "sql", "store-api", "strum 0.27.1", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "table", "tokio", "tonic 0.14.2", @@ -5211,7 +5205,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619" [[package]] name = "frontend" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arc-swap", @@ -6459,7 +6453,7 @@ dependencies = [ [[package]] name = "index" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "asynchronous-codec", @@ -7296,7 +7290,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -7427,7 +7421,7 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "log-query" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "chrono", "common-error", @@ -7439,7 +7433,7 @@ dependencies = [ [[package]] name = "log-store" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-stream", "async-trait", @@ -7730,7 +7724,7 @@ dependencies = [ [[package]] name = "meta-client" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -7761,7 +7755,7 @@ dependencies = [ [[package]] name = "meta-srv" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -7861,7 +7855,7 @@ dependencies = [ [[package]] name = "metric-engine" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "aquamarine", @@ -7962,7 +7956,7 @@ dependencies = [ [[package]] name = "mito-codec" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "bytes", @@ -7987,7 +7981,7 @@ dependencies = [ [[package]] name = "mito2" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "aquamarine", @@ -8711,7 +8705,7 @@ dependencies = [ [[package]] name = "object-store" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "anyhow", "bytes", @@ -8724,7 +8718,6 @@ dependencies = [ "futures", "humantime-serde", "lazy_static", - "moka", "opendal", "prometheus 0.14.0", "reqwest", @@ -9039,7 +9032,7 @@ dependencies = [ [[package]] name = "operator" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -9099,7 +9092,7 @@ dependencies = [ "sql", "sqlparser", "store-api", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "table", "tokio", "tokio-util", @@ -9375,7 +9368,7 @@ checksum = "e3c406c9e2aa74554e662d2c2ee11cd3e73756988800be7e6f5eddb16fed4699" [[package]] name = "partition" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "async-trait", @@ -9593,9 +9586,9 @@ dependencies = [ [[package]] name = "pgwire" -version = "0.38.2" +version = "0.38.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a1bdf05fc8231cc5024572fe056e3ce34eb6b9b755ba7aba110e1c64119cec3" +checksum = "24bd4e6b1bfddc5c6420dee6602ec80946700b4c31ddcb64ee190ad6d979c210" dependencies = [ "async-trait", "base64 0.22.1", @@ -9731,7 +9724,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pipeline" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -9888,7 +9881,7 @@ dependencies = [ [[package]] name = "plugins" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "auth", "catalog", @@ -10206,7 +10199,7 @@ dependencies = [ [[package]] name = "promql" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "async-trait", @@ -10558,7 +10551,7 @@ dependencies = [ [[package]] name = "puffin" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-compression", "async-trait", @@ -10620,7 +10613,7 @@ dependencies = [ [[package]] name = "query" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -10687,7 +10680,7 @@ dependencies = [ "sql", "sqlparser", "store-api", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "table", "tokio", "tokio-stream", @@ -11991,7 +11984,7 @@ dependencies = [ [[package]] name = "servers" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -12125,7 +12118,7 @@ dependencies = [ [[package]] name = "session" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "ahash 0.8.12", "api", @@ -12457,7 +12450,7 @@ dependencies = [ [[package]] name = "sql" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow-buffer 57.3.0", @@ -12518,7 +12511,7 @@ dependencies = [ [[package]] name = "sqlness-runner" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "clap", @@ -12798,7 +12791,7 @@ dependencies = [ [[package]] name = "standalone" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "catalog", @@ -12842,7 +12835,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "store-api" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "aquamarine", @@ -13034,7 +13027,7 @@ dependencies = [ [[package]] name = "substrait" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "async-trait", "bytes", @@ -13156,7 +13149,7 @@ dependencies = [ [[package]] name = "table" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arc-swap", @@ -13426,7 +13419,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" [[package]] name = "tests-fuzz" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "arbitrary", "async-trait", @@ -13470,7 +13463,7 @@ dependencies = [ [[package]] name = "tests-integration" -version = "1.0.0-rc.2" +version = "1.0.0" dependencies = [ "api", "arrow-flight", @@ -13547,7 +13540,7 @@ dependencies = [ "sqlx", "standalone", "store-api", - "substrait 1.0.0-rc.2", + "substrait 1.0.0", "table", "tempfile", "time", diff --git a/Cargo.toml b/Cargo.toml index 5041f167c3..227608bf64 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,7 +75,7 @@ members = [ resolver = "2" [workspace.package] -version = "1.0.0-rc.2" +version = "1.0.0" edition = "2024" license = "Apache-2.0" diff --git a/config/config.md b/config/config.md index 4861675217..82297d484e 100644 --- a/config/config.md +++ b/config/config.md @@ -69,6 +69,11 @@ | `prom_store` | -- | -- | Prometheus remote storage options | | `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. | | `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. | +| `prom_store.pending_rows_flush_interval` | String | `0s` | Interval to flush pending rows batcher.
Set to "0s" to disable batching mode in Prometheus Remote Write endpoint | +| `prom_store.max_batch_rows` | Integer | `100000` | Max rows per pending batch before triggering a flush. | +| `prom_store.max_concurrent_flushes` | Integer | `256` | Max number of concurrent batch flushes. | +| `prom_store.worker_channel_capacity` | Integer | `65526` | Capacity of the pending batch worker channel. | +| `prom_store.max_inflight_requests` | Integer | `3000` | Max inflight write requests before backpressure. | | `wal` | -- | -- | The WAL options. | | `wal.provider` | String | `raft_engine` | The provider of the WAL.
- `raft_engine`: the wal is stored in the local file system by raft-engine.
- `kafka`: it's remote wal that data is stored in Kafka. | | `wal.dir` | String | Unset | The directory to store the WAL files.
**It's only used when the provider is `raft_engine`**. | @@ -139,7 +144,7 @@ | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). | | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). | | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). | -| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. | +| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks.
Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%").
Setting it to 0 or "unlimited" disables the limit. | | `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.
Options: "wait" (default, 10s), "wait()", "fail" | | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. | | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. | @@ -157,13 +162,12 @@ | `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).
When disabled, cache refilling on read won't happen. | | `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). | | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. | -| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | | `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit. | | `region_engine.mito.scan_memory_on_exhausted` | String | `fail` | Controls what happens when a scan cannot get memory immediately.
"fail" (default) fails fast and is the recommended option for most users.
"wait" / "wait()" waits for memory to become available. This is mainly
for advanced tuning in bursty workloads where temporary contention is common and
higher latency is acceptable.
"wait" means "wait(10s)", not unlimited waiting. | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.
To align with the old behavior, the default value is 0 (no restrictions). | -| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | +| `region_engine.mito.default_flat_format` | Bool | `true` | Whether to enable flat format as the default SST format. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | | `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for
creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.
The default name for this directory is `index_intermediate` for backward compatibility.

This path contains two subdirectories:
- `__intm`: for storing intermediate files used during creating index.
- `staging`: for storing staging files used during searching index. | | `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. | @@ -293,6 +297,11 @@ | `prom_store` | -- | -- | Prometheus remote storage options | | `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. | | `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. | +| `prom_store.pending_rows_flush_interval` | String | `0s` | Interval to flush pending rows batcher.
Set to "0s" to disable batching mode in Prometheus Remote Write endpoint | +| `prom_store.max_batch_rows` | Integer | `100000` | Max rows per pending batch before triggering a flush. | +| `prom_store.max_concurrent_flushes` | Integer | `256` | Max number of concurrent batch flushes. | +| `prom_store.worker_channel_capacity` | Integer | `65526` | Capacity of the pending batch worker channel. | +| `prom_store.max_inflight_requests` | Integer | `3000` | Max inflight write requests before backpressure. | | `meta_client` | -- | -- | The metasrv client options. | | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. | | `meta_client.timeout` | String | `3s` | Operation timeout. | @@ -532,7 +541,7 @@ | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). | | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). | | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). | -| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. | +| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks.
Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%").
Setting it to 0 or "unlimited" disables the limit. | | `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.
Options: "wait" (default, 10s), "wait()", "fail" | | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. | | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. | @@ -550,13 +559,12 @@ | `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).
When disabled, cache refilling on read won't happen. | | `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). | | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. | -| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | | `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit. | | `region_engine.mito.scan_memory_on_exhausted` | String | `fail` | Controls what happens when a scan cannot get memory immediately.
"fail" (default) fails fast and is the recommended option for most users.
"wait" / "wait()" waits for memory to become available. This is mainly
for advanced tuning in bursty workloads where temporary contention is common and
higher latency is acceptable.
"wait" means "wait(10s)", not unlimited waiting. | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.
To align with the old behavior, the default value is 0 (no restrictions). | -| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | +| `region_engine.mito.default_flat_format` | Bool | `true` | Whether to enable flat format as the default SST format. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | | `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for
creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.
The default name for this directory is `index_intermediate` for backward compatibility.

This path contains two subdirectories:
- `__intm`: for storing intermediate files used during creating index.
- `staging`: for storing staging files used during searching index. | | `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index 833a567d74..6effec4c87 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -446,7 +446,9 @@ compress_manifest = false ## @toml2docs:none-default="Auto" #+ max_background_purges = 8 -## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. +## Memory budget for compaction tasks. +## Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%"). +## Setting it to 0 or "unlimited" disables the limit. ## @toml2docs:none-default="0" #+ experimental_compaction_memory_limit = "0" @@ -520,9 +522,6 @@ manifest_cache_size = "256MB" ## Buffer size for SST writing. sst_write_buffer_size = "8MB" -## Capacity of the channel to send data from parallel scan tasks to the main task. -parallel_scan_channel_size = 32 - ## Maximum number of SST files to scan concurrently. max_concurrent_scan_files = 384 @@ -545,8 +544,8 @@ scan_memory_on_exhausted = "fail" ## To align with the old behavior, the default value is 0 (no restrictions). min_compaction_interval = "0m" -## Whether to enable experimental flat format as the default format. -default_experimental_flat_format = false +## Whether to enable flat format as the default SST format. +default_flat_format = true ## The options for index in Mito engine. [region_engine.mito.index] diff --git a/config/frontend.example.toml b/config/frontend.example.toml index 435504b122..97b5851672 100644 --- a/config/frontend.example.toml +++ b/config/frontend.example.toml @@ -214,6 +214,17 @@ enable = true enable = true ## Whether to store the data from Prometheus remote write in metric engine. with_metric_engine = true +## Interval to flush pending rows batcher. +## Set to "0s" to disable batching mode in Prometheus Remote Write endpoint +#+pending_rows_flush_interval = "0s" +## Max rows per pending batch before triggering a flush. +#+max_batch_rows = 100000 +## Max number of concurrent batch flushes. +#+max_concurrent_flushes = 256 +## Capacity of the pending batch worker channel. +#+worker_channel_capacity = 65526 +## Max inflight write requests before backpressure. +#+max_inflight_requests = 3000 ## The metasrv client options. [meta_client] diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 94c5feebf1..d14bbe63d5 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -173,6 +173,17 @@ enable = true enable = true ## Whether to store the data from Prometheus remote write in metric engine. with_metric_engine = true +## Interval to flush pending rows batcher. +## Set to "0s" to disable batching mode in Prometheus Remote Write endpoint +#+pending_rows_flush_interval = "0s" +## Max rows per pending batch before triggering a flush. +#+max_batch_rows = 100000 +## Max number of concurrent batch flushes. +#+max_concurrent_flushes = 256 +## Capacity of the pending batch worker channel. +#+worker_channel_capacity = 65526 +## Max inflight write requests before backpressure. +#+max_inflight_requests = 3000 ## The WAL options. [wal] @@ -538,7 +549,9 @@ compress_manifest = false ## @toml2docs:none-default="Auto" #+ max_background_purges = 8 -## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. +## Memory budget for compaction tasks. +## Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%"). +## Setting it to 0 or "unlimited" disables the limit. ## @toml2docs:none-default="0" #+ experimental_compaction_memory_limit = "0" @@ -612,9 +625,6 @@ manifest_cache_size = "256MB" ## Buffer size for SST writing. sst_write_buffer_size = "8MB" -## Capacity of the channel to send data from parallel scan tasks to the main task. -parallel_scan_channel_size = 32 - ## Maximum number of SST files to scan concurrently. max_concurrent_scan_files = 384 @@ -637,8 +647,8 @@ scan_memory_on_exhausted = "fail" ## To align with the old behavior, the default value is 0 (no restrictions). min_compaction_interval = "0m" -## Whether to enable experimental flat format as the default format. -default_experimental_flat_format = false +## Whether to enable flat format as the default SST format. +default_flat_format = true ## The options for index in Mito engine. [region_engine.mito.index] diff --git a/src/cli/src/data/export_v2/data.rs b/src/cli/src/data/export_v2/data.rs index fe2ec7c051..25d70ee118 100644 --- a/src/cli/src/data/export_v2/data.rs +++ b/src/cli/src/data/export_v2/data.rs @@ -337,6 +337,7 @@ fn mask_secrets(sql: &str, secrets: &[Option]) -> String { #[cfg(test)] mod tests { use common_base::secrets::SecretString; + use common_test_util::temp_dir::create_temp_dir; use super::*; use crate::common::{PrefixedAzblobConnection, PrefixedGcsConnection, PrefixedOssConnection}; @@ -432,9 +433,21 @@ mod tests { #[test] fn test_build_copy_target_decodes_file_uri_path() { let storage = ObjectStoreConfig::default(); - let target = build_copy_target("file:///tmp/my%20backup", &storage, "public", 7) + let snapshot_root = create_temp_dir("my backup"); + let snapshot_uri = Url::from_file_path(snapshot_root.path()) + .expect("absolute platform path should convert to file:// URI") + .to_string(); + let expected = normalize_path(&format!( + "{}/{}", + snapshot_root.path().to_string_lossy(), + data_dir_for_schema_chunk("public", 7) + )); + let target = build_copy_target(&snapshot_uri, &storage, "public", 7) .expect("file:// copy target should be built"); - assert_eq!(target.location, "/tmp/my backup/data/public/7/"); + assert!(snapshot_uri.contains("%20")); + assert!(!target.location.contains("%20")); + assert!(target.location.contains("my backup")); + assert_eq!(target.location, expected); } } diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index d547ec6e81..34619f4f1b 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -86,7 +86,6 @@ serde.workspace = true serde_json.workspace = true servers.workspace = true session.workspace = true -similar-asserts.workspace = true snafu.workspace = true sqlparser.workspace = true standalone.workspace = true @@ -113,5 +112,6 @@ common-version.workspace = true file-engine.workspace = true mito2.workspace = true serde.workspace = true +similar-asserts.workspace = true temp-env = "0.3" tempfile.workspace = true diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs index f6d8674d4c..1f3591635f 100644 --- a/src/cmd/src/datanode/objbench.rs +++ b/src/cmd/src/datanode/objbench.rs @@ -211,7 +211,6 @@ impl ObjbenchCommand { object_store.clone(), ) .expected_metadata(Some(region_meta.clone())) - .flat_format(true) .build() .await .map_err(|e| { diff --git a/src/cmd/src/datanode/scanbench.rs b/src/cmd/src/datanode/scanbench.rs index 51064126fe..a93aca430a 100644 --- a/src/cmd/src/datanode/scanbench.rs +++ b/src/cmd/src/datanode/scanbench.rs @@ -102,10 +102,6 @@ pub struct ScanbenchCommand { #[clap(long, value_name = "FILE")] pprof_file: Option, - /// Force reading the region in flat format. - #[clap(long, default_value_t = false)] - force_flat_format: bool, - /// Enable WAL replay when opening the region. #[clap(long, default_value_t = false)] enable_wal: bool, @@ -580,12 +576,11 @@ impl ScanbenchCommand { }; println!( - "{} Scanner: {}, Parallelism: {}, Iterations: {}, Force flat format: {}", + "{} Scanner: {}, Parallelism: {}, Iterations: {}", "ℹ".blue(), self.scanner, self.parallelism, self.iterations, - self.force_flat_format, ); // Start profiling if pprof_file is specified (unless pprof_after_warmup is set) @@ -626,7 +621,6 @@ impl ScanbenchCommand { filters: filters.clone(), series_row_selector, distribution, - force_flat_format: self.force_flat_format, ..Default::default() }; diff --git a/src/common/base/Cargo.toml b/src/common/base/Cargo.toml index 3ec9e1fa35..44c30cd548 100644 --- a/src/common/base/Cargo.toml +++ b/src/common/base/Cargo.toml @@ -16,16 +16,11 @@ anymap2 = "0.13" async-trait.workspace = true bitvec = "1.0" bytes.workspace = true -common-error.workspace = true -common-macro.workspace = true futures.workspace = true lazy_static.workspace = true -paste.workspace = true pin-project.workspace = true -rand.workspace = true regex.workspace = true serde = { version = "1.0", features = ["derive"] } -snafu.workspace = true tokio.workspace = true zeroize = { version = "1.6", default-features = false, features = ["alloc"] } diff --git a/src/common/config/Cargo.toml b/src/common/config/Cargo.toml index 2737f82a58..27b238add7 100644 --- a/src/common/config/Cargo.toml +++ b/src/common/config/Cargo.toml @@ -18,7 +18,6 @@ notify.workspace = true object-store.workspace = true serde.workspace = true serde_json.workspace = true -serde_with.workspace = true snafu.workspace = true toml.workspace = true diff --git a/src/common/datasource/Cargo.toml b/src/common/datasource/Cargo.toml index 6ec9a14733..ae81c6ba98 100644 --- a/src/common/datasource/Cargo.toml +++ b/src/common/datasource/Cargo.toml @@ -28,7 +28,6 @@ common-runtime.workspace = true common-telemetry.workspace = true datafusion.workspace = true datafusion-datasource.workspace = true -datafusion-orc.workspace = true datatypes.workspace = true futures.workspace = true lazy_static.workspace = true @@ -47,3 +46,4 @@ url.workspace = true [dev-dependencies] common-test-util.workspace = true +datafusion-orc.workspace = true diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml index f5ca9d2c09..8132a1fc4e 100644 --- a/src/common/meta/Cargo.toml +++ b/src/common/meta/Cargo.toml @@ -37,7 +37,6 @@ common-error.workspace = true common-grpc-expr.workspace = true common-macro.workspace = true common-procedure.workspace = true -common-procedure-test.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true @@ -92,6 +91,7 @@ typetag.workspace = true [dev-dependencies] chrono.workspace = true common-procedure = { workspace = true, features = ["testing"] } +common-procedure-test.workspace = true common-test-util.workspace = true common-wal = { workspace = true, features = ["testing"] } datatypes.workspace = true diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 8cda639686..75168b3b9a 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -15,14 +15,17 @@ use std::sync::Arc; use api::helper::ColumnDataTypeWrapper; -use api::v1::{ColumnDataType, RowInsertRequests}; +use api::v1::alter_table_expr::Kind; +use api::v1::{ + AlterTableExpr, ColumnDataType, ModifyColumnType, ModifyColumnTypes, RowInsertRequests, +}; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; use client::Output; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_query::prelude::GREPTIME_PHYSICAL_TABLE; -use common_telemetry::tracing; +use common_telemetry::{tracing, warn}; use itertools::Itertools; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; @@ -33,23 +36,26 @@ use servers::http::prom_store::PHYSICAL_TABLE_PARAM; use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef}; use servers::otlp; use servers::otlp::trace::TraceAuxData; -use servers::otlp::trace::coerce::{ - coerce_value_data, is_supported_trace_coercion, resolve_new_trace_column_type, - trace_value_datatype, -}; +use servers::otlp::trace::coerce::{coerce_value_data, trace_value_datatype}; use servers::otlp::trace::span::{TraceSpan, TraceSpanGroup}; use servers::query_handler::{ OpenTelemetryProtocolHandler, PipelineHandlerRef, TraceIngestOutcome, }; use session::context::QueryContextRef; -use snafu::ResultExt; +use snafu::{IntoError, ResultExt}; use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM}; use crate::instance::Instance; +use crate::instance::otlp::trace_types::{ + PendingTraceColumnRewrite, choose_trace_reconcile_decision, enrich_trace_reconcile_error, + is_trace_reconcile_candidate_type, push_observed_trace_type, validate_trace_column_rewrites, +}; use crate::metrics::{ OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_FAILURE_COUNT, OTLP_TRACES_ROWS, }; +pub mod trace_types; + const TRACE_INGEST_CHUNK_SIZE: usize = 64; const TRACE_FAILURE_MESSAGE_LIMIT: usize = 4; @@ -546,34 +552,72 @@ impl Instance { Some(summary) } - /// Picks the final datatype for one trace column. - /// - /// Existing table schema is authoritative when present. Otherwise we resolve the - /// request-local observed types using the shared trace coercion rules. - fn choose_trace_target_type( - observed_types: &[ColumnDataType], - existing_type: Option, - ) -> ServerResult> { - let Some(existing_type) = existing_type else { - return resolve_new_trace_column_type(observed_types.iter().copied()).map_err(|_| { - error::InvalidParameterSnafu { - reason: "unsupported trace type mix".to_string(), - } - .build() - }); + /// Widen existing trace table columns to Float64 before request rewrite. + async fn alter_trace_table_columns_to_float64( + &self, + ctx: &QueryContextRef, + table_name: &str, + column_names: &[String], + ) -> ServerResult<()> { + let catalog_name = ctx.current_catalog().to_string(); + let schema_name = ctx.current_schema(); + let alter_expr = AlterTableExpr { + catalog_name: catalog_name.clone(), + schema_name: schema_name.clone(), + table_name: table_name.to_string(), + kind: Some(Kind::ModifyColumnTypes(ModifyColumnTypes { + modify_column_types: column_names + .iter() + .map(|column_name| ModifyColumnType { + column_name: column_name.clone(), + target_type: ColumnDataType::Float64 as i32, + target_type_extension: None, + }) + .collect(), + })), }; - if observed_types.iter().copied().all(|request_type| { - request_type == existing_type - || is_supported_trace_coercion(request_type, existing_type) - }) { - Ok(Some(existing_type)) - } else { - error::InvalidParameterSnafu { - reason: "unsupported trace type mix".to_string(), + if let Err(err) = self + .statement_executor + .alter_table_inner(alter_expr, ctx.clone()) + .await + { + let table = self + .catalog_manager + .table(&catalog_name, &schema_name, table_name, None) + .await + .map_err(servers::error::Error::from)?; + let alter_already_applied = table + .map(|table| { + let table_schema = table.schema(); + column_names.iter().all(|column_name| { + table_schema + .column_schema_by_name(column_name) + .and_then(|table_col| { + ColumnDataTypeWrapper::try_from(table_col.data_type.clone()) + .ok() + .map(|wrapper| wrapper.datatype()) + }) + == Some(ColumnDataType::Float64) + }) + }) + .unwrap_or(false); + + if alter_already_applied { + return Ok(()); } - .fail() + + warn!( + table_name, + columns = ?column_names, + error = %err, + "failed to widen trace columns before insert" + ); + + return Err(wrap_trace_alter_failure(err)); } + + Ok(()) } /// Coerce request column types and values to match the existing table schema @@ -598,7 +642,8 @@ impl Instance { }; let table_schema = table.map(|table| table.schema()); - let mut pending_coercions = Vec::new(); + let mut pending_rewrites = Vec::new(); + let mut pending_alter_columns = Vec::new(); for (col_idx, col_schema) in rows.schema.iter().enumerate() { let Some(current_type) = ColumnDataType::try_from(col_schema.datatype).ok() else { @@ -647,8 +692,8 @@ impl Instance { // Decide the final type once per column, then rewrite all affected cells // together in one row pass below. - let Some(target_type) = - Self::choose_trace_target_type(&observed_types, existing_type).map_err( + let Some(decision) = + choose_trace_reconcile_decision(&observed_types, existing_type).map_err( |_| { enrich_trace_reconcile_error( &req.table_name, @@ -661,31 +706,54 @@ impl Instance { else { continue; }; + let target_type = decision.target_type(); - if observed_types - .iter() - .all(|observed| *observed == target_type) + if !decision.requires_alter() + && observed_types + .iter() + .all(|observed| *observed == target_type) && col_schema.datatype == target_type as i32 { continue; } - pending_coercions.push((col_idx, target_type, col_schema.column_name.clone())); + if decision.requires_alter() + && !pending_alter_columns.contains(&col_schema.column_name) + { + pending_alter_columns.push(col_schema.column_name.clone()); + } + + pending_rewrites.push(PendingTraceColumnRewrite { + col_idx, + target_type, + column_name: col_schema.column_name.clone(), + }); } - if pending_coercions.is_empty() { + if pending_rewrites.is_empty() { continue; } + validate_trace_column_rewrites(&rows.rows, &pending_rewrites, &req.table_name)?; + + if !pending_alter_columns.is_empty() { + self.alter_trace_table_columns_to_float64( + ctx, + &req.table_name, + &pending_alter_columns, + ) + .await?; + } + // Update schema metadata before mutating row values so both stay in sync. - for (col_idx, target_type, ..) in &pending_coercions { - rows.schema[*col_idx].datatype = *target_type as i32; + for pending_rewrite in &pending_rewrites { + rows.schema[pending_rewrite.col_idx].datatype = pending_rewrite.target_type as i32; } // Apply all pending column rewrites in one row pass. for row in &mut rows.rows { - for (col_idx, target_type, column_name) in &pending_coercions { - let Some(value) = row.values.get_mut(*col_idx) else { + for pending_rewrite in &pending_rewrites { + let Some(value) = row.values.get_mut(pending_rewrite.col_idx) else { continue; }; let Some(request_type) = @@ -693,20 +761,23 @@ impl Instance { else { continue; }; - if request_type == *target_type { + if request_type == pending_rewrite.target_type { continue; } value.value_data = coerce_value_data( &value.value_data, - *target_type, + pending_rewrite.target_type, request_type, ) .map_err(|_| { error::InvalidParameterSnafu { reason: format!( "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}", - column_name, req.table_name, request_type, target_type + pending_rewrite.column_name, + req.table_name, + request_type, + pending_rewrite.target_type ), } .build() @@ -719,58 +790,21 @@ impl Instance { } } -fn enrich_trace_reconcile_error( - table_name: &str, - column_name: &str, - observed_types: &[ColumnDataType], - existing_type: Option, -) -> servers::error::Error { - let observed_types = observed_types - .iter() - .map(|datatype| format!("{datatype:?}")) - .collect::>() - .join(", "); - - error::InvalidParameterSnafu { - reason: match existing_type { - Some(existing_type) => format!( - "failed to reconcile trace column '{}' in table '{}' with observed types [{}] against existing {:?}", - column_name, table_name, observed_types, existing_type - ), - None => format!( - "failed to reconcile trace column '{}' in table '{}' with observed types [{}]", - column_name, table_name, observed_types - ), - }, - } - .build() -} - -/// Only these trace scalar types participate in reconciliation. Other column kinds -/// such as JSON and binary keep their original write path and schema checks. -fn is_trace_reconcile_candidate_type(datatype: ColumnDataType) -> bool { - matches!( - datatype, - ColumnDataType::String - | ColumnDataType::Boolean - | ColumnDataType::Int64 - | ColumnDataType::Float64 - ) -} - -/// Keeps the observed type list small without depending on enum ordering. -fn push_observed_trace_type(observed_types: &mut Vec, datatype: ColumnDataType) { - if !observed_types.contains(&datatype) { - observed_types.push(datatype); - } +/// Preserve the original alter failure status so chunk retry behavior stays correct. +fn wrap_trace_alter_failure(err: E) -> servers::error::Error +where + E: ErrorExt + Send + Sync + 'static, +{ + error::ExecuteGrpcQuerySnafu.into_error(BoxedError::new(err)) } #[cfg(test)] mod tests { + use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use servers::query_handler::TraceIngestOutcome; - use super::{ChunkFailureReaction, Instance}; + use super::{ChunkFailureReaction, Instance, wrap_trace_alter_failure}; use crate::metrics::OTLP_TRACES_FAILURE_COUNT; #[test] @@ -923,4 +957,18 @@ mod tests { ChunkFailureReaction::DiscardChunk ); } + + #[test] + fn test_wrap_trace_alter_failure_preserves_status_code() { + let err = wrap_trace_alter_failure( + servers::error::TableNotFoundSnafu { + catalog: "greptime".to_string(), + schema: "public".to_string(), + table: "trace_type_missing".to_string(), + } + .build(), + ); + + assert_eq!(err.status_code(), StatusCode::TableNotFound); + } } diff --git a/src/frontend/src/instance/otlp/trace_types.rs b/src/frontend/src/instance/otlp/trace_types.rs new file mode 100644 index 0000000000..0be3df550e --- /dev/null +++ b/src/frontend/src/instance/otlp/trace_types.rs @@ -0,0 +1,308 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::{ColumnDataType, Row}; +use servers::error::{self, Result as ServerResult}; +use servers::otlp::trace::coerce::{ + coerce_value_data, is_supported_trace_coercion, resolve_new_trace_column_type, + trace_value_datatype, +}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(super) enum TraceReconcileDecision { + UseExisting(ColumnDataType), + UseRequestLocal(ColumnDataType), + AlterExistingTo(ColumnDataType), +} + +impl TraceReconcileDecision { + pub(super) fn target_type(self) -> ColumnDataType { + match self { + Self::UseExisting(target_type) + | Self::UseRequestLocal(target_type) + | Self::AlterExistingTo(target_type) => target_type, + } + } + + pub(super) fn requires_alter(self) -> bool { + matches!(self, Self::AlterExistingTo(_)) + } +} + +pub(super) struct PendingTraceColumnRewrite { + pub(super) col_idx: usize, + pub(super) target_type: ColumnDataType, + pub(super) column_name: String, +} + +/// Picks the reconciliation action for one trace column. +/// +/// Existing table schema is authoritative unless the only incompatible case is +/// widening an existing Int64 column to Float64 for incoming Int64/Float64 data. +pub(super) fn choose_trace_reconcile_decision( + observed_types: &[ColumnDataType], + existing_type: Option, +) -> ServerResult> { + let Some(existing_type) = existing_type else { + return resolve_new_trace_column_type(observed_types.iter().copied()) + .map(|target_type| target_type.map(TraceReconcileDecision::UseRequestLocal)) + .map_err(|_| { + error::InvalidParameterSnafu { + reason: "unsupported trace type mix".to_string(), + } + .build() + }); + }; + + if observed_types.iter().all(|&request_type| { + request_type == existing_type || is_supported_trace_coercion(request_type, existing_type) + }) { + return Ok(Some(TraceReconcileDecision::UseExisting(existing_type))); + } + + if existing_type == ColumnDataType::Int64 + && observed_types.contains(&ColumnDataType::Float64) + && observed_types.iter().all(|observed_type| { + matches!( + observed_type, + ColumnDataType::Int64 | ColumnDataType::Float64 + ) + }) + { + return Ok(Some(TraceReconcileDecision::AlterExistingTo( + ColumnDataType::Float64, + ))); + } + + error::InvalidParameterSnafu { + reason: "unsupported trace type mix".to_string(), + } + .fail() +} + +/// Validate all pending trace column rewrites before any schema mutation happens. +pub(super) fn validate_trace_column_rewrites( + rows: &[Row], + pending_rewrites: &[PendingTraceColumnRewrite], + table_name: &str, +) -> ServerResult<()> { + for row in rows { + for pending_rewrite in pending_rewrites { + let Some(value) = row.values.get(pending_rewrite.col_idx) else { + continue; + }; + let Some(request_type) = value.value_data.as_ref().and_then(trace_value_datatype) + else { + continue; + }; + if request_type == pending_rewrite.target_type { + continue; + } + + coerce_value_data(&value.value_data, pending_rewrite.target_type, request_type) + .map_err(|_| { + error::InvalidParameterSnafu { + reason: format!( + "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}", + pending_rewrite.column_name, + table_name, + request_type, + pending_rewrite.target_type + ), + } + .build() + })?; + } + } + + Ok(()) +} + +pub(super) fn enrich_trace_reconcile_error( + table_name: &str, + column_name: &str, + observed_types: &[ColumnDataType], + existing_type: Option, +) -> servers::error::Error { + let observed_types = observed_types + .iter() + .map(|datatype| format!("{datatype:?}")) + .collect::>() + .join(", "); + + error::InvalidParameterSnafu { + reason: match existing_type { + Some(existing_type) => format!( + "failed to reconcile trace column '{}' in table '{}' with observed types [{}] against existing {:?}", + column_name, table_name, observed_types, existing_type + ), + None => format!( + "failed to reconcile trace column '{}' in table '{}' with observed types [{}]", + column_name, table_name, observed_types + ), + }, + } + .build() +} + +/// Only these trace scalar types participate in reconciliation. Other column kinds +/// such as JSON and binary keep their original write path and schema checks. +pub(super) fn is_trace_reconcile_candidate_type(datatype: ColumnDataType) -> bool { + matches!( + datatype, + ColumnDataType::String + | ColumnDataType::Boolean + | ColumnDataType::Int64 + | ColumnDataType::Float64 + ) +} + +/// Keeps the observed type list small without depending on enum ordering. +pub(super) fn push_observed_trace_type( + observed_types: &mut Vec, + datatype: ColumnDataType, +) { + if !observed_types.contains(&datatype) { + observed_types.push(datatype); + } +} + +#[cfg(test)] +mod tests { + use api::v1::value::ValueData; + use api::v1::{ColumnDataType, Row, Value}; + use common_error::ext::ErrorExt; + use common_error::status_code::StatusCode; + + use super::{ + PendingTraceColumnRewrite, TraceReconcileDecision, choose_trace_reconcile_decision, + enrich_trace_reconcile_error, is_trace_reconcile_candidate_type, push_observed_trace_type, + validate_trace_column_rewrites, + }; + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_keeps_int64() { + assert_eq!( + choose_trace_reconcile_decision(&[ColumnDataType::Int64], Some(ColumnDataType::Int64)) + .unwrap(), + Some(TraceReconcileDecision::UseExisting(ColumnDataType::Int64)) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_widens_to_float64() { + assert_eq!( + choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + Some(ColumnDataType::Int64) + ) + .unwrap(), + Some(TraceReconcileDecision::AlterExistingTo( + ColumnDataType::Float64 + )) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_float64_stays_authoritative() { + assert_eq!( + choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + Some(ColumnDataType::Float64) + ) + .unwrap(), + Some(TraceReconcileDecision::UseExisting(ColumnDataType::Float64)) + ); + } + + #[test] + fn test_choose_trace_reconcile_decision_existing_int64_with_boolean_is_error() { + let err = choose_trace_reconcile_decision( + &[ColumnDataType::Boolean, ColumnDataType::Int64], + Some(ColumnDataType::Int64), + ) + .unwrap_err(); + assert_eq!(err.status_code(), StatusCode::InvalidArguments); + } + + #[test] + fn test_choose_trace_reconcile_decision_request_local_prefers_float64() { + assert_eq!( + choose_trace_reconcile_decision( + &[ColumnDataType::Int64, ColumnDataType::Float64], + None + ) + .unwrap(), + Some(TraceReconcileDecision::UseRequestLocal( + ColumnDataType::Float64 + )) + ); + } + + #[test] + fn test_validate_trace_column_rewrites_rejects_invalid_string_parse() { + let rows = vec![Row { + values: vec![Value { + value_data: Some(ValueData::StringValue("not_a_number".to_string())), + }], + }]; + let pending_rewrites = vec![PendingTraceColumnRewrite { + col_idx: 0, + target_type: ColumnDataType::Int64, + column_name: "span_attributes.attr_int".to_string(), + }]; + + let err = validate_trace_column_rewrites(&rows, &pending_rewrites, "trace_type_atomicity") + .unwrap_err(); + assert_eq!(err.status_code(), StatusCode::InvalidArguments); + } + + #[test] + fn test_enrich_trace_reconcile_error_includes_existing_type() { + let err = enrich_trace_reconcile_error( + "trace_type_atomicity", + "span_attributes.attr_int", + &[ColumnDataType::String, ColumnDataType::Int64], + Some(ColumnDataType::Boolean), + ); + + assert_eq!(err.status_code(), StatusCode::InvalidArguments); + assert!(err.to_string().contains("span_attributes.attr_int")); + assert!(err.to_string().contains("Boolean")); + } + + #[test] + fn test_is_trace_reconcile_candidate_type_filters_non_scalar_types() { + assert!(is_trace_reconcile_candidate_type(ColumnDataType::String)); + assert!(is_trace_reconcile_candidate_type(ColumnDataType::Boolean)); + assert!(!is_trace_reconcile_candidate_type(ColumnDataType::Binary)); + assert!(!is_trace_reconcile_candidate_type( + ColumnDataType::TimestampMillisecond + )); + } + + #[test] + fn test_push_observed_trace_type_deduplicates_types() { + let mut observed_types = Vec::new(); + + push_observed_trace_type(&mut observed_types, ColumnDataType::Int64); + push_observed_trace_type(&mut observed_types, ColumnDataType::Int64); + push_observed_trace_type(&mut observed_types, ColumnDataType::Float64); + + assert_eq!( + observed_types, + vec![ColumnDataType::Int64, ColumnDataType::Float64] + ); + } +} diff --git a/src/frontend/src/instance/promql.rs b/src/frontend/src/instance/promql.rs index 419be8d96e..3a3aba2307 100644 --- a/src/frontend/src/instance/promql.rs +++ b/src/frontend/src/instance/promql.rs @@ -31,7 +31,7 @@ use snafu::{OptionExt, ResultExt}; use crate::error::{ CatalogSnafu, CollectRecordbatchSnafu, ExecLogicalPlanSnafu, PrometheusLabelValuesQueryPlanSnafu, PrometheusMetricNamesQueryPlanSnafu, ReadTableSnafu, - Result, TableNotFoundSnafu, + Result, TableNotFoundSnafu, TableSnafu, }; use crate::instance::Instance; @@ -120,20 +120,32 @@ impl Instance { }) .unwrap_or_else(|| ctx.current_schema()); + let full_table_name = format_full_table_name(ctx.current_catalog(), &table_schema, &metric); let table = self .catalog_manager .table(ctx.current_catalog(), &table_schema, &metric, Some(ctx)) .await .context(CatalogSnafu)? .with_context(|| TableNotFoundSnafu { - table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric), + table_name: full_table_name.clone(), })?; + // Check label column existence before building the query plan so a missing label can be + // reported as `TableColumnNotFound` and handled like Prometheus expects. + if table.schema().column_schema_by_name(&label_name).is_none() { + return table::error::ColumnNotExistsSnafu { + column_name: label_name, + table_name: full_table_name, + } + .fail() + .context(TableSnafu); + } + let dataframe = self .query_engine .read_table(table.clone()) .with_context(|_| ReadTableSnafu { - table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric), + table_name: full_table_name, })?; let scan_plan = dataframe.into_unoptimized_plan(); diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs index eadb7cdc75..51d2b4d37b 100644 --- a/src/meta-srv/src/bootstrap.rs +++ b/src/meta-srv/src/bootstrap.rs @@ -24,7 +24,6 @@ use common_base::Plugins; use common_config::Configurable; #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] use common_meta::distributed_time_constants::META_LEASE_SECS; -use common_meta::election::CANDIDATE_LEASE_SECS; use common_meta::election::etcd::EtcdElection; use common_meta::kv_backend::chroot::ChrootKvBackend; use common_meta::kv_backend::etcd::EtcdStore; @@ -290,6 +289,7 @@ pub async fn metasrv_builder( use std::time::Duration; use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS; + use common_meta::election::CANDIDATE_LEASE_SECS; use common_meta::election::rds::postgres::{ElectionPgClient, PgElection}; use common_meta::kv_backend::rds::PgStore; use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod}; @@ -354,6 +354,7 @@ pub async fn metasrv_builder( (None, BackendImpl::MysqlStore) => { use std::time::Duration; + use common_meta::election::CANDIDATE_LEASE_SECS; use common_meta::election::rds::mysql::{ElectionMysqlClient, MySqlElection}; use common_meta::kv_backend::rds::MySqlStore; diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs index 7b7983b1ba..a0f800f981 100644 --- a/src/meta-srv/src/error.rs +++ b/src/meta-srv/src/error.rs @@ -1136,6 +1136,12 @@ impl Error { Error::RetryLater { .. } | Error::RetryLaterWithSource { .. } | Error::MailboxTimeout { .. } + ) || matches!( + self, + Error::AllocateRegions { source, .. } if source.is_retry_later() + ) || matches!( + self, + Error::DeallocateRegions { source, .. } if source.is_retry_later() ) } } @@ -1324,3 +1330,35 @@ pub(crate) fn match_for_io_error(err_status: &tonic::Status) -> Option<&std::io: err = err.source()?; } } + +#[cfg(test)] +mod tests { + use common_error::mock::MockError; + use common_error::status_code::StatusCode; + use snafu::ResultExt; + + use super::DeallocateRegionsSnafu; + + #[test] + fn test_deallocate_regions_is_retryable_when_source_is_retry_later() { + let source = common_meta::error::Error::retry_later(MockError::new(StatusCode::Internal)); + let err = Err::<(), _>(source) + .context(DeallocateRegionsSnafu { table_id: 1024_u32 }) + .unwrap_err(); + + assert!(err.is_retryable()); + } + + #[test] + fn test_deallocate_regions_is_not_retryable_when_source_is_not_retry_later() { + let source = common_meta::error::UnexpectedSnafu { + err_msg: "mock error", + } + .build(); + let err = Err::<(), _>(source) + .context(DeallocateRegionsSnafu { table_id: 1024_u32 }) + .unwrap_err(); + + assert!(!err.is_retryable()); + } +} diff --git a/src/meta-srv/src/procedure/repartition.rs b/src/meta-srv/src/procedure/repartition.rs index 37c7745ae5..db8bfeadc5 100644 --- a/src/meta-srv/src/procedure/repartition.rs +++ b/src/meta-srv/src/procedure/repartition.rs @@ -23,7 +23,7 @@ pub mod repartition_start; pub mod utils; use std::any::Any; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Display}; use std::time::{Duration, Instant}; @@ -40,15 +40,15 @@ use common_meta::key::table_route::TableRouteValue; use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; use common_meta::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock}; use common_meta::node_manager::NodeManagerRef; -use common_meta::region_keeper::MemoryRegionKeeperRef; +use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard}; use common_meta::region_registry::LeaderRegionRegistryRef; -use common_meta::rpc::router::RegionRoute; +use common_meta::rpc::router::{RegionRoute, operating_leader_regions}; use common_procedure::error::{FromJsonSnafu, ToJsonSnafu}; use common_procedure::{ BoxedProcedure, Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, ProcedureManagerRef, Result as ProcedureResult, Status, StringKey, UserMetadata, }; -use common_telemetry::{error, info}; +use common_telemetry::{error, info, warn}; use partition::expr::PartitionExpr; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; @@ -56,6 +56,8 @@ use store_api::storage::{RegionNumber, TableId}; use table::table_name::TableName; use crate::error::{self, Result}; +use crate::procedure::repartition::collect::ProcedureMeta; +use crate::procedure::repartition::deallocate_region::DeallocateRegion; use crate::procedure::repartition::group::{ Context as RepartitionGroupContext, RepartitionGroupProcedure, }; @@ -74,6 +76,12 @@ pub struct PersistentContext { pub table_name: String, pub table_id: TableId, pub plans: Vec, + /// Records failed sub-procedures for metadata rollback. + #[serde(default)] + pub failed_procedures: Vec, + #[serde(default)] + /// Records unknown sub-procedures for metadata rollback. + pub unknown_procedures: Vec, /// The timeout for repartition operations. #[serde(with = "humantime_serde", default = "default_timeout")] pub timeout: Duration, @@ -102,6 +110,8 @@ impl PersistentContext { table_name, table_id, plans: vec![], + failed_procedures: vec![], + unknown_procedures: vec![], timeout: timeout.unwrap_or_else(default_timeout), } } @@ -393,6 +403,23 @@ impl Context { .await; Ok(()) } + + pub fn register_operating_regions( + memory_region_keeper: &MemoryRegionKeeperRef, + region_routes: &[RegionRoute], + ) -> Result> { + let mut operating_guards = Vec::with_capacity(region_routes.len()); + for (region_id, datanode_id) in operating_leader_regions(region_routes) { + let guard = memory_region_keeper + .register(datanode_id, region_id) + .context(error::RegionOperatingRaceSnafu { + peer_id: datanode_id, + region_id, + })?; + operating_guards.push(guard); + } + Ok(operating_guards) + } } #[async_trait::async_trait] @@ -456,6 +483,131 @@ impl RepartitionProcedure { Ok(Self { state, context }) } + + /// Returns whether parent rollback should remove this repartition's allocated regions. + /// + /// This uses an "after AllocateRegion" semantic: once execution reaches + /// `AllocateRegion` or any later state, rollback must try to remove this round's + /// `allocated_region_ids` from table-route metadata when they exist. + /// + /// State flow: + /// `RepartitionStart -> AllocateRegion -> Dispatch -> Collect -> DeallocateRegion -> RepartitionEnd` + /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + /// rollback allocated regions in metadata + /// + /// Notes: + /// - `RepartitionStart`: no-op, because allocation has not happened yet. + /// - `AllocateRegion` / `Dispatch` / `Collect` rollback-active. + /// - `DeallocateRegion`: is not rollback-active. + /// - `RepartitionEnd`: no-op. + fn should_rollback_allocated_regions(&self) -> bool { + self.state.as_any().is::() + || self.state.as_any().is::() + || self.state.as_any().is::() + } + + fn rollback_allocated_region_ids(&self) -> HashSet { + if self.state.as_any().is::() + || self.state.as_any().is::() + { + return self + .context + .persistent_ctx + .plans + .iter() + .flat_map(|plan| plan.allocated_region_ids.iter().copied()) + .collect(); + } + + self.context + .persistent_ctx + .failed_procedures + .iter() + .chain(self.context.persistent_ctx.unknown_procedures.iter()) + .flat_map(|procedure_meta| { + let plan_index = procedure_meta.plan_index; + self.context.persistent_ctx.plans[plan_index] + .allocated_region_ids + .iter() + .copied() + }) + .collect() + } + + fn filter_allocated_region_routes( + region_routes: &[RegionRoute], + allocated_region_ids: &HashSet, + ) -> Vec { + region_routes + .iter() + .filter(|route| !allocated_region_ids.contains(&route.region.id)) + .cloned() + .collect() + } + + async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> { + if !self.should_rollback_allocated_regions() { + return Ok(()); + } + + let table_id = self.context.persistent_ctx.table_id; + let allocated_region_ids = self.rollback_allocated_region_ids(); + if allocated_region_ids.is_empty() { + return Ok(()); + } + + let table_lock = TableLock::Write(table_id).into(); + let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await; + let table_route_value = self.context.get_table_route_value().await?; + let current_region_routes = table_route_value.region_routes().unwrap(); + let allocated_region_routes = DeallocateRegion::filter_deallocatable_region_routes( + table_id, + current_region_routes, + &allocated_region_ids, + ); + if !allocated_region_routes.is_empty() { + let table = TableName { + catalog_name: self.context.persistent_ctx.catalog_name.clone(), + schema_name: self.context.persistent_ctx.schema_name.clone(), + table_name: self.context.persistent_ctx.table_name.clone(), + }; + // Memory guards are not required here, + // because the table metadata still contains routes for the deallocating regions. + if let Err(err) = DeallocateRegion::deallocate_regions( + &self.context.node_manager, + &self.context.leader_region_registry, + table, + table_id, + &allocated_region_routes, + ) + .await + { + warn!(err; "Failed to drop allocated regions during repartition rollback, table_id: {}, regions: {:?}", table_id, allocated_region_ids); + } + } + + let new_region_routes = + Self::filter_allocated_region_routes(current_region_routes, &allocated_region_ids); + + if new_region_routes.len() != current_region_routes.len() { + self.context + .update_table_route(&table_route_value, new_region_routes, HashMap::new()) + .await + .map_err(BoxedError::new) + .with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!( + "Failed to rollback allocated region routes for repartition table: {}", + table_id + ), + })?; + } + + if let Err(err) = self.context.invalidate_table_cache().await { + warn!(err; "Failed to invalidate table cache during repartition rollback, table_id: {}", table_id); + } + + Ok(()) + } } #[async_trait::async_trait] @@ -497,9 +649,14 @@ impl Procedure for RepartitionProcedure { } } + async fn rollback(&mut self, ctx: &ProcedureContext) -> ProcedureResult<()> { + self.rollback_inner(ctx) + .await + .map_err(ProcedureError::external) + } + fn rollback_supported(&self) -> bool { - // TODO(weny): support rollback. - false + true } fn dump(&self) -> ProcedureResult { @@ -624,3 +781,642 @@ impl RepartitionProcedureFactory for DefaultRepartitionProcedureFactory { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, Ordering}; + + use common_error::ext::BoxedError; + use common_error::mock::MockError; + use common_error::status_code::StatusCode; + use common_meta::ddl::test_util::datanode_handler::{ + DatanodeWatcher, NaiveDatanodeHandler, UnexpectedErrorDatanodeHandler, + }; + use common_meta::error; + use common_meta::peer::Peer; + use common_meta::rpc::router::{Region, RegionRoute}; + use common_meta::test_util::MockDatanodeManager; + use common_procedure::{Error as ProcedureError, Procedure, ProcedureId, ProcedureState}; + use store_api::storage::RegionId; + use table::table_name::TableName; + use tokio::sync::mpsc; + use uuid::Uuid; + + use super::*; + use crate::procedure::repartition::allocate_region::AllocateRegion; + use crate::procedure::repartition::collect::Collect; + use crate::procedure::repartition::deallocate_region::DeallocateRegion; + use crate::procedure::repartition::dispatch::Dispatch; + use crate::procedure::repartition::plan::RegionDescriptor; + use crate::procedure::repartition::repartition_end::RepartitionEnd; + use crate::procedure::repartition::test_util::{ + TestingEnv, assert_parent_state, current_parent_region_routes, extract_subprocedure_ids, + new_parent_context, procedure_context_with_receivers, procedure_state_receiver, range_expr, + test_region_route, test_region_wal_options, + }; + + fn test_plan(table_id: TableId) -> RepartitionPlanEntry { + RepartitionPlanEntry { + group_id: uuid::Uuid::new_v4(), + source_regions: vec![RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100), + }], + target_regions: vec![ + RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 50), + }, + RegionDescriptor { + region_id: RegionId::new(table_id, 3), + partition_expr: range_expr("x", 50, 100), + }, + ], + allocated_region_ids: vec![RegionId::new(table_id, 3)], + pending_deallocate_region_ids: vec![], + transition_map: vec![vec![0, 1]], + } + } + + fn test_procedure(state: Box, context: Context) -> RepartitionProcedure { + RepartitionProcedure { state, context } + } + + fn test_context(env: &TestingEnv, table_id: TableId) -> Context { + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + + Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ) + } + + #[test] + fn test_filter_allocated_region_routes() { + let table_id = 1024; + let region_routes = vec![ + test_region_route(RegionId::new(table_id, 1), "a"), + test_region_route(RegionId::new(table_id, 2), "b"), + ]; + let allocated_region_ids = HashSet::from([RegionId::new(table_id, 2)]); + + let new_region_routes = RepartitionProcedure::filter_allocated_region_routes( + ®ion_routes, + &allocated_region_ids, + ); + + assert_eq!(new_region_routes.len(), 1); + assert_eq!(new_region_routes[0].region.id, RegionId::new(table_id, 1)); + } + + #[test] + fn test_should_rollback_allocated_regions() { + let env = TestingEnv::new(); + let table_id = 1024; + + let procedure = test_procedure( + Box::new(RepartitionStart::new(vec![], vec![])), + test_context(&env, table_id), + ); + assert!(!procedure.should_rollback_allocated_regions()); + + let procedure = test_procedure( + Box::new(AllocateRegion::new(vec![])), + test_context(&env, table_id), + ); + assert!(procedure.should_rollback_allocated_regions()); + + let procedure = test_procedure(Box::new(Dispatch), test_context(&env, table_id)); + assert!(procedure.should_rollback_allocated_regions()); + + let procedure = + test_procedure(Box::new(Collect::new(vec![])), test_context(&env, table_id)); + assert!(procedure.should_rollback_allocated_regions()); + + let procedure = test_procedure(Box::new(DeallocateRegion), test_context(&env, table_id)); + assert!(!procedure.should_rollback_allocated_regions()); + + let procedure = test_procedure(Box::new(RepartitionEnd), test_context(&env, table_id)); + assert!(!procedure.should_rollback_allocated_regions()); + } + + #[tokio::test] + async fn test_repartition_rollback_removes_allocated_routes_from_dispatch() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let original_region_routes = vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 50, 100).as_json_str().unwrap(), + ), + test_region_route(RegionId::new(table_id, 3), ""), + ]; + env.create_physical_table_metadata_with_wal_options( + table_id, + original_region_routes, + test_region_wal_options(&[1, 2]), + ) + .await; + + let mut persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + persistent_ctx.plans = vec![test_plan(table_id)]; + persistent_ctx.failed_procedures = vec![ProcedureMeta { + plan_index: 0, + group_id: Uuid::new_v4(), + procedure_id: ProcedureId::random(), + }]; + let context = Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut procedure = RepartitionProcedure { + state: Box::new(Dispatch), + context, + }; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + + let region_routes = current_parent_region_routes(&procedure.context).await; + assert_eq!(region_routes.len(), 2); + assert_eq!(region_routes[0].region.id, RegionId::new(table_id, 1)); + assert_eq!(region_routes[1].region.id, RegionId::new(table_id, 2)); + } + + #[tokio::test] + async fn test_repartition_rollback_removes_allocated_routes_from_allocate() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let original_region_routes = vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 50, 100).as_json_str().unwrap(), + ), + test_region_route(RegionId::new(table_id, 3), ""), + ]; + env.create_physical_table_metadata_with_wal_options( + table_id, + original_region_routes, + test_region_wal_options(&[1, 2]), + ) + .await; + + let mut persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + persistent_ctx.plans = vec![test_plan(table_id)]; + let context = Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut procedure = RepartitionProcedure { + state: Box::new(AllocateRegion::new(vec![])), + context, + }; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + + let region_routes = current_parent_region_routes(&procedure.context).await; + assert_eq!(region_routes.len(), 2); + assert_eq!(region_routes[0].region.id, RegionId::new(table_id, 1)); + assert_eq!(region_routes[1].region.id, RegionId::new(table_id, 2)); + } + + #[tokio::test] + async fn test_repartition_rollback_from_collect_only_removes_failed_allocated_routes() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let original_region_routes = vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + test_region_route(RegionId::new(table_id, 3), ""), + test_region_route(RegionId::new(table_id, 4), ""), + ]; + env.create_physical_table_metadata_with_wal_options( + table_id, + original_region_routes, + test_region_wal_options(&[1, 2, 3, 4]), + ) + .await; + + let mut persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + let failed_plan = test_plan(table_id); + let succeeded_plan = RepartitionPlanEntry { + group_id: Uuid::new_v4(), + source_regions: vec![RegionDescriptor { + region_id: RegionId::new(table_id, 2), + partition_expr: range_expr("x", 100, 200), + }], + target_regions: vec![ + RegionDescriptor { + region_id: RegionId::new(table_id, 2), + partition_expr: range_expr("x", 100, 150), + }, + RegionDescriptor { + region_id: RegionId::new(table_id, 4), + partition_expr: range_expr("x", 150, 200), + }, + ], + allocated_region_ids: vec![RegionId::new(table_id, 4)], + pending_deallocate_region_ids: vec![], + transition_map: vec![vec![0]], + }; + persistent_ctx.plans = vec![failed_plan, succeeded_plan]; + persistent_ctx.failed_procedures = vec![ProcedureMeta { + plan_index: 0, + group_id: persistent_ctx.plans[0].group_id, + procedure_id: ProcedureId::random(), + }]; + + let context = Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut procedure = RepartitionProcedure { + state: Box::new(Collect::new(vec![])), + context, + }; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + + let region_routes = current_parent_region_routes(&procedure.context).await; + assert_eq!(region_routes.len(), 3); + assert_eq!(region_routes[0].region.id, RegionId::new(table_id, 1)); + assert_eq!(region_routes[1].region.id, RegionId::new(table_id, 2)); + assert_eq!(region_routes[2].region.id, RegionId::new(table_id, 4)); + } + + #[tokio::test] + async fn test_repartition_rollback_is_idempotent() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler)); + let ddl_ctx = env.ddl_context(node_manager); + let original_region_routes = vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 50, 100).as_json_str().unwrap(), + ), + test_region_route(RegionId::new(table_id, 3), ""), + ]; + env.create_physical_table_metadata_with_wal_options( + table_id, + original_region_routes, + test_region_wal_options(&[1, 2]), + ) + .await; + + let mut persistent_ctx = PersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + persistent_ctx.plans = vec![test_plan(table_id)]; + persistent_ctx.failed_procedures = vec![ProcedureMeta { + plan_index: 0, + group_id: Uuid::new_v4(), + procedure_id: ProcedureId::random(), + }]; + let context = Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut procedure = RepartitionProcedure { + state: Box::new(Dispatch), + context, + }; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + let once = current_parent_region_routes(&procedure.context).await; + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + let twice = current_parent_region_routes(&procedure.context).await; + + assert_eq!(once, twice); + assert_eq!(once.len(), 2); + assert_eq!(once[0].region.id, RegionId::new(table_id, 1)); + assert_eq!(once[1].region.id, RegionId::new(table_id, 2)); + } + + #[tokio::test] + async fn test_repartition_procedure_flow_split_failed_and_full_rollback() { + let env = TestingEnv::new(); + let table_id = 1024; + let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler)); + + env.create_physical_table_metadata_for_repartition( + table_id, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + ], + test_region_wal_options(&[1, 2]), + ) + .await; + + let context = new_parent_context(&env, node_manager, table_id); + let mut procedure = RepartitionProcedure::new( + vec![range_expr("x", 0, 100)], + vec![range_expr("x", 0, 50), range_expr("x", 50, 100)], + context, + ); + + let start_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(!start_status.need_persist()); + let start_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(start_status.need_persist()); + assert_parent_state::(&procedure); + + let allocate_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(allocate_status.need_persist()); + assert_parent_state::(&procedure); + assert_eq!(procedure.context.persistent_ctx.plans.len(), 1); + let plan = &procedure.context.persistent_ctx.plans[0]; + let expected_plan = test_plan(table_id); + assert_eq!(plan.source_regions, expected_plan.source_regions); + assert_eq!(plan.target_regions, expected_plan.target_regions); + assert_eq!( + plan.allocated_region_ids, + expected_plan.allocated_region_ids + ); + assert_eq!( + plan.pending_deallocate_region_ids, + expected_plan.pending_deallocate_region_ids + ); + assert_eq!(plan.transition_map, expected_plan.transition_map); + assert_eq!( + current_parent_region_routes(&procedure.context).await, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + RegionRoute { + region: Region { + id: RegionId::new(table_id, 3), + partition_expr: range_expr("x", 50, 100).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(0)), + ..Default::default() + }, + ] + ); + + let dispatch_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(!dispatch_status.need_persist()); + let subprocedure_ids = extract_subprocedure_ids(dispatch_status); + assert_eq!(subprocedure_ids.len(), 1); + assert_parent_state::(&procedure); + + let failed_state = ProcedureState::failed(Arc::new(ProcedureError::external( + MockError::new(StatusCode::Internal), + ))); + let collect_ctx = procedure_context_with_receivers(HashMap::from([( + subprocedure_ids[0], + procedure_state_receiver(failed_state), + )])); + + let err = procedure.execute(&collect_ctx).await.unwrap_err(); + assert!(!err.is_retry_later()); + assert_parent_state::(&procedure); + + procedure + .rollback(&TestingEnv::procedure_context()) + .await + .unwrap(); + + let region_routes = current_parent_region_routes(&procedure.context).await; + assert_eq!( + region_routes, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + ] + ); + } + + #[tokio::test] + async fn test_repartition_procedure_flow_split_allocate_retryable_then_resume() { + common_telemetry::init_default_ut_logging(); + let env = TestingEnv::new(); + let table_id = 1024; + let (tx, _rx) = mpsc::channel(8); + let should_retry = Arc::new(AtomicBool::new(true)); + let datanode_handler = DatanodeWatcher::new(tx).with_handler(move |_, _| { + if should_retry.swap(false, Ordering::SeqCst) { + return Err(error::Error::RetryLater { + source: BoxedError::new( + error::UnexpectedSnafu { + err_msg: "retry later", + } + .build(), + ), + clean_poisons: false, + }); + } + + Ok(api::region::RegionResponse::new(0)) + }); + let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler)); + + env.create_physical_table_metadata_for_repartition( + table_id, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + ], + test_region_wal_options(&[1, 2]), + ) + .await; + + let context = new_parent_context(&env, node_manager, table_id); + let mut procedure = RepartitionProcedure::new( + vec![range_expr("x", 0, 100)], + vec![range_expr("x", 0, 50), range_expr("x", 50, 100)], + context, + ); + + let start_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(!start_status.need_persist()); + let start_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(start_status.need_persist()); + assert_parent_state::(&procedure); + + let err = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap_err(); + assert!(err.is_retry_later()); + assert_parent_state::(&procedure); + assert!(!procedure.context.persistent_ctx.plans.is_empty()); + assert_eq!( + current_parent_region_routes(&procedure.context).await, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + ] + ); + + let allocate_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(allocate_status.need_persist()); + assert_parent_state::(&procedure); + + assert_eq!(procedure.context.persistent_ctx.plans.len(), 1); + let plan = &procedure.context.persistent_ctx.plans[0]; + let expected_plan = test_plan(table_id); + assert_eq!(plan.source_regions, expected_plan.source_regions); + assert_eq!(plan.target_regions, expected_plan.target_regions); + assert_eq!( + plan.allocated_region_ids, + expected_plan.allocated_region_ids + ); + assert_eq!(plan.transition_map, expected_plan.transition_map); + assert_eq!( + current_parent_region_routes(&procedure.context).await, + vec![ + test_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + ), + test_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + ), + RegionRoute { + region: Region { + id: RegionId::new(table_id, 3), + partition_expr: range_expr("x", 50, 100).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(0)), + ..Default::default() + }, + ] + ); + + let dispatch_status = procedure + .execute(&TestingEnv::procedure_context()) + .await + .unwrap(); + assert!(!dispatch_status.need_persist()); + let subprocedure_ids = extract_subprocedure_ids(dispatch_status); + assert_eq!(subprocedure_ids.len(), 1); + assert_parent_state::(&procedure); + } +} diff --git a/src/meta-srv/src/procedure/repartition/allocate_region.rs b/src/meta-srv/src/procedure/repartition/allocate_region.rs index b1bf93d986..12ffac9918 100644 --- a/src/meta-srv/src/procedure/repartition/allocate_region.rs +++ b/src/meta-srv/src/procedure/repartition/allocate_region.rs @@ -21,12 +21,11 @@ use common_meta::ddl::create_table::template::{ }; use common_meta::lock_key::TableLock; use common_meta::node_manager::NodeManagerRef; -use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard}; -use common_meta::rpc::router::{RegionRoute, operating_leader_regions}; +use common_meta::rpc::router::RegionRoute; use common_procedure::{Context as ProcedureContext, Status}; -use common_telemetry::info; -use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt}; +use common_telemetry::{debug, info}; +use serde::{Deserialize, Deserializer, Serialize}; +use snafu::ResultExt; use store_api::storage::{RegionNumber, TableId}; use table::metadata::TableInfo; use table::table_reference::TableReference; @@ -40,14 +39,103 @@ use crate::procedure::repartition::plan::{ }; use crate::procedure::repartition::{Context, State}; +#[derive(Debug, Clone, Serialize)] +pub enum AllocateRegion { + Build(BuildPlan), + Execute(ExecutePlan), +} + +impl<'de> Deserialize<'de> for AllocateRegion { + fn deserialize(deserializer: D) -> std::result::Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + enum CurrentAllocateRegion { + Build(BuildPlan), + Execute(ExecutePlan), + } + + #[derive(Deserialize)] + struct LegacyAllocateRegion { + plan_entries: Vec, + } + + #[derive(Deserialize)] + #[serde(untagged)] + enum AllocateRegionRepr { + Current(CurrentAllocateRegion), + Legacy(LegacyAllocateRegion), + } + + match AllocateRegionRepr::deserialize(deserializer)? { + AllocateRegionRepr::Current(CurrentAllocateRegion::Build(build_plan)) => { + Ok(Self::Build(build_plan)) + } + AllocateRegionRepr::Current(CurrentAllocateRegion::Execute(execute_plan)) => { + Ok(Self::Execute(execute_plan)) + } + AllocateRegionRepr::Legacy(legacy) => Ok(Self::Build(BuildPlan { + plan_entries: legacy.plan_entries, + })), + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct AllocateRegion { +pub struct BuildPlan { plan_entries: Vec, } -#[async_trait::async_trait] -#[typetag::serde] -impl State for AllocateRegion { +impl BuildPlan { + async fn next( + &mut self, + ctx: &mut Context, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let timer = Instant::now(); + let table_id = ctx.persistent_ctx.table_id; + let table_route_value = ctx.get_table_route_value().await?; + let mut next_region_number = + AllocateRegion::get_next_region_number(table_route_value.max_region_number().unwrap()); + + // Converts allocation plan to repartition plan. + let repartition_plan_entries = AllocateRegion::convert_to_repartition_plans( + table_id, + &mut next_region_number, + &self.plan_entries, + ); + let plan_count = repartition_plan_entries.len(); + let to_allocate = AllocateRegion::count_regions_to_allocate(&repartition_plan_entries); + info!( + "Repartition allocate regions start, table_id: {}, groups: {}, regions_to_allocate: {}", + table_id, plan_count, to_allocate + ); + + // If no region to allocate, directly dispatch the plan. + if AllocateRegion::count_regions_to_allocate(&repartition_plan_entries) == 0 { + ctx.persistent_ctx.plans = repartition_plan_entries; + ctx.update_allocate_region_elapsed(timer.elapsed()); + return Ok((Box::new(Dispatch), Status::executing(true))); + } + + ctx.persistent_ctx.plans = repartition_plan_entries; + debug!( + "Repartition allocate regions build plan completed, table_id: {}, elapsed: {:?}", + table_id, + timer.elapsed() + ); + Ok(( + Box::new(AllocateRegion::Execute(ExecutePlan)), + Status::executing(true), + )) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExecutePlan; + +impl ExecutePlan { async fn next( &mut self, ctx: &mut Context, @@ -55,36 +143,13 @@ impl State for AllocateRegion { ) -> Result<(Box, Status)> { let timer = Instant::now(); let table_id = ctx.persistent_ctx.table_id; + let allocate_regions = AllocateRegion::collect_allocate_regions(&ctx.persistent_ctx.plans); + let region_number_and_partition_exprs = + AllocateRegion::prepare_region_allocation_data(&allocate_regions)?; + let table_info_value = ctx.get_table_info_value().await?; let table_route_value = ctx.get_table_route_value().await?; // Safety: it is physical table route value. let region_routes = table_route_value.region_routes().unwrap(); - let mut next_region_number = - Self::get_next_region_number(table_route_value.max_region_number().unwrap()); - - // Converts allocation plan to repartition plan. - let repartition_plan_entries = Self::convert_to_repartition_plans( - table_id, - &mut next_region_number, - &self.plan_entries, - ); - let plan_count = repartition_plan_entries.len(); - let to_allocate = Self::count_regions_to_allocate(&repartition_plan_entries); - info!( - "Repartition allocate regions start, table_id: {}, groups: {}, regions_to_allocate: {}", - table_id, plan_count, to_allocate - ); - - // If no region to allocate, directly dispatch the plan. - if Self::count_regions_to_allocate(&repartition_plan_entries) == 0 { - ctx.persistent_ctx.plans = repartition_plan_entries; - ctx.update_allocate_region_elapsed(timer.elapsed()); - return Ok((Box::new(Dispatch), Status::executing(true))); - } - - let allocate_regions = Self::collect_allocate_regions(&repartition_plan_entries); - let region_number_and_partition_exprs = - Self::prepare_region_allocation_data(&allocate_regions)?; - let table_info_value = ctx.get_table_info_value().await?; let new_allocated_region_routes = ctx .region_routes_allocator .allocate( @@ -122,12 +187,13 @@ impl State for AllocateRegion { table_id, new_region_count, new_regions_brief ); - let _operating_guards = Self::register_operating_regions( + // The table route metadata is not updated yet; register it in memory for region lease renewal. + let _operating_guards = Context::register_operating_regions( &ctx.memory_region_keeper, &new_allocated_region_routes, )?; // Allocates the regions on datanodes. - Self::allocate_regions( + AllocateRegion::allocate_regions( &ctx.node_manager, &table_info_value.table_info, &new_allocated_region_routes, @@ -135,21 +201,33 @@ impl State for AllocateRegion { ) .await?; - // TODO(weny): for metric engine, sync logical regions from the the central region. - // Updates the table routes. let table_lock = TableLock::Write(table_id).into(); let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await; let new_region_routes = - Self::generate_region_routes(region_routes, &new_allocated_region_routes); + AllocateRegion::generate_region_routes(region_routes, &new_allocated_region_routes); ctx.update_table_route(&table_route_value, new_region_routes, wal_options) .await?; ctx.invalidate_table_cache().await?; - ctx.persistent_ctx.plans = repartition_plan_entries; ctx.update_allocate_region_elapsed(timer.elapsed()); Ok((Box::new(Dispatch), Status::executing(true))) } +} + +#[async_trait::async_trait] +#[typetag::serde] +impl State for AllocateRegion { + async fn next( + &mut self, + ctx: &mut Context, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + match self { + AllocateRegion::Build(build_plan) => build_plan.next(ctx, procedure_ctx).await, + AllocateRegion::Execute(execute_plan) => execute_plan.next(ctx, procedure_ctx).await, + } + } fn as_any(&self) -> &dyn Any { self @@ -158,24 +236,7 @@ impl State for AllocateRegion { impl AllocateRegion { pub fn new(plan_entries: Vec) -> Self { - Self { plan_entries } - } - - fn register_operating_regions( - memory_region_keeper: &MemoryRegionKeeperRef, - region_routes: &[RegionRoute], - ) -> Result> { - let mut operating_guards = Vec::with_capacity(region_routes.len()); - for (region_id, datanode_id) in operating_leader_regions(region_routes) { - let guard = memory_region_keeper - .register(datanode_id, region_id) - .context(error::RegionOperatingRaceSnafu { - peer_id: datanode_id, - region_id, - })?; - operating_guards.push(guard); - } - Ok(operating_guards) + AllocateRegion::Build(BuildPlan { plan_entries }) } fn generate_region_routes( @@ -300,6 +361,7 @@ mod tests { use uuid::Uuid; use super::*; + use crate::procedure::repartition::State; use crate::procedure::repartition::test_util::range_expr; fn create_region_descriptor( @@ -488,4 +550,71 @@ mod tests { assert!(!result[0].1.is_empty()); assert!(!result[1].1.is_empty()); } + + #[test] + fn test_allocate_region_state_backward_compatibility() { + // Arrange + let serialized = r#"{"repartition_state":"AllocateRegion","plan_entries":[]}"#; + + // Act + let state: Box = serde_json::from_str(serialized).unwrap(); + + // Assert + let allocate_region = state + .as_any() + .downcast_ref::() + .expect("expected AllocateRegion state"); + match allocate_region { + AllocateRegion::Build(build_plan) => assert!(build_plan.plan_entries.is_empty()), + AllocateRegion::Execute(_) => panic!("expected build plan"), + } + } + + #[test] + fn test_allocate_region_state_round_trip() { + // Arrange + let state: Box = Box::new(AllocateRegion::new(vec![])); + + // Act + let serialized = serde_json::to_string(&state).unwrap(); + let deserialized: Box = serde_json::from_str(&serialized).unwrap(); + + // Assert + assert_eq!( + serialized, + r#"{"repartition_state":"AllocateRegion","Build":{"plan_entries":[]}}"# + ); + let allocate_region = deserialized + .as_any() + .downcast_ref::() + .expect("expected AllocateRegion state"); + match allocate_region { + AllocateRegion::Build(build_plan) => assert!(build_plan.plan_entries.is_empty()), + AllocateRegion::Execute(_) => panic!("expected build plan"), + } + } + + #[test] + fn test_allocate_region_execute_state_round_trip() { + // Arrange + let state: Box = Box::new(AllocateRegion::Execute(ExecutePlan)); + + // Act + let serialized = serde_json::to_string(&state).unwrap(); + let deserialized: Box = serde_json::from_str(&serialized).unwrap(); + + // Assert + assert_eq!( + serialized, + r#"{"repartition_state":"AllocateRegion","Execute":null}"# + ); + let allocate_region = deserialized + .as_any() + .downcast_ref::() + .expect("expected AllocateRegion state"); + match allocate_region { + AllocateRegion::Execute(_) => {} + AllocateRegion::Build(_) => panic!("expected execute plan"), + } + } } diff --git a/src/meta-srv/src/procedure/repartition/collect.rs b/src/meta-srv/src/procedure/repartition/collect.rs index d413158b94..1a6d0c6257 100644 --- a/src/meta-srv/src/procedure/repartition/collect.rs +++ b/src/meta-srv/src/procedure/repartition/collect.rs @@ -94,17 +94,28 @@ impl State for Collect { } } - let inflight = self.inflight_procedures.len(); let succeeded = self.succeeded_procedures.len(); let failed = self.failed_procedures.len(); let unknown = self.unknown_procedures.len(); info!( - "Collected repartition group results for table_id: {}, inflight: {}, succeeded: {}, failed: {}, unknown: {}", - table_id, inflight, succeeded, failed, unknown + "Collected repartition group results for table_id: {}, succeeded: {}, failed: {}, unknown: {}", + table_id, succeeded, failed, unknown ); if failed > 0 || unknown > 0 { - // TODO(weny): retry the failed or unknown procedures. + ctx.persistent_ctx + .failed_procedures + .extend(self.failed_procedures.iter()); + ctx.persistent_ctx + .unknown_procedures + .extend(self.unknown_procedures.iter()); + return crate::error::UnexpectedSnafu { + violated: format!( + "Repartition groups failed or became unknown, table_id: {}, failed: {}, unknown: {}", + table_id, failed, unknown + ), + } + .fail(); } if let Some(start_time) = ctx.volatile_ctx.dispatch_start_time.take() { @@ -118,3 +129,139 @@ impl State for Collect { self } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use common_error::mock::MockError; + use common_error::status_code::StatusCode; + use common_meta::test_util::MockDatanodeManager; + use common_procedure::{ + Context as ProcedureContext, ContextProvider, Error as ProcedureError, ProcedureId, + ProcedureState, + }; + use common_procedure_test::MockContextProvider; + use tokio::sync::watch; + + use super::*; + use crate::procedure::repartition::PersistentContext; + use crate::procedure::repartition::test_util::TestingEnv; + + struct FailedProcedureContextProvider { + receiver: watch::Receiver, + inner: MockContextProvider, + } + + #[async_trait::async_trait] + impl ContextProvider for FailedProcedureContextProvider { + async fn procedure_state( + &self, + procedure_id: ProcedureId, + ) -> common_procedure::Result> { + self.inner.procedure_state(procedure_id).await + } + + async fn procedure_state_receiver( + &self, + _procedure_id: ProcedureId, + ) -> common_procedure::Result>> { + Ok(Some(self.receiver.clone())) + } + + async fn try_put_poison( + &self, + key: &common_procedure::PoisonKey, + procedure_id: ProcedureId, + ) -> common_procedure::Result<()> { + self.inner.try_put_poison(key, procedure_id).await + } + + async fn acquire_lock( + &self, + key: &common_procedure::StringKey, + ) -> common_procedure::local::DynamicKeyLockGuard { + self.inner.acquire_lock(key).await + } + } + + #[tokio::test] + async fn test_collect_returns_error_when_unknown_exists() { + let env = TestingEnv::new(); + let ddl_ctx = env.ddl_context(Arc::new(MockDatanodeManager::new(()))); + let persistent_ctx = PersistentContext::new( + table::table_name::TableName::new("test_catalog", "test_schema", "test_table"), + 1024, + None, + ); + let mut ctx = crate::procedure::repartition::Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let mut state = Collect { + inflight_procedures: vec![], + succeeded_procedures: vec![], + failed_procedures: vec![], + unknown_procedures: vec![ProcedureMeta { + plan_index: 0, + group_id: uuid::Uuid::new_v4(), + procedure_id: common_procedure::ProcedureId::random(), + }], + }; + + let err = state + .next(&mut ctx, &TestingEnv::procedure_context()) + .await + .unwrap_err(); + + assert!(!err.is_retryable()); + } + + #[tokio::test] + async fn test_collect_returns_error_when_failed_exists() { + let env = TestingEnv::new(); + let ddl_ctx = env.ddl_context(Arc::new(MockDatanodeManager::new(()))); + let persistent_ctx = PersistentContext::new( + table::table_name::TableName::new("test_catalog", "test_schema", "test_table"), + 1024, + None, + ); + let mut ctx = crate::procedure::repartition::Context::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ); + let procedure_id = common_procedure::ProcedureId::random(); + let (tx, rx) = watch::channel(ProcedureState::Running); + tx.send(ProcedureState::failed(Arc::new(ProcedureError::external( + MockError::new(StatusCode::Internal), + )))) + .unwrap(); + let procedure_ctx = ProcedureContext { + procedure_id: ProcedureId::random(), + provider: Arc::new(FailedProcedureContextProvider { + receiver: rx, + inner: MockContextProvider::default(), + }), + }; + let mut state = Collect { + inflight_procedures: vec![ProcedureMeta { + plan_index: 0, + group_id: uuid::Uuid::new_v4(), + procedure_id, + }], + succeeded_procedures: vec![], + failed_procedures: vec![], + unknown_procedures: vec![], + }; + + let err = state.next(&mut ctx, &procedure_ctx).await.unwrap_err(); + + assert_eq!(state.failed_procedures.len(), 1); + assert_eq!(state.unknown_procedures.len(), 0); + assert!(!err.is_retryable()); + } +} diff --git a/src/meta-srv/src/procedure/repartition/deallocate_region.rs b/src/meta-srv/src/procedure/repartition/deallocate_region.rs index 12233c27e7..3f5dc5bd8e 100644 --- a/src/meta-srv/src/procedure/repartition/deallocate_region.rs +++ b/src/meta-srv/src/procedure/repartition/deallocate_region.rs @@ -88,7 +88,8 @@ impl State for DeallocateRegion { &ctx.persistent_ctx.schema_name, &ctx.persistent_ctx.table_name, ); - // Deallocates the regions on datanodes. + // Memory guards are not required here, + // because the table metadata still contains routes for the deallocating regions. Self::deallocate_regions( &ctx.node_manager, &ctx.leader_region_registry, @@ -116,7 +117,7 @@ impl State for DeallocateRegion { } impl DeallocateRegion { - async fn deallocate_regions( + pub(crate) async fn deallocate_regions( node_manager: &NodeManagerRef, leader_region_registry: &LeaderRegionRegistryRef, table: TableName, @@ -141,7 +142,7 @@ impl DeallocateRegion { Ok(()) } - fn filter_deallocatable_region_routes( + pub(crate) fn filter_deallocatable_region_routes( table_id: TableId, region_routes: &[RegionRoute], pending_deallocate_region_ids: &HashSet, @@ -165,7 +166,7 @@ impl DeallocateRegion { .collect::>() } - fn generate_region_routes( + pub(crate) fn generate_region_routes( region_routes: &[RegionRoute], pending_deallocate_region_ids: &HashSet, ) -> Vec { @@ -181,12 +182,21 @@ impl DeallocateRegion { #[cfg(test)] mod tests { use std::collections::HashSet; + use std::sync::Arc; + use common_meta::ddl::test_util::datanode_handler::RetryErrorDatanodeHandler; use common_meta::peer::Peer; use common_meta::rpc::router::{Region, RegionRoute}; + use common_meta::test_util::MockDatanodeManager; use store_api::storage::{RegionId, TableId}; + use crate::error::Error; + use crate::procedure::repartition::State; use crate::procedure::repartition::deallocate_region::DeallocateRegion; + use crate::procedure::repartition::plan::RepartitionPlanEntry; + use crate::procedure::repartition::test_util::{ + TestingEnv, current_parent_region_routes, new_parent_context, + }; fn test_region_routes(table_id: TableId) -> Vec { vec![ @@ -238,4 +248,36 @@ mod tests { assert_eq!(new_region_routes.len(), 1); assert_eq!(new_region_routes[0].region.id, RegionId::new(table_id, 2)); } + + #[tokio::test] + async fn test_next_retryable_when_deallocate_regions_retry_later() { + let env = TestingEnv::new(); + let table_id = 1024; + let original_routes = test_region_routes(table_id); + + env.create_physical_table_metadata(table_id, original_routes.clone()) + .await; + + let node_manager = Arc::new(MockDatanodeManager::new(RetryErrorDatanodeHandler)); + let mut ctx = new_parent_context(&env, node_manager, table_id); + ctx.persistent_ctx.plans = vec![RepartitionPlanEntry { + group_id: uuid::Uuid::new_v4(), + source_regions: vec![], + target_regions: vec![], + allocated_region_ids: vec![], + pending_deallocate_region_ids: vec![RegionId::new(table_id, 1)], + transition_map: vec![], + }]; + + let mut state = DeallocateRegion; + + let err = state + .next(&mut ctx, &TestingEnv::procedure_context()) + .await + .unwrap_err(); + + assert!(matches!(err, Error::DeallocateRegions { .. })); + assert!(err.is_retryable()); + assert_eq!(current_parent_region_routes(&ctx).await, original_routes); + } } diff --git a/src/meta-srv/src/procedure/repartition/dispatch.rs b/src/meta-srv/src/procedure/repartition/dispatch.rs index 02dc73362d..3a9f9376f1 100644 --- a/src/meta-srv/src/procedure/repartition/dispatch.rs +++ b/src/meta-srv/src/procedure/repartition/dispatch.rs @@ -31,7 +31,7 @@ use crate::procedure::repartition::{self, Context, State}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Dispatch; -fn build_region_mapping( +pub(crate) fn build_region_mapping( source_regions: &[RegionDescriptor], target_regions: &[RegionDescriptor], transition_map: &[Vec], @@ -106,7 +106,11 @@ impl State for Dispatch { Ok(( Box::new(Collect::new(procedure_metas)), - Status::suspended(procedures, true), + // The state is not persisted after sub-procedures are spawned. + // If metasrv restarts before all sub-procedures complete, + // it restores from the `Dispatch` state and re-dispatches them. + // This is safe because the sub-procedures are idempotent. + Status::suspended(procedures, false), )) } diff --git a/src/meta-srv/src/procedure/repartition/group.rs b/src/meta-srv/src/procedure/repartition/group.rs index f0cb1c4dd0..e5a06f79a8 100644 --- a/src/meta-srv/src/procedure/repartition/group.rs +++ b/src/meta-srv/src/procedure/repartition/group.rs @@ -41,14 +41,18 @@ use common_procedure::{ Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, Result as ProcedureResult, Status, StringKey, UserMetadata, }; -use common_telemetry::{error, info}; +use common_telemetry::{error, info, warn}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; use store_api::storage::{RegionId, TableId}; use uuid::Uuid; use crate::error::{self, Result}; +use crate::procedure::repartition::group::apply_staging_manifest::ApplyStagingManifest; +use crate::procedure::repartition::group::enter_staging_region::EnterStagingRegion; +use crate::procedure::repartition::group::remap_manifest::RemapManifest; use crate::procedure::repartition::group::repartition_start::RepartitionStart; +use crate::procedure::repartition::group::update_metadata::UpdateMetadata; use crate::procedure::repartition::plan::RegionDescriptor; use crate::procedure::repartition::utils::get_datanode_table_value; use crate::procedure::repartition::{self}; @@ -192,6 +196,62 @@ impl RepartitionGroupProcedure { Ok(Self { state, context }) } + + async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> { + if !self.should_rollback_metadata() { + return Ok(()); + } + + let table_lock = + common_meta::lock_key::TableLock::Write(self.context.persistent_ctx.table_id).into(); + let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await; + UpdateMetadata::RollbackStaging + .rollback_staging_regions(&mut self.context) + .await?; + + if let Err(err) = self.context.invalidate_table_cache().await { + warn!( + err; + "Failed to broadcast the invalidate table cache message during repartition group rollback" + ); + } + + Ok(()) + } + + /// Returns whether group rollback should revert staging metadata. + /// + /// This uses an "after metadata apply, before exit staging" semantic. + /// Once execution reaches `UpdateMetadata::ApplyStaging` or any later staging state, + /// rollback must restore table-route metadata back to the pre-apply view. + /// + /// State flow: + /// `RepartitionStart -> SyncRegion -> UpdateMetadata::ApplyStaging -> EnterStagingRegion` + /// ` -> RemapManifest -> ApplyStagingManifest -> UpdateMetadata::ExitStaging -> RepartitionEnd` + /// ` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^` + /// ` rollback staging metadata` + /// + /// Notes: + /// - `RepartitionStart` / `SyncRegion`: no-op, metadata has not been staged yet. + /// - `UpdateMetadata::ApplyStaging` / `EnterStagingRegion` / `RemapManifest` / + /// `ApplyStagingManifest` / `UpdateMetadata::RollbackStaging`: rollback-active. + /// - `UpdateMetadata::ExitStaging` / `RepartitionEnd`: excluded, because metadata has + /// already moved into the post-commit exit path. + fn should_rollback_metadata(&self) -> bool { + self.state.as_any().is::() + || self.state.as_any().is::() + || self.state.as_any().is::() + || self + .state + .as_any() + .downcast_ref::() + .is_some_and(|state| { + matches!( + state, + UpdateMetadata::ApplyStaging | UpdateMetadata::RollbackStaging + ) + }) + } } #[async_trait::async_trait] @@ -200,6 +260,12 @@ impl Procedure for RepartitionGroupProcedure { Self::TYPE_NAME } + async fn rollback(&mut self, ctx: &ProcedureContext) -> ProcedureResult<()> { + self.rollback_inner(ctx) + .await + .map_err(ProcedureError::external) + } + #[tracing::instrument(skip_all, fields( state = %self.state.name(), table_id = self.context.persistent_ctx.table_id, @@ -238,7 +304,7 @@ impl Procedure for RepartitionGroupProcedure { } fn rollback_supported(&self) -> bool { - false + true } fn dump(&self) -> ProcedureResult { @@ -304,7 +370,7 @@ impl Context { pub struct GroupPrepareResult { /// The validated source region routes. pub source_routes: Vec, - /// The validated target region routes. + /// Validated target region routes used for metadata rollback (logical rollback). pub target_routes: Vec, /// The primary source region id (first source region), used for retrieving region options. pub central_region: RegionId, @@ -599,12 +665,149 @@ pub(crate) trait State: Sync + Send + Debug { mod tests { use std::assert_matches; use std::sync::Arc; + use std::time::Duration; use common_meta::key::TableMetadataManager; use common_meta::kv_backend::test_util::MockKvBackendBuilder; + use common_meta::peer::Peer; + use common_meta::rpc::router::{Region, RegionRoute}; + use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId}; + use common_procedure_test::MockContextProvider; + use partition::expr::PartitionExpr; + use store_api::storage::RegionId; + use super::{ + Context, PersistentContext, RepartitionGroupProcedure, RepartitionStart, State, + region_routes, + }; use crate::error::Error; - use crate::procedure::repartition::test_util::{TestingEnv, new_persistent_context}; + use crate::procedure::repartition::dispatch::build_region_mapping; + use crate::procedure::repartition::group::apply_staging_manifest::ApplyStagingManifest; + use crate::procedure::repartition::group::enter_staging_region::EnterStagingRegion; + use crate::procedure::repartition::group::remap_manifest::RemapManifest; + use crate::procedure::repartition::group::repartition_start::RepartitionStart as GroupRepartitionStart; + use crate::procedure::repartition::group::sync_region::SyncRegion; + use crate::procedure::repartition::group::update_metadata::UpdateMetadata; + use crate::procedure::repartition::plan; + use crate::procedure::repartition::repartition_start::RepartitionStart as ParentRepartitionStart; + use crate::procedure::repartition::test_util::{ + TestingEnv, new_persistent_context, range_expr, + }; + + struct GroupRollbackFixture { + context: Context, + original_region_routes: Vec, + next_state: Option>, + } + + async fn new_group_rollback_fixture( + original_region_routes: Vec, + from_exprs: Vec, + to_exprs: Vec, + sync_region: bool, + ) -> GroupRollbackFixture { + let env = TestingEnv::new(); + let procedure_ctx = TestingEnv::procedure_context(); + let table_id = 1024; + let mut next_region_number = 10; + + env.create_physical_table_metadata(table_id, original_region_routes.clone()) + .await; + + let (_, physical_route) = env + .table_metadata_manager + .table_route_manager() + .get_physical_table_route(table_id) + .await + .unwrap(); + let allocation_plans = + ParentRepartitionStart::build_plan(&physical_route, &from_exprs, &to_exprs).unwrap(); + assert_eq!(allocation_plans.len(), 1); + + let repartition_plan = plan::convert_allocation_plan_to_repartition_plan( + table_id, + &mut next_region_number, + &allocation_plans[0], + ); + let region_mapping = build_region_mapping( + &repartition_plan.source_regions, + &repartition_plan.target_regions, + &repartition_plan.transition_map, + ); + let persistent_context = PersistentContext::new( + repartition_plan.group_id, + table_id, + "test_catalog".to_string(), + "test_schema".to_string(), + repartition_plan.source_regions, + repartition_plan.target_regions, + region_mapping, + sync_region, + repartition_plan.allocated_region_ids, + repartition_plan.pending_deallocate_region_ids, + Duration::from_secs(120), + ); + let mut context = env.create_context(persistent_context); + let (next_state, _) = GroupRepartitionStart + .next(&mut context, &procedure_ctx) + .await + .unwrap(); + + GroupRollbackFixture { + context, + original_region_routes, + next_state: Some(next_state), + } + } + + async fn new_split_group_rollback_fixture(sync_region: bool) -> GroupRollbackFixture { + new_group_rollback_fixture( + vec![ + new_region_route(RegionId::new(1024, 1), Some(range_expr("x", 0, 100))), + new_region_route(RegionId::new(1024, 2), Some(range_expr("x", 100, 200))), + new_region_route(RegionId::new(1024, 10), None), + ], + vec![range_expr("x", 0, 100)], + vec![range_expr("x", 0, 50), range_expr("x", 50, 100)], + sync_region, + ) + .await + } + + async fn new_merge_group_rollback_fixture(sync_region: bool) -> GroupRollbackFixture { + new_group_rollback_fixture( + vec![ + new_region_route(RegionId::new(1024, 1), Some(range_expr("x", 0, 100))), + new_region_route(RegionId::new(1024, 2), Some(range_expr("x", 100, 200))), + new_region_route(RegionId::new(1024, 3), Some(range_expr("x", 200, 300))), + ], + vec![range_expr("x", 0, 100), range_expr("x", 100, 200)], + vec![range_expr("x", 0, 200)], + sync_region, + ) + .await + } + + async fn stage_metadata(context: &mut Context) { + UpdateMetadata::ApplyStaging + .apply_staging_regions(context) + .await + .unwrap(); + } + + fn new_region_route(region_id: RegionId, partition_expr: Option) -> RegionRoute { + RegionRoute { + region: Region { + id: region_id, + partition_expr: partition_expr + .map(|expr| expr.as_json_str().unwrap()) + .unwrap_or_default(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + } + } #[tokio::test] async fn test_get_table_route_value_not_found_error() { @@ -653,4 +856,198 @@ mod tests { let err = ctx.get_datanode_table_value(1024, 1).await.unwrap_err(); assert!(err.is_retryable()); } + + #[tokio::test] + async fn test_group_rollback_supported() { + let env = TestingEnv::new(); + let persistent_context = new_persistent_context(1024, vec![], vec![]); + let procedure = RepartitionGroupProcedure { + state: Box::new(RepartitionStart), + context: env.create_context(persistent_context), + }; + + assert!(procedure.rollback_supported()); + } + + #[tokio::test] + async fn test_group_rollback_is_noop_before_apply_staging() { + let env = TestingEnv::new(); + let persistent_context = new_persistent_context(1024, vec![], vec![]); + let ctx = env.create_context(persistent_context.clone()); + let mut procedure = RepartitionGroupProcedure { + state: Box::new(RepartitionStart), + context: ctx, + }; + let provider = Arc::new(MockContextProvider::new(Default::default())); + let procedure_ctx = ProcedureContext { + procedure_id: ProcedureId::random(), + provider, + }; + + procedure.rollback(&procedure_ctx).await.unwrap(); + + assert!(procedure.state.as_any().is::()); + assert_eq!(procedure.context.persistent_ctx, persistent_context); + } + + async fn assert_noop_rollback( + fixture: GroupRollbackFixture, + state: Box, + assert_state: impl FnOnce(&dyn State), + ) { + let original_region_routes = fixture.original_region_routes.clone(); + let procedure_ctx = TestingEnv::procedure_context(); + let mut procedure = RepartitionGroupProcedure { + state, + context: fixture.context, + }; + + procedure.rollback(&procedure_ctx).await.unwrap(); + + assert_state(&*procedure.state); + let table_route_value = procedure + .context + .get_table_route_value() + .await + .unwrap() + .into_inner(); + let region_routes = region_routes( + procedure.context.persistent_ctx.table_id, + &table_route_value, + ) + .unwrap(); + assert_eq!(region_routes.clone(), original_region_routes); + } + + async fn assert_metadata_rollback_restores_table_route( + mut fixture: GroupRollbackFixture, + state: Box, + ) { + let original_region_routes = fixture.original_region_routes.clone(); + let procedure_ctx = TestingEnv::procedure_context(); + stage_metadata(&mut fixture.context).await; + let mut procedure = RepartitionGroupProcedure { + state, + context: fixture.context, + }; + + procedure.rollback(&procedure_ctx).await.unwrap(); + + let table_route_value = procedure + .context + .get_table_route_value() + .await + .unwrap() + .into_inner(); + let region_routes = region_routes( + procedure.context.persistent_ctx.table_id, + &table_route_value, + ) + .unwrap(); + assert_eq!(region_routes.clone(), original_region_routes); + } + + #[tokio::test] + async fn test_group_rollback_is_noop_in_sync_region() { + let mut fixture = new_split_group_rollback_fixture(true).await; + assert!( + fixture + .next_state + .as_ref() + .unwrap() + .as_any() + .is::() + ); + let state = fixture.next_state.take().unwrap(); + + assert_noop_rollback(fixture, state, |state| { + assert!(state.as_any().is::()); + }) + .await; + } + + #[tokio::test] + async fn test_group_rollback_is_noop_in_exit_staging() { + let fixture = new_split_group_rollback_fixture(false).await; + + assert_noop_rollback(fixture, Box::new(UpdateMetadata::ExitStaging), |state| { + assert!(state.as_any().is::()); + assert!(matches!( + state.as_any().downcast_ref::(), + Some(UpdateMetadata::ExitStaging) + )); + }) + .await; + } + + #[tokio::test] + async fn test_group_rollback_restores_split_routes_from_apply_staging() { + let fixture = new_split_group_rollback_fixture(false).await; + assert_metadata_rollback_restores_table_route( + fixture, + Box::new(UpdateMetadata::ApplyStaging), + ) + .await; + } + + #[tokio::test] + async fn test_group_rollback_restores_split_routes_from_enter_staging_region() { + let fixture = new_split_group_rollback_fixture(false).await; + assert_metadata_rollback_restores_table_route(fixture, Box::new(EnterStagingRegion)).await; + } + + #[tokio::test] + async fn test_group_rollback_restores_split_routes_from_remap_manifest() { + let fixture = new_split_group_rollback_fixture(false).await; + assert_metadata_rollback_restores_table_route(fixture, Box::new(RemapManifest)).await; + } + + #[tokio::test] + async fn test_group_rollback_restores_split_routes_from_apply_staging_manifest() { + let fixture = new_split_group_rollback_fixture(false).await; + assert_metadata_rollback_restores_table_route(fixture, Box::new(ApplyStagingManifest)) + .await; + } + + #[tokio::test] + async fn test_group_rollback_restores_merge_routes_and_is_idempotent() { + let mut fixture = new_merge_group_rollback_fixture(false).await; + let original_region_routes = fixture.original_region_routes.clone(); + let procedure_ctx = TestingEnv::procedure_context(); + stage_metadata(&mut fixture.context).await; + let mut procedure = RepartitionGroupProcedure { + state: Box::new(UpdateMetadata::ApplyStaging), + context: fixture.context, + }; + + procedure.rollback(&procedure_ctx).await.unwrap(); + let table_route_value = procedure + .context + .get_table_route_value() + .await + .unwrap() + .into_inner(); + let once = region_routes( + procedure.context.persistent_ctx.table_id, + &table_route_value, + ) + .unwrap() + .clone(); + procedure.rollback(&procedure_ctx).await.unwrap(); + let table_route_value = procedure + .context + .get_table_route_value() + .await + .unwrap() + .into_inner(); + let twice = region_routes( + procedure.context.persistent_ctx.table_id, + &table_route_value, + ) + .unwrap() + .clone(); + + assert_eq!(once, original_region_routes); + assert_eq!(once, twice); + } } diff --git a/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs b/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs index 2020e9e2f4..43e5ee31d9 100644 --- a/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs +++ b/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs @@ -332,7 +332,14 @@ impl ApplyStagingManifest { ); Ok(()) - } + }, + Err(error::Error::MailboxChannelClosed {..})=> error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending apply staging manifests to datanode {:?}, elapsed: {:?}", + peer, + now.elapsed() + ), + }.fail()?, Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( "Mailbox received timeout for apply staging manifests on datanode {:?}, elapsed: {:?}", diff --git a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs index 59de569c13..911e881ac3 100644 --- a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs @@ -315,7 +315,14 @@ impl EnterStagingRegion { ); Ok(()) - } + }, + Err(error::Error::MailboxChannelClosed {..})=> error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending enter staging regions to datanode {:?}, elapsed: {:?}", + peer, + now.elapsed() + ), + }.fail()?, Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( "Mailbox received timeout for enter staging regions on datanode {:?}, elapsed: {:?}", diff --git a/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs b/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs index 6e3460c2ce..1d6a75100e 100644 --- a/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs +++ b/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs @@ -184,6 +184,14 @@ impl RemapManifest { Self::handle_remap_manifest_reply(remap.region_id, reply, &now, peer) } + Err(error::Error::MailboxChannelClosed { .. }) => error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending remap manifests to datanode {:?}, elapsed: {:?}", + peer, + now.elapsed() + ), + } + .fail()?, Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( "Mailbox received timeout for remap manifests on datanode {:?}, elapsed: {:?}", diff --git a/src/meta-srv/src/procedure/repartition/group/sync_region.rs b/src/meta-srv/src/procedure/repartition/group/sync_region.rs index dcd58c21e9..7422ae8607 100644 --- a/src/meta-srv/src/procedure/repartition/group/sync_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/sync_region.rs @@ -273,6 +273,14 @@ impl SyncRegion { } Ok(()) } + Err(error::Error::MailboxChannelClosed { .. }) => error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending sync region to datanode {:?}, elapsed: {:?}", + peer, + now.elapsed() + ), + } + .fail()?, Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( "Mailbox received timeout for sync regions on datanode {:?}, elapsed: {:?}", diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs index ecde5f0507..ff01161ff5 100644 --- a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs @@ -30,7 +30,7 @@ impl UpdateMetadata { /// Abort: /// - Target region not found. /// - Source region not found. - fn apply_staging_region_routes( + pub(crate) fn apply_staging_region_routes( group_id: GroupId, sources: &[RegionDescriptor], targets: &[RegionDescriptor], @@ -50,10 +50,12 @@ impl UpdateMetadata { region_id: target.region_id, }, )?; + // Set the new partition expression for the target region route. region_route.region.partition_expr = target .partition_expr .as_json_str() .context(error::SerializePartitionExprSnafu)?; + // Set leader staging state and write route policy for the target region route. region_route.set_leader_staging(); region_route.clear_ignore_all_writes(); } @@ -65,6 +67,7 @@ impl UpdateMetadata { region_id: source.region_id, }, )?; + // Set leader staging state for the source region route. region_route.set_leader_staging(); if pending_deallocate_region_ids.contains(&source.region_id) { // When a region is pending deallocation, it should ignore all writes. diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs index e9bef4cf8e..4e6bf67fc8 100644 --- a/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs +++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs @@ -18,10 +18,12 @@ use common_error::ext::BoxedError; use common_meta::rpc::router::RegionRoute; use common_telemetry::{error, info}; use snafu::{OptionExt, ResultExt}; +use store_api::storage::RegionId; use crate::error::{self, Result}; use crate::procedure::repartition::group::update_metadata::UpdateMetadata; use crate::procedure::repartition::group::{Context, GroupId, region_routes}; +use crate::procedure::repartition::plan::RegionDescriptor; impl UpdateMetadata { /// Rolls back the staging regions. @@ -31,8 +33,9 @@ impl UpdateMetadata { /// - Target region not found. fn rollback_staging_region_routes( group_id: GroupId, - source_routes: &[RegionRoute], - target_routes: &[RegionRoute], + sources: &[RegionDescriptor], + original_target_routes: &[RegionRoute], + pending_deallocate_region_ids: &[RegionId], current_region_routes: &[RegionRoute], ) -> Result> { let mut region_routes = current_region_routes.to_vec(); @@ -40,26 +43,35 @@ impl UpdateMetadata { .iter_mut() .map(|route| (route.region.id, route)) .collect::>(); - - for source in source_routes { - let region_route = region_routes_map.get_mut(&source.region.id).context( + for source in sources { + let region_route = region_routes_map.get_mut(&source.region_id).context( error::RepartitionSourceRegionMissingSnafu { group_id, - region_id: source.region.id, + region_id: source.region_id, }, )?; - region_route.region.partition_expr = source.region.partition_expr.clone(); + // Clean leader staging state for source regions. region_route.clear_leader_staging(); - region_route.clear_ignore_all_writes(); + if pending_deallocate_region_ids.contains(&source.region_id) { + // Clean ignore all writes state for source regions if it's pending to be deallocated, + // which means the source region is merged into the target region. + region_route.clear_ignore_all_writes(); + } } - for target in target_routes { + for target in original_target_routes { let region_route = region_routes_map.get_mut(&target.region.id).context( error::RepartitionTargetRegionMissingSnafu { group_id, region_id: target.region.id, }, )?; + + // Revert the partition expression and write route policy to the original value for the target region. + region_route.region.partition_expr = target.region.partition_expr.clone(); + region_route.write_route_policy = target.write_route_policy; + + // Clean leader staging state for target regions. region_route.clear_leader_staging(); } @@ -83,8 +95,9 @@ impl UpdateMetadata { let prepare_result = ctx.persistent_ctx.group_prepare_result.as_ref().unwrap(); let new_region_routes = Self::rollback_staging_region_routes( group_id, - &prepare_result.source_routes, + &ctx.persistent_ctx.sources, &prepare_result.target_routes, + &ctx.persistent_ctx.pending_deallocate_region_ids, region_routes, )?; @@ -113,87 +126,176 @@ impl UpdateMetadata { #[cfg(test)] mod tests { + use std::collections::HashSet; + use common_meta::peer::Peer; use common_meta::rpc::router::{LeaderState, Region, RegionRoute}; use store_api::storage::RegionId; use uuid::Uuid; use crate::procedure::repartition::group::update_metadata::UpdateMetadata; + use crate::procedure::repartition::plan::RegionDescriptor; use crate::procedure::repartition::test_util::range_expr; + fn new_region_route( + region_id: RegionId, + partition_expr: &str, + leader_state: Option, + ignore_all_writes: bool, + ) -> RegionRoute { + let mut route = RegionRoute { + region: Region { + id: region_id, + partition_expr: partition_expr.to_string(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + leader_state, + ..Default::default() + }; + + if ignore_all_writes { + route.set_ignore_all_writes(); + } + + route + } + + fn original_target_routes( + region_routes: &[RegionRoute], + targets: &[RegionDescriptor], + ) -> Vec { + let target_ids = targets + .iter() + .map(|target| target.region_id) + .collect::>(); + region_routes + .iter() + .filter(|route| target_ids.contains(&route.region.id)) + .cloned() + .collect() + } + #[test] - fn test_rollback_staging_region_routes() { + fn test_rollback_staging_region_routes_split_case() { let group_id = Uuid::new_v4(); let table_id = 1024; - let region_routes = vec![ - { - let mut route = RegionRoute { - region: Region { - id: RegionId::new(table_id, 1), - partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - leader_state: Some(LeaderState::Staging), - ..Default::default() - }; - route.set_ignore_all_writes(); - route + let original_region_routes = vec![ + new_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + None, + false, + ), + new_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + None, + false, + ), + new_region_route(RegionId::new(table_id, 3), "", None, false), + ]; + let sources = vec![RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100), + }]; + let targets = vec![ + RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 50), }, - RegionRoute { - region: Region { - id: RegionId::new(table_id, 2), - partition_expr: String::new(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - leader_state: Some(LeaderState::Staging), - ..Default::default() - }, - RegionRoute { - region: Region { - id: RegionId::new(table_id, 3), - partition_expr: String::new(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - leader_state: Some(LeaderState::Downgrading), - ..Default::default() + RegionDescriptor { + region_id: RegionId::new(table_id, 3), + partition_expr: range_expr("x", 50, 100), }, ]; - let source_routes = vec![RegionRoute { - region: Region { - id: RegionId::new(table_id, 1), - partition_expr: range_expr("x", 0, 20).as_json_str().unwrap(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - ..Default::default() - }]; - let target_routes = vec![RegionRoute { - region: Region { - id: RegionId::new(table_id, 2), - partition_expr: range_expr("x", 0, 20).as_json_str().unwrap(), - ..Default::default() - }, - leader_peer: Some(Peer::empty(1)), - ..Default::default() - }]; - let new_region_routes = UpdateMetadata::rollback_staging_region_routes( + let applied_region_routes = UpdateMetadata::apply_staging_region_routes( group_id, - &source_routes, - &target_routes, - ®ion_routes, + &sources, + &targets, + &[], + &original_region_routes, ) .unwrap(); - assert!(!new_region_routes[0].is_leader_staging()); - assert!(!new_region_routes[0].is_ignore_all_writes()); - assert_eq!( - new_region_routes[0].region.partition_expr, - range_expr("x", 0, 20).as_json_str().unwrap(), - ); - assert!(!new_region_routes[1].is_leader_staging()); - assert!(!new_region_routes[1].is_ignore_all_writes()); - assert!(new_region_routes[2].is_leader_downgrading()); + let target_routes = original_target_routes(&original_region_routes, &targets); + let new_region_routes = UpdateMetadata::rollback_staging_region_routes( + group_id, + &sources, + &target_routes, + &[], + &applied_region_routes, + ) + .unwrap(); + + assert_eq!(new_region_routes, original_region_routes); + } + + #[test] + fn test_rollback_staging_region_routes_merge_case_is_idempotent() { + let group_id = Uuid::new_v4(); + let table_id = 1024; + let original_region_routes = vec![ + new_region_route( + RegionId::new(table_id, 1), + &range_expr("x", 0, 100).as_json_str().unwrap(), + None, + false, + ), + new_region_route( + RegionId::new(table_id, 2), + &range_expr("x", 100, 200).as_json_str().unwrap(), + None, + false, + ), + new_region_route( + RegionId::new(table_id, 3), + &range_expr("x", 200, 300).as_json_str().unwrap(), + None, + false, + ), + ]; + let sources = vec![ + RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100), + }, + RegionDescriptor { + region_id: RegionId::new(table_id, 2), + partition_expr: range_expr("x", 100, 200), + }, + ]; + let targets = vec![RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 200), + }]; + let target_routes = original_target_routes(&original_region_routes, &targets); + let applied_region_routes = UpdateMetadata::apply_staging_region_routes( + group_id, + &sources, + &targets, + &[RegionId::new(table_id, 2)], + &original_region_routes, + ) + .unwrap(); + + let once = UpdateMetadata::rollback_staging_region_routes( + group_id, + &sources, + &target_routes, + &[RegionId::new(table_id, 2)], + &applied_region_routes, + ) + .unwrap(); + let twice = UpdateMetadata::rollback_staging_region_routes( + group_id, + &sources, + &target_routes, + &[RegionId::new(table_id, 2)], + &once, + ) + .unwrap(); + + assert_eq!(once, original_region_routes); + assert_eq!(once, twice); } } diff --git a/src/meta-srv/src/procedure/repartition/repartition_start.rs b/src/meta-srv/src/procedure/repartition/repartition_start.rs index 1f657d58f2..5c6bcfdb06 100644 --- a/src/meta-srv/src/procedure/repartition/repartition_start.rs +++ b/src/meta-srv/src/procedure/repartition/repartition_start.rs @@ -102,7 +102,7 @@ impl State for RepartitionStart { } impl RepartitionStart { - fn build_plan( + pub(crate) fn build_plan( physical_route: &PhysicalTableRouteValue, from_exprs: &[PartitionExpr], to_exprs: &[PartitionExpr], diff --git a/src/meta-srv/src/procedure/repartition/test_util.rs b/src/meta-srv/src/procedure/repartition/test_util.rs index 3cefd4a095..83856a49e6 100644 --- a/src/meta-srv/src/procedure/repartition/test_util.rs +++ b/src/meta-srv/src/procedure/repartition/test_util.rs @@ -16,22 +16,41 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; +use common_meta::ddl::DdlContext; +use common_meta::key::table_route::TableRouteValue; +use common_meta::key::test_utils::{new_test_table_info, new_test_table_info_with_name}; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; +use common_meta::kv_backend::KvBackendRef; use common_meta::kv_backend::memory::MemoryKvBackend; +use common_meta::node_manager::NodeManagerRef; +use common_meta::peer::Peer; +use common_meta::rpc::router::{Region, RegionRoute}; use common_meta::sequence::SequenceBuilder; +use common_meta::test_util::new_ddl_context_with_kv_backend; +use common_procedure::{ + Context as ProcedureContext, ContextProvider, ProcedureId, ProcedureState, Status, +}; +use common_procedure_test::MockContextProvider; +use common_wal::options::{KafkaWalOptions, WalOptions}; use datatypes::value::Value; use partition::expr::{PartitionExpr, col}; -use store_api::storage::TableId; +use store_api::storage::{RegionId, RegionNumber, TableId}; +use table::table_name::TableName; +use tokio::sync::watch; use uuid::Uuid; use crate::cache_invalidator::MetasrvCacheInvalidator; use crate::metasrv::MetasrvInfo; use crate::procedure::repartition::group::{Context, PersistentContext, VolatileContext}; use crate::procedure::repartition::plan::RegionDescriptor; +use crate::procedure::repartition::{ + Context as ParentContext, PersistentContext as ParentPersistentContext, RepartitionProcedure, +}; use crate::procedure::test_util::MailboxContext; /// `TestingEnv` provides components during the tests. pub struct TestingEnv { + pub kv_backend: KvBackendRef, pub table_metadata_manager: TableMetadataManagerRef, pub mailbox_ctx: MailboxContext, pub server_addr: String, @@ -45,13 +64,14 @@ impl Default for TestingEnv { impl TestingEnv { pub fn new() -> Self { - let kv_backend = Arc::new(MemoryKvBackend::new()); + let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef; let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone())); let mailbox_sequence = SequenceBuilder::new("test_heartbeat_mailbox", kv_backend.clone()).build(); let mailbox_ctx = MailboxContext::new(mailbox_sequence); Self { + kv_backend, table_metadata_manager, mailbox_ctx, server_addr: "localhost".to_string(), @@ -76,6 +96,65 @@ impl TestingEnv { volatile_ctx: VolatileContext::default(), } } + + pub fn procedure_context() -> ProcedureContext { + ProcedureContext { + procedure_id: ProcedureId::random(), + provider: Arc::new(MockContextProvider::default()), + } + } + + pub async fn create_physical_table_metadata( + &self, + table_id: TableId, + region_routes: Vec, + ) { + self.create_physical_table_metadata_with_wal_options( + table_id, + region_routes, + HashMap::default(), + ) + .await; + } + + pub async fn create_physical_table_metadata_with_wal_options( + &self, + table_id: TableId, + region_routes: Vec, + region_wal_options: HashMap, + ) { + self.table_metadata_manager + .create_table_metadata( + new_test_table_info(table_id), + TableRouteValue::physical(region_routes), + region_wal_options, + ) + .await + .unwrap(); + } + + pub async fn create_physical_table_metadata_for_repartition( + &self, + table_id: TableId, + region_routes: Vec, + region_wal_options: HashMap, + ) { + let mut table_info = new_test_table_info_with_name(table_id, "test_table"); + table_info.meta.column_ids = vec![0, 1, 2]; + + self.table_metadata_manager + .create_table_metadata( + table_info, + TableRouteValue::physical(region_routes), + region_wal_options, + ) + .await + .unwrap(); + } + + pub fn ddl_context(&self, node_manager: NodeManagerRef) -> DdlContext { + new_ddl_context_with_kv_backend(node_manager, self.kv_backend.clone()) + } } pub fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr { @@ -84,6 +163,18 @@ pub fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr { .and(col(col_name).lt(Value::Int64(end))) } +pub fn test_region_wal_options(region_numbers: &[RegionNumber]) -> HashMap { + let wal_options = serde_json::to_string(&WalOptions::Kafka(KafkaWalOptions { + topic: "test_topic".to_string(), + })) + .unwrap(); + + region_numbers + .iter() + .map(|region_number| (*region_number, wal_options.clone())) + .collect() +} + pub fn new_persistent_context( table_id: TableId, sources: Vec, @@ -105,3 +196,110 @@ pub fn new_persistent_context( timeout: Duration::from_secs(120), } } + +pub fn test_region_route(region_id: RegionId, partition_expr: &str) -> RegionRoute { + RegionRoute { + region: Region { + id: region_id, + partition_expr: partition_expr.to_string(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + } +} + +pub async fn current_parent_region_routes(ctx: &ParentContext) -> Vec { + let table_route_value = ctx.get_table_route_value().await.unwrap().into_inner(); + table_route_value.region_routes().unwrap().clone() +} + +pub fn new_parent_context( + env: &TestingEnv, + node_manager: NodeManagerRef, + table_id: TableId, +) -> ParentContext { + let ddl_ctx = env.ddl_context(node_manager); + let persistent_ctx = ParentPersistentContext::new( + TableName::new("test_catalog", "test_schema", "test_table"), + table_id, + None, + ); + + ParentContext::new( + &ddl_ctx, + env.mailbox_ctx.mailbox().clone(), + env.server_addr.clone(), + persistent_ctx, + ) +} + +pub fn assert_parent_state(procedure: &RepartitionProcedure) { + assert!(procedure.state.as_any().is::()); +} + +pub fn extract_subprocedure_ids(status: Status) -> Vec { + let Status::Suspended { subprocedures, .. } = status else { + panic!("expected suspended status"); + }; + + subprocedures + .into_iter() + .map(|procedure| procedure.id) + .collect() +} + +pub fn procedure_state_receiver(state: ProcedureState) -> watch::Receiver { + let (tx, rx) = watch::channel(ProcedureState::Running); + tx.send(state).unwrap(); + rx +} + +pub fn procedure_context_with_receivers( + receivers: HashMap>, +) -> ProcedureContext { + ProcedureContext { + procedure_id: ProcedureId::random(), + provider: Arc::new(ProcedureStateReceiverProvider { + receivers, + inner: MockContextProvider::default(), + }), + } +} + +struct ProcedureStateReceiverProvider { + receivers: HashMap>, + inner: MockContextProvider, +} + +#[async_trait::async_trait] +impl ContextProvider for ProcedureStateReceiverProvider { + async fn procedure_state( + &self, + procedure_id: ProcedureId, + ) -> common_procedure::Result> { + self.inner.procedure_state(procedure_id).await + } + + async fn procedure_state_receiver( + &self, + procedure_id: ProcedureId, + ) -> common_procedure::Result>> { + Ok(self.receivers.get(&procedure_id).cloned()) + } + + async fn try_put_poison( + &self, + key: &common_procedure::PoisonKey, + procedure_id: ProcedureId, + ) -> common_procedure::Result<()> { + self.inner.try_put_poison(key, procedure_id).await + } + + async fn acquire_lock( + &self, + key: &common_procedure::StringKey, + ) -> common_procedure::local::DynamicKeyLockGuard { + self.inner.acquire_lock(key).await + } +} diff --git a/src/meta-srv/src/procedure/utils.rs b/src/meta-srv/src/procedure/utils.rs index bea2195573..5ea8e00038 100644 --- a/src/meta-srv/src/procedure/utils.rs +++ b/src/meta-srv/src/procedure/utils.rs @@ -190,6 +190,23 @@ pub(crate) async fn flush_region( operation: "Flush regions", } .fail(), + Err(error::Error::MailboxChannelClosed { .. }) => match error_strategy { + ErrorStrategy::Ignore => { + warn!( + "Failed to flush regions({:?}), the datanode({}) is unreachable(MailboxChannelClosed). Skip flush operation.", + region_ids, datanode + ); + Ok(()) + } + ErrorStrategy::Retry => error::RetryLaterSnafu { + reason: format!( + "Mailbox closed when sending flush region to datanode {:?}, elapsed: {:?}", + datanode, + now.elapsed() + ), + } + .fail()?, + }, Err(err) => Err(err), } } diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs index 300bd34647..942dae1136 100644 --- a/src/metric-engine/src/engine/bulk_insert.rs +++ b/src/metric-engine/src/engine/bulk_insert.rs @@ -528,7 +528,7 @@ mod tests { async fn test_bulk_insert_physical_region_passthrough() { // Use flat format so that BulkMemtable is used (supports write_bulk). let mito_config = MitoConfig { - default_experimental_flat_format: true, + default_flat_format: true, ..Default::default() }; let env = TestEnv::with_mito_config("", mito_config, Default::default()).await; @@ -585,7 +585,7 @@ mod tests { async fn test_bulk_insert_physical_region_empty_batch() { // Use flat format so that BulkMemtable is used (supports write_bulk). let mito_config = MitoConfig { - default_experimental_flat_format: true, + default_flat_format: true, ..Default::default() }; let env = TestEnv::with_mito_config("", mito_config, Default::default()).await; diff --git a/src/metric-engine/src/engine/flush.rs b/src/metric-engine/src/engine/flush.rs index 5d7479c5d0..8c0f33aaf3 100644 --- a/src/metric-engine/src/engine/flush.rs +++ b/src/metric-engine/src/engine/flush.rs @@ -121,6 +121,10 @@ mod tests { .map(|path| path.replace(&e.file_id, "")); e.file_id = "".to_string(); e.index_version = 0; + // Round down sizes to nearest 1000 to avoid exact size + // comparisons that break when the SST format changes. + e.file_size = e.file_size / 1000 * 1000; + e.index_file_size = e.index_file_size.map(|s| s / 1000 * 1000); format!("\n{:?}", e) }) .sorted() @@ -129,12 +133,12 @@ mod tests { assert_eq!( debug_format, r#" -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/metadata/.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/metadata/.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/metadata/.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#, +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/data/.parquet", file_size: 3000, index_file_path: Some("test_metric_region/11_0000000001/data/index/.puffin"), index_file_size: Some(0), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/data/.parquet", file_size: 3000, index_file_path: Some("test_metric_region/11_0000000002/data/index/.puffin"), index_file_size: Some(0), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/metadata/.parquet", file_size: 4000, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/metadata/.parquet", file_size: 3000, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/data/.parquet", file_size: 3000, index_file_path: Some("test_metric_region/22_0000000042/data/index/.puffin"), index_file_size: Some(0), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/metadata/.parquet", file_size: 3000, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#, ); // list from storage let storage_entries = mito diff --git a/src/mito-codec/Cargo.toml b/src/mito-codec/Cargo.toml index 07d64482e0..c5a6625cc2 100644 --- a/src/mito-codec/Cargo.toml +++ b/src/mito-codec/Cargo.toml @@ -15,7 +15,6 @@ common-base.workspace = true common-decimal.workspace = true common-error.workspace = true common-macro.workspace = true -common-query.workspace = true common-recordbatch.workspace = true common-telemetry.workspace = true common-time.workspace = true @@ -27,6 +26,7 @@ snafu.workspace = true store-api.workspace = true [dev-dependencies] +common-query.workspace = true criterion = "0.7" datafusion-common.workspace = true datafusion-expr.workspace = true diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index a43fa8a0a6..944c51ebd6 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -848,7 +848,7 @@ impl CompactionSstReaderBuilder<'_> { } fn build_scan_input(self) -> Result { - let mapper = ProjectionMapper::all(&self.metadata, true)?; + let mapper = ProjectionMapper::all(&self.metadata)?; let mut scan_input = ScanInput::new(self.sst_layer, mapper) .with_files(self.inputs.to_vec()) .with_append_mode(self.append_mode) @@ -857,8 +857,7 @@ impl CompactionSstReaderBuilder<'_> { .with_filter_deleted(self.filter_deleted) // We ignore file not found error during compaction. .with_ignore_file_not_found(true) - .with_merge_mode(self.merge_mode) - .with_flat_format(true); + .with_merge_mode(self.merge_mode); // This serves as a workaround of https://github.com/GreptimeTeam/greptimedb/issues/3944 // by converting time ranges into predicate. diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs index b03e6415e8..ff4317331f 100644 --- a/src/mito2/src/compaction/compactor.rs +++ b/src/mito2/src/compaction/compactor.rs @@ -322,11 +322,7 @@ impl DefaultCompactor { .region_options .sst_format .map(|format| format == FormatType::Flat) - .unwrap_or( - compaction_region - .engine_config - .default_experimental_flat_format, - ); + .unwrap_or(compaction_region.engine_config.default_flat_format); let index_config = compaction_region.engine_config.index.clone(); let inverted_index_config = compaction_region.engine_config.inverted_index.clone(); diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index da0ec74022..120b5adbe3 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -33,8 +33,6 @@ use crate::memtable::MemtableConfig; use crate::sst::DEFAULT_WRITE_BUFFER_SIZE; const MULTIPART_UPLOAD_MINIMUM_SIZE: ReadableSize = ReadableSize::mb(5); -/// Default channel size for parallel scan task. -pub(crate) const DEFAULT_SCAN_CHANNEL_SIZE: usize = 32; /// Default maximum number of SST files to scan concurrently. pub(crate) const DEFAULT_MAX_CONCURRENT_SCAN_FILES: usize = 384; @@ -93,7 +91,9 @@ pub struct MitoConfig { pub max_background_compactions: usize, /// Max number of running background purge jobs (default: number of cpu cores). pub max_background_purges: usize, - /// Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. + /// Memory budget for compaction tasks. + /// Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%"). + /// Setting it to 0 or "unlimited" disables the limit. pub experimental_compaction_memory_limit: MemoryLimit, /// Behavior when compaction cannot acquire memory from the budget. pub experimental_compaction_on_exhausted: OnExhaustedPolicy, @@ -142,8 +142,6 @@ pub struct MitoConfig { // Other configs: /// Buffer size for SST writing. pub sst_write_buffer_size: ReadableSize, - /// Capacity of the channel to send data from parallel scan tasks to the main task (default 32). - pub parallel_scan_channel_size: usize, /// Maximum number of SST files to scan concurrently (default 384). pub max_concurrent_scan_files: usize, /// Whether to allow stale entries read during replay. @@ -177,9 +175,9 @@ pub struct MitoConfig { #[serde(with = "humantime_serde")] pub min_compaction_interval: Duration, - /// Whether to enable experimental flat format as the default format. + /// Whether to enable flat format as the default SST format. /// When enabled, forces using BulkMemtable and BulkMemtableBuilder. - pub default_experimental_flat_format: bool, + pub default_flat_format: bool, pub gc: GcConfig, } @@ -217,7 +215,6 @@ impl Default for MitoConfig { enable_refill_cache_on_read: true, manifest_cache_size: ReadableSize::mb(256), sst_write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, - parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, allow_stale_entries: false, scan_memory_limit: MemoryLimit::default(), @@ -230,7 +227,7 @@ impl Default for MitoConfig { vector_index: VectorIndexConfig::default(), memtable: MemtableConfig::default(), min_compaction_interval: Duration::from_secs(0), - default_experimental_flat_format: false, + default_flat_format: true, gc: GcConfig::default(), }; @@ -295,14 +292,6 @@ impl MitoConfig { ); } - if self.parallel_scan_channel_size < 1 { - self.parallel_scan_channel_size = DEFAULT_SCAN_CHANNEL_SIZE; - warn!( - "Sanitize scan channel size to {}", - self.parallel_scan_channel_size - ); - } - // Sets write cache path if it is empty. if self.write_cache_path.trim().is_empty() { self.write_cache_path = data_home.to_string(); diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index d1c30c3ff6..d006067f0d 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -1027,7 +1027,6 @@ impl EngineInner { request, CacheStrategy::EnableAll(cache_manager), ) - .with_parallel_scan_channel_size(self.config.parallel_scan_channel_size) .with_max_concurrent_scan_files(self.config.max_concurrent_scan_files) .with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled()) .with_ignore_fulltext_index(self.config.fulltext_index.apply_on_query.disabled()) diff --git a/src/mito2/src/engine/alter_test.rs b/src/mito2/src/engine/alter_test.rs index b8ba06f0b9..05ba5dae25 100644 --- a/src/mito2/src/engine/alter_test.rs +++ b/src/mito2/src/engine/alter_test.rs @@ -141,7 +141,7 @@ async fn test_alter_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -213,7 +213,7 @@ async fn test_alter_region_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -267,7 +267,7 @@ async fn test_put_after_alter_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -318,7 +318,7 @@ async fn test_put_after_alter_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -387,7 +387,7 @@ async fn test_alter_region_retry_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -457,7 +457,7 @@ async fn test_alter_on_flushing_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -574,7 +574,7 @@ async fn test_alter_column_fulltext_options_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -681,7 +681,7 @@ async fn test_alter_column_fulltext_options_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -718,7 +718,7 @@ async fn test_alter_column_set_inverted_index_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -816,7 +816,7 @@ async fn test_alter_column_set_inverted_index_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -853,7 +853,7 @@ async fn test_alter_region_ttl_options_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -916,7 +916,7 @@ async fn test_write_stall_on_altering_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -994,7 +994,7 @@ async fn test_alter_region_sst_format_with_flush() { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: false, + default_flat_format: false, ..Default::default() }) .await; @@ -1085,7 +1085,7 @@ async fn test_alter_region_sst_format_with_flush() { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: false, + default_flat_format: false, ..Default::default() }, ) @@ -1118,7 +1118,7 @@ async fn test_alter_region_sst_format_without_flush() { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: false, + default_flat_format: false, ..Default::default() }) .await; @@ -1203,7 +1203,7 @@ async fn test_alter_region_sst_format_without_flush() { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: false, + default_flat_format: false, ..Default::default() }, ) @@ -1231,6 +1231,250 @@ async fn test_alter_region_sst_format_without_flush() { assert_eq!(expected_all_data, batches.pretty_print().unwrap()); } +#[tokio::test] +async fn test_alter_region_sst_format_flat_to_pk_with_flush() { + common_telemetry::init_default_ut_logging(); + + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_flat_format: true, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let column_schemas = rows_schema(&request); + let table_dir = request.table_dir.clone(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + // Inserts some data with flat format + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + // Flushes to create SST files with flat format + flush_region(&engine, region_id, None).await; + + let expected_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_data, batches.pretty_print().unwrap()); + + // Alters sst_format from flat to primary_key + let alter_format_request = RegionAlterRequest { + kind: AlterKind::SetRegionOptions { + options: vec![SetRegionOption::Format("primary_key".to_string())], + }, + }; + engine + .handle_request(region_id, RegionRequest::Alter(alter_format_request)) + .await + .unwrap(); + + // Inserts more data after alter + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(3, 6), + }; + put_rows(&engine, region_id, rows).await; + + // Flushes to create SST files with primary_key format + flush_region(&engine, region_id, None).await; + + let expected_all_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | +| 3 | 3.0 | 1970-01-01T00:00:03 | +| 4 | 4.0 | 1970-01-01T00:00:04 | +| 5 | 5.0 | 1970-01-01T00:00:05 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); + + // Reopens region to verify format persists + let engine = env + .reopen_engine( + engine, + MitoConfig { + default_flat_format: false, + ..Default::default() + }, + ) + .await; + engine + .handle_request( + region_id, + RegionRequest::Open(RegionOpenRequest { + engine: String::new(), + table_dir, + path_type: PathType::Bare, + options: HashMap::default(), + skip_wal_replay: false, + checkpoint: None, + }), + ) + .await + .unwrap(); + + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); +} + +#[tokio::test] +async fn test_alter_region_sst_format_flat_to_pk_without_flush() { + common_telemetry::init_default_ut_logging(); + + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_flat_format: true, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let column_schemas = rows_schema(&request); + let table_dir = request.table_dir.clone(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let check_format = |engine: &MitoEngine, expected: Option| { + let current_format = engine + .get_region(region_id) + .unwrap() + .version() + .options + .sst_format; + assert_eq!(current_format, expected); + }; + check_format(&engine, Some(FormatType::Flat)); + + // Inserts some data with flat format + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + // Alters sst_format from flat to primary_key + let alter_format_request = RegionAlterRequest { + kind: AlterKind::SetRegionOptions { + options: vec![SetRegionOption::Format("primary_key".to_string())], + }, + }; + engine + .handle_request(region_id, RegionRequest::Alter(alter_format_request)) + .await + .unwrap(); + + check_format(&engine, Some(FormatType::PrimaryKey)); + + // Inserts more data after alter + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(3, 6), + }; + put_rows(&engine, region_id, rows).await; + + let expected_all_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | +| 3 | 3.0 | 1970-01-01T00:00:03 | +| 4 | 4.0 | 1970-01-01T00:00:04 | +| 5 | 5.0 | 1970-01-01T00:00:05 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); + + // Reopens region to verify format persists + let engine = env + .reopen_engine( + engine, + MitoConfig { + default_flat_format: false, + ..Default::default() + }, + ) + .await; + engine + .handle_request( + region_id, + RegionRequest::Open(RegionOpenRequest { + engine: String::new(), + table_dir, + path_type: PathType::Bare, + options: HashMap::default(), + skip_wal_replay: false, + checkpoint: None, + }), + ) + .await + .unwrap(); + + check_format(&engine, Some(FormatType::PrimaryKey)); + + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); +} + #[tokio::test] async fn test_alter_region_append_mode_with_flush() { common_telemetry::init_default_ut_logging(); diff --git a/src/mito2/src/engine/append_mode_test.rs b/src/mito2/src/engine/append_mode_test.rs index 61488b6592..fa7db1f573 100644 --- a/src/mito2/src/engine/append_mode_test.rs +++ b/src/mito2/src/engine/append_mode_test.rs @@ -44,7 +44,7 @@ async fn test_append_mode_write_query_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -112,7 +112,7 @@ async fn test_append_mode_compaction_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -211,7 +211,7 @@ async fn test_append_mode_compaction_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -238,7 +238,7 @@ async fn test_alter_append_mode_clears_merge_mode_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -329,7 +329,7 @@ async fn test_alter_append_mode_clears_merge_mode_with_format(flat_format: bool) .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -376,7 +376,7 @@ async fn test_put_single_range_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -474,7 +474,7 @@ async fn test_put_single_range_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/engine/apply_staging_manifest_test.rs b/src/mito2/src/engine/apply_staging_manifest_test.rs index 401e6572a2..a82fcfe049 100644 --- a/src/mito2/src/engine/apply_staging_manifest_test.rs +++ b/src/mito2/src/engine/apply_staging_manifest_test.rs @@ -62,7 +62,7 @@ async fn test_apply_staging_manifest_invalid_region_state_with_format(flat_forma let mut env = TestEnv::with_prefix("invalid-region-state").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -125,7 +125,7 @@ async fn test_apply_staging_manifest_mismatched_partition_expr_with_format(flat_ let mut env = TestEnv::with_prefix("mismatched-partition-expr").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -205,7 +205,7 @@ async fn test_apply_staging_manifest_success_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("success").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -406,7 +406,7 @@ async fn test_apply_staging_manifest_invalid_files_to_add_with_format(flat_forma let mut env = TestEnv::with_prefix("invalid-files-to-add").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -483,7 +483,7 @@ async fn test_apply_staging_manifest_change_edit_different_columns_fails_with_fo let mut env = TestEnv::with_prefix("apply-change-edit-different-columns").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -599,7 +599,7 @@ async fn test_apply_staging_manifest_preserves_unflushed_memtable_with_format(fl let mut env = TestEnv::with_prefix("apply-preserve-memtable").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs index ed92c2b4ac..5c2bd4fd4e 100644 --- a/src/mito2/src/engine/basic_test.rs +++ b/src/mito2/src/engine/basic_test.rs @@ -56,7 +56,7 @@ async fn test_engine_new_stop_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("engine-stop").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -93,7 +93,7 @@ async fn test_write_to_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("write-to-region").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -134,7 +134,7 @@ async fn test_region_replay_with_format(factory: Option, flat_f .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -169,7 +169,7 @@ async fn test_region_replay_with_format(factory: Option, flat_f .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -234,7 +234,7 @@ async fn test_write_query_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -278,7 +278,7 @@ async fn test_different_order_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -339,7 +339,7 @@ async fn test_different_order_and_type_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -403,7 +403,7 @@ async fn test_put_delete_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -465,7 +465,7 @@ async fn test_delete_not_null_fields_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -524,7 +524,7 @@ async fn test_put_overwrite_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -594,7 +594,7 @@ async fn test_absent_and_invalid_columns_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -650,7 +650,7 @@ async fn test_region_usage_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("region_usage").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -716,7 +716,7 @@ async fn test_engine_with_write_cache_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let path = env.data_home().to_str().unwrap().to_string(); let mito_config = MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() } .enable_write_cache(path, ReadableSize::mb(512), None); @@ -765,7 +765,7 @@ async fn test_cache_null_primary_key_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, vector_cache_size: ReadableSize::mb(32), ..Default::default() }) @@ -896,7 +896,7 @@ async fn test_list_ssts_with_format( let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -1002,7 +1002,7 @@ async fn test_all_index_metas_list_all_types_with_format(flat_format: bool, expe let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/batch_catchup_test.rs b/src/mito2/src/engine/batch_catchup_test.rs index d8c744a733..dc0b552adc 100644 --- a/src/mito2/src/engine/batch_catchup_test.rs +++ b/src/mito2/src/engine/batch_catchup_test.rs @@ -49,7 +49,7 @@ async fn test_batch_catchup_with_format(factory: Option, flat_f .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -135,7 +135,7 @@ async fn test_batch_catchup_with_format(factory: Option, flat_f .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -216,7 +216,7 @@ async fn test_batch_catchup_err_with_format(factory: Option, fl .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/batch_open_test.rs b/src/mito2/src/engine/batch_open_test.rs index c718ef248c..6b16b3c120 100644 --- a/src/mito2/src/engine/batch_open_test.rs +++ b/src/mito2/src/engine/batch_open_test.rs @@ -49,7 +49,7 @@ async fn test_batch_open_with_format(factory: Option, flat_form .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -157,7 +157,7 @@ async fn test_batch_open_with_format(factory: Option, flat_form .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -193,7 +193,7 @@ async fn test_batch_open_err_with_format(factory: Option, flat_ .with_log_store_factory(factory.clone()); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/bump_committed_sequence_test.rs b/src/mito2/src/engine/bump_committed_sequence_test.rs index 00d2c0f51c..12db0044c5 100644 --- a/src/mito2/src/engine/bump_committed_sequence_test.rs +++ b/src/mito2/src/engine/bump_committed_sequence_test.rs @@ -35,7 +35,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -97,7 +97,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -136,7 +136,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/engine/catchup_test.rs b/src/mito2/src/engine/catchup_test.rs index 718462e8a8..e10e91b51b 100644 --- a/src/mito2/src/engine/catchup_test.rs +++ b/src/mito2/src/engine/catchup_test.rs @@ -701,7 +701,7 @@ async fn test_catchup_not_exist_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/close_test.rs b/src/mito2/src/engine/close_test.rs index 965a4f6fff..4c06583b0b 100644 --- a/src/mito2/src/engine/close_test.rs +++ b/src/mito2/src/engine/close_test.rs @@ -29,7 +29,7 @@ async fn test_engine_close_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("close").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/compaction_test.rs b/src/mito2/src/engine/compaction_test.rs index df8521535f..cbcad3a58a 100644 --- a/src/mito2/src/engine/compaction_test.rs +++ b/src/mito2/src/engine/compaction_test.rs @@ -147,7 +147,7 @@ async fn test_compaction_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -223,7 +223,7 @@ async fn test_infer_compaction_time_window_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -374,7 +374,7 @@ async fn test_compaction_overlapping_files_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -445,7 +445,7 @@ async fn test_compaction_region_with_overlapping_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -503,7 +503,7 @@ async fn test_compaction_region_with_overlapping_delete_all_with_format(flat_for let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -571,7 +571,7 @@ async fn test_readonly_during_compaction_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, // Ensure there is only one background worker for purge task. max_background_purges: 1, ..Default::default() @@ -730,7 +730,7 @@ async fn test_compaction_update_time_window_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -836,7 +836,7 @@ async fn test_change_region_compaction_window_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -938,7 +938,7 @@ async fn test_change_region_compaction_window_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -981,7 +981,7 @@ async fn test_open_overwrite_compaction_window_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -1040,7 +1040,7 @@ async fn test_open_overwrite_compaction_window_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/engine/copy_region_from_test.rs b/src/mito2/src/engine/copy_region_from_test.rs index e9f8398302..0cf2686fca 100644 --- a/src/mito2/src/engine/copy_region_from_test.rs +++ b/src/mito2/src/engine/copy_region_from_test.rs @@ -41,7 +41,7 @@ async fn test_engine_copy_region_from_with_format(flat_format: bool, with_index: let mut env = TestEnv::with_prefix("copy-region-from").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -156,7 +156,7 @@ async fn test_engine_copy_region_failure_with_format(flat_format: bool) { let mut env = TestEnv::new().await.with_mock_layer(mock_layer); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -283,7 +283,7 @@ async fn test_engine_copy_region_invalid_args_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -328,7 +328,7 @@ async fn test_engine_copy_region_unexpected_state_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/create_test.rs b/src/mito2/src/engine/create_test.rs index e5980d9442..6dff346539 100644 --- a/src/mito2/src/engine/create_test.rs +++ b/src/mito2/src/engine/create_test.rs @@ -36,7 +36,7 @@ async fn test_engine_create_new_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("new-region").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -61,7 +61,7 @@ async fn test_engine_create_existing_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("create-existing").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -91,7 +91,7 @@ async fn test_engine_create_close_create_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("create-close-create").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -131,7 +131,7 @@ async fn test_engine_create_with_different_id_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -160,7 +160,7 @@ async fn test_engine_create_with_different_schema_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -190,7 +190,7 @@ async fn test_engine_create_with_different_primary_key_with_format(flat_format: let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -220,7 +220,7 @@ async fn test_engine_create_with_options_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -253,7 +253,7 @@ async fn test_engine_create_with_custom_store_with_format(flat_format: bool) { let engine = env .create_engine_with_multiple_object_stores( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -301,7 +301,7 @@ async fn test_engine_create_with_memtable_opts_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -353,7 +353,7 @@ async fn create_with_partition_expr_persists_manifest_with_format(flat_format: b let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -401,7 +401,7 @@ async fn test_engine_create_with_format_one_case(create_format: &str, default_fl let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: default_flat_format, + default_flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/drop_test.rs b/src/mito2/src/engine/drop_test.rs index b3da775117..a34a5d1172 100644 --- a/src/mito2/src/engine/drop_test.rs +++ b/src/mito2/src/engine/drop_test.rs @@ -45,7 +45,7 @@ async fn test_engine_drop_region_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -175,7 +175,7 @@ async fn test_engine_drop_region_for_custom_store_with_format(flat_format: bool) let engine = env .create_engine_with_multiple_object_stores( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, diff --git a/src/mito2/src/engine/edit_region_test.rs b/src/mito2/src/engine/edit_region_test.rs index 01bdf60070..4a92d3494f 100644 --- a/src/mito2/src/engine/edit_region_test.rs +++ b/src/mito2/src/engine/edit_region_test.rs @@ -54,7 +54,7 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) { let (tx, mut rx) = oneshot::channel(); let config = MitoConfig { min_compaction_interval: Duration::from_secs(60 * 60), - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }; let time_provider = Arc::new(MockTimeProvider::new(current_time_millis())); @@ -154,7 +154,7 @@ async fn test_edit_region_fill_cache_with_format(flat_format: bool) { MitoConfig { // Write cache must be enabled to download the ingested SST file. enable_write_cache: true, - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -268,7 +268,7 @@ async fn test_edit_region_concurrently_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, // Suppress the compaction to not impede the speed of this kinda stress testing. min_compaction_interval: Duration::from_secs(60 * 60), ..Default::default() diff --git a/src/mito2/src/engine/filter_deleted_test.rs b/src/mito2/src/engine/filter_deleted_test.rs index c40fc7ba02..497583b8bc 100644 --- a/src/mito2/src/engine/filter_deleted_test.rs +++ b/src/mito2/src/engine/filter_deleted_test.rs @@ -36,7 +36,7 @@ async fn test_scan_without_filtering_deleted_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/flush_test.rs b/src/mito2/src/engine/flush_test.rs index 78bae2b461..b86e75c72a 100644 --- a/src/mito2/src/engine/flush_test.rs +++ b/src/mito2/src/engine/flush_test.rs @@ -49,7 +49,7 @@ async fn test_manual_flush_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -112,7 +112,7 @@ async fn test_flush_engine_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), @@ -191,7 +191,7 @@ async fn test_write_stall_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), @@ -274,7 +274,7 @@ async fn test_flush_empty_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), @@ -447,7 +447,7 @@ async fn test_auto_flush_engine_with_format(flat_format: bool) { .create_engine_with_time( MitoConfig { auto_flush_interval: Duration::from_secs(60 * 5), - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), @@ -523,7 +523,7 @@ async fn test_flush_workers_with_format(flat_format: bool) { .create_engine_with( MitoConfig { num_workers: 2, - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager.clone()), diff --git a/src/mito2/src/engine/merge_mode_test.rs b/src/mito2/src/engine/merge_mode_test.rs index 097d5e2b91..40a87642ae 100644 --- a/src/mito2/src/engine/merge_mode_test.rs +++ b/src/mito2/src/engine/merge_mode_test.rs @@ -39,7 +39,7 @@ async fn test_merge_mode_write_query_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -107,7 +107,7 @@ async fn test_merge_mode_compaction_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -220,7 +220,7 @@ async fn test_merge_mode_compaction_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/engine/open_test.rs b/src/mito2/src/engine/open_test.rs index 5ee25fb9ff..28ad1de71e 100644 --- a/src/mito2/src/engine/open_test.rs +++ b/src/mito2/src/engine/open_test.rs @@ -48,7 +48,7 @@ async fn test_engine_open_empty_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("open-empty").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -87,7 +87,7 @@ async fn test_engine_open_existing_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("open-exiting").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -126,7 +126,7 @@ async fn test_engine_reopen_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("reopen-region").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -153,7 +153,7 @@ async fn test_engine_open_readonly_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -207,7 +207,7 @@ async fn test_engine_region_open_with_options_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -260,7 +260,7 @@ async fn test_engine_region_open_with_custom_store_with_format(flat_format: bool let engine = env .create_engine_with_multiple_object_stores( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -332,7 +332,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -376,7 +376,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -415,7 +415,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -462,7 +462,7 @@ async fn test_open_region_wait_for_opening_region_ok_with_format(flat_format: bo let mut env = TestEnv::with_prefix("wait-for-opening-region-ok").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -513,7 +513,7 @@ async fn test_open_region_wait_for_opening_region_err_with_format(flat_format: b let mut env = TestEnv::with_prefix("wait-for-opening-region-err").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -569,7 +569,7 @@ async fn test_open_compaction_region() { async fn test_open_compaction_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let mut mito_config = MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }; mito_config diff --git a/src/mito2/src/engine/parallel_test.rs b/src/mito2/src/engine/parallel_test.rs index cf5b6491a7..b88a60739b 100644 --- a/src/mito2/src/engine/parallel_test.rs +++ b/src/mito2/src/engine/parallel_test.rs @@ -33,13 +33,11 @@ async fn scan_in_parallel( region_id: RegionId, table_dir: &str, parallelism: usize, - channel_size: usize, flat_format: bool, ) { let engine = env .open_engine(MitoConfig { - default_experimental_flat_format: flat_format, - parallel_scan_channel_size: channel_size, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -85,7 +83,7 @@ async fn test_parallel_scan_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -146,15 +144,13 @@ async fn test_parallel_scan_with_format(flat_format: bool) { engine.stop().await.unwrap(); - scan_in_parallel(&mut env, region_id, &table_dir, 0, 1, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 0, flat_format).await; - scan_in_parallel(&mut env, region_id, &table_dir, 1, 1, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 1, flat_format).await; - scan_in_parallel(&mut env, region_id, &table_dir, 2, 1, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 2, flat_format).await; - scan_in_parallel(&mut env, region_id, &table_dir, 2, 8, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 4, flat_format).await; - scan_in_parallel(&mut env, region_id, &table_dir, 4, 8, flat_format).await; - - scan_in_parallel(&mut env, region_id, &table_dir, 8, 2, flat_format).await; + scan_in_parallel(&mut env, region_id, &table_dir, 8, flat_format).await; } diff --git a/src/mito2/src/engine/partition_filter_test.rs b/src/mito2/src/engine/partition_filter_test.rs index fdea7d547f..61db52484e 100644 --- a/src/mito2/src/engine/partition_filter_test.rs +++ b/src/mito2/src/engine/partition_filter_test.rs @@ -58,7 +58,7 @@ async fn test_partition_filter_basic_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/projection_test.rs b/src/mito2/src/engine/projection_test.rs index 7726005b0b..afa505a3ee 100644 --- a/src/mito2/src/engine/projection_test.rs +++ b/src/mito2/src/engine/projection_test.rs @@ -84,7 +84,7 @@ async fn test_scan_projection_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -141,7 +141,7 @@ async fn test_scan_projection_without_primary_key_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/prune_test.rs b/src/mito2/src/engine/prune_test.rs index beb5e2644a..599547ec8d 100644 --- a/src/mito2/src/engine/prune_test.rs +++ b/src/mito2/src/engine/prune_test.rs @@ -32,7 +32,7 @@ async fn check_prune_row_groups(exprs: Vec, expected: &str, flat_format: b let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -180,7 +180,7 @@ async fn test_prune_memtable_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -264,7 +264,7 @@ async fn test_prune_memtable_complex_expr_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -327,7 +327,7 @@ async fn test_mem_range_prune_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -392,7 +392,7 @@ async fn test_scan_filter_field_after_delete_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/remap_manifests_test.rs b/src/mito2/src/engine/remap_manifests_test.rs index 339896450c..b893eb5b97 100644 --- a/src/mito2/src/engine/remap_manifests_test.rs +++ b/src/mito2/src/engine/remap_manifests_test.rs @@ -37,7 +37,7 @@ async fn test_remap_manifests_invalid_partition_expr_with_format(flat_format: bo let mut env = TestEnv::with_prefix("invalid-partition-expr").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -83,7 +83,7 @@ async fn test_remap_manifests_invalid_region_state_with_format(flat_format: bool let mut env = TestEnv::with_prefix("invalid-region-state").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -123,7 +123,7 @@ async fn test_remap_manifests_invalid_input_regions_with_format(flat_format: boo let mut env = TestEnv::with_prefix("invalid-input-regions").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -166,7 +166,7 @@ async fn test_remap_manifests_success_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("engine-stop").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/scan_test.rs b/src/mito2/src/engine/scan_test.rs index 46f4cc6cf2..119b4493fd 100644 --- a/src/mito2/src/engine/scan_test.rs +++ b/src/mito2/src/engine/scan_test.rs @@ -12,11 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; + use api::v1::Rows; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_recordbatch::RecordBatches; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; +use datatypes::arrow::array::AsArray; +use datatypes::arrow::datatypes::{Float64Type, TimestampMillisecondType}; use futures::TryStreamExt; use store_api::region_engine::{PrepareRequest, RegionEngine, RegionScanner}; use store_api::region_request::RegionRequest; @@ -37,7 +41,7 @@ async fn test_scan_with_min_sst_sequence_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("test_scan_with_min_sst_sequence").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -172,7 +176,7 @@ async fn test_max_concurrent_scan_files() { async fn test_max_concurrent_scan_files_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("test_max_concurrent_scan_files").await; let config = MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, max_concurrent_scan_files: 2, ..Default::default() }; @@ -222,11 +226,16 @@ async fn test_max_concurrent_scan_files_with_format(flat_format: bool) { } #[tokio::test] -async fn test_series_scan_primarykey() { +async fn test_series_scan() { + test_series_scan_with_format(false).await; + test_series_scan_with_format(true).await; +} + +async fn test_series_scan_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("test_series_scan").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: false, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -295,10 +304,27 @@ async fn test_series_scan_primarykey() { }) .unwrap(); + let actual_rows = collect_partition_rows_round_robin(&scanner, 3).await; + + let mut expected_rows = Vec::new(); + for value in [0_i64, 1, 2, 3, 4, 5, 3600, 3601, 3602, 7200, 7201, 7202] { + expected_rows.push((value.to_string(), value as f64, value * 1000)); + } + expected_rows.sort_by(|a, b| a.0.cmp(&b.0).then(a.2.cmp(&b.2))); + + assert_eq!(expected_rows, actual_rows); +} + +/// Scans all partitions in round-robin fashion and returns rows sorted by (tag, ts). +/// Also asserts that each series appears in only one partition. +async fn collect_partition_rows_round_robin( + scanner: &dyn RegionScanner, + num_partitions: usize, +) -> Vec<(String, f64, i64)> { let metrics_set = ExecutionPlanMetricsSet::default(); - let mut partition_batches = vec![vec![]; 3]; - let mut streams: Vec<_> = (0..3) + let mut partition_batches = vec![vec![]; num_partitions]; + let mut streams: Vec<_> = (0..num_partitions) .map(|partition| { let stream = scanner .scan_partition(&Default::default(), &metrics_set, partition) @@ -309,11 +335,11 @@ async fn test_series_scan_primarykey() { let mut num_done = 0; let mut schema = None; // Pull streams in round-robin fashion to get the consistent output from the sender. - while num_done < 3 { + while num_done < num_partitions { if schema.is_none() { schema = Some(streams[0].as_ref().unwrap().schema().clone()); } - for i in 0..3 { + for i in 0..num_partitions { let Some(mut stream) = streams[i].take() else { continue; }; @@ -326,189 +352,54 @@ async fn test_series_scan_primarykey() { } } - let mut check_result = |expected| { - let batches = - RecordBatches::try_new(schema.clone().unwrap(), partition_batches.remove(0)).unwrap(); - assert_eq!(expected, batches.pretty_print().unwrap()); - }; - - // Output series order is 0, 1, 2, 3, 3600, 3601, 3602, 4, 5, 7200, 7201, 7202 - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 0 | 0.0 | 1970-01-01T00:00:00 | -| 3 | 3.0 | 1970-01-01T00:00:03 | -| 3602 | 3602.0 | 1970-01-01T01:00:02 | -| 7200 | 7200.0 | 1970-01-01T02:00:00 | -+-------+---------+---------------------+"; - check_result(expected); - - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 1 | 1.0 | 1970-01-01T00:00:01 | -| 3600 | 3600.0 | 1970-01-01T01:00:00 | -| 4 | 4.0 | 1970-01-01T00:00:04 | -| 7201 | 7201.0 | 1970-01-01T02:00:01 | -+-------+---------+---------------------+"; - check_result(expected); - - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 2 | 2.0 | 1970-01-01T00:00:02 | -| 3601 | 3601.0 | 1970-01-01T01:00:01 | -| 5 | 5.0 | 1970-01-01T00:00:05 | -| 7202 | 7202.0 | 1970-01-01T02:00:02 | -+-------+---------+---------------------+"; - check_result(expected); + let schema = schema.unwrap(); + collect_and_assert_partition_rows(schema, partition_batches) } -#[tokio::test] -async fn test_series_scan_flat() { - let mut env = TestEnv::with_prefix("test_series_scan").await; - let engine = env - .create_engine(MitoConfig { - default_experimental_flat_format: true, - ..Default::default() - }) - .await; +/// Collects rows sorted by (tag, ts) from partition batches. +/// Also asserts that each series appears in only one partition. +fn collect_and_assert_partition_rows( + schema: datatypes::schema::SchemaRef, + partition_batches: Vec>, +) -> Vec<(String, f64, i64)> { + let mut series_to_partition = BTreeMap::new(); + let mut actual_rows = Vec::new(); - let region_id = RegionId::new(1, 1); - let request = CreateRequestBuilder::new() - .insert_option("compaction.type", "twcs") - .insert_option("compaction.twcs.time_window", "1h") - .build(); - let column_schemas = test_util::rows_schema(&request); + for (partition, batches) in partition_batches.into_iter().enumerate() { + let batches = RecordBatches::try_new(schema.clone(), batches).unwrap(); + let mut partition_series = Vec::new(); - engine - .handle_request(region_id, RegionRequest::Create(request)) - .await - .unwrap(); + for batch in batches.iter() { + let tags = batch.column_by_name("tag_0").unwrap().as_string::(); + let fields = batch + .column_by_name("field_0") + .unwrap() + .as_primitive::(); + let ts = batch + .column_by_name("ts") + .unwrap() + .as_primitive::(); - let put_flush_rows = async |start, end| { - let rows = Rows { - schema: column_schemas.clone(), - rows: test_util::build_rows(start, end), - }; - test_util::put_rows(&engine, region_id, rows).await; - test_util::flush_region(&engine, region_id, None).await; - }; - // generates 3 SST files - put_flush_rows(0, 3).await; - put_flush_rows(2, 6).await; - put_flush_rows(3600, 3603).await; - // Put to memtable. - let rows = Rows { - schema: column_schemas.clone(), - rows: test_util::build_rows(7200, 7203), - }; - test_util::put_rows(&engine, region_id, rows).await; - - let request = ScanRequest { - distribution: Some(TimeSeriesDistribution::PerSeries), - ..Default::default() - }; - let scanner = engine.scanner(region_id, request).await.unwrap(); - let Scanner::Series(mut scanner) = scanner else { - panic!("Scanner should be series scan"); - }; - // 3 partition ranges for 3 time window. - assert_eq!( - 3, - scanner.properties().partitions[0].len(), - "unexpected ranges: {:?}", - scanner.properties().partitions - ); - let raw_ranges: Vec<_> = scanner - .properties() - .partitions - .iter() - .flatten() - .cloned() - .collect(); - let mut new_ranges = Vec::with_capacity(3); - for range in raw_ranges { - new_ranges.push(vec![range]); - } - scanner - .prepare(PrepareRequest { - ranges: Some(new_ranges), - ..Default::default() - }) - .unwrap(); - - let metrics_set = ExecutionPlanMetricsSet::default(); - - let mut partition_batches = vec![vec![]; 3]; - let mut streams: Vec<_> = (0..3) - .map(|partition| { - let stream = scanner - .scan_partition(&Default::default(), &metrics_set, partition) - .unwrap(); - Some(stream) - }) - .collect(); - let mut num_done = 0; - let mut schema = None; - // Pull streams in round-robin fashion to get the consistent output from the sender. - while num_done < 3 { - if schema.is_none() { - schema = Some(streams[0].as_ref().unwrap().schema().clone()); + for row in 0..batch.num_rows() { + let tag = tags.value(row).to_string(); + let field = fields.value(row); + let ts = ts.value(row); + partition_series.push(tag.clone()); + actual_rows.push((tag, field, ts)); + } } - for i in 0..3 { - let Some(mut stream) = streams[i].take() else { - continue; - }; - let Some(rb) = stream.try_next().await.unwrap() else { - num_done += 1; - continue; - }; - partition_batches[i].push(rb); - streams[i] = Some(stream); + + partition_series.sort(); + partition_series.dedup(); + for tag in partition_series { + let prev = series_to_partition.insert(tag.clone(), partition); + assert_eq!( + None, prev, + "series {tag} appears in multiple partitions: {prev:?} and {partition}" + ); } } - let mut check_result = |expected| { - let batches = - RecordBatches::try_new(schema.clone().unwrap(), partition_batches.remove(0)).unwrap(); - assert_eq!(expected, batches.pretty_print().unwrap()); - }; - - // Output series order is 0, 1, 2, 3, 3600, 3601, 3602, 4, 5, 7200, 7201, 7202 - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 0 | 0.0 | 1970-01-01T00:00:00 | -| 1 | 1.0 | 1970-01-01T00:00:01 | -| 2 | 2.0 | 1970-01-01T00:00:02 | -| 3 | 3.0 | 1970-01-01T00:00:03 | -| 7200 | 7200.0 | 1970-01-01T02:00:00 | -| 7201 | 7201.0 | 1970-01-01T02:00:01 | -| 7202 | 7202.0 | 1970-01-01T02:00:02 | -+-------+---------+---------------------+"; - check_result(expected); - - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 3600 | 3600.0 | 1970-01-01T01:00:00 | -| 3601 | 3601.0 | 1970-01-01T01:00:01 | -| 3602 | 3602.0 | 1970-01-01T01:00:02 | -+-------+---------+---------------------+"; - check_result(expected); - - let expected = "\ -+-------+---------+---------------------+ -| tag_0 | field_0 | ts | -+-------+---------+---------------------+ -| 4 | 4.0 | 1970-01-01T00:00:04 | -| 5 | 5.0 | 1970-01-01T00:00:05 | -+-------+---------+---------------------+"; - check_result(expected); + actual_rows.sort_by(|a, b| a.0.cmp(&b.0).then(a.2.cmp(&b.2))); + actual_rows } diff --git a/src/mito2/src/engine/set_role_state_test.rs b/src/mito2/src/engine/set_role_state_test.rs index fd90cd99f7..4fb15ab7fe 100644 --- a/src/mito2/src/engine/set_role_state_test.rs +++ b/src/mito2/src/engine/set_role_state_test.rs @@ -70,7 +70,7 @@ async fn test_set_role_state_gracefully_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -141,7 +141,7 @@ async fn test_set_role_state_gracefully_not_exist_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -166,7 +166,7 @@ async fn test_write_downgrading_region_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("write-to-downgrading-region").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -220,7 +220,7 @@ async fn test_unified_state_transitions_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -329,7 +329,7 @@ async fn test_restricted_state_transitions_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/staging_test.rs b/src/mito2/src/engine/staging_test.rs index e47a77bea0..bd90779e0b 100644 --- a/src/mito2/src/engine/staging_test.rs +++ b/src/mito2/src/engine/staging_test.rs @@ -72,7 +72,7 @@ async fn test_staging_state_integration_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -130,7 +130,7 @@ async fn test_staging_blocks_alter_operations_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -171,7 +171,7 @@ async fn test_staging_blocks_truncate_operations_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -308,7 +308,7 @@ async fn test_staging_write_partition_expr_version_with_format(flat_format: bool let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -505,7 +505,7 @@ async fn test_staging_manifest_directory_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -657,7 +657,7 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -883,7 +883,7 @@ async fn test_enter_staging_writes_partition_expr_change_action_with_format(flat let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -947,7 +947,7 @@ async fn test_staging_exit_conflict_partition_expr_change_and_change_with_format let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -1032,7 +1032,7 @@ async fn test_write_stall_on_enter_staging_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, None, @@ -1156,7 +1156,7 @@ async fn test_enter_staging_error(env: &mut TestEnv, flat_format: bool) { let partition_expr = default_partition_expr(); let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/sync_test.rs b/src/mito2/src/engine/sync_test.rs index 6c3b91c130..17d73b1848 100644 --- a/src/mito2/src/engine/sync_test.rs +++ b/src/mito2/src/engine/sync_test.rs @@ -80,7 +80,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -112,7 +112,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) { // Open the region on the follower engine let follower_engine = env .create_follower_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -189,7 +189,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) { let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -224,7 +224,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) { // Open the region on the follower engine let follower_engine = env .create_follower_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; diff --git a/src/mito2/src/engine/truncate_test.rs b/src/mito2/src/engine/truncate_test.rs index 223cc2b488..818da17faa 100644 --- a/src/mito2/src/engine/truncate_test.rs +++ b/src/mito2/src/engine/truncate_test.rs @@ -41,7 +41,7 @@ async fn test_engine_truncate_region_basic_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("truncate-basic").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -104,7 +104,7 @@ async fn test_engine_put_data_after_truncate_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("truncate-put").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -180,7 +180,7 @@ async fn test_engine_truncate_after_flush_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("truncate-flush").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -270,7 +270,7 @@ async fn test_engine_truncate_reopen_with_format(flat_format: bool) { let mut env = TestEnv::with_prefix("truncate-reopen").await; let engine = env .create_engine(MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }) .await; @@ -310,7 +310,7 @@ async fn test_engine_truncate_reopen_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) @@ -355,7 +355,7 @@ async fn test_engine_truncate_during_flush_with_format(flat_format: bool) { let engine = env .create_engine_with( MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, Some(write_buffer_manager), @@ -436,7 +436,7 @@ async fn test_engine_truncate_during_flush_with_format(flat_format: bool) { .reopen_engine( engine, MitoConfig { - default_experimental_flat_format: flat_format, + default_flat_format: flat_format, ..Default::default() }, ) diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index fedac95d27..7be81dec8d 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -634,7 +634,7 @@ impl RegionFlushTask { .options .sst_format .map(|f| f == FormatType::Flat) - .unwrap_or(self.engine_config.default_experimental_flat_format); + .unwrap_or(self.engine_config.default_flat_format); SstWriteRequest { op_type: OperationType::Flush, metadata: version.metadata.clone(), diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs index 154d062e07..e1494aa47b 100644 --- a/src/mito2/src/memtable.rs +++ b/src/mito2/src/memtable.rs @@ -421,7 +421,7 @@ impl MemtableBuilderProvider { let flat_format = options .sst_format .map(|format| format == FormatType::Flat) - .unwrap_or(self.config.default_experimental_flat_format); + .unwrap_or(self.config.default_flat_format); if flat_format { if options.memtable.is_some() { common_telemetry::info!( diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs index 84931b9f37..db7dfd1958 100644 --- a/src/mito2/src/read.rs +++ b/src/mito2/src/read.rs @@ -175,6 +175,7 @@ impl Batch { } /// Create an empty [`Batch`]. + #[allow(dead_code)] pub(crate) fn empty() -> Self { Self { primary_key: vec![], @@ -677,6 +678,7 @@ impl Batch { /// Checks the batch is monotonic by timestamps. #[cfg(debug_assertions)] + #[allow(dead_code)] pub(crate) fn check_monotonic(&self) -> Result<(), String> { use std::cmp::Ordering; if self.timestamps_native().is_none() { @@ -719,6 +721,7 @@ impl Batch { /// Returns Ok if the given batch is behind the current batch. #[cfg(debug_assertions)] + #[allow(dead_code)] pub(crate) fn check_next_batch(&self, other: &Batch) -> Result<(), String> { // Checks the primary key if self.primary_key() < other.primary_key() { @@ -798,6 +801,7 @@ impl Batch { /// A struct to check the batch is monotonic. #[cfg(debug_assertions)] #[derive(Default)] +#[allow(dead_code)] pub(crate) struct BatchChecker { last_batch: Option, start: Option, @@ -805,6 +809,7 @@ pub(crate) struct BatchChecker { } #[cfg(debug_assertions)] +#[allow(dead_code)] impl BatchChecker { /// Attaches the given start timestamp to the checker. pub(crate) fn with_start(mut self, start: Option) -> Self { diff --git a/src/mito2/src/read/compat.rs b/src/mito2/src/read/compat.rs index fd88749827..90d664a4bd 100644 --- a/src/mito2/src/read/compat.rs +++ b/src/mito2/src/read/compat.rs @@ -98,6 +98,7 @@ pub(crate) enum CompatBatch { impl CompatBatch { /// Returns the inner primary key batch adapter if this is a PrimaryKey format. + #[allow(dead_code)] pub(crate) fn as_primary_key(&self) -> Option<&PrimaryKeyCompatBatch> { match self { CompatBatch::PrimaryKey(batch) => Some(batch), @@ -980,7 +981,6 @@ mod tests { use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; use datatypes::value::ValueRef; - use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, UInt8Vector, UInt64Vector}; use mito_codec::row_converter::{ DensePrimaryKeyCodec, PrimaryKeyCodecExt, SparsePrimaryKeyCodec, }; @@ -992,7 +992,6 @@ mod tests { use crate::read::flat_projection::FlatProjectionMapper; use crate::sst::parquet::flat_format::FlatReadFormat; use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; - use crate::test_util::{VecBatchReader, check_reader_result}; /// Creates a new [RegionMetadata]. fn new_metadata( @@ -1053,44 +1052,6 @@ mod tests { buffer } - /// Creates a batch for specific primary `key`. - /// - /// `fields`: [(column_id of the field, is null)] - fn new_batch( - primary_key: &[u8], - fields: &[(ColumnId, bool)], - start_ts: i64, - num_rows: usize, - ) -> Batch { - let timestamps = Arc::new(TimestampMillisecondVector::from_values( - start_ts..start_ts + num_rows as i64, - )); - let sequences = Arc::new(UInt64Vector::from_values(0..num_rows as u64)); - let op_types = Arc::new(UInt8Vector::from_vec(vec![OpType::Put as u8; num_rows])); - let field_columns = fields - .iter() - .map(|(id, is_null)| { - let data = if *is_null { - Arc::new(Int64Vector::from(vec![None; num_rows])) - } else { - Arc::new(Int64Vector::from_vec(vec![*id as i64; num_rows])) - }; - BatchColumn { - column_id: *id, - data, - } - }) - .collect(); - Batch::new( - primary_key.to_vec(), - timestamps, - sequences, - op_types, - field_columns, - ) - .unwrap() - } - #[test] fn test_invalid_pk_len() { let reader_meta = new_metadata( @@ -1213,311 +1174,6 @@ mod tests { assert!(may_compat_fields(&mapper, &reader_meta).unwrap().is_none()) } - #[tokio::test] - async fn test_compat_reader() { - let reader_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (3, SemanticType::Tag, ConcreteDataType::string_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1, 3], - )); - let mapper = ProjectionMapper::all(&expect_meta, false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let k2 = encode_key(&[Some("b")]); - let source_reader = VecBatchReader::new(&[ - new_batch(&k1, &[(2, false)], 1000, 3), - new_batch(&k2, &[(2, false)], 1000, 3), - ]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - let k1 = encode_key(&[Some("a"), None]); - let k2 = encode_key(&[Some("b"), None]); - check_reader_result( - &mut compat_reader, - &[ - new_batch(&k1, &[(2, false), (4, true)], 1000, 3), - new_batch(&k2, &[(2, false), (4, true)], 1000, 3), - ], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_different_order() { - let reader_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (3, SemanticType::Field, ConcreteDataType::int64_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let mapper = ProjectionMapper::all(&expect_meta, false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let k2 = encode_key(&[Some("b")]); - let source_reader = VecBatchReader::new(&[ - new_batch(&k1, &[(2, false)], 1000, 3), - new_batch(&k2, &[(2, false)], 1000, 3), - ]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[ - new_batch(&k1, &[(3, true), (2, false), (4, true)], 1000, 3), - new_batch(&k2, &[(3, true), (2, false), (4, true)], 1000, 3), - ], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_different_types() { - let actual_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::string_datatype()), - ], - &[1], - )); - let mapper = ProjectionMapper::all(&expect_meta, false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let k2 = encode_key(&[Some("b")]); - let source_reader = VecBatchReader::new(&[ - new_batch(&k1, &[(2, false)], 1000, 3), - new_batch(&k2, &[(2, false)], 1000, 3), - ]); - - let fn_batch_cast = |batch: Batch| { - let mut new_fields = batch.fields.clone(); - new_fields[0].data = new_fields[0] - .data - .cast(&ConcreteDataType::string_datatype()) - .unwrap(); - - batch.with_fields(new_fields).unwrap() - }; - let mut compat_reader = CompatReader::new(&mapper, actual_meta, source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[ - fn_batch_cast(new_batch(&k1, &[(2, false)], 1000, 3)), - fn_batch_cast(new_batch(&k2, &[(2, false)], 1000, 3)), - ], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_projection() { - let reader_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (3, SemanticType::Field, ConcreteDataType::int64_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - // tag_1, field_2, field_3 - let mapper = ProjectionMapper::new(&expect_meta, [1, 3, 2].into_iter(), false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let source_reader = VecBatchReader::new(&[new_batch(&k1, &[(2, false)], 1000, 3)]); - - let mut compat_reader = - CompatReader::new(&mapper, reader_meta.clone(), source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[new_batch(&k1, &[(3, true), (2, false)], 1000, 3)], - ) - .await; - - // tag_1, field_4, field_3 - let mapper = ProjectionMapper::new(&expect_meta, [1, 4, 2].into_iter(), false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let source_reader = VecBatchReader::new(&[new_batch(&k1, &[], 1000, 3)]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[new_batch(&k1, &[(3, true), (4, true)], 1000, 3)], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_projection_read_superset() { - let reader_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - let expect_meta = Arc::new(new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (3, SemanticType::Field, ConcreteDataType::int64_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - )); - // Output: tag_1, field_3, field_2. Read also includes field_4. - let mapper = ProjectionMapper::new_with_read_columns( - &expect_meta, - [1, 3, 2].into_iter(), - false, - vec![1, 3, 2, 4], - ) - .unwrap(); - let k1 = encode_key(&[Some("a")]); - let source_reader = VecBatchReader::new(&[new_batch(&k1, &[(2, false)], 1000, 3)]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - check_reader_result( - &mut compat_reader, - &[new_batch(&k1, &[(3, true), (2, false), (4, true)], 1000, 3)], - ) - .await; - } - - #[tokio::test] - async fn test_compat_reader_different_pk_encoding() { - let mut reader_meta = new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1], - ); - reader_meta.primary_key_encoding = PrimaryKeyEncoding::Dense; - let reader_meta = Arc::new(reader_meta); - let mut expect_meta = new_metadata( - &[ - ( - 0, - SemanticType::Timestamp, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - (1, SemanticType::Tag, ConcreteDataType::string_datatype()), - (2, SemanticType::Field, ConcreteDataType::int64_datatype()), - (3, SemanticType::Tag, ConcreteDataType::string_datatype()), - (4, SemanticType::Field, ConcreteDataType::int64_datatype()), - ], - &[1, 3], - ); - expect_meta.primary_key_encoding = PrimaryKeyEncoding::Sparse; - let expect_meta = Arc::new(expect_meta); - - let mapper = ProjectionMapper::all(&expect_meta, false).unwrap(); - let k1 = encode_key(&[Some("a")]); - let k2 = encode_key(&[Some("b")]); - let source_reader = VecBatchReader::new(&[ - new_batch(&k1, &[(2, false)], 1000, 3), - new_batch(&k2, &[(2, false)], 1000, 3), - ]); - - let mut compat_reader = CompatReader::new(&mapper, reader_meta, source_reader).unwrap(); - let k1 = encode_sparse_key(&[(1, Some("a")), (3, None)]); - let k2 = encode_sparse_key(&[(1, Some("b")), (3, None)]); - check_reader_result( - &mut compat_reader, - &[ - new_batch(&k1, &[(2, false), (4, true)], 1000, 3), - new_batch(&k2, &[(2, false), (4, true)], 1000, 3), - ], - ) - .await; - } - /// Creates a primary key array for flat format testing. fn build_flat_test_pk_array(primary_keys: &[&[u8]]) -> ArrayRef { let mut builder = BinaryDictionaryBuilder::::new(); diff --git a/src/mito2/src/read/flat_merge.rs b/src/mito2/src/read/flat_merge.rs index 90df227ae9..946f2a610c 100644 --- a/src/mito2/src/read/flat_merge.rs +++ b/src/mito2/src/read/flat_merge.rs @@ -19,9 +19,10 @@ use std::time::Instant; use async_stream::try_stream; use common_telemetry::debug; -use datatypes::arrow::array::{Int64Array, UInt64Array}; +use datatypes::arrow::array::{Array, AsArray, Int64Array, UInt64Array}; use datatypes::arrow::compute::interleave; -use datatypes::arrow::datatypes::SchemaRef; +use datatypes::arrow::datatypes::{ArrowNativeType, BinaryType, DataType, SchemaRef, Utf8Type}; +use datatypes::arrow::error::ArrowError; use datatypes::arrow::record_batch::RecordBatch; use datatypes::arrow_array::BinaryArray; use datatypes::timestamp::timestamp_array_to_primitive; @@ -39,6 +40,62 @@ use crate::sst::parquet::flat_format::{ }; use crate::sst::parquet::format::PrimaryKeyArray; +/// Checks whether interleaving the selected rows from byte columns would overflow +/// i32 offsets. Similar to arrow-rs `interleave_bytes()`, accumulates offsets and +/// returns an error if the capacity exceeds `i32::MAX`. +/// +/// TODO(yingwen): Remove this after upgrading to arrow >= 58.1.0, which handles +/// offset overflow in `interleave_bytes()` natively. +/// +/// See: +fn check_interleave_bytes_overflow( + batches: &[(usize, RecordBatch)], + col_idx: usize, + indices: &[(usize, usize)], +) -> std::result::Result<(), ArrowError> { + // Quick check: if concatenating all value data won't overflow, interleaving + // a subset of rows definitely won't either. + let total: usize = batches + .iter() + .map(|(_, batch)| batch.column(col_idx).as_bytes::().value_data().len()) + .sum(); + if T::Offset::from_usize(total).is_some() { + return Ok(()); + } + // Total exceeds the offset limit, do the precise per-row check. + let mut capacity: usize = 0; + for &(a, b) in indices { + let array = batches[a].1.column(col_idx).as_bytes::(); + let o = array.value_offsets(); + let element_len = o[b + 1].as_usize() - o[b].as_usize(); + capacity += element_len; + T::Offset::from_usize(capacity).ok_or(ArrowError::OffsetOverflowError(capacity))?; + } + Ok(()) +} + +/// Checks whether `interleave()` would overflow i32 offsets for `Utf8` or `Binary` columns. +fn check_interleave_overflow( + batches: &[(usize, RecordBatch)], + schema: &SchemaRef, + indices: &[(usize, usize)], +) -> Result<()> { + for (col_idx, field) in schema.fields.iter().enumerate() { + match field.data_type() { + DataType::Utf8 => { + check_interleave_bytes_overflow::(batches, col_idx, indices) + .context(ComputeArrowSnafu)?; + } + DataType::Binary => { + check_interleave_bytes_overflow::(batches, col_idx, indices) + .context(ComputeArrowSnafu)?; + } + _ => continue, + } + } + Ok(()) +} + /// Keeps track of the current position in a batch #[derive(Debug, Copy, Clone, Default)] struct BatchCursor { @@ -121,6 +178,8 @@ impl BatchBuilder { return Ok(None); } + check_interleave_overflow(&self.batches, &self.schema, &self.indices)?; + let columns = (0..self.schema.fields.len()) .map(|column_idx| { let arrays: Vec<_> = self diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs index 1dc4102311..e087e12094 100644 --- a/src/mito2/src/read/last_row.rs +++ b/src/mito2/src/read/last_row.rs @@ -45,6 +45,7 @@ use crate::sst::parquet::reader::{FlatRowGroupReader, ReaderMetrics, RowGroupRea /// /// This reader is different from the [MergeMode](crate::region::options::MergeMode) as /// it focus on time series (the same key). +#[allow(dead_code)] pub(crate) struct LastRowReader { /// Inner reader. reader: BoxedBatchReader, @@ -52,6 +53,7 @@ pub(crate) struct LastRowReader { selector: LastRowSelector, } +#[allow(dead_code)] impl LastRowReader { /// Creates a new `LastRowReader`. pub(crate) fn new(reader: BoxedBatchReader) -> Self { diff --git a/src/mito2/src/read/projection.rs b/src/mito2/src/read/projection.rs index b5b6904521..d22c87bcc2 100644 --- a/src/mito2/src/read/projection.rs +++ b/src/mito2/src/read/projection.rs @@ -52,51 +52,27 @@ impl ProjectionMapper { pub fn new( metadata: &RegionMetadataRef, projection: impl Iterator + Clone, - flat_format: bool, ) -> Result { - if flat_format { - Ok(ProjectionMapper::Flat(FlatProjectionMapper::new( - metadata, projection, - )?)) - } else { - Ok(ProjectionMapper::PrimaryKey( - PrimaryKeyProjectionMapper::new(metadata, projection)?, - )) - } + Ok(ProjectionMapper::Flat(FlatProjectionMapper::new( + metadata, projection, + )?)) } /// Returns a new mapper with output projection and explicit read columns. pub fn new_with_read_columns( metadata: &RegionMetadataRef, projection: impl Iterator, - flat_format: bool, read_column_ids: Vec, ) -> Result { let projection: Vec<_> = projection.collect(); - if flat_format { - Ok(ProjectionMapper::Flat( - FlatProjectionMapper::new_with_read_columns(metadata, projection, read_column_ids)?, - )) - } else { - Ok(ProjectionMapper::PrimaryKey( - PrimaryKeyProjectionMapper::new_with_read_columns( - metadata, - projection, - read_column_ids, - )?, - )) - } + Ok(ProjectionMapper::Flat( + FlatProjectionMapper::new_with_read_columns(metadata, projection, read_column_ids)?, + )) } /// Returns a new mapper without projection. - pub fn all(metadata: &RegionMetadataRef, flat_format: bool) -> Result { - if flat_format { - Ok(ProjectionMapper::Flat(FlatProjectionMapper::all(metadata)?)) - } else { - Ok(ProjectionMapper::PrimaryKey( - PrimaryKeyProjectionMapper::all(metadata)?, - )) - } + pub fn all(metadata: &RegionMetadataRef) -> Result { + Ok(ProjectionMapper::Flat(FlatProjectionMapper::all(metadata)?)) } /// Returns the metadata that created the mapper. @@ -159,6 +135,7 @@ impl ProjectionMapper { } /// Handles projection and converts a projected [Batch] to a projected [RecordBatch]. +#[allow(dead_code)] pub struct PrimaryKeyProjectionMapper { /// Metadata of the region. metadata: RegionMetadataRef, @@ -178,6 +155,7 @@ pub struct PrimaryKeyProjectionMapper { is_empty_projection: bool, } +#[allow(dead_code)] impl PrimaryKeyProjectionMapper { /// Returns a new mapper with projection. /// If `projection` is empty, it outputs [RecordBatch] without any column but only a row count. @@ -413,6 +391,7 @@ pub(crate) fn read_column_ids_from_projection( /// Index of a vector in a [Batch]. #[derive(Debug, Clone, Copy)] +#[allow(dead_code)] enum BatchIndex { /// Index in primary keys. Tag((usize, ColumnId)), @@ -480,53 +459,6 @@ mod tests { }; use super::*; - use crate::cache::CacheManager; - use crate::read::BatchBuilder; - - fn new_batch( - ts_start: i64, - tags: &[i64], - fields: &[(ColumnId, i64)], - num_rows: usize, - ) -> Batch { - let converter = DensePrimaryKeyCodec::with_fields( - (0..tags.len()) - .map(|idx| { - ( - idx as u32, - SortField::new(ConcreteDataType::int64_datatype()), - ) - }) - .collect(), - ); - let primary_key = converter - .encode(tags.iter().map(|v| ValueRef::Int64(*v))) - .unwrap(); - - let mut builder = BatchBuilder::new(primary_key); - builder - .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values( - (0..num_rows).map(|i| ts_start + i as i64 * 1000), - ))) - .unwrap() - .sequences_array(Arc::new(UInt64Array::from_iter_values(0..num_rows as u64))) - .unwrap() - .op_types_array(Arc::new(UInt8Array::from_iter_values( - (0..num_rows).map(|_| OpType::Put as u8), - ))) - .unwrap(); - for (column_id, field) in fields { - builder - .push_field_array( - *column_id, - Arc::new(Int64Array::from_iter_values(std::iter::repeat_n( - *field, num_rows, - ))), - ) - .unwrap(); - } - builder.build().unwrap() - } fn print_record_batch(record_batch: RecordBatch) -> String { pretty::pretty_format_batches(&[record_batch.into_df_record_batch()]) @@ -534,166 +466,6 @@ mod tests { .to_string() } - #[test] - fn test_projection_mapper_all() { - let metadata = Arc::new( - TestRegionMetadataBuilder::default() - .num_tags(2) - .num_fields(2) - .build(), - ); - // Create the enum wrapper with default format (primary key) - let mapper = ProjectionMapper::all(&metadata, false).unwrap(); - assert_eq!([0, 1, 2, 3, 4], mapper.column_ids()); - assert_eq!( - [ - (3, ConcreteDataType::int64_datatype()), - (4, ConcreteDataType::int64_datatype()) - ], - mapper.as_primary_key().unwrap().batch_fields() - ); - - // With vector cache. - let cache = CacheManager::builder().vector_cache_size(1024).build(); - let cache = CacheStrategy::EnableAll(Arc::new(cache)); - let batch = new_batch(0, &[1, 2], &[(3, 3), (4, 4)], 3); - let record_batch = mapper - .as_primary_key() - .unwrap() - .convert(&batch, &cache) - .unwrap(); - let expect = "\ -+---------------------+----+----+----+----+ -| ts | k0 | k1 | v0 | v1 | -+---------------------+----+----+----+----+ -| 1970-01-01T00:00:00 | 1 | 2 | 3 | 4 | -| 1970-01-01T00:00:01 | 1 | 2 | 3 | 4 | -| 1970-01-01T00:00:02 | 1 | 2 | 3 | 4 | -+---------------------+----+----+----+----+"; - assert_eq!(expect, print_record_batch(record_batch)); - - assert!( - cache - .get_repeated_vector(&ConcreteDataType::int64_datatype(), &Value::Int64(1)) - .is_some() - ); - assert!( - cache - .get_repeated_vector(&ConcreteDataType::int64_datatype(), &Value::Int64(2)) - .is_some() - ); - assert!( - cache - .get_repeated_vector(&ConcreteDataType::int64_datatype(), &Value::Int64(3)) - .is_none() - ); - let record_batch = mapper - .as_primary_key() - .unwrap() - .convert(&batch, &cache) - .unwrap(); - assert_eq!(expect, print_record_batch(record_batch)); - } - - #[test] - fn test_projection_mapper_with_projection() { - let metadata = Arc::new( - TestRegionMetadataBuilder::default() - .num_tags(2) - .num_fields(2) - .build(), - ); - // Columns v1, k0 - let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), false).unwrap(); - assert_eq!([4, 1], mapper.column_ids()); - assert_eq!( - [(4, ConcreteDataType::int64_datatype())], - mapper.as_primary_key().unwrap().batch_fields() - ); - - let batch = new_batch(0, &[1, 2], &[(4, 4)], 3); - let cache = CacheManager::builder().vector_cache_size(1024).build(); - let cache = CacheStrategy::EnableAll(Arc::new(cache)); - let record_batch = mapper - .as_primary_key() - .unwrap() - .convert(&batch, &cache) - .unwrap(); - let expect = "\ -+----+----+ -| v1 | k0 | -+----+----+ -| 4 | 1 | -| 4 | 1 | -| 4 | 1 | -+----+----+"; - assert_eq!(expect, print_record_batch(record_batch)); - } - - #[test] - fn test_projection_mapper_read_superset() { - let metadata = Arc::new( - TestRegionMetadataBuilder::default() - .num_tags(2) - .num_fields(2) - .build(), - ); - // Output columns v1, k0. Read also includes v0. - let mapper = ProjectionMapper::new_with_read_columns( - &metadata, - [4, 1].into_iter(), - false, - vec![4, 1, 3], - ) - .unwrap(); - assert_eq!([4, 1, 3], mapper.column_ids()); - - let batch = new_batch(0, &[1, 2], &[(3, 3), (4, 4)], 3); - let cache = CacheManager::builder().vector_cache_size(1024).build(); - let cache = CacheStrategy::EnableAll(Arc::new(cache)); - let record_batch = mapper - .as_primary_key() - .unwrap() - .convert(&batch, &cache) - .unwrap(); - let expect = "\ -+----+----+ -| v1 | k0 | -+----+----+ -| 4 | 1 | -| 4 | 1 | -| 4 | 1 | -+----+----+"; - assert_eq!(expect, print_record_batch(record_batch)); - } - - #[test] - fn test_projection_mapper_empty_projection() { - let metadata = Arc::new( - TestRegionMetadataBuilder::default() - .num_tags(2) - .num_fields(2) - .build(), - ); - // Empty projection - let mapper = ProjectionMapper::new(&metadata, [].into_iter(), false).unwrap(); - assert_eq!([0], mapper.column_ids()); // Should still read the time index column - assert!(mapper.output_schema().is_empty()); - let pk_mapper = mapper.as_primary_key().unwrap(); - assert!(pk_mapper.batch_fields().is_empty()); - assert!(!pk_mapper.has_tags); - assert!(pk_mapper.batch_indices.is_empty()); - assert!(pk_mapper.is_empty_projection); - - let batch = new_batch(0, &[1, 2], &[], 3); - let cache = CacheManager::builder().vector_cache_size(1024).build(); - let cache = CacheStrategy::EnableAll(Arc::new(cache)); - let record_batch = pk_mapper.convert(&batch, &cache).unwrap(); - assert_eq!(3, record_batch.num_rows()); - assert_eq!(0, record_batch.num_columns()); - assert!(record_batch.schema.is_empty()); - } - fn new_flat_batch( ts_start: Option, idx_tags: &[(usize, i64)], @@ -809,7 +581,7 @@ mod tests { .build(), ); let cache = CacheStrategy::Disabled; - let mapper = ProjectionMapper::all(&metadata, true).unwrap(); + let mapper = ProjectionMapper::all(&metadata).unwrap(); assert_eq!([0, 1, 2, 3, 4], mapper.column_ids()); assert_eq!( [ @@ -845,7 +617,7 @@ mod tests { ); let cache = CacheStrategy::Disabled; // Columns v1, k0 - let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), true).unwrap(); + let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter()).unwrap(); assert_eq!([4, 1], mapper.column_ids()); assert_eq!( [ @@ -879,13 +651,9 @@ mod tests { ); let cache = CacheStrategy::Disabled; // Output columns v1, k0. Read also includes v0. - let mapper = ProjectionMapper::new_with_read_columns( - &metadata, - [4, 1].into_iter(), - true, - vec![4, 1, 3], - ) - .unwrap(); + let mapper = + ProjectionMapper::new_with_read_columns(&metadata, [4, 1].into_iter(), vec![4, 1, 3]) + .unwrap(); assert_eq!([4, 1, 3], mapper.column_ids()); let batch = new_flat_batch(None, &[(1, 1)], &[(3, 3), (4, 4)], 3); @@ -911,7 +679,7 @@ mod tests { ); let cache = CacheStrategy::Disabled; // Empty projection - let mapper = ProjectionMapper::new(&metadata, [].into_iter(), true).unwrap(); + let mapper = ProjectionMapper::new(&metadata, [].into_iter()).unwrap(); assert_eq!([0], mapper.column_ids()); // Should still read the time index column assert!(mapper.output_schema().is_empty()); let flat_mapper = mapper.as_flat().unwrap(); diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs index 6766bf3f38..55ad504e6f 100644 --- a/src/mito2/src/read/prune.rs +++ b/src/mito2/src/read/prune.rs @@ -30,11 +30,13 @@ use crate::sst::file::FileTimeRange; use crate::sst::parquet::file_range::FileRangeContextRef; use crate::sst::parquet::reader::{FlatRowGroupReader, ReaderMetrics, RowGroupReader}; +#[allow(dead_code)] pub enum Source { RowGroup(RowGroupReader), LastRow(RowGroupLastRowCachedReader), } +#[allow(dead_code)] impl Source { async fn next_batch(&mut self) -> Result> { match self { @@ -44,6 +46,7 @@ impl Source { } } +#[allow(dead_code)] pub struct PruneReader { /// Context for file ranges. context: FileRangeContextRef, @@ -53,6 +56,7 @@ pub struct PruneReader { skip_fields: bool, } +#[allow(dead_code)] impl PruneReader { pub(crate) fn new_with_row_group_reader( ctx: FileRangeContextRef, diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs index 5fc8931691..2431a21f6a 100644 --- a/src/mito2/src/read/range_cache.rs +++ b/src/mito2/src/read/range_cache.rs @@ -515,7 +515,7 @@ mod tests { ) -> (StreamContext, PartitionRange) { let env = SchedulerEnv::new().await; let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); - let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(); let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap(); let file_id = FileId::random(); let file = sst_file_handle_with_file_id( @@ -527,8 +527,7 @@ mod tests { .with_predicate(predicate) .with_time_range(query_time_range) .with_files(vec![file]) - .with_cache(test_cache_strategy()) - .with_flat_format(true); + .with_cache(test_cache_strategy()); let range_meta = RangeMeta { time_range: partition_time_range, indices: smallvec![SourceIndex { diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index e7cae7e7b8..c447685822 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -46,7 +46,7 @@ use tokio_stream::wrappers::ReceiverStream; use crate::access_layer::AccessLayerRef; use crate::cache::CacheStrategy; -use crate::config::{DEFAULT_MAX_CONCURRENT_SCAN_FILES, DEFAULT_SCAN_CHANNEL_SIZE}; +use crate::config::DEFAULT_MAX_CONCURRENT_SCAN_FILES; use crate::error::{InvalidPartitionExprSnafu, InvalidRequestSnafu, Result}; #[cfg(feature = "enterprise")] use crate::extension::{BoxedExtensionRange, BoxedExtensionRangeProvider}; @@ -63,7 +63,6 @@ use crate::read::unordered_scan::UnorderedScan; use crate::read::{Batch, BoxedRecordBatchStream, RecordBatch, Source}; use crate::region::options::MergeMode; use crate::region::version::VersionRef; -use crate::sst::FormatType; use crate::sst::file::FileHandle; use crate::sst::index::bloom_filter::applier::{ BloomFilterIndexApplierBuilder, BloomFilterIndexApplierRef, @@ -77,8 +76,6 @@ use crate::sst::index::vector_index::applier::{VectorIndexApplier, VectorIndexAp use crate::sst::parquet::file_range::PreFilterMode; use crate::sst::parquet::reader::ReaderMetrics; -/// Parallel scan channel size for flat format. -const FLAT_SCAN_CHANNEL_SIZE: usize = 2; #[cfg(feature = "vector_index")] const VECTOR_INDEX_OVERFETCH_MULTIPLIER: usize = 2; @@ -222,8 +219,6 @@ pub(crate) struct ScanRegion { request: ScanRequest, /// Cache. cache_strategy: CacheStrategy, - /// Capacity of the channel to send data from parallel scan tasks to the main task. - parallel_scan_channel_size: usize, /// Maximum number of SST files to scan concurrently. max_concurrent_scan_files: usize, /// Whether to ignore inverted index. @@ -254,7 +249,6 @@ impl ScanRegion { access_layer, request, cache_strategy, - parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, ignore_inverted_index: false, ignore_fulltext_index: false, @@ -266,16 +260,6 @@ impl ScanRegion { } } - /// Sets parallel scan task channel size. - #[must_use] - pub(crate) fn with_parallel_scan_channel_size( - mut self, - parallel_scan_channel_size: usize, - ) -> Self { - self.parallel_scan_channel_size = parallel_scan_channel_size; - self - } - /// Sets maximum number of SST files to scan concurrently. #[must_use] pub(crate) fn with_max_concurrent_scan_files( @@ -399,19 +383,12 @@ impl ScanRegion { self.request.distribution == Some(TimeSeriesDistribution::PerSeries) } - /// Returns true if the region use flat format. - fn use_flat_format(&self) -> bool { - self.request.force_flat_format - || self.version.options.sst_format.unwrap_or_default() == FormatType::Flat - } - /// Creates a scan input. #[tracing::instrument(skip_all, fields(region_id = %self.region_id()))] - async fn scan_input(mut self) -> Result { + async fn scan_input(self) -> Result { let sst_min_sequence = self.request.sst_min_sequence.and_then(NonZeroU64::new); let time_range = self.build_time_range_predicate(); let predicate = PredicateGroup::new(&self.version.metadata, &self.request.filters)?; - let flat_format = self.use_flat_format(); let read_column_ids = match &self.request.projection { Some(p) => self.build_read_column_ids(p, &predicate)?, @@ -429,10 +406,9 @@ impl ScanRegion { Some(p) => ProjectionMapper::new_with_read_columns( &self.version.metadata, p.iter().copied(), - flat_format, read_column_ids.clone(), )?, - None => ProjectionMapper::all(&self.version.metadata, flat_format)?, + None => ProjectionMapper::all(&self.version.metadata)?, }; let ssts = &self.version.ssts; @@ -496,14 +472,13 @@ impl ScanRegion { let region_id = self.region_id(); debug!( - "Scan region {}, request: {:?}, time range: {:?}, memtables: {}, ssts_to_read: {}, append_mode: {}, flat_format: {}", + "Scan region {}, request: {:?}, time range: {:?}, memtables: {}, ssts_to_read: {}, append_mode: {}", region_id, self.request, time_range, mem_range_builders.len(), files.len(), self.version.options.append_mode, - flat_format, ); let (non_field_filters, field_filters) = self.partition_by_field_filters(); @@ -530,11 +505,6 @@ impl ScanRegion { } }); - if flat_format { - // The batch is already large enough so we use a small channel size here. - self.parallel_scan_channel_size = FLAT_SCAN_CHANNEL_SIZE; - } - let input = ScanInput::new(self.access_layer, mapper) .with_time_range(Some(time_range)) .with_predicate(predicate) @@ -544,7 +514,6 @@ impl ScanRegion { .with_inverted_index_appliers(inverted_index_appliers) .with_bloom_filter_index_appliers(bloom_filter_appliers) .with_fulltext_index_appliers(fulltext_index_appliers) - .with_parallel_scan_channel_size(self.parallel_scan_channel_size) .with_max_concurrent_scan_files(self.max_concurrent_scan_files) .with_start_time(self.start_time) .with_append_mode(self.version.options.append_mode) @@ -552,7 +521,9 @@ impl ScanRegion { .with_merge_mode(self.version.options.merge_mode()) .with_series_row_selector(self.request.series_row_selector) .with_distribution(self.request.distribution) - .with_flat_format(flat_format); + .with_explain_flat_format( + self.version.options.sst_format == Some(crate::sst::FormatType::Flat), + ); #[cfg(feature = "vector_index")] let input = input .with_vector_index_applier(vector_index_applier) @@ -829,8 +800,6 @@ pub struct ScanInput { pub(crate) cache_strategy: CacheStrategy, /// Ignores file not found error. ignore_file_not_found: bool, - /// Capacity of the channel to send data from parallel scan tasks to the main task. - pub(crate) parallel_scan_channel_size: usize, /// Maximum number of SST files to scan concurrently. pub(crate) max_concurrent_scan_files: usize, /// Index appliers. @@ -855,8 +824,8 @@ pub struct ScanInput { pub(crate) series_row_selector: Option, /// Hint for the required distribution of the scanner. pub(crate) distribution: Option, - /// Whether to use flat format. - pub(crate) flat_format: bool, + /// Whether the region's configured SST format is flat. + explain_flat_format: bool, /// Whether this scan is for compaction. pub(crate) compaction: bool, #[cfg(feature = "enterprise")] @@ -878,7 +847,6 @@ impl ScanInput { files: Vec::new(), cache_strategy: CacheStrategy::Disabled, ignore_file_not_found: false, - parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, inverted_index_appliers: [None, None], bloom_filter_index_appliers: [None, None], @@ -893,7 +861,7 @@ impl ScanInput { merge_mode: MergeMode::default(), series_row_selector: None, distribution: None, - flat_format: false, + explain_flat_format: false, compaction: false, #[cfg(feature = "enterprise")] extension_ranges: Vec::new(), @@ -943,16 +911,6 @@ impl ScanInput { self } - /// Sets scan task channel size. - #[must_use] - pub(crate) fn with_parallel_scan_channel_size( - mut self, - parallel_scan_channel_size: usize, - ) -> Self { - self.parallel_scan_channel_size = parallel_scan_channel_size; - self - } - /// Sets maximum number of SST files to scan concurrently. #[must_use] pub(crate) fn with_max_concurrent_scan_files( @@ -1049,6 +1007,13 @@ impl ScanInput { self } + /// Sets whether the region's configured SST format is flat for explain output. + #[must_use] + pub(crate) fn with_explain_flat_format(mut self, explain_flat_format: bool) -> Self { + self.explain_flat_format = explain_flat_format; + self + } + /// Sets the time series row selector. #[must_use] pub(crate) fn with_series_row_selector( @@ -1059,13 +1024,6 @@ impl ScanInput { self } - /// Sets whether to use flat format. - #[must_use] - pub(crate) fn with_flat_format(mut self, flat_format: bool) -> Self { - self.flat_format = flat_format; - self - } - /// Sets whether this scan is for compaction. #[must_use] pub(crate) fn with_compaction(mut self, compaction: bool) -> Self { @@ -1087,6 +1045,7 @@ impl ScanInput { &self, sources: Vec, semaphore: Arc, + channel_size: usize, ) -> Result> { if sources.len() <= 1 { return Ok(sources); @@ -1096,7 +1055,7 @@ impl ScanInput { let sources = sources .into_iter() .map(|source| { - let (sender, receiver) = mpsc::channel(self.parallel_scan_channel_size); + let (sender, receiver) = mpsc::channel(channel_size); self.spawn_scan_task(source, semaphore.clone(), sender); let stream = Box::pin(ReceiverStream::new(receiver)); Source::Stream(stream) @@ -1165,7 +1124,6 @@ impl ScanInput { }; let res = reader .expected_metadata(Some(self.mapper.metadata().clone())) - .flat_format(self.flat_format) .compaction(self.compaction) .pre_filter_mode(filter_mode) .decode_primary_key_values(decode_pk_values) @@ -1272,6 +1230,7 @@ impl ScanInput { &self, sources: Vec, semaphore: Arc, + channel_size: usize, ) -> Result> { if sources.len() <= 1 { return Ok(sources); @@ -1281,7 +1240,7 @@ impl ScanInput { let sources = sources .into_iter() .map(|source| { - let (sender, receiver) = mpsc::channel(self.parallel_scan_channel_size); + let (sender, receiver) = mpsc::channel(channel_size); self.spawn_flat_scan_task(source, semaphore.clone(), sender); let stream = Box::pin(ReceiverStream::new(receiver)); Box::pin(stream) as _ @@ -1421,8 +1380,7 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode { /// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible /// for partition range caching. pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option { - let eligible = input.flat_format - && !input.compaction + let eligible = !input.compaction && !input.files.is_empty() && matches!(input.cache_strategy, CacheStrategy::EnableAll(_)); @@ -1709,8 +1667,7 @@ impl StreamContext { .entries(self.input.files.iter().map(|file| FileWrapper { file })) .finish()?; } - write!(f, ", \"flat_format\": {}", self.input.flat_format)?; - + write!(f, ", \"flat_format\": {}", self.input.explain_flat_format)?; #[cfg(feature = "enterprise")] self.format_extension_ranges(f)?; @@ -1881,9 +1838,7 @@ mod tests { use crate::cache::CacheManager; use crate::memtable::time_partition::TimePartitions; use crate::read::range_cache::ScanRequestFingerprintBuilder; - use crate::region::options::RegionOptions; use crate::region::version::VersionBuilder; - use crate::sst::FormatType; use crate::test_util::memtable_util::{EmptyMemtableBuilder, metadata_with_primary_key}; use crate::test_util::scheduler_util::SchedulerEnv; @@ -1897,30 +1852,9 @@ mod tests { Arc::new(VersionBuilder::new(metadata, mutable).build()) } - fn new_version_with_sst_format( - metadata: RegionMetadataRef, - sst_format: Option, - ) -> VersionRef { - let mutable = Arc::new(TimePartitions::new( - metadata.clone(), - Arc::new(EmptyMemtableBuilder::default()), - 0, - None, - )); - let options = RegionOptions { - sst_format, - ..Default::default() - }; - Arc::new( - VersionBuilder::new(metadata, mutable) - .options(options) - .build(), - ) - } - async fn new_scan_input(metadata: RegionMetadataRef, filters: Vec) -> ScanInput { let env = SchedulerEnv::new().await; - let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(); let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap(); let file = FileHandle::new( crate::sst::file::FileMeta::default(), @@ -1934,7 +1868,6 @@ mod tests { .range_result_cache_size(1024) .build(), ))) - .with_flat_format(true) .with_files(vec![file]) } @@ -2018,45 +1951,6 @@ mod tests { assert_eq!(vec![4, 1, 3], read_ids); } - #[tokio::test] - async fn test_use_flat_format_honors_request_override() { - let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); - let env = SchedulerEnv::new().await; - - let primary_key_version = - new_version_with_sst_format(metadata.clone(), Some(FormatType::PrimaryKey)); - let request = ScanRequest::default(); - let scan_region = ScanRegion::new( - primary_key_version.clone(), - env.access_layer.clone(), - request, - CacheStrategy::Disabled, - ); - assert!(!scan_region.use_flat_format()); - - let request = ScanRequest { - force_flat_format: true, - ..Default::default() - }; - let scan_region = ScanRegion::new( - primary_key_version, - env.access_layer.clone(), - request, - CacheStrategy::Disabled, - ); - assert!(scan_region.use_flat_format()); - - let flat_version = new_version_with_sst_format(metadata, Some(FormatType::Flat)); - let request = ScanRequest::default(); - let scan_region = ScanRegion::new( - flat_version, - env.access_layer.clone(), - request, - CacheStrategy::Disabled, - ); - assert!(scan_region.use_flat_format()); - } - /// Helper to create a timestamp millisecond literal. fn ts_lit(val: i64) -> datafusion_expr::Expr { lit(ScalarValue::TimestampMillisecond(Some(val), None)) @@ -2128,17 +2022,11 @@ mod tests { let disabled = ScanInput::new( SchedulerEnv::new().await.access_layer.clone(), - ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(), + ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(), ) - .with_predicate(PredicateGroup::new(metadata.as_ref(), &filters).unwrap()) - .with_flat_format(true); + .with_predicate(PredicateGroup::new(metadata.as_ref(), &filters).unwrap()); assert!(build_scan_fingerprint(&disabled).is_none()); - let non_flat = new_scan_input(metadata.clone(), filters.clone()) - .await - .with_flat_format(false); - assert!(build_scan_fingerprint(&non_flat).is_none()); - let compaction = new_scan_input(metadata.clone(), filters.clone()) .await .with_compaction(true); diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index d065657242..597f592de6 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -43,16 +43,16 @@ use crate::read::merge::{MergeMetrics, MergeMetricsReport}; use crate::read::pruner::PartitionPruner; use crate::read::range::{RangeMeta, RowGroupIndex}; use crate::read::scan_region::StreamContext; -use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source}; +use crate::read::{BoxedRecordBatchStream, ScannerMetrics}; use crate::sst::file::{FileTimeRange, RegionFileId}; use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplyMetrics; use crate::sst::index::fulltext_index::applier::FulltextIndexApplyMetrics; use crate::sst::index::inverted_index::applier::InvertedIndexApplyMetrics; -use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE; use crate::sst::parquet::file_range::FileRange; use crate::sst::parquet::flat_format::time_index_column_index; use crate::sst::parquet::reader::{MetadataCacheMetrics, ReaderFilterMetrics, ReaderMetrics}; use crate::sst::parquet::row_group::ParquetFetchMetrics; +use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE}; /// Per-file scan metrics. #[derive(Default, Clone)] @@ -1186,45 +1186,6 @@ pub(crate) struct SeriesDistributorMetrics { pub(crate) divider_cost: Duration, } -/// Scans memtable ranges at `index`. -#[tracing::instrument( - skip_all, - fields( - region_id = %stream_ctx.input.region_metadata().region_id, - file_or_mem_index = %index.index, - row_group_index = %index.row_group_index, - source = "mem" - ) -)] -pub(crate) fn scan_mem_ranges( - stream_ctx: Arc, - part_metrics: PartitionMetrics, - index: RowGroupIndex, - time_range: FileTimeRange, -) -> impl Stream> { - try_stream! { - let ranges = stream_ctx.input.build_mem_ranges(index); - part_metrics.inc_num_mem_ranges(ranges.len()); - for range in ranges { - let build_reader_start = Instant::now(); - let mem_scan_metrics = Some(MemScanMetrics::default()); - let iter = range.build_prune_iter(time_range, mem_scan_metrics.clone())?; - part_metrics.inc_build_reader_cost(build_reader_start.elapsed()); - - let mut source = Source::Iter(iter); - while let Some(batch) = source.next_batch().await? { - yield batch; - } - - // Report the memtable scan metrics to partition metrics - if let Some(ref metrics) = mem_scan_metrics { - let data = metrics.data(); - part_metrics.report_mem_scan_metrics(&data); - } - } - } -} - /// Scans memtable ranges at `index` using flat format that returns RecordBatch. #[tracing::instrument( skip_all, @@ -1270,15 +1231,19 @@ const NUM_SERIES_THRESHOLD: u64 = 10240; /// 60 samples per hour. const BATCH_SIZE_THRESHOLD: u64 = 50; -/// Returns true if splitting flat record batches may improve merge performance. +/// Returns the estimated rows per batch after splitting if splitting flat record batches +/// may improve merge performance. Returns `None` if splitting is not beneficial. pub(crate) fn should_split_flat_batches_for_merge( stream_ctx: &Arc, range_meta: &RangeMeta, -) -> bool { +) -> Option { // Number of files to split and scan. let mut num_files_to_split = 0; let mut num_mem_rows = 0; let mut num_mem_series = 0; + // Total rows and series for estimating batch size after splitting. + let mut total_rows: u64 = 0; + let mut total_series: u64 = 0; // Checks each file range, returns early if any range is not splittable. // For mem ranges, we collect the total number of rows and series because the number of rows in a // mem range may be too small. @@ -1300,23 +1265,49 @@ pub(crate) fn should_split_flat_batches_for_merge( debug_assert!(file.meta_ref().num_rows > 0); if !can_split_series(file.meta_ref().num_rows, file.meta_ref().num_series) { // We can't split batches in a file. - return false; + return None; } else { num_files_to_split += 1; + total_rows += file.meta_ref().num_rows; + total_series += file.meta_ref().num_series; } } // Skips non-file and non-mem ranges. } - if num_files_to_split > 0 { + let should_split = if num_files_to_split > 0 { // We mainly consider file ranges because they have enough data for sampling. true - } else if num_mem_series > 0 && num_mem_rows > 0 { - // If we don't have files to scan, we check whether to split by the memtable. - can_split_series(num_mem_rows as u64, num_mem_series as u64) + } else if num_mem_series > 0 + && num_mem_rows > 0 + && can_split_series(num_mem_rows as u64, num_mem_series as u64) + { + total_rows += num_mem_rows as u64; + total_series += num_mem_series as u64; + true } else { false + }; + + if !should_split { + return None; } + + // Estimate rows per batch after splitting. + let estimated_batch_size = if total_series > 0 && total_rows > 0 { + ((total_rows / total_series) as usize).clamp(1, DEFAULT_READ_BATCH_SIZE) + } else { + // No valid estimate available, use a conservative fallback. + DEFAULT_READ_BATCH_SIZE / 4 + }; + Some(estimated_batch_size) +} + +/// Computes the channel size for parallel scan based on the estimated rows per batch. +/// The channel should buffer approximately `2 * DEFAULT_READ_BATCH_SIZE` rows. +pub(crate) fn compute_parallel_channel_size(estimated_rows_per_batch: usize) -> usize { + let size = 2 * DEFAULT_READ_BATCH_SIZE / estimated_rows_per_batch.max(1); + size.clamp(2, 64) } fn can_split_series(num_rows: u64, num_series: u64) -> bool { @@ -1342,59 +1333,6 @@ fn new_filter_metrics(explain_verbose: bool) -> ReaderFilterMetrics { } } -/// Scans file ranges at `index`. -#[tracing::instrument( - skip_all, - fields( - region_id = %stream_ctx.input.region_metadata().region_id, - row_group_index = %index.index, - source = read_type - ) -)] -pub(crate) async fn scan_file_ranges( - stream_ctx: Arc, - part_metrics: PartitionMetrics, - index: RowGroupIndex, - read_type: &'static str, - partition_pruner: Arc, -) -> Result>> { - let mut reader_metrics = ReaderMetrics { - filter_metrics: new_filter_metrics(part_metrics.explain_verbose()), - ..Default::default() - }; - let ranges = partition_pruner - .build_file_ranges(index, &part_metrics, &mut reader_metrics) - .await?; - part_metrics.inc_num_file_ranges(ranges.len()); - part_metrics.merge_reader_metrics(&reader_metrics, None); - - // Creates initial per-file metrics with build_part_cost. - let init_per_file_metrics = if part_metrics.explain_verbose() { - let file = stream_ctx.input.file_from_index(index); - let file_id = file.file_id(); - - let mut map = HashMap::new(); - map.insert( - file_id, - FileScanMetrics { - build_part_cost: reader_metrics.build_cost, - ..Default::default() - }, - ); - Some(map) - } else { - None - }; - - Ok(build_file_range_scan_stream( - stream_ctx, - part_metrics, - read_type, - ranges, - init_per_file_metrics, - )) -} - /// Scans file ranges at `index` using flat reader that returns RecordBatch. #[tracing::instrument( skip_all, @@ -1448,70 +1386,6 @@ pub(crate) async fn scan_flat_file_ranges( )) } -/// Build the stream of scanning the input [`FileRange`]s. -#[tracing::instrument( - skip_all, - fields(read_type = read_type, range_count = ranges.len()) -)] -pub fn build_file_range_scan_stream( - stream_ctx: Arc, - part_metrics: PartitionMetrics, - read_type: &'static str, - ranges: SmallVec<[FileRange; 2]>, - mut per_file_metrics: Option>, -) -> impl Stream> { - try_stream! { - let fetch_metrics = if part_metrics.explain_verbose() { - Some(Arc::new(ParquetFetchMetrics::default())) - } else { - None - }; - let reader_metrics = &mut ReaderMetrics { - fetch_metrics: fetch_metrics.clone(), - ..Default::default() - }; - for range in ranges { - let build_reader_start = Instant::now(); - let Some(reader) = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await? else { - continue; - }; - let build_cost = build_reader_start.elapsed(); - part_metrics.inc_build_reader_cost(build_cost); - let compat_batch = range.compat_batch(); - let mut source = Source::PruneReader(reader); - while let Some(mut batch) = source.next_batch().await? { - if let Some(compact_batch) = compat_batch { - batch = compact_batch.as_primary_key().unwrap().compat_batch(batch)?; - } - yield batch; - } - if let Source::PruneReader(reader) = source { - let prune_metrics = reader.metrics(); - - // Update per-file metrics if tracking is enabled - if let Some(file_metrics_map) = per_file_metrics.as_mut() { - let file_id = range.file_handle().file_id(); - let file_metrics = file_metrics_map - .entry(file_id) - .or_insert_with(FileScanMetrics::default); - - file_metrics.num_ranges += 1; - file_metrics.num_rows += prune_metrics.num_rows; - file_metrics.build_reader_cost += build_cost; - file_metrics.scan_cost += prune_metrics.scan_cost; - } - - reader_metrics.merge_from(&prune_metrics); - } - } - - // Reports metrics. - reader_metrics.observe_rows(read_type); - reader_metrics.filter_metrics.observe(); - part_metrics.merge_reader_metrics(reader_metrics, per_file_metrics.as_ref()); - } -} - /// Build the stream of scanning the input [`FileRange`]s using flat reader that returns RecordBatch. #[tracing::instrument( skip_all, @@ -1591,47 +1465,6 @@ pub fn build_flat_file_range_scan_stream( } } -/// Build the stream of scanning the extension range denoted by the [`RowGroupIndex`]. -#[cfg(feature = "enterprise")] -pub(crate) async fn scan_extension_range( - context: Arc, - index: RowGroupIndex, - partition_metrics: PartitionMetrics, -) -> Result { - use snafu::ResultExt; - - let range = context.input.extension_range(index.index); - let reader = range.reader(context.as_ref()); - let stream = reader - .read(context, partition_metrics, index) - .await - .context(crate::error::ScanExternalRangeSnafu)?; - Ok(stream) -} - -pub(crate) async fn maybe_scan_other_ranges( - context: &Arc, - index: RowGroupIndex, - metrics: &PartitionMetrics, -) -> Result { - #[cfg(feature = "enterprise")] - { - scan_extension_range(context.clone(), index, metrics.clone()).await - } - - #[cfg(not(feature = "enterprise"))] - { - let _ = context; - let _ = index; - let _ = metrics; - - crate::error::UnexpectedSnafu { - reason: "no other ranges scannable", - } - .fail() - } -} - /// Build the stream of scanning the extension range in flat format denoted by the [`RowGroupIndex`]. #[cfg(feature = "enterprise")] pub(crate) async fn scan_flat_extension_range( @@ -1752,3 +1585,235 @@ pub(crate) fn split_record_batch(record_batch: RecordBatch, batches: &mut VecDeq batches.push_back(record_batch.slice(start, rows_in_batch)); } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::time::Instant; + + use common_time::Timestamp; + use smallvec::{SmallVec, smallvec}; + use store_api::storage::RegionId; + + use super::*; + use crate::cache::CacheStrategy; + use crate::memtable::{ + BoxedBatchIterator, BoxedRecordBatchIterator, IterBuilder, MemtableRange, + MemtableRangeContext, MemtableStats, + }; + use crate::read::projection::ProjectionMapper; + use crate::read::range::{MemRangeBuilder, SourceIndex}; + use crate::read::scan_region::ScanInput; + use crate::sst::file::{FileHandle, FileMeta}; + use crate::sst::file_purger::NoopFilePurger; + use crate::test_util::memtable_util::metadata_for_test; + use crate::test_util::scheduler_util::SchedulerEnv; + + struct EmptyIterBuilder; + + impl IterBuilder for EmptyIterBuilder { + fn build(&self, _metrics: Option) -> Result { + Ok(Box::new(std::iter::empty())) + } + + fn is_record_batch(&self) -> bool { + true + } + + fn build_record_batch( + &self, + _time_range: Option<(Timestamp, Timestamp)>, + _metrics: Option, + ) -> Result { + Ok(Box::new(std::iter::empty())) + } + } + + async fn new_test_stream_ctx( + files: Vec, + memtables: Vec, + ) -> Arc { + let env = SchedulerEnv::new().await; + let metadata = metadata_for_test(); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter()).unwrap(); + let input = ScanInput::new(env.access_layer.clone(), mapper) + .with_cache(CacheStrategy::Disabled) + .with_memtables(memtables) + .with_files(files); + + Arc::new(StreamContext { + input, + ranges: Vec::new(), + scan_fingerprint: None, + query_start: Instant::now(), + }) + } + + fn new_test_file(num_rows: u64, num_series: u64) -> FileHandle { + let meta = FileMeta { + region_id: RegionId::new(123, 456), + file_id: Default::default(), + time_range: ( + Timestamp::new_millisecond(0), + Timestamp::new_millisecond(1000), + ), + num_rows, + num_series, + ..Default::default() + }; + FileHandle::new(meta, Arc::new(NoopFilePurger)) + } + + fn new_test_memtable(num_rows: usize, series_count: usize) -> MemRangeBuilder { + let context = Arc::new(MemtableRangeContext::new( + 0, + Box::new(EmptyIterBuilder), + Default::default(), + )); + let stats = MemtableStats { + time_range: Some(( + Timestamp::new_millisecond(0), + Timestamp::new_millisecond(1000), + )), + num_rows, + num_ranges: 1, + series_count, + ..Default::default() + }; + let range = MemtableRange::new(context, stats.clone()); + MemRangeBuilder::new(range, stats) + } + + fn new_test_range_meta(row_group_indices: SmallVec<[RowGroupIndex; 2]>) -> RangeMeta { + let indices = row_group_indices + .iter() + .map(|row_group_index| SourceIndex { + index: row_group_index.index, + num_row_groups: 1, + }) + .collect(); + + RangeMeta { + time_range: ( + Timestamp::new_millisecond(0), + Timestamp::new_millisecond(1000), + ), + indices, + row_group_indices, + num_rows: 0, + } + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_uses_splittable_file_rows_per_series() { + let num_rows = SPLIT_ROW_THRESHOLD * 2; + let num_series = (num_rows / 100).max(1); + let stream_ctx = + new_test_stream_ctx(vec![new_test_file(num_rows, num_series)], vec![]).await; + let range_meta = new_test_range_meta(smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }]); + + assert_eq!( + Some((num_rows / num_series) as usize), + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_skips_small_or_unknown_series_files() { + let stream_ctx = new_test_stream_ctx( + vec![ + new_test_file(SPLIT_ROW_THRESHOLD.saturating_sub(1), 1), + new_test_file(SPLIT_ROW_THRESHOLD * 2, 0), + ], + vec![], + ) + .await; + let range_meta = new_test_range_meta(smallvec![ + RowGroupIndex { + index: 0, + row_group_index: 0, + }, + RowGroupIndex { + index: 1, + row_group_index: 0, + } + ]); + + assert_eq!( + None, + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_returns_none_for_unsplittable_file() { + let num_series = + (SPLIT_ROW_THRESHOLD / (BATCH_SIZE_THRESHOLD - 1)).max(NUM_SERIES_THRESHOLD) + 1; + let stream_ctx = + new_test_stream_ctx(vec![new_test_file(SPLIT_ROW_THRESHOLD, num_series)], vec![]).await; + let range_meta = new_test_range_meta(smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }]); + + assert_eq!( + None, + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_falls_back_to_memtables() { + let stream_ctx = new_test_stream_ctx(vec![], vec![new_test_memtable(5_000, 100)]).await; + let range_meta = new_test_range_meta(smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }]); + + assert_eq!( + Some(50), + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[tokio::test] + async fn test_should_split_flat_batches_for_merge_clamps_estimate() { + let stream_ctx = + new_test_stream_ctx(vec![new_test_file(SPLIT_ROW_THRESHOLD * 2, 1)], vec![]).await; + let range_meta = new_test_range_meta(smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }]); + + assert_eq!( + Some(DEFAULT_READ_BATCH_SIZE), + should_split_flat_batches_for_merge(&stream_ctx, &range_meta) + ); + } + + #[test] + fn test_compute_parallel_channel_size_clamps_to_max_for_small_batches() { + assert_eq!(64, compute_parallel_channel_size(0)); + assert_eq!(64, compute_parallel_channel_size(1)); + } + + #[test] + fn test_compute_parallel_channel_size_returns_expected_mid_range_size() { + assert_eq!( + 4, + compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE / 2) + ); + } + + #[test] + fn test_compute_parallel_channel_size_clamps_to_min_for_large_batches() { + assert_eq!(2, compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE)); + assert_eq!( + 2, + compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE * 2) + ); + } +} diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index a1b3b8f350..15ab435425 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -27,7 +27,7 @@ use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion::physical_plan::{DisplayAs, DisplayFormatType}; use datatypes::schema::SchemaRef; use futures::{StreamExt, TryStreamExt}; -use snafu::{OptionExt, ensure}; +use snafu::ensure; use store_api::metadata::RegionMetadataRef; use store_api::region_engine::{ PartitionRange, PrepareRequest, QueryScanContext, RegionScanner, ScannerProperties, @@ -35,24 +35,19 @@ use store_api::region_engine::{ use store_api::storage::TimeSeriesRowSelector; use tokio::sync::Semaphore; -use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu, UnexpectedSnafu}; -use crate::read::dedup::{DedupReader, LastNonNull, LastRow}; +use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu}; use crate::read::flat_dedup::{FlatDedupReader, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeReader; -use crate::read::last_row::{FlatLastRowReader, LastRowReader}; -use crate::read::merge::MergeReaderBuilder; +use crate::read::last_row::FlatLastRowReader; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::range::RangeMeta; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{ - PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, scan_file_ranges, - scan_flat_file_ranges, scan_flat_mem_ranges, scan_mem_ranges, - should_split_flat_batches_for_merge, + PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, compute_parallel_channel_size, + scan_flat_file_ranges, scan_flat_mem_ranges, should_split_flat_batches_for_merge, }; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; -use crate::read::{ - Batch, BatchReader, BoxedBatchReader, BoxedRecordBatchStream, ScannerMetrics, Source, scan_util, -}; +use crate::read::{BoxedRecordBatchStream, ScannerMetrics, scan_util}; use crate::region::options::MergeMode; use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; @@ -121,7 +116,7 @@ impl SeqScan { let streams = (0..self.properties.partitions.len()) .map(|partition| { let metrics = self.new_partition_metrics(false, &metrics_set, partition); - self.scan_batch_in_partition(partition, metrics) + self.scan_flat_batch_in_partition(partition, metrics) }) .collect::>>()?; @@ -181,58 +176,14 @@ impl SeqScan { partition_ranges.len(), sources.len() ); - Self::build_flat_reader_from_sources(stream_ctx, sources, None, None).await - } - - /// Builds a reader to read sources. If `semaphore` is provided, reads sources in parallel - /// if possible. - #[tracing::instrument(level = tracing::Level::DEBUG, skip_all)] - pub(crate) async fn build_reader_from_sources( - stream_ctx: &StreamContext, - mut sources: Vec, - semaphore: Option>, - part_metrics: Option<&PartitionMetrics>, - ) -> Result { - if let Some(semaphore) = semaphore.as_ref() { - // Read sources in parallel. - if sources.len() > 1 { - sources = stream_ctx - .input - .create_parallel_sources(sources, semaphore.clone())?; - } - } - - let mut builder = MergeReaderBuilder::from_sources(sources); - if let Some(metrics) = part_metrics { - builder.with_metrics_reporter(Some(metrics.merge_metrics_reporter())); - } - let reader = builder.build().await?; - - let dedup = !stream_ctx.input.append_mode; - let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter()); - let reader = if dedup { - match stream_ctx.input.merge_mode { - MergeMode::LastRow => Box::new(DedupReader::new( - reader, - LastRow::new(stream_ctx.input.filter_deleted), - dedup_metrics_reporter, - )) as _, - MergeMode::LastNonNull => Box::new(DedupReader::new( - reader, - LastNonNull::new(stream_ctx.input.filter_deleted), - dedup_metrics_reporter, - )) as _, - } - } else { - Box::new(reader) as _ - }; - - let reader = match &stream_ctx.input.series_row_selector { - Some(TimeSeriesRowSelector::LastRow) => Box::new(LastRowReader::new(reader)) as _, - None => reader, - }; - - Ok(reader) + Self::build_flat_reader_from_sources( + stream_ctx, + sources, + None, + None, + compute_parallel_channel_size(DEFAULT_READ_BATCH_SIZE), + ) + .await } /// Builds a flat reader to read sources that returns RecordBatch. If `semaphore` is provided, reads sources in parallel @@ -243,13 +194,16 @@ impl SeqScan { mut sources: Vec, semaphore: Option>, part_metrics: Option<&PartitionMetrics>, + channel_size: usize, ) -> Result { if let Some(semaphore) = semaphore.as_ref() { // Read sources in parallel. if sources.len() > 1 { - sources = stream_ctx - .input - .create_parallel_flat_sources(sources, semaphore.clone())?; + sources = stream_ctx.input.create_parallel_flat_sources( + sources, + semaphore.clone(), + channel_size, + )?; } } @@ -318,13 +272,7 @@ impl SeqScan { let metrics = self.new_partition_metrics(ctx.explain_verbose, metrics_set, partition); let input = &self.stream_ctx.input; - let batch_stream = if input.flat_format { - // Use flat scan for bulk memtables - self.scan_flat_batch_in_partition(partition, metrics.clone())? - } else { - // Use regular batch scan for normal memtables - self.scan_batch_in_partition(partition, metrics.clone())? - }; + let batch_stream = self.scan_flat_batch_in_partition(partition, metrics.clone())?; let record_batch_stream = ConvertBatchStream::new( batch_stream, input.mapper.clone(), @@ -338,125 +286,6 @@ impl SeqScan { ))) } - #[tracing::instrument( - skip_all, - fields( - region_id = %self.stream_ctx.input.mapper.metadata().region_id, - partition = partition - ) - )] - fn scan_batch_in_partition( - &self, - partition: usize, - part_metrics: PartitionMetrics, - ) -> Result { - ensure!( - partition < self.properties.partitions.len(), - PartitionOutOfRangeSnafu { - given: partition, - all: self.properties.partitions.len(), - } - ); - - if self.properties.partitions[partition].is_empty() { - return Ok(Box::pin(futures::stream::empty())); - } - - let stream_ctx = self.stream_ctx.clone(); - let semaphore = self.new_semaphore(); - let partition_ranges = self.properties.partitions[partition].clone(); - let compaction = self.stream_ctx.input.compaction; - let distinguish_range = self.properties.distinguish_partition_range; - let file_scan_semaphore = if compaction { None } else { semaphore.clone() }; - let pruner = self.pruner.clone(); - // Initializes ref counts for the pruner. - // If we call scan_batch_in_partition() multiple times but don't read all batches from the stream, - // then the ref count won't be decremented. - // This is a rare case and keeping all remaining entries still uses less memory than a per partition cache. - pruner.add_partition_ranges(&partition_ranges); - let partition_pruner = Arc::new(PartitionPruner::new(pruner, &partition_ranges)); - - let stream = try_stream! { - part_metrics.on_first_poll(); - // Start fetch time before building sources so scan cost contains - // build part cost. - let mut fetch_start = Instant::now(); - - let _mapper = stream_ctx.input.mapper.as_primary_key().context(UnexpectedSnafu { - reason: "Unexpected format", - })?; - // Scans each part. - for part_range in partition_ranges { - let mut sources = Vec::new(); - build_sources( - &stream_ctx, - &part_range, - compaction, - &part_metrics, - partition_pruner.clone(), - &mut sources, - file_scan_semaphore.clone(), - ).await?; - - let mut reader = - Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics)) - .await?; - #[cfg(debug_assertions)] - let mut checker = crate::read::BatchChecker::default() - .with_start(Some(part_range.start)) - .with_end(Some(part_range.end)); - - let mut metrics = ScannerMetrics { - scan_cost: fetch_start.elapsed(), - ..Default::default() - }; - fetch_start = Instant::now(); - - while let Some(batch) = reader.next_batch().await? { - metrics.scan_cost += fetch_start.elapsed(); - metrics.num_batches += 1; - metrics.num_rows += batch.num_rows(); - - debug_assert!(!batch.is_empty()); - if batch.is_empty() { - fetch_start = Instant::now(); - continue; - } - - #[cfg(debug_assertions)] - checker.ensure_part_range_batch( - "SeqScan", - _mapper.metadata().region_id, - partition, - part_range, - &batch, - ); - - let yield_start = Instant::now(); - yield ScanBatch::Normal(batch); - metrics.yield_cost += yield_start.elapsed(); - - fetch_start = Instant::now(); - } - - // Yields an empty part to indicate this range is terminated. - // The query engine can use this to optimize some queries. - if distinguish_range { - let yield_start = Instant::now(); - yield ScanBatch::Normal(Batch::empty()); - metrics.yield_cost += yield_start.elapsed(); - } - - metrics.scan_cost += fetch_start.elapsed(); - fetch_start = Instant::now(); - part_metrics.merge_metrics(&metrics); - } - - part_metrics.on_finish(); - }; - Ok(Box::pin(stream)) - } - #[tracing::instrument( skip_all, fields( @@ -503,7 +332,7 @@ impl SeqScan { // Scans each part. for part_range in partition_ranges { let mut sources = Vec::new(); - build_flat_sources( + let split_batch_size = build_flat_sources( &stream_ctx, &part_range, compaction, @@ -513,8 +342,11 @@ impl SeqScan { file_scan_semaphore.clone(), ).await?; + let channel_size = compute_parallel_channel_size( + split_batch_size.unwrap_or(DEFAULT_READ_BATCH_SIZE), + ); let mut reader = - Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics)) + Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics), channel_size) .await?; let mut metrics = ScannerMetrics { @@ -709,109 +541,8 @@ impl fmt::Debug for SeqScan { } } -/// Builds sources for the partition range and push them to the `sources` vector. -pub(crate) async fn build_sources( - stream_ctx: &Arc, - part_range: &PartitionRange, - compaction: bool, - part_metrics: &PartitionMetrics, - partition_pruner: Arc, - sources: &mut Vec, - semaphore: Option>, -) -> Result<()> { - // Gets range meta. - let range_meta = &stream_ctx.ranges[part_range.identifier]; - #[cfg(debug_assertions)] - if compaction { - // Compaction expects input sources are not been split. - debug_assert_eq!(range_meta.indices.len(), range_meta.row_group_indices.len()); - for (i, row_group_idx) in range_meta.row_group_indices.iter().enumerate() { - // It should scan all row groups. - debug_assert_eq!( - -1, row_group_idx.row_group_index, - "Expect {} range scan all row groups, given: {}", - i, row_group_idx.row_group_index, - ); - } - } - - let read_type = if compaction { - "compaction" - } else { - "seq_scan_files" - }; - let num_indices = range_meta.row_group_indices.len(); - if num_indices == 0 { - return Ok(()); - } - - sources.reserve(num_indices); - let mut ordered_sources = Vec::with_capacity(num_indices); - ordered_sources.resize_with(num_indices, || None); - let mut file_scan_tasks = Vec::new(); - - for (position, index) in range_meta.row_group_indices.iter().enumerate() { - if stream_ctx.is_mem_range_index(*index) { - let stream = scan_mem_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - range_meta.time_range, - ); - ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _)); - } else if stream_ctx.is_file_range_index(*index) { - if let Some(semaphore_ref) = semaphore.as_ref() { - // run in parallel, controlled by semaphore - let stream_ctx = stream_ctx.clone(); - let part_metrics = part_metrics.clone(); - let partition_pruner = partition_pruner.clone(); - let semaphore = Arc::clone(semaphore_ref); - let row_group_index = *index; - file_scan_tasks.push(async move { - let _permit = semaphore.acquire().await.unwrap(); - let stream = scan_file_ranges( - stream_ctx, - part_metrics, - row_group_index, - read_type, - partition_pruner, - ) - .await?; - Ok((position, Source::Stream(Box::pin(stream) as _))) - }); - } else { - // no semaphore, run sequentially - let stream = scan_file_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - read_type, - partition_pruner.clone(), - ) - .await?; - ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _)); - } - } else { - let stream = - scan_util::maybe_scan_other_ranges(stream_ctx, *index, part_metrics).await?; - ordered_sources[position] = Some(Source::Stream(stream)); - } - } - - if !file_scan_tasks.is_empty() { - let results = futures::future::try_join_all(file_scan_tasks).await?; - for (position, source) in results { - ordered_sources[position] = Some(source); - } - } - - for source in ordered_sources.into_iter().flatten() { - sources.push(source); - } - Ok(()) -} - /// Builds flat sources for the partition range and push them to the `sources` vector. +/// Returns the estimated rows per batch after splitting if splitting is applied, or `None`. pub(crate) async fn build_flat_sources( stream_ctx: &Arc, part_range: &PartitionRange, @@ -820,7 +551,7 @@ pub(crate) async fn build_flat_sources( partition_pruner: Arc, sources: &mut Vec, semaphore: Option>, -) -> Result<()> { +) -> Result> { // Gets range meta. let range_meta = &stream_ctx.ranges[part_range.identifier]; #[cfg(debug_assertions)] @@ -844,10 +575,11 @@ pub(crate) async fn build_flat_sources( }; let num_indices = range_meta.row_group_indices.len(); if num_indices == 0 { - return Ok(()); + return Ok(None); } - let should_split = should_split_flat_batches_for_merge(stream_ctx, range_meta); + let split_batch_size = should_split_flat_batches_for_merge(stream_ctx, range_meta); + let should_split = split_batch_size.is_some(); sources.reserve(num_indices); let mut ordered_sources = Vec::with_capacity(num_indices); ordered_sources.resize_with(num_indices, || None); @@ -925,7 +657,7 @@ pub(crate) async fn build_flat_sources( ); } - Ok(()) + Ok(split_batch_size) } #[cfg(test)] diff --git a/src/mito2/src/read/series_scan.rs b/src/mito2/src/read/series_scan.rs index 5109120d92..bf7ed072ab 100644 --- a/src/mito2/src/read/series_scan.rs +++ b/src/mito2/src/read/series_scan.rs @@ -30,7 +30,7 @@ use datatypes::arrow::array::BinaryArray; use datatypes::arrow::record_batch::RecordBatch; use datatypes::schema::SchemaRef; use futures::{StreamExt, TryStreamExt}; -use smallvec::{SmallVec, smallvec}; +use smallvec::SmallVec; use snafu::{OptionExt, ResultExt, ensure}; use store_api::metadata::RegionMetadataRef; use store_api::region_engine::{ @@ -44,12 +44,15 @@ use crate::error::{ Error, InvalidSenderSnafu, PartitionOutOfRangeSnafu, Result, ScanMultiTimesSnafu, ScanSeriesSnafu, TooManyFilesToReadSnafu, }; +use crate::read::ScannerMetrics; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::scan_region::{ScanInput, StreamContext}; -use crate::read::scan_util::{PartitionMetrics, PartitionMetricsList, SeriesDistributorMetrics}; -use crate::read::seq_scan::{SeqScan, build_flat_sources, build_sources}; +use crate::read::scan_util::{ + PartitionMetrics, PartitionMetricsList, SeriesDistributorMetrics, compute_parallel_channel_size, +}; +use crate::read::seq_scan::{SeqScan, build_flat_sources}; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; -use crate::read::{Batch, ScannerMetrics}; +use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::PrimaryKeyArray; @@ -443,11 +446,7 @@ impl SeriesDistributor { fields(region_id = %self.stream_ctx.input.mapper.metadata().region_id) )] async fn execute(&mut self) { - let result = if self.stream_ctx.input.flat_format { - self.scan_partitions_flat().await - } else { - self.scan_partitions().await - }; + let result = self.scan_partitions_flat().await; if let Err(e) = result { self.senders.send_error(e).await; @@ -486,10 +485,11 @@ impl SeriesDistributor { // Scans all parts. let mut sources = Vec::with_capacity(self.partitions.len()); + let mut min_batch_size: Option = None; for partition in &self.partitions { sources.reserve(partition.len()); for part_range in partition { - build_flat_sources( + let split_batch_size = build_flat_sources( &self.stream_ctx, part_range, false, @@ -499,15 +499,21 @@ impl SeriesDistributor { self.semaphore.clone(), ) .await?; + if let Some(size) = split_batch_size { + min_batch_size = Some(min_batch_size.map_or(size, |cur| cur.min(size))); + } } } // Builds a flat reader that merge sources from all parts. + let channel_size = + compute_parallel_channel_size(min_batch_size.unwrap_or(DEFAULT_READ_BATCH_SIZE)); let mut reader = SeqScan::build_flat_reader_from_sources( &self.stream_ctx, sources, self.semaphore.clone(), Some(&part_metrics), + channel_size, ) .await?; let mut metrics = SeriesDistributorMetrics::default(); @@ -559,151 +565,11 @@ impl SeriesDistributor { Ok(()) } - - /// Scans all parts. - #[tracing::instrument( - skip_all, - fields(region_id = %self.stream_ctx.input.mapper.metadata().region_id) - )] - async fn scan_partitions(&mut self) -> Result<()> { - // Initialize reference counts for all partition ranges. - for partition_ranges in &self.partitions { - self.pruner.add_partition_ranges(partition_ranges); - } - - // Create PartitionPruner covering all partitions - let all_partition_ranges: Vec<_> = self.partitions.iter().flatten().cloned().collect(); - let partition_pruner = Arc::new(PartitionPruner::new( - self.pruner.clone(), - &all_partition_ranges, - )); - - let part_metrics = new_partition_metrics( - &self.stream_ctx, - self.explain_verbose, - &self.metrics_set, - self.partitions.len(), - &self.metrics_list, - ); - part_metrics.on_first_poll(); - // Start fetch time before building sources so scan cost contains - // build part cost. - let mut fetch_start = Instant::now(); - - // Scans all parts. - let mut sources = Vec::with_capacity(self.partitions.len()); - for partition in &self.partitions { - sources.reserve(partition.len()); - for part_range in partition { - build_sources( - &self.stream_ctx, - part_range, - false, - &part_metrics, - partition_pruner.clone(), - &mut sources, - self.semaphore.clone(), - ) - .await?; - } - } - - // Builds a reader that merge sources from all parts. - let mut reader = SeqScan::build_reader_from_sources( - &self.stream_ctx, - sources, - self.semaphore.clone(), - Some(&part_metrics), - ) - .await?; - let mut metrics = SeriesDistributorMetrics::default(); - - let mut current_series = PrimaryKeySeriesBatch::default(); - while let Some(batch) = reader.next_batch().await? { - metrics.scan_cost += fetch_start.elapsed(); - metrics.num_batches += 1; - metrics.num_rows += batch.num_rows(); - - debug_assert!(!batch.is_empty()); - if batch.is_empty() { - fetch_start = Instant::now(); - continue; - } - - let Some(last_key) = current_series.current_key() else { - current_series.push(batch); - fetch_start = Instant::now(); - continue; - }; - - if last_key == batch.primary_key() { - current_series.push(batch); - fetch_start = Instant::now(); - continue; - } - - // We find a new series, send the current one. - let to_send = - std::mem::replace(&mut current_series, PrimaryKeySeriesBatch::single(batch)); - let yield_start = Instant::now(); - self.senders - .send_batch(SeriesBatch::PrimaryKey(to_send)) - .await?; - metrics.yield_cost += yield_start.elapsed(); - fetch_start = Instant::now(); - } - - if !current_series.is_empty() { - let yield_start = Instant::now(); - self.senders - .send_batch(SeriesBatch::PrimaryKey(current_series)) - .await?; - metrics.yield_cost += yield_start.elapsed(); - } - - metrics.scan_cost += fetch_start.elapsed(); - metrics.num_series_send_timeout = self.senders.num_timeout; - metrics.num_series_send_full = self.senders.num_full; - part_metrics.set_distributor_metrics(&metrics); - - part_metrics.on_finish(); - - Ok(()) - } -} - -/// Batches of the same series in primary key format. -#[derive(Default, Debug)] -pub struct PrimaryKeySeriesBatch { - pub batches: SmallVec<[Batch; 4]>, -} - -impl PrimaryKeySeriesBatch { - /// Creates a new [PrimaryKeySeriesBatch] from a single [Batch]. - fn single(batch: Batch) -> Self { - Self { - batches: smallvec![batch], - } - } - - fn current_key(&self) -> Option<&[u8]> { - self.batches.first().map(|batch| batch.primary_key()) - } - - fn push(&mut self, batch: Batch) { - self.batches.push(batch); - } - - /// Returns true if there is no batch. - fn is_empty(&self) -> bool { - self.batches.is_empty() - } } /// Batches of the same series. #[derive(Debug)] pub enum SeriesBatch { - PrimaryKey(PrimaryKeySeriesBatch), Flat(FlatSeriesBatch), } @@ -711,7 +577,6 @@ impl SeriesBatch { /// Returns the number of batches. pub fn num_batches(&self) -> usize { match self { - SeriesBatch::PrimaryKey(primary_key_batch) => primary_key_batch.batches.len(), SeriesBatch::Flat(flat_batch) => flat_batch.batches.len(), } } @@ -719,9 +584,6 @@ impl SeriesBatch { /// Returns the total number of rows across all batches. pub fn num_rows(&self) -> usize { match self { - SeriesBatch::PrimaryKey(primary_key_batch) => { - primary_key_batch.batches.iter().map(|x| x.num_rows()).sum() - } SeriesBatch::Flat(flat_batch) => flat_batch.batches.iter().map(|x| x.num_rows()).sum(), } } diff --git a/src/mito2/src/read/stream.rs b/src/mito2/src/read/stream.rs index 80002147ea..c8547fdf0c 100644 --- a/src/mito2/src/read/stream.rs +++ b/src/mito2/src/read/stream.rs @@ -27,14 +27,12 @@ use snafu::ResultExt; use crate::cache::CacheStrategy; use crate::error::Result; -use crate::read::Batch; use crate::read::projection::ProjectionMapper; use crate::read::scan_util::PartitionMetrics; use crate::read::series_scan::SeriesBatch; /// All kinds of [`Batch`]es to produce in scanner. pub enum ScanBatch { - Normal(Batch), Series(SeriesBatch), RecordBatch(DfRecordBatch), } @@ -45,6 +43,7 @@ pub type ScanBatchStream = BoxStream<'static, Result>; pub(crate) struct ConvertBatchStream { inner: ScanBatchStream, projection_mapper: Arc, + #[allow(dead_code)] cache_strategy: CacheStrategy, partition_metrics: PartitionMetrics, pending: VecDeque, @@ -68,41 +67,19 @@ impl ConvertBatchStream { fn convert(&mut self, batch: ScanBatch) -> common_recordbatch::error::Result { match batch { - ScanBatch::Normal(batch) => { - // Safety: Only primary key format returns this batch. - let mapper = self.projection_mapper.as_primary_key().unwrap(); - - if batch.is_empty() { - Ok(mapper.empty_record_batch()) - } else { - mapper.convert(&batch, &self.cache_strategy) - } - } ScanBatch::Series(series) => { debug_assert!( self.pending.is_empty(), "ConvertBatchStream should not convert a new SeriesBatch when pending batches exist" ); - match series { - SeriesBatch::PrimaryKey(primary_key_batch) => { - // Safety: Only primary key format returns this batch. - let mapper = self.projection_mapper.as_primary_key().unwrap(); + let SeriesBatch::Flat(flat_batch) = series; + // Safety: Only flat format returns this batch. + let mapper = self.projection_mapper.as_flat().unwrap(); - for batch in primary_key_batch.batches { - self.pending - .push_back(mapper.convert(&batch, &self.cache_strategy)?); - } - } - SeriesBatch::Flat(flat_batch) => { - // Safety: Only flat format returns this batch. - let mapper = self.projection_mapper.as_flat().unwrap(); - - for batch in flat_batch.batches { - self.pending - .push_back(mapper.convert(&batch, &self.cache_strategy)?); - } - } + for batch in flat_batch.batches { + self.pending + .push_back(mapper.convert(&batch, &self.cache_strategy)?); } let output_schema = self.projection_mapper.output_schema(); diff --git a/src/mito2/src/read/unordered_scan.rs b/src/mito2/src/read/unordered_scan.rs index 2d557e8871..9763d14cd2 100644 --- a/src/mito2/src/read/unordered_scan.rs +++ b/src/mito2/src/read/unordered_scan.rs @@ -37,11 +37,10 @@ use crate::error::{PartitionOutOfRangeSnafu, Result}; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{ - PartitionMetrics, PartitionMetricsList, scan_file_ranges, scan_flat_file_ranges, - scan_flat_mem_ranges, scan_mem_ranges, + PartitionMetrics, PartitionMetricsList, scan_flat_file_ranges, scan_flat_mem_ranges, }; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; -use crate::read::{Batch, ScannerMetrics, scan_util}; +use crate::read::{ScannerMetrics, scan_util}; /// Scans a region without providing any output ordering guarantee. /// @@ -103,59 +102,6 @@ impl UnorderedScan { Ok(stream) } - /// Scans a [PartitionRange] by its `identifier` and returns a stream. - #[tracing::instrument( - skip_all, - fields( - region_id = %stream_ctx.input.region_metadata().region_id, - part_range_id = part_range_id - ) - )] - fn scan_partition_range( - stream_ctx: Arc, - part_range_id: usize, - part_metrics: PartitionMetrics, - partition_pruner: Arc, - ) -> impl Stream> { - try_stream! { - // Gets range meta. - let range_meta = &stream_ctx.ranges[part_range_id]; - for index in &range_meta.row_group_indices { - if stream_ctx.is_mem_range_index(*index) { - let stream = scan_mem_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - range_meta.time_range, - ); - for await batch in stream { - yield batch?; - } - } else if stream_ctx.is_file_range_index(*index) { - let stream = scan_file_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - "unordered_scan_files", - partition_pruner.clone(), - ).await?; - for await batch in stream { - yield batch?; - } - } else { - let stream = scan_util::maybe_scan_other_ranges( - &stream_ctx, - *index, - &part_metrics, - ).await?; - for await batch in stream { - yield batch?; - } - } - } - } - } - /// Scans a [PartitionRange] by its `identifier` and returns a flat stream of RecordBatch. #[tracing::instrument( skip_all, @@ -216,7 +162,7 @@ impl UnorderedScan { let streams = (0..self.properties.partitions.len()) .map(|partition| { let metrics = self.partition_metrics(false, partition, &metrics_set); - self.scan_batch_in_partition(partition, metrics) + self.scan_flat_batch_in_partition(partition, metrics) }) .collect::>>()?; @@ -265,13 +211,7 @@ impl UnorderedScan { let metrics = self.partition_metrics(ctx.explain_verbose, partition, metrics_set); let input = &self.stream_ctx.input; - let batch_stream = if input.flat_format { - // Use flat scan for bulk memtables - self.scan_flat_batch_in_partition(partition, metrics.clone())? - } else { - // Use regular batch scan for normal memtables - self.scan_batch_in_partition(partition, metrics.clone())? - }; + let batch_stream = self.scan_flat_batch_in_partition(partition, metrics.clone())?; let record_batch_stream = ConvertBatchStream::new( batch_stream, @@ -286,100 +226,6 @@ impl UnorderedScan { ))) } - #[tracing::instrument( - skip_all, - fields( - region_id = %self.stream_ctx.input.mapper.metadata().region_id, - partition = partition - ) - )] - fn scan_batch_in_partition( - &self, - partition: usize, - part_metrics: PartitionMetrics, - ) -> Result { - ensure!( - partition < self.properties.partitions.len(), - PartitionOutOfRangeSnafu { - given: partition, - all: self.properties.partitions.len(), - } - ); - - let stream_ctx = self.stream_ctx.clone(); - let part_ranges = self.properties.partitions[partition].clone(); - let distinguish_range = self.properties.distinguish_partition_range; - let pruner = self.pruner.clone(); - // Initializes ref counts for the pruner. - // If we call scan_batch_in_partition() multiple times but don't read all batches from the stream, - // then the ref count won't be decremented. - // This is a rare case and keeping all remaining entries still uses less memory than a per partition cache. - pruner.add_partition_ranges(&part_ranges); - let partition_pruner = Arc::new(PartitionPruner::new(pruner, &part_ranges)); - - let stream = try_stream! { - part_metrics.on_first_poll(); - - // Scans each part. - for part_range in part_ranges { - let mut metrics = ScannerMetrics::default(); - let mut fetch_start = Instant::now(); - let _mapper = &stream_ctx.input.mapper; - #[cfg(debug_assertions)] - let mut checker = crate::read::BatchChecker::default() - .with_start(Some(part_range.start)) - .with_end(Some(part_range.end)); - - let stream = Self::scan_partition_range( - stream_ctx.clone(), - part_range.identifier, - part_metrics.clone(), - partition_pruner.clone(), - ); - for await batch in stream { - let batch = batch?; - metrics.scan_cost += fetch_start.elapsed(); - metrics.num_batches += 1; - metrics.num_rows += batch.num_rows(); - - debug_assert!(!batch.is_empty()); - if batch.is_empty() { - continue; - } - - #[cfg(debug_assertions)] - checker.ensure_part_range_batch( - "UnorderedScan", - _mapper.metadata().region_id, - partition, - part_range, - &batch, - ); - - let yield_start = Instant::now(); - yield ScanBatch::Normal(batch); - metrics.yield_cost += yield_start.elapsed(); - - fetch_start = Instant::now(); - } - - // Yields an empty part to indicate this range is terminated. - // The query engine can use this to optimize some queries. - if distinguish_range { - let yield_start = Instant::now(); - yield ScanBatch::Normal(Batch::empty()); - metrics.yield_cost += yield_start.elapsed(); - } - - metrics.scan_cost += fetch_start.elapsed(); - part_metrics.merge_metrics(&metrics); - } - - part_metrics.on_finish(); - }; - Ok(Box::pin(stream)) - } - #[tracing::instrument( skip_all, fields( diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index 9aa6454f75..b23e73557d 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -269,7 +269,7 @@ impl RegionOpener { // Sets the sst_format based on options or flat_format flag let sst_format = if let Some(format) = options.sst_format { format - } else if config.default_experimental_flat_format { + } else if config.default_flat_format { options.sst_format = Some(FormatType::Flat); FormatType::Flat } else { @@ -309,7 +309,7 @@ impl RegionOpener { debug!( "Create region {} with options: {:?}, default_flat_format: {}", - region_id, options, config.default_experimental_flat_format + region_id, options, config.default_flat_format ); let version = VersionBuilder::new(metadata, mutable) @@ -626,8 +626,10 @@ pub(crate) fn sanitize_region_options(manifest: &RegionManifest, options: &mut R manifest.sst_format, manifest.metadata.region_id, ); - options.sst_format = Some(manifest.sst_format); } + // Always set sst_format from manifest to ensure it's explicitly stored, + // even when the default matches the manifest value. + options.sst_format = Some(manifest.sst_format); if let Some(manifest_append_mode) = manifest.append_mode && options.append_mode != manifest_append_mode { diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 4a3466a29c..2ca83ca8cf 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -895,7 +895,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -960,7 +959,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -1015,7 +1013,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -1549,7 +1546,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -1652,7 +1648,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .bloom_filter_index_appliers([None, bloom_filter_applier.clone()]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -1712,7 +1707,6 @@ mod tests { let builder = ParquetReaderBuilder::new(FILE_DIR.to_string(), PathType::Bare, handle, object_store) - .flat_format(true) .predicate(Some(Predicate::new(vec![col("tag_0").eq(lit("a"))]))); let mut metrics = ReaderMetrics::default(); @@ -1774,7 +1768,6 @@ mod tests { let builder = ParquetReaderBuilder::new(FILE_DIR.to_string(), PathType::Bare, handle, object_store) - .flat_format(true) .predicate(Some(Predicate::new(vec![col("tag_0").eq(lit("a"))]))); let mut metrics = ReaderMetrics::default(); @@ -1884,7 +1877,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -1991,7 +1983,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .bloom_filter_index_appliers([None, bloom_filter_applier.clone()]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -2255,7 +2246,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .fulltext_index_appliers([None, fulltext_applier.clone()]) .cache(CacheStrategy::EnableAll(cache.clone())); @@ -2304,7 +2294,6 @@ mod tests { handle.clone(), object_store.clone(), ) - .flat_format(true) .predicate(Some(Predicate::new(preds))) .fulltext_index_appliers([None, fulltext_applier.clone()]) .cache(CacheStrategy::EnableAll(cache.clone())); diff --git a/src/mito2/src/sst/parquet/file_range.rs b/src/mito2/src/sst/parquet/file_range.rs index 8b4a61acb7..bf86e4a764 100644 --- a/src/mito2/src/sst/parquet/file_range.rs +++ b/src/mito2/src/sst/parquet/file_range.rs @@ -175,6 +175,7 @@ impl FileRange { } /// Returns a reader to read the [FileRange]. + #[allow(dead_code)] pub(crate) async fn reader( &self, selector: Option, diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 8832cd4a16..73ca7748e9 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -141,8 +141,6 @@ pub struct ParquetReaderBuilder { /// This is usually the latest metadata of the region. The reader use /// it get the correct column id of a column by name. expected_metadata: Option, - /// Whether to use flat format for reading. - flat_format: bool, /// Whether this reader is for compaction. compaction: bool, /// Mode to pre-filter columns. @@ -176,7 +174,6 @@ impl ParquetReaderBuilder { #[cfg(feature = "vector_index")] vector_index_k: None, expected_metadata: None, - flat_format: false, compaction: false, pre_filter_mode: PreFilterMode::All, decode_primary_key_values: false, @@ -257,13 +254,6 @@ impl ParquetReaderBuilder { self } - /// Sets the flat format flag. - #[must_use] - pub fn flat_format(mut self, flat_format: bool) -> Self { - self.flat_format = flat_format; - self - } - /// Sets the compaction flag. #[must_use] pub fn compaction(mut self, compaction: bool) -> Self { @@ -304,8 +294,7 @@ impl ParquetReaderBuilder { pub async fn build(&self) -> Result> { let mut metrics = ReaderMetrics::default(); - let Some((context, selection)) = self.build_reader_input_inner(&mut metrics, true).await? - else { + let Some((context, selection)) = self.build_reader_input_inner(&mut metrics).await? else { return Ok(None); }; ParquetReader::new(Arc::new(context), selection) @@ -327,14 +316,12 @@ impl ParquetReaderBuilder { &self, metrics: &mut ReaderMetrics, ) -> Result> { - self.build_reader_input_inner(metrics, self.flat_format) - .await + self.build_reader_input_inner(metrics).await } async fn build_reader_input_inner( &self, metrics: &mut ReaderMetrics, - flat_format: bool, ) -> Result> { let start = Instant::now(); @@ -376,7 +363,6 @@ impl ParquetReaderBuilder { // before compat handling. let compaction_projection_mapper = if self.compaction && !is_same_region_partition - && flat_format && region_meta.primary_key_encoding == PrimaryKeyEncoding::Sparse { Some(CompactionProjectionMapper::try_new(®ion_meta)?) @@ -388,7 +374,7 @@ impl ParquetReaderBuilder { ReadFormat::new( region_meta.clone(), Some(column_ids), - flat_format, + true, // Always reads as flat format. Some(parquet_meta.file_metadata().schema_descr().num_columns()), &file_path, skip_auto_convert, @@ -404,7 +390,7 @@ impl ParquetReaderBuilder { ReadFormat::new( region_meta.clone(), Some(&column_ids), - flat_format, + true, // Always reads as flat format. Some(parquet_meta.file_metadata().schema_descr().num_columns()), &file_path, skip_auto_convert, @@ -2060,6 +2046,7 @@ impl RowGroupReaderContext for FileRangeContextRef { /// [RowGroupReader] that reads from [FileRange]. pub(crate) type RowGroupReader = RowGroupReaderBase; +#[allow(dead_code)] impl RowGroupReader { /// Creates a new reader from file range. pub(crate) fn new( @@ -2084,6 +2071,7 @@ pub(crate) struct RowGroupReaderBase { override_sequence: Option, } +#[allow(dead_code)] impl RowGroupReaderBase where T: RowGroupReaderContext, diff --git a/src/mito2/src/worker/handle_alter.rs b/src/mito2/src/worker/handle_alter.rs index 459aa8dd32..6fa560e90c 100644 --- a/src/mito2/src/worker/handle_alter.rs +++ b/src/mito2/src/worker/handle_alter.rs @@ -216,15 +216,6 @@ impl RegionWorkerLoop { // If the format is unchanged, we also consider the option is altered. if new_format != current_options.sst_format.unwrap_or_default() { all_options_altered = false; - - // Validates the format type. - ensure!( - new_format == FormatType::Flat, - store_api::metadata::InvalidRegionRequestSnafu { - region_id: region.region_id, - err: "Only allow changing format type to flat", - } - ); } } SetRegionOption::AppendMode(new_append_mode) => { @@ -274,8 +265,6 @@ fn new_region_options_on_empty_memtable( SetRegionOption::Format(format_str) => { // Safety: handle_alter_region_options_fast() has validated this. let new_format = format_str.parse::().unwrap(); - assert_eq!(FormatType::Flat, new_format); - current_options.sst_format = Some(new_format); } SetRegionOption::AppendMode(new_append_mode) => { diff --git a/src/object-store/Cargo.toml b/src/object-store/Cargo.toml index 2ef251d04d..7247c5892c 100644 --- a/src/object-store/Cargo.toml +++ b/src/object-store/Cargo.toml @@ -21,7 +21,6 @@ derive_builder = { workspace = true, optional = true } futures.workspace = true humantime-serde.workspace = true lazy_static.workspace = true -moka = { workspace = true, features = ["future"] } opendal = { version = "0.54", features = [ "layers-tracing", "layers-prometheus", diff --git a/src/promql/Cargo.toml b/src/promql/Cargo.toml index 306563d1ce..460be8ddd9 100644 --- a/src/promql/Cargo.toml +++ b/src/promql/Cargo.toml @@ -13,7 +13,6 @@ async-trait.workspace = true bytemuck.workspace = true common-error.workspace = true common-macro.workspace = true -common-recordbatch.workspace = true common-telemetry.workspace = true datafusion.workspace = true datafusion-common.workspace = true @@ -27,6 +26,7 @@ prost.workspace = true snafu.workspace = true [dev-dependencies] +common-recordbatch.workspace = true criterion.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true diff --git a/src/query/src/optimizer/windowed_sort.rs b/src/query/src/optimizer/windowed_sort.rs index 9365c8e1e8..3d3993d454 100644 --- a/src/query/src/optimizer/windowed_sort.rs +++ b/src/query/src/optimizer/windowed_sort.rs @@ -94,6 +94,8 @@ impl WindowedSortPhysicalRule { && scanner_info .time_index .contains(input_schema.field(column_expr.index()).name()) + && sort_exec.fetch().is_none() + // skip if there is a limit, as dyn filter along is good enough in this case { } else { return Ok(Transformed::no(plan)); diff --git a/src/query/src/part_sort.rs b/src/query/src/part_sort.rs index e12479cc5a..19a114c8ce 100644 --- a/src/query/src/part_sort.rs +++ b/src/query/src/part_sort.rs @@ -237,14 +237,10 @@ impl ExecutionPlan for PartSortExec { } else { internal_err!("No children found")? }; - // create a new dynamic filter when with_new_children, as the old filter is bound to the old input and cannot be reused - let new = Self::try_new( - self.expression.clone(), - self.limit, - self.partition_ranges.clone(), - new_input.clone(), - )?; - Ok(Arc::new(new)) + let mut new_exec = self.as_ref().clone(); + new_exec.input = new_input.clone(); + new_exec.properties = new_input.properties().clone(); + Ok(Arc::new(new_exec)) } fn execute( diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 55bb41ee51..2d68f17699 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -91,7 +91,7 @@ otel-arrow-rust.workspace = true parking_lot.workspace = true partition.workspace = true pg_interval = { version = "0.5.2", package = "pg_interval_2" } -pgwire = { version = "0.38.2", default-features = false, features = [ +pgwire = { version = "0.38.3", default-features = false, features = [ "server-api-ring", "pg-ext-types", ] } @@ -178,3 +178,7 @@ harness = false [[bench]] name = "loki_labels" harness = false + +[[bench]] +name = "flush_batch_physical" +harness = false diff --git a/src/servers/benches/flush_batch_physical.rs b/src/servers/benches/flush_batch_physical.rs new file mode 100644 index 0000000000..a3d190adf2 --- /dev/null +++ b/src/servers/benches/flush_batch_physical.rs @@ -0,0 +1,289 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use api::region::RegionResponse; +use api::v1::meta::Peer; +use api::v1::region::RegionRequest; +use arrow::array::{Float64Array, StringArray, TimestampMillisecondArray}; +use arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema, TimeUnit}; +use arrow::record_batch::RecordBatch; +use async_trait::async_trait; +use catalog::error::Result as CatalogResult; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datatypes::prelude::ConcreteDataType; +use datatypes::schema::{ColumnSchema as DtColumnSchema, Schema as DtSchema}; +use partition::error::Result as PartitionResult; +use partition::partition::{PartitionRule, PartitionRuleRef, RegionMask}; +use servers::error::{self, Result}; +use servers::pending_rows_batcher::{ + PhysicalFlushCatalogProvider, PhysicalFlushNodeRequester, PhysicalFlushPartitionProvider, + PhysicalTableMetadata, TableBatch, flush_batch_physical, +}; +use store_api::storage::RegionId; +use table::test_util::table_info::test_table_info; +use tokio::runtime::Runtime; + +// --------------------------------------------------------------------------- +// Mock implementations (memory-backed, no I/O) +// --------------------------------------------------------------------------- + +struct BenchCatalogProvider { + table: PhysicalTableMetadata, +} + +#[async_trait] +impl PhysicalFlushCatalogProvider for BenchCatalogProvider { + async fn physical_table( + &self, + _catalog: &str, + _schema: &str, + _table_name: &str, + _query_ctx: &session::context::QueryContext, + ) -> CatalogResult> { + Ok(Some(self.table.clone())) + } +} + +struct BenchPartitionProvider; + +struct SingleRegionPartitionRule; + +impl PartitionRule for SingleRegionPartitionRule { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn partition_columns(&self) -> &[String] { + &[] + } + + fn find_region( + &self, + _values: &[datatypes::prelude::Value], + ) -> PartitionResult { + Ok(1) + } + + fn split_record_batch( + &self, + record_batch: &RecordBatch, + ) -> PartitionResult> { + let n = record_batch.num_rows(); + Ok(HashMap::from([( + 1, + RegionMask::new(arrow::array::BooleanArray::from(vec![true; n]), n), + )])) + } +} + +#[async_trait] +impl PhysicalFlushPartitionProvider for BenchPartitionProvider { + async fn find_table_partition_rule( + &self, + _table_info: &table::metadata::TableInfo, + ) -> PartitionResult { + Ok(Arc::new(SingleRegionPartitionRule)) + } + + async fn find_region_leader(&self, _region_id: RegionId) -> Result { + Ok(Peer { + id: 1, + addr: "bench-node".to_string(), + }) + } +} + +struct BenchNodeRequester; + +#[async_trait] +impl PhysicalFlushNodeRequester for BenchNodeRequester { + async fn handle(&self, _peer: &Peer, _request: RegionRequest) -> error::Result { + Ok(RegionResponse::new(0)) + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn make_physical_table_metadata(num_tags: usize) -> PhysicalTableMetadata { + let mut columns = vec![ + DtColumnSchema::new("__primary_key", ConcreteDataType::binary_datatype(), false), + DtColumnSchema::new( + "greptime_timestamp", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + DtColumnSchema::new("greptime_value", ConcreteDataType::float64_datatype(), true), + ]; + + let mut name_to_ids = HashMap::new(); + let mut column_ids = vec![0u32, 1, 2]; + + for i in 0..num_tags { + let tag_name = format!("tag{}", i); + let col_id = (i + 3) as u32; + columns.push(DtColumnSchema::new( + &tag_name, + ConcreteDataType::string_datatype(), + true, + )); + name_to_ids.insert(tag_name, col_id); + column_ids.push(col_id); + } + + let schema = Arc::new(DtSchema::try_new(columns).unwrap()); + let mut table_info = test_table_info(1, "phy", "public", "greptime", schema); + table_info.meta.column_ids = column_ids; + + PhysicalTableMetadata { + table_info: Arc::new(table_info), + col_name_to_ids: Some(name_to_ids), + } +} + +fn make_tag_batch(tag_names: &[&str], num_rows: usize) -> RecordBatch { + let mut fields = vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + ]; + for tag in tag_names { + fields.push(Field::new(*tag, ArrowDataType::Utf8, true)); + } + + let schema = Arc::new(ArrowSchema::new(fields)); + + let ts: Vec = (0..num_rows as i64).collect(); + let vals: Vec = (0..num_rows).map(|i| i as f64).collect(); + + let mut arrays: Vec> = vec![ + Arc::new(TimestampMillisecondArray::from(ts)), + Arc::new(Float64Array::from(vals)), + ]; + + for (tag_idx, _tag) in tag_names.iter().enumerate() { + let values: Vec = (0..num_rows) + .map(|i| format!("val-{}-{}", tag_idx, i)) + .collect(); + arrays.push(Arc::new(StringArray::from(values))); + } + + RecordBatch::try_new(schema, arrays).unwrap() +} + +fn make_table_batches( + num_logical_tables: usize, + rows_per_table: usize, + tag_names: &[&str], +) -> Vec { + (0..num_logical_tables) + .map(|i| { + let batch = make_tag_batch(tag_names, rows_per_table); + let row_count = batch.num_rows(); + TableBatch { + table_name: format!("logical_{}", i), + table_id: (100 + i) as u32, + batches: vec![batch], + row_count, + } + }) + .collect() +} + +// --------------------------------------------------------------------------- +// Benchmarks +// --------------------------------------------------------------------------- + +fn bench_flush_batch_physical(c: &mut Criterion) { + let rt = Runtime::new().unwrap(); + let ctx = session::context::QueryContext::arc(); + + let num_tags = 5; + let tag_names: Vec = (0..num_tags).map(|i| format!("tag{}", i)).collect(); + let tag_refs: Vec<&str> = tag_names.iter().map(|s| s.as_str()).collect(); + + let catalog_provider = BenchCatalogProvider { + table: make_physical_table_metadata(num_tags), + }; + let partition_provider = BenchPartitionProvider; + let node_requester = BenchNodeRequester; + + let mut group = c.benchmark_group("flush_batch_physical"); + + // Vary the number of logical tables + for num_tables in [1, 10, 50, 100] { + let rows_per_table = 100; + let table_batches = make_table_batches(num_tables, rows_per_table, &tag_refs); + + group.bench_with_input( + BenchmarkId::new("tables", num_tables), + &table_batches, + |b, batches| { + b.iter(|| { + rt.block_on(async { + flush_batch_physical( + batches, + "phy", + &ctx, + &partition_provider, + &node_requester, + &catalog_provider, + ) + .await + .unwrap(); + }); + }); + }, + ); + } + + // Vary the number of rows per table + for rows_per_table in [10, 100, 1000, 5000] { + let num_tables = 10; + let table_batches = make_table_batches(num_tables, rows_per_table, &tag_refs); + + group.bench_with_input( + BenchmarkId::new("rows_per_table", rows_per_table), + &table_batches, + |b, batches| { + b.iter(|| { + rt.block_on(async { + flush_batch_physical( + batches, + "phy", + &ctx, + &partition_provider, + &node_requester, + &catalog_provider, + ) + .await + .unwrap(); + }); + }); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_flush_batch_physical); +criterion_main!(benches); diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index 682288b271..8a3c554058 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -15,6 +15,7 @@ use std::any::Any; use std::net::SocketAddr; use std::string::FromUtf8Error; +use std::sync::Arc; use axum::http::StatusCode as HttpStatusCode; use axum::response::{IntoResponse, Response}; @@ -51,6 +52,8 @@ pub enum Error { Arrow { #[snafu(source)] error: arrow_schema::ArrowError, + #[snafu(implicit)] + location: Location, }, #[snafu(display("Internal error: {}", err_msg))] @@ -685,6 +688,23 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(transparent)] + Partition { + source: partition::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(transparent)] + MetricEngine { + source: metric_engine::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to submit batch: {}", source))] + SubmitBatch { source: Arc }, } pub type Result = std::result::Result; @@ -818,6 +838,9 @@ impl ErrorExt for Error { MemoryLimitExceeded { .. } => StatusCode::RateLimited, GreptimeProto { source, .. } => source.status_code(), + Partition { source, .. } => source.status_code(), + MetricEngine { source, .. } => source.status_code(), + SubmitBatch { source, .. } => source.status_code(), } } diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index eb2086726a..d25be0485f 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -1312,7 +1312,7 @@ mod test { use std::io::Cursor; use std::sync::Arc; - use arrow_ipc::reader::FileReader; + use arrow_ipc::reader::StreamReader; use arrow_schema::DataType; use axum::handler::Handler; use axum::http::StatusCode; @@ -1684,8 +1684,8 @@ mod test { HttpResponse::Arrow(resp) => { let output = resp.data; - let mut reader = - FileReader::try_new(Cursor::new(output), None).expect("Arrow reader error"); + let mut reader = StreamReader::try_new(Cursor::new(output), None) + .expect("Arrow reader error"); let schema = reader.schema(); assert_eq!(schema.fields[0].name(), "numbers"); assert_eq!(schema.fields[0].data_type(), &DataType::UInt32); diff --git a/src/servers/src/http/result/arrow_result.rs b/src/servers/src/http/result/arrow_result.rs index d583c3a590..90f8513827 100644 --- a/src/servers/src/http/result/arrow_result.rs +++ b/src/servers/src/http/result/arrow_result.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use arrow::datatypes::Schema; use arrow_ipc::CompressionType; -use arrow_ipc::writer::{FileWriter, IpcWriteOptions}; +use arrow_ipc::writer::{IpcWriteOptions, StreamWriter}; use axum::http::{HeaderValue, header}; use axum::response::{IntoResponse, Response}; use common_error::status_code::StatusCode; @@ -48,7 +48,7 @@ async fn write_arrow_bytes( let options = IpcWriteOptions::default() .try_with_compression(compression) .context(error::ArrowSnafu)?; - let mut writer = FileWriter::try_new_with_options(&mut bytes, schema, options) + let mut writer = StreamWriter::try_new_with_options(&mut bytes, schema, options) .context(error::ArrowSnafu)?; while let Some(rb) = recordbatches.next().await { @@ -164,7 +164,7 @@ impl IntoResponse for ArrowResponse { mod test { use std::io::Cursor; - use arrow_ipc::reader::FileReader; + use arrow_ipc::reader::StreamReader; use arrow_schema::DataType; use common_recordbatch::{RecordBatch, RecordBatches}; use datatypes::prelude::*; @@ -200,8 +200,8 @@ mod test { match http_resp { HttpResponse::Arrow(resp) => { let output = resp.data; - let mut reader = - FileReader::try_new(Cursor::new(output), None).expect("Arrow reader error"); + let mut reader = StreamReader::try_new(Cursor::new(output), None) + .expect("Arrow reader error"); let schema = reader.schema(); assert_eq!(schema.fields[0].name(), "numbers"); assert_eq!(schema.fields[0].data_type(), &DataType::UInt32); diff --git a/src/servers/src/pending_rows_batcher.rs b/src/servers/src/pending_rows_batcher.rs index b6e07d2a81..4cd8331636 100644 --- a/src/servers/src/pending_rows_batcher.rs +++ b/src/servers/src/pending_rows_batcher.rs @@ -31,15 +31,17 @@ use common_grpc::flight::{FlightEncoder, FlightMessage}; use common_meta::node_manager::NodeManagerRef; use common_query::prelude::{GREPTIME_PHYSICAL_TABLE, greptime_timestamp, greptime_value}; use common_telemetry::tracing_context::TracingContext; -use common_telemetry::{debug, error, warn}; +use common_telemetry::{debug, warn}; use dashmap::DashMap; use dashmap::mapref::entry::Entry; use metric_engine::batch_modifier::{TagColumnInfo, modify_batch_sparse}; use partition::manager::PartitionRuleManagerRef; +use partition::partition::PartitionRuleRef; use session::context::QueryContextRef; use smallvec::SmallVec; -use snafu::{OptionExt, ensure}; +use snafu::{OptionExt, ResultExt, ensure}; use store_api::storage::{RegionId, TableId}; +use table::metadata::{TableInfo, TableInfoRef}; use tokio::sync::{OwnedSemaphorePermit, Semaphore, broadcast, mpsc, oneshot}; use crate::error; @@ -86,6 +88,116 @@ pub trait PendingRowsSchemaAlterer: Send + Sync { pub type PendingRowsSchemaAltererRef = Arc; +#[derive(Clone)] +pub struct PhysicalTableMetadata { + pub table_info: TableInfoRef, + /// Mapping from column name to column id + pub col_name_to_ids: Option>, +} + +#[async_trait] +pub trait PhysicalFlushCatalogProvider: Send + Sync { + async fn physical_table( + &self, + catalog: &str, + schema: &str, + table_name: &str, + query_ctx: &session::context::QueryContext, + ) -> catalog::error::Result>; +} + +#[async_trait] +pub trait PhysicalFlushPartitionProvider: Send + Sync { + async fn find_table_partition_rule( + &self, + table_info: &TableInfo, + ) -> partition::error::Result; + + async fn find_region_leader(&self, region_id: RegionId) -> Result; +} + +#[async_trait] +pub trait PhysicalFlushNodeRequester: Send + Sync { + async fn handle( + &self, + peer: &Peer, + request: RegionRequest, + ) -> Result; +} + +#[derive(Clone)] +struct CatalogManagerPhysicalFlushAdapter { + catalog_manager: CatalogManagerRef, +} + +#[async_trait] +impl PhysicalFlushCatalogProvider for CatalogManagerPhysicalFlushAdapter { + async fn physical_table( + &self, + catalog: &str, + schema: &str, + table_name: &str, + query_ctx: &session::context::QueryContext, + ) -> catalog::error::Result> { + self.catalog_manager + .table(catalog, schema, table_name, Some(query_ctx)) + .await + .map(|table| { + table.map(|table| { + let table_info = table.table_info(); + let name_to_ids = table_info.name_to_ids(); + PhysicalTableMetadata { + table_info, + col_name_to_ids: name_to_ids, + } + }) + }) + } +} + +#[derive(Clone)] +struct PartitionManagerPhysicalFlushAdapter { + partition_manager: PartitionRuleManagerRef, +} + +#[async_trait] +impl PhysicalFlushPartitionProvider for PartitionManagerPhysicalFlushAdapter { + async fn find_table_partition_rule( + &self, + table_info: &TableInfo, + ) -> partition::error::Result { + self.partition_manager + .find_table_partition_rule(table_info) + .await + .map(|(rule, _)| rule) + } + + async fn find_region_leader(&self, region_id: RegionId) -> Result { + let peer = self.partition_manager.find_region_leader(region_id).await?; + Ok(peer) + } +} + +#[derive(Clone)] +struct NodeManagerPhysicalFlushAdapter { + node_manager: NodeManagerRef, +} + +#[async_trait] +impl PhysicalFlushNodeRequester for NodeManagerPhysicalFlushAdapter { + async fn handle( + &self, + peer: &Peer, + request: RegionRequest, + ) -> error::Result { + let datanode = self.node_manager.datanode(peer).await; + datanode + .handle(request) + .await + .context(error::CommonMetaSnafu) + } +} + #[derive(Debug, Clone, Hash, Eq, PartialEq)] struct BatchKey { catalog: String, @@ -94,11 +206,11 @@ struct BatchKey { } #[derive(Debug)] -struct TableBatch { - table_name: String, - table_id: TableId, - batches: Vec, - row_count: usize, +pub struct TableBatch { + pub table_name: String, + pub table_id: TableId, + pub batches: Vec, + pub row_count: usize, } /// Intermediate planning state for resolving and preparing logical tables @@ -114,14 +226,14 @@ struct TableResolutionPlan { struct PendingBatch { tables: HashMap, - created_at: Option, + created_at: Instant, total_row_count: usize, - ctx: Option, + ctx: QueryContextRef, waiters: Vec, } struct FlushWaiter { - response_tx: oneshot::Sender>, + response_tx: oneshot::Sender>>, _permit: OwnedSemaphorePermit, } @@ -142,7 +254,7 @@ enum WorkerCommand { table_batches: Vec<(String, u32, RecordBatch)>, total_rows: usize, ctx: QueryContextRef, - response_tx: oneshot::Sender>, + response_tx: oneshot::Sender>>, _permit: OwnedSemaphorePermit, }, } @@ -301,7 +413,9 @@ impl PendingRowsBatcher { .await .map_err(|_| error::BatcherChannelClosedSnafu.build())? }; - result.map(|()| total_rows as u64) + result + .context(error::SubmitBatchSnafu) + .map(|()| total_rows as u64) } else { Ok(total_rows as u64) } @@ -706,12 +820,12 @@ impl Drop for PendingRowsBatcher { } impl PendingBatch { - fn new() -> Self { + fn new(ctx: QueryContextRef) -> Self { Self { tables: HashMap::new(), - created_at: None, + created_at: Instant::now(), total_row_count: 0, - ctx: None, + ctx, waiters: Vec::new(), } } @@ -733,7 +847,7 @@ fn start_worker( flush_semaphore: Arc, ) { tokio::spawn(async move { - let mut batch = PendingBatch::new(); + let mut batch = None; let mut interval = tokio::time::interval(flush_interval); let mut shutdown_rx = shutdown.subscribe(); let idle_deadline = tokio::time::Instant::now() + worker_idle_timeout; @@ -747,16 +861,15 @@ fn start_worker( Some(WorkerCommand::Submit { table_batches, total_rows, ctx, response_tx, _permit }) => { idle_timer.as_mut().reset(tokio::time::Instant::now() + worker_idle_timeout); - if batch.total_row_count == 0 { - batch.created_at = Some(Instant::now()); - batch.ctx = Some(ctx); + let pending_batch = batch.get_or_insert_with(||{ PENDING_BATCHES.inc(); - } + PendingBatch::new(ctx) + }); - batch.waiters.push(FlushWaiter { response_tx, _permit }); + pending_batch.waiters.push(FlushWaiter { response_tx, _permit }); for (table_name, table_id, record_batch) in table_batches { - let entry = batch.tables.entry(table_name.clone()).or_insert_with(|| TableBatch { + let entry = pending_batch.tables.entry(table_name.clone()).or_insert_with(|| TableBatch { table_name, table_id, batches: Vec::new(), @@ -766,10 +879,10 @@ fn start_worker( entry.batches.push(record_batch); } - batch.total_row_count += total_rows; + pending_batch.total_row_count += total_rows; PENDING_ROWS.add(total_rows as i64); - if batch.total_row_count >= max_batch_rows + if pending_batch.total_row_count >= max_batch_rows && let Some(flush) = drain_batch(&mut batch) { spawn_flush( flush, @@ -794,7 +907,10 @@ fn start_worker( } } _ = &mut idle_timer => { - if !should_close_worker_on_idle_timeout(batch.total_row_count, rx.len()) { + if !should_close_worker_on_idle_timeout( + batch.as_ref().map_or(0, |batch| batch.total_row_count), + rx.len(), + ) { idle_timer .as_mut() .reset(tokio::time::Instant::now() + worker_idle_timeout); @@ -810,9 +926,9 @@ fn start_worker( break; } _ = interval.tick() => { - if let Some(created_at) = batch.created_at - && batch.total_row_count > 0 - && created_at.elapsed() >= flush_interval + if batch + .as_ref() + .is_some_and(|batch| batch.created_at.elapsed() >= flush_interval) && let Some(flush) = drain_batch(&mut batch) { spawn_flush( flush, @@ -862,24 +978,16 @@ fn should_close_worker_on_idle_timeout(total_row_count: usize, queued_requests: total_row_count == 0 && queued_requests == 0 } -fn drain_batch(batch: &mut PendingBatch) -> Option { - if batch.total_row_count == 0 { +fn drain_batch(batch: &mut Option) -> Option { + let batch = batch.take()?; + let total_row_count = batch.total_row_count; + + if total_row_count == 0 { return None; } - let ctx = match batch.ctx.take() { - Some(ctx) => ctx, - None => { - flush_with_error(batch, "Pending batch missing context"); - return None; - } - }; - - let total_row_count = batch.total_row_count; - let table_batches = std::mem::take(&mut batch.tables).into_values().collect(); - let waiters = std::mem::take(&mut batch.waiters); - batch.total_row_count = 0; - batch.created_at = None; + let table_batches = batch.tables.into_values().collect(); + let waiters = batch.waiters; PENDING_ROWS.sub(total_row_count as i64); PENDING_BATCHES.dec(); @@ -887,7 +995,7 @@ fn drain_batch(batch: &mut PendingBatch) -> Option { Some(FlushBatch { table_batches, total_row_count, - ctx, + ctx: batch.ctx, waiters, }) } @@ -914,15 +1022,25 @@ async fn spawn_flush( } struct FlushRegionWrite { - region_id: RegionId, - row_count: usize, datanode: Peer, request: RegionRequest, } -enum FlushWriteResult { - Success { row_count: usize }, - Failed { row_count: usize, message: String }, +struct PlannedRegionBatch { + region_id: RegionId, + batch: RecordBatch, +} + +#[cfg(test)] +impl PlannedRegionBatch { + fn num_rows(&self) -> usize { + self.batch.num_rows() + } +} + +struct ResolvedRegionBatch { + planned: PlannedRegionBatch, + datanode: Peer, } fn should_dispatch_concurrently(region_write_count: usize) -> bool { @@ -1045,65 +1163,35 @@ fn strip_partition_columns_from_batch(batch: RecordBatch) -> Result } ); let essential_indices: Vec = (0..PHYSICAL_REGION_ESSENTIAL_COLUMN_COUNT).collect(); - batch - .project(&essential_indices) - .map_err(|err| Error::Internal { - err_msg: format!("Failed to project essential columns from RecordBatch: {err}"), - }) + batch.project(&essential_indices).context(error::ArrowSnafu) } async fn flush_region_writes_concurrently( - node_manager: NodeManagerRef, + node_manager: &(impl PhysicalFlushNodeRequester + ?Sized), writes: Vec, -) -> Vec { +) -> Result<()> { if !should_dispatch_concurrently(writes.len()) { - let mut results = Vec::with_capacity(writes.len()); for write in writes { - let datanode = node_manager.datanode(&write.datanode).await; let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_write_region"]) .start_timer(); - match datanode.handle(write.request).await { - Ok(_) => results.push(FlushWriteResult::Success { - row_count: write.row_count, - }), - Err(err) => results.push(FlushWriteResult::Failed { - row_count: write.row_count, - message: format!( - "Bulk insert flush failed for region {}: {:?}", - write.region_id, err - ), - }), - } + node_manager.handle(&write.datanode, write.request).await?; } - return results; + return Ok(()); } - let write_futures = writes.into_iter().map(|write| { - let node_manager = node_manager.clone(); - async move { - let datanode = node_manager.datanode(&write.datanode).await; - let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED - .with_label_values(&["flush_write_region"]) - .start_timer(); + let write_futures = writes.into_iter().map(|write| async move { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_write_region"]) + .start_timer(); - match datanode.handle(write.request).await { - Ok(_) => FlushWriteResult::Success { - row_count: write.row_count, - }, - Err(err) => FlushWriteResult::Failed { - row_count: write.row_count, - message: format!( - "Bulk insert flush failed for region {}: {:?}", - write.region_id, err - ), - }, - } - } + node_manager.handle(&write.datanode, write.request).await?; + Ok::<_, Error>(()) }); // todo(hl): should be bounded. - futures::future::join_all(write_futures).await + futures::future::try_join_all(write_futures).await?; + Ok(()) } async fn flush_batch( @@ -1119,7 +1207,6 @@ async fn flush_batch( waiters, } = flush; let start = Instant::now(); - let mut first_error: Option = None; // Physical-table-level flush: transform all logical table batches // into physical format and write them together. @@ -1127,169 +1214,148 @@ async fn flush_batch( .extension(PHYSICAL_TABLE_KEY) .unwrap_or(GREPTIME_PHYSICAL_TABLE) .to_string(); - flush_batch_physical( + let partition_provider = PartitionManagerPhysicalFlushAdapter { partition_manager }; + let node_requester = NodeManagerPhysicalFlushAdapter { node_manager }; + let catalog_provider = CatalogManagerPhysicalFlushAdapter { catalog_manager }; + let result = flush_batch_physical( &table_batches, - total_row_count, &physical_table_name, &ctx, - &partition_manager, - &node_manager, - &catalog_manager, - &mut first_error, + &partition_provider, + &node_requester, + &catalog_provider, ) .await; let elapsed = start.elapsed().as_secs_f64(); FLUSH_ELAPSED.observe(elapsed); + + if result.is_err() { + FLUSH_FAILURES.inc(); + FLUSH_DROPPED_ROWS.inc_by(total_row_count as u64); + } else { + FLUSH_TOTAL.inc(); + FLUSH_ROWS.observe(total_row_count as f64); + } + debug!( "Pending rows batch flushed, total rows: {}, elapsed time: {}s", total_row_count, elapsed ); - notify_waiters(waiters, &first_error); + notify_waiters(waiters, result); } -/// Attempts to flush all table batches by transforming them into the physical -/// table format (sparse primary key encoding) and writing directly to the -/// physical data regions. +/// Flushes a batch of logical table rows by transforming them into the physical table format +/// and writing them to the appropriate datanode regions. /// -/// This is the only flush path. Any failure in resolving or transforming the -/// physical flush inputs is recorded as flush failure and reported to waiters. -#[allow(clippy::too_many_arguments)] -async fn flush_batch_physical( +/// This function performs the end-to-end physical flush pipeline: +/// 1. Resolves the physical table metadata and column ID mapping. +/// 2. Fetches the physical table's partition rule. +/// 3. Transforms each logical table batch into the physical (sparse primary key) format. +/// 4. Concatenates all transformed batches into a single combined batch. +/// 5. Splits the combined batch by partition rule and sends region write requests +/// concurrently to the target datanodes. +pub async fn flush_batch_physical( table_batches: &[TableBatch], - total_row_count: usize, physical_table_name: &str, ctx: &QueryContextRef, - partition_manager: &PartitionRuleManagerRef, - node_manager: &NodeManagerRef, - catalog_manager: &CatalogManagerRef, - first_error: &mut Option, -) { - macro_rules! record_failure { - ($row_count:expr, $msg:expr) => {{ - let msg = $msg; - if first_error.is_none() { - *first_error = Some(msg.clone()); - } - mark_flush_failure($row_count, &msg); - }}; - } - + partition_manager: &(impl PhysicalFlushPartitionProvider + ?Sized), + node_manager: &(impl PhysicalFlushNodeRequester + ?Sized), + catalog_manager: &(impl PhysicalFlushCatalogProvider + ?Sized), +) -> Result<()> { // 1. Resolve the physical table and get column ID mapping let physical_table = { let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_physical_resolve_table"]) .start_timer(); - match catalog_manager - .table( + catalog_manager + .physical_table( ctx.current_catalog(), &ctx.current_schema(), physical_table_name, - Some(ctx.as_ref()), + ctx.as_ref(), ) - .await - { - Ok(Some(table)) => table, - Ok(None) => { - record_failure!( - total_row_count, - format!( - "Physical table '{}' not found during pending flush", - physical_table_name - ) - ); - return; - } - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to resolve physical table '{}' for pending flush: {:?}", - physical_table_name, err - ) - ); - return; - } - } + .await? + .with_context(|| error::InternalSnafu { + err_msg: format!( + "Physical table '{}' not found during pending flush", + physical_table_name + ), + })? }; - let physical_table_info = physical_table.table_info(); - let name_to_ids = match physical_table_info.name_to_ids() { - Some(ids) => ids, - None => { - record_failure!( - total_row_count, - format!( - "Physical table '{}' has no column IDs for pending flush", - physical_table_name - ) - ); - return; - } - }; + let physical_table_info = physical_table.table_info; + let name_to_ids = physical_table + .col_name_to_ids + .with_context(|| error::InternalSnafu { + err_msg: format!( + "Physical table '{}' has no column IDs for pending flush", + physical_table_name + ), + })?; // 2. Get the physical table's partition rule (one lookup instead of N) let partition_rule = { let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_physical_fetch_partition_rule"]) .start_timer(); - match partition_manager - .find_table_partition_rule(&physical_table_info) - .await - { - Ok(rule) => rule, - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to fetch partition rule for physical table '{}': {:?}", - physical_table_name, err - ) - ); - return; - } - } + partition_manager + .find_table_partition_rule(physical_table_info.as_ref()) + .await? }; - let partition_columns = partition_rule.0.partition_columns(); + let partition_columns = partition_rule.partition_columns(); let partition_columns_set: HashSet<&str> = partition_columns.iter().map(String::as_str).collect(); // 3. Transform each logical table batch into physical format - let mut modified_batches: Vec = Vec::with_capacity(table_batches.len()); - let mut modified_row_count: usize = 0; + let modified_batches = + transform_logical_batches_to_physical(table_batches, &name_to_ids, &partition_columns_set)?; + + // 4. Concatenate all modified batches (all share the same physical schema) + let combined_batch = concat_modified_batches(&modified_batches)?; + + // 5. Split by physical partition rule and send to regions + let physical_table_id = physical_table_info.table_id(); + let planned_batches = plan_region_batches( + combined_batch, + physical_table_id, + partition_rule.as_ref(), + partition_columns, + )?; + + let resolved_batches = resolve_region_targets(planned_batches, partition_manager).await?; + let region_writes = encode_region_write_requests(resolved_batches)?; + flush_region_writes_concurrently(node_manager, region_writes).await +} + +/// Transforms logical table batches into physical format (sparse primary key encoding). +/// +/// It identifies tag columns and essential columns (timestamp, value) for each logical batch +/// and applies sparse primary key modification. +fn transform_logical_batches_to_physical( + table_batches: &[TableBatch], + name_to_ids: &HashMap, + partition_columns_set: &HashSet<&str>, +) -> Result> { + let mut modified_batches: Vec = + Vec::with_capacity(table_batches.iter().map(|b| b.batches.len()).sum()); let mut modify_elapsed = Duration::ZERO; let mut columns_taxonomy_elapsed = Duration::ZERO; - 'next_table: for table_batch in table_batches { + for table_batch in table_batches { let table_id = table_batch.table_id; - // Transform each chunk to physical format directly, avoiding an - // intermediate concat_batches per logical table. for batch in &table_batch.batches { - // Identify tag columns and non-tag columns from the logical batch schema. - // Chunks within a table_batch may have different schemas if new tag columns - // are added between submits. - // In prom batches, Float64 = value, Timestamp = timestamp, Utf8 = tags. let batch_schema = batch.schema(); let start = Instant::now(); - let (tag_columns, essential_col_indices) = match columns_taxonomy( + let (tag_columns, essential_col_indices) = columns_taxonomy( &batch_schema, &table_batch.table_name, - &name_to_ids, - &partition_columns_set, - ) { - Ok(columns) => columns, - Err(err) => { - warn!( - "Failed to resolve columns for logical table '{}': {:?}", - table_batch.table_name, err - ); - record_failure!(table_batch.row_count, err.to_string()); - continue 'next_table; - } - }; + name_to_ids, + partition_columns_set, + )?; columns_taxonomy_elapsed += start.elapsed(); if tag_columns.is_empty() && essential_col_indices.is_empty() { @@ -1298,30 +1364,16 @@ async fn flush_batch_physical( let modified = { let start = Instant::now(); - match modify_batch_sparse( + let batch = modify_batch_sparse( batch.clone(), table_id, &tag_columns, &essential_col_indices, - ) { - Ok(batch) => { - modify_elapsed += start.elapsed(); - batch - } - Err(err) => { - record_failure!( - table_batch.row_count, - format!( - "Failed to modify batch for logical table '{}': {:?}", - table_batch.table_name, err - ) - ); - continue 'next_table; - } - } + )?; + modify_elapsed += start.elapsed(); + batch }; - modified_row_count += modified.num_rows(); modified_batches.push(modified); } } @@ -1333,147 +1385,130 @@ async fn flush_batch_physical( .with_label_values(&["flush_physical_columns_taxonomy"]) .observe(columns_taxonomy_elapsed.as_secs_f64()); - if modified_batches.is_empty() { - if first_error.is_none() { - record_failure!( - total_row_count, - format!( - "No batches can be transformed for physical table '{}' during pending flush", - physical_table_name - ) - ); + ensure!( + !modified_batches.is_empty(), + error::InternalSnafu { + err_msg: "No batches can be transformed during pending flush", } - return; + ); + Ok(modified_batches) +} + +/// Concatenates all modified batches into a single large batch. +/// +/// All modified batches share the same physical schema. +fn concat_modified_batches(modified_batches: &[RecordBatch]) -> Result { + let combined_schema = modified_batches[0].schema(); + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_concat_all"]) + .start_timer(); + concat_batches(&combined_schema, modified_batches).context(error::ArrowSnafu) +} + +fn split_combined_batch_by_region( + combined_batch: &RecordBatch, + partition_rule: &dyn partition::partition::PartitionRule, +) -> Result> { + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_split_record_batch"]) + .start_timer(); + let map = partition_rule.split_record_batch(combined_batch)?; + Ok(map) +} + +fn prepare_physical_region_routing_batch( + combined_batch: RecordBatch, + partition_columns: &[String], +) -> Result { + if partition_columns.is_empty() { + return Ok(combined_batch); + } + strip_partition_columns_from_batch(combined_batch) +} + +fn plan_region_batch( + stripped_batch: &RecordBatch, + physical_table_id: TableId, + region_number: u32, + mask: &partition::partition::RegionMask, +) -> Result> { + if mask.select_none() { + return Ok(None); } - // 4. Concatenate all modified batches (all share the same physical schema) - let combined_batch = { - let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED - .with_label_values(&["flush_physical_concat_all"]) - .start_timer(); - let combined_schema = modified_batches[0].schema(); - // todo(hl): maybe limit max rows to concat. - match concat_batches(&combined_schema, &modified_batches) { - Ok(batch) => batch, - Err(err) => { - record_failure!( - modified_row_count, - format!("Failed to concat modified batches: {:?}", err) - ); - return; - } - } - }; - - // 5. Split by physical partition rule and send to regions - let physical_table_id = physical_table_info.table_id(); - let region_masks = { - let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED - .with_label_values(&["flush_physical_split_record_batch"]) - .start_timer(); - match partition_rule.0.split_record_batch(&combined_batch) { - Ok(masks) => masks, - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to split combined batch for physical table '{}': {:?}", - physical_table_name, err - ) - ); - return; - } - } - }; - - let stripped_batch = if partition_columns.is_empty() { - combined_batch + let region_batch = if mask.select_all() { + stripped_batch.clone() } else { - // Strip partition columns before encoding and sending requests. - match strip_partition_columns_from_batch(combined_batch) { - Ok(batch) => batch, - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to strip partition columns for physical table '{}': {:?}", - physical_table_name, err - ) - ); - return; - } - } + let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED + .with_label_values(&["flush_physical_filter_record_batch"]) + .start_timer(); + filter_record_batch(stripped_batch, mask.array()).context(error::ArrowSnafu)? }; - let mut region_writes = Vec::new(); + let row_count = region_batch.num_rows(); + if row_count == 0 { + return Ok(None); + } + + Ok(Some(PlannedRegionBatch { + region_id: RegionId::new(physical_table_id, region_number), + batch: region_batch, + })) +} + +fn plan_region_batches( + combined_batch: RecordBatch, + physical_table_id: TableId, + partition_rule: &dyn partition::partition::PartitionRule, + partition_columns: &[String], +) -> Result> { + let region_masks = split_combined_batch_by_region(&combined_batch, partition_rule)?; + let stripped_batch = prepare_physical_region_routing_batch(combined_batch, partition_columns)?; + + let mut planned_batches = Vec::new(); for (region_number, mask) in region_masks { - if mask.select_none() { - continue; + if let Some(planned_batch) = + plan_region_batch(&stripped_batch, physical_table_id, region_number, &mask)? + { + planned_batches.push(planned_batch); } + } - let region_batch = if mask.select_all() { - stripped_batch.clone() - } else { - let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED - .with_label_values(&["flush_physical_filter_record_batch"]) - .start_timer(); - match filter_record_batch(&stripped_batch, mask.array()) { - Ok(batch) => batch, - Err(err) => { - record_failure!( - total_row_count, - format!( - "Failed to filter combined batch for physical table '{}': {:?}", - physical_table_name, err - ) - ); - continue; - } - } - }; + Ok(planned_batches) +} - let row_count = region_batch.num_rows(); - if row_count == 0 { - continue; - } - - let region_id = RegionId::new(physical_table_id, region_number); +async fn resolve_region_targets( + planned_batches: Vec, + partition_manager: &(impl PhysicalFlushPartitionProvider + ?Sized), +) -> Result> { + let mut resolved_batches = Vec::with_capacity(planned_batches.len()); + for planned in planned_batches { let datanode = { let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_physical_resolve_region_leader"]) .start_timer(); - match partition_manager.find_region_leader(region_id).await { - Ok(peer) => peer, - Err(err) => { - record_failure!( - row_count, - format!( - "Failed to resolve region leader for physical region {}: {:?}", - region_id, err - ) - ); - continue; - } - } + partition_manager + .find_region_leader(planned.region_id) + .await? }; + resolved_batches.push(ResolvedRegionBatch { planned, datanode }); + } + + Ok(resolved_batches) +} + +fn encode_region_write_requests( + resolved_batches: Vec, +) -> Result> { + let mut region_writes = Vec::with_capacity(resolved_batches.len()); + for resolved in resolved_batches { + let region_id = resolved.planned.region_id; let (schema_bytes, data_header, payload) = { let _timer = PENDING_ROWS_BATCH_FLUSH_STAGE_ELAPSED .with_label_values(&["flush_physical_encode_ipc"]) .start_timer(); - match record_batch_to_ipc(region_batch) { - Ok(encoded) => encoded, - Err(err) => { - record_failure!( - row_count, - format!( - "Failed to encode Arrow IPC for physical region {}: {:?}", - region_id, err - ) - ); - continue; - } - } + record_batch_to_ipc(resolved.planned.batch)? }; let request = RegionRequest { @@ -1493,65 +1528,25 @@ async fn flush_batch_physical( }; region_writes.push(FlushRegionWrite { - region_id, - row_count, - datanode, + datanode: resolved.datanode, request, }); } - for result in flush_region_writes_concurrently(node_manager.clone(), region_writes).await { - match result { - FlushWriteResult::Success { row_count } => { - FLUSH_TOTAL.inc(); - FLUSH_ROWS.observe(row_count as f64); - } - FlushWriteResult::Failed { row_count, message } => { - record_failure!(row_count, message); - } - } - } + Ok(region_writes) } -fn notify_waiters(waiters: Vec, first_error: &Option) { +fn notify_waiters(waiters: Vec, result: Result<()>) { + let shared_result = result.map_err(Arc::new); for waiter in waiters { - let result = match first_error { - Some(err_msg) => Err(Error::Internal { - err_msg: err_msg.clone(), - }), - None => Ok(()), - }; - let _ = waiter.response_tx.send(result); + let _ = waiter.response_tx.send(match &shared_result { + Ok(()) => Ok(()), + Err(error) => Err(Arc::clone(error)), + }); // waiter._permit is dropped here, releasing the inflight semaphore slot } } -fn mark_flush_failure(row_count: usize, message: &str) { - error!("Pending rows batch flush failed, message: {}", message); - FLUSH_FAILURES.inc(); - FLUSH_DROPPED_ROWS.inc_by(row_count as u64); -} - -fn flush_with_error(batch: &mut PendingBatch, message: &str) { - if batch.total_row_count == 0 { - return; - } - - let row_count = batch.total_row_count; - let waiters = std::mem::take(&mut batch.waiters); - batch.tables.clear(); - batch.total_row_count = 0; - batch.created_at = None; - batch.ctx = None; - - PENDING_ROWS.sub(row_count as i64); - PENDING_BATCHES.dec(); - - let err_msg = Some(message.to_string()); - notify_waiters(waiters, &err_msg); - mark_flush_failure(row_count, message); -} - fn record_batch_to_ipc(record_batch: RecordBatch) -> Result<(Bytes, Bytes, Bytes)> { let mut encoder = FlightEncoder::default(); let schema = encoder.encode_schema(record_batch.schema().as_ref()); @@ -1581,17 +1576,18 @@ mod tests { use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; - use std::time::Duration; + use std::time::{Duration, Instant}; use api::region::RegionResponse; use api::v1::flow::{DirtyWindowRequests, FlowRequest, FlowResponse}; use api::v1::meta::Peer; - use api::v1::region::{InsertRequests, RegionRequest}; + use api::v1::region::{InsertRequests, RegionRequest, region_request}; use api::v1::{ColumnSchema, Row, RowInsertRequest, RowInsertRequests, Rows}; - use arrow::array::{BinaryArray, StringArray, TimestampMillisecondArray}; + use arrow::array::{BinaryArray, BooleanArray, StringArray, TimestampMillisecondArray}; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; use arrow::record_batch::RecordBatch; use async_trait::async_trait; + use catalog::error::Result as CatalogResult; use common_meta::error::Result as MetaResult; use common_meta::node_manager::{ Datanode, DatanodeManager, DatanodeRef, Flownode, FlownodeManager, FlownodeRef, @@ -1599,17 +1595,28 @@ mod tests { use common_query::request::QueryRequest; use common_recordbatch::SendableRecordBatchStream; use dashmap::DashMap; + use datatypes::schema::{ColumnSchema as DtColumnSchema, Schema as DtSchema}; + use partition::error::Result as PartitionResult; + use partition::partition::{PartitionRule, PartitionRuleRef, RegionMask}; use smallvec::SmallVec; + use snafu::ResultExt; use store_api::storage::RegionId; - use tokio::sync::mpsc; + use table::metadata::TableId; + use table::test_util::table_info::test_table_info; + use tokio::sync::{Semaphore, mpsc, oneshot}; use tokio::time::sleep; use super::{ - BatchKey, Error, FlushRegionWrite, FlushWriteResult, PendingRowsBatcher, PendingWorker, - WorkerCommand, columns_taxonomy, flush_region_writes_concurrently, - remove_worker_if_same_channel, should_close_worker_on_idle_timeout, + BatchKey, Error, FlushRegionWrite, FlushWaiter, PendingBatch, PendingRowsBatcher, + PendingWorker, PhysicalFlushCatalogProvider, PhysicalFlushNodeRequester, + PhysicalFlushPartitionProvider, PhysicalTableMetadata, PlannedRegionBatch, + ResolvedRegionBatch, TableBatch, WorkerCommand, columns_taxonomy, drain_batch, + encode_region_write_requests, flush_batch_physical, flush_region_writes_concurrently, + plan_region_batches, remove_worker_if_same_channel, should_close_worker_on_idle_timeout, should_dispatch_concurrently, strip_partition_columns_from_batch, + transform_logical_batches_to_physical, }; + use crate::error; fn mock_rows(row_count: usize, schema_name: &str) -> Rows { Rows { @@ -1621,6 +1628,190 @@ mod tests { } } + fn mock_tag_batch(tag_name: &str, tag_value: &str, ts: i64, val: f64) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new(tag_name, ArrowDataType::Utf8, true), + ])); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(TimestampMillisecondArray::from(vec![ts])), + Arc::new(arrow::array::Float64Array::from(vec![val])), + Arc::new(StringArray::from(vec![tag_value])), + ], + ) + .unwrap() + } + + fn mock_physical_table_metadata(table_id: TableId) -> PhysicalTableMetadata { + let schema = Arc::new( + DtSchema::try_new(vec![ + DtColumnSchema::new( + "__primary_key", + datatypes::prelude::ConcreteDataType::binary_datatype(), + false, + ), + DtColumnSchema::new( + "greptime_timestamp", + datatypes::prelude::ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + DtColumnSchema::new( + "greptime_value", + datatypes::prelude::ConcreteDataType::float64_datatype(), + true, + ), + DtColumnSchema::new( + "tag1", + datatypes::prelude::ConcreteDataType::string_datatype(), + true, + ), + ]) + .unwrap(), + ); + let mut table_info = test_table_info(table_id, "phy", "public", "greptime", schema); + table_info.meta.column_ids = vec![0, 1, 2, 3]; + + PhysicalTableMetadata { + table_info: Arc::new(table_info), + col_name_to_ids: Some(HashMap::from([("tag1".to_string(), 3)])), + } + } + + struct MockFlushCatalogProvider { + table: Option, + } + + #[async_trait] + impl PhysicalFlushCatalogProvider for MockFlushCatalogProvider { + async fn physical_table( + &self, + _catalog: &str, + _schema: &str, + _table_name: &str, + _query_ctx: &session::context::QueryContext, + ) -> CatalogResult> { + Ok(self.table.clone()) + } + } + + struct SingleRegionPartitionRule; + + impl PartitionRule for SingleRegionPartitionRule { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn partition_columns(&self) -> &[String] { + &[] + } + + fn find_region( + &self, + _values: &[datatypes::prelude::Value], + ) -> partition::error::Result { + unimplemented!() + } + + fn split_record_batch( + &self, + record_batch: &RecordBatch, + ) -> partition::error::Result> + { + Ok(HashMap::from([( + 1, + RegionMask::new( + arrow::array::BooleanArray::from(vec![true; record_batch.num_rows()]), + record_batch.num_rows(), + ), + )])) + } + } + + struct TwoRegionPartitionRule { + partition_columns: Vec, + } + + impl PartitionRule for TwoRegionPartitionRule { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn partition_columns(&self) -> &[String] { + &self.partition_columns + } + + fn find_region( + &self, + _values: &[datatypes::prelude::Value], + ) -> partition::error::Result { + unimplemented!() + } + + fn split_record_batch( + &self, + _record_batch: &RecordBatch, + ) -> partition::error::Result> + { + Ok(HashMap::from([ + (1, RegionMask::new(BooleanArray::from(vec![true, false]), 1)), + (2, RegionMask::new(BooleanArray::from(vec![false, true]), 1)), + ( + 3, + RegionMask::new(BooleanArray::from(vec![false, false]), 0), + ), + ])) + } + } + + struct MockFlushPartitionProvider { + partition_rule_calls: Arc, + region_leader_calls: Arc, + } + + #[async_trait] + impl PhysicalFlushPartitionProvider for MockFlushPartitionProvider { + async fn find_table_partition_rule( + &self, + _table_info: &table::metadata::TableInfo, + ) -> PartitionResult { + self.partition_rule_calls.fetch_add(1, Ordering::SeqCst); + Ok(Arc::new(SingleRegionPartitionRule)) + } + + async fn find_region_leader(&self, _region_id: RegionId) -> error::Result { + self.region_leader_calls.fetch_add(1, Ordering::SeqCst); + Ok(Peer { + id: 1, + addr: "node-1".to_string(), + }) + } + } + + #[derive(Default)] + struct MockFlushNodeRequester { + writes: Arc, + } + + #[async_trait] + impl PhysicalFlushNodeRequester for MockFlushNodeRequester { + async fn handle( + &self, + _peer: &Peer, + _request: RegionRequest, + ) -> error::Result { + self.writes.fetch_add(1, Ordering::SeqCst); + Ok(RegionResponse::new(0)) + } + } + #[test] fn test_collect_non_empty_table_rows_filters_empty_payloads() { let requests = RowInsertRequests { @@ -1648,6 +1839,38 @@ mod tests { assert_eq!(2, table_rows[0].1.rows.len()); } + #[test] + fn test_drain_batch_takes_initialized_pending_batch_from_option() { + let ctx = session::context::QueryContext::arc(); + let (response_tx, _response_rx) = oneshot::channel(); + let permit = Arc::new(Semaphore::new(1)).try_acquire_owned().unwrap(); + let mut batch = Some(PendingBatch { + tables: HashMap::from([( + "cpu".to_string(), + TableBatch { + table_name: "cpu".to_string(), + table_id: 42, + batches: vec![mock_tag_batch("tag1", "host-1", 1000, 1.0)], + row_count: 1, + }, + )]), + created_at: Instant::now(), + total_row_count: 1, + ctx: ctx.clone(), + waiters: vec![FlushWaiter { + response_tx, + _permit: permit, + }], + }); + + let flush = drain_batch(&mut batch).unwrap(); + + assert!(batch.is_none()); + assert_eq!(1, flush.total_row_count); + assert_eq!(1, flush.table_batches.len()); + assert_eq!(ctx.current_catalog(), flush.ctx.current_catalog()); + } + #[derive(Clone)] struct ConcurrentMockDatanode { delay: Duration, @@ -1728,6 +1951,21 @@ mod tests { } } + #[async_trait] + impl PhysicalFlushNodeRequester for ConcurrentMockNodeManager { + async fn handle( + &self, + peer: &Peer, + request: RegionRequest, + ) -> error::Result { + let datanode = self.datanode(peer).await; + datanode + .handle(request) + .await + .context(error::CommonMetaSnafu) + } + } + #[test] fn test_remove_worker_if_same_channel_removes_matching_entry() { let workers = DashMap::new(); @@ -1798,8 +2036,6 @@ mod tests { let writes = vec![ FlushRegionWrite { - region_id: RegionId::new(1024, 1), - row_count: 10, datanode: Peer { id: 1, addr: "node1".to_string(), @@ -1807,8 +2043,6 @@ mod tests { request: RegionRequest::default(), }, FlushRegionWrite { - region_id: RegionId::new(1024, 2), - row_count: 12, datanode: Peer { id: 2, addr: "node2".to_string(), @@ -1817,13 +2051,9 @@ mod tests { }, ]; - let results = flush_region_writes_concurrently(node_manager, writes).await; - assert_eq!(2, results.len()); - assert!( - results - .iter() - .all(|result| matches!(result, FlushWriteResult::Success { .. })) - ); + flush_region_writes_concurrently(node_manager.as_ref(), writes) + .await + .unwrap(); assert!(max_inflight.load(Ordering::SeqCst) >= 2); } @@ -2108,4 +2338,304 @@ mod tests { "PK should be different because batch2 has tag2!" ); } + + #[test] + fn test_transform_logical_batches_to_physical_success() { + let batch = mock_tag_batch("tag1", "v1", 1000, 1.0); + + let table_batches = vec![TableBatch { + table_name: "t1".to_string(), + table_id: 1, + batches: vec![batch], + row_count: 1, + }]; + + let name_to_ids = HashMap::from([("tag1".to_string(), 1)]); + let partition_columns = HashSet::new(); + let modified = + transform_logical_batches_to_physical(&table_batches, &name_to_ids, &partition_columns) + .unwrap(); + + assert_eq!(1, modified.len()); + assert_eq!(3, modified[0].num_columns()); + assert_eq!("__primary_key", modified[0].schema().field(0).name()); + assert_eq!("greptime_timestamp", modified[0].schema().field(1).name()); + assert_eq!("greptime_value", modified[0].schema().field(2).name()); + } + + #[test] + fn test_transform_logical_batches_to_physical_taxonomy_failure() { + let batch = mock_tag_batch("tag1", "v1", 1000, 1.0); + + let table_batches = vec![TableBatch { + table_name: "t1".to_string(), + table_id: 1, + batches: vec![batch], + row_count: 1, + }]; + + // tag1 is missing from name_to_ids, causing columns_taxonomy to fail. + let name_to_ids = HashMap::new(); + let partition_columns = HashSet::new(); + let err = + transform_logical_batches_to_physical(&table_batches, &name_to_ids, &partition_columns) + .unwrap_err(); + + assert!( + err.to_string() + .contains("not found in physical table column IDs") + ); + } + + #[test] + fn test_transform_logical_batches_to_physical_multiple_batches() { + let batch1 = mock_tag_batch("tag1", "v1", 1000, 1.0); + let batch2 = mock_tag_batch("tag2", "v2", 2000, 2.0); + + let table_batches = vec![ + TableBatch { + table_name: "t1".to_string(), + table_id: 1, + batches: vec![batch1], + row_count: 1, + }, + TableBatch { + table_name: "t2".to_string(), + table_id: 2, + batches: vec![batch2], + row_count: 1, + }, + ]; + + let name_to_ids = HashMap::from([("tag1".to_string(), 1), ("tag2".to_string(), 2)]); + let partition_columns = HashSet::new(); + let modified = + transform_logical_batches_to_physical(&table_batches, &name_to_ids, &partition_columns) + .unwrap(); + + assert_eq!(2, modified.len()); + } + + #[test] + fn test_transform_logical_batches_to_physical_mixed_success_failure() { + let batch1 = mock_tag_batch("tag1", "v1", 1000, 1.0); + let batch2 = mock_tag_batch("tag2", "v2", 2000, 2.0); + + let table_batches = vec![ + TableBatch { + table_name: "t1".to_string(), + table_id: 1, + batches: vec![batch1], + row_count: 1, + }, + TableBatch { + table_name: "t2".to_string(), + table_id: 2, + batches: vec![batch2], + row_count: 1, + }, + ]; + + // tag1 is missing from name_to_ids, causing batch1 to fail. + let name_to_ids = HashMap::from([("tag2".to_string(), 2)]); + let partition_columns = HashSet::new(); + let err = + transform_logical_batches_to_physical(&table_batches, &name_to_ids, &partition_columns) + .unwrap_err(); + + assert!(err.to_string().contains("tag1")); + } + + #[tokio::test] + async fn test_flush_batch_physical_uses_mockable_trait_dependencies() { + let table_batches = vec![TableBatch { + table_name: "t1".to_string(), + table_id: 11, + batches: vec![mock_tag_batch("tag1", "host-1", 1000, 1.0)], + row_count: 1, + }]; + let partition_calls = Arc::new(AtomicUsize::new(0)); + let leader_calls = Arc::new(AtomicUsize::new(0)); + let node = MockFlushNodeRequester::default(); + let ctx = session::context::QueryContext::arc(); + + flush_batch_physical( + &table_batches, + "phy", + &ctx, + &MockFlushPartitionProvider { + partition_rule_calls: partition_calls.clone(), + region_leader_calls: leader_calls.clone(), + }, + &node, + &MockFlushCatalogProvider { + table: Some(mock_physical_table_metadata(1024)), + }, + ) + .await + .unwrap(); + + assert_eq!(1, partition_calls.load(Ordering::SeqCst)); + assert_eq!(1, leader_calls.load(Ordering::SeqCst)); + assert_eq!(1, node.writes.load(Ordering::SeqCst)); + } + + #[tokio::test] + async fn test_flush_batch_physical_stops_before_partition_and_node_when_table_missing() { + let table_batches = vec![TableBatch { + table_name: "t1".to_string(), + table_id: 11, + batches: vec![mock_tag_batch("tag1", "host-1", 1000, 1.0)], + row_count: 1, + }]; + let partition_calls = Arc::new(AtomicUsize::new(0)); + let leader_calls = Arc::new(AtomicUsize::new(0)); + let node = MockFlushNodeRequester::default(); + let ctx = session::context::QueryContext::arc(); + + let err = flush_batch_physical( + &table_batches, + "missing_phy", + &ctx, + &MockFlushPartitionProvider { + partition_rule_calls: partition_calls.clone(), + region_leader_calls: leader_calls.clone(), + }, + &node, + &MockFlushCatalogProvider { table: None }, + ) + .await + .unwrap_err(); + + assert!( + err.to_string() + .contains("Physical table 'missing_phy' not found") + ); + assert_eq!(0, partition_calls.load(Ordering::SeqCst)); + assert_eq!(0, leader_calls.load(Ordering::SeqCst)); + assert_eq!(0, node.writes.load(Ordering::SeqCst)); + } + + #[tokio::test] + async fn test_flush_batch_physical_aborts_immediately_on_transform_error() { + let table_batches = vec![ + TableBatch { + table_name: "broken".to_string(), + table_id: 11, + batches: vec![mock_tag_batch("unknown_tag", "host-1", 1000, 1.0)], + row_count: 1, + }, + TableBatch { + table_name: "healthy".to_string(), + table_id: 12, + batches: vec![mock_tag_batch("tag1", "host-2", 2000, 2.0)], + row_count: 1, + }, + ]; + let partition_calls = Arc::new(AtomicUsize::new(0)); + let leader_calls = Arc::new(AtomicUsize::new(0)); + let node = MockFlushNodeRequester::default(); + let ctx = session::context::QueryContext::arc(); + + let err = flush_batch_physical( + &table_batches, + "phy", + &ctx, + &MockFlushPartitionProvider { + partition_rule_calls: partition_calls.clone(), + region_leader_calls: leader_calls.clone(), + }, + &node, + &MockFlushCatalogProvider { + table: Some(mock_physical_table_metadata(1024)), + }, + ) + .await + .unwrap_err(); + + assert!(err.to_string().contains("unknown_tag")); + assert_eq!(1, partition_calls.load(Ordering::SeqCst)); + assert_eq!(0, leader_calls.load(Ordering::SeqCst)); + assert_eq!(0, node.writes.load(Ordering::SeqCst)); + } + + #[test] + fn test_plan_region_batches_splits_and_strips_partition_columns() { + let combined_batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("__primary_key", ArrowDataType::Binary, false), + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + Field::new("host", ArrowDataType::Utf8, true), + ])), + vec![ + Arc::new(BinaryArray::from(vec![b"k1".as_slice(), b"k2".as_slice()])), + Arc::new(TimestampMillisecondArray::from(vec![1000_i64, 2000_i64])), + Arc::new(arrow::array::Float64Array::from(vec![1.0_f64, 2.0_f64])), + Arc::new(StringArray::from(vec!["node-1", "node-2"])), + ], + ) + .unwrap(); + let mut planned_batches = plan_region_batches( + combined_batch, + 1024, + &TwoRegionPartitionRule { + partition_columns: vec!["host".to_string()], + }, + &["host".to_string()], + ) + .unwrap(); + planned_batches.sort_by_key(|planned| planned.region_id.region_number()); + + assert_eq!(2, planned_batches.len()); + assert_eq!(RegionId::new(1024, 1), planned_batches[0].region_id); + assert_eq!(1, planned_batches[0].num_rows()); + assert_eq!(3, planned_batches[0].batch.num_columns()); + assert_eq!(RegionId::new(1024, 2), planned_batches[1].region_id); + assert_eq!(1, planned_batches[1].num_rows()); + assert_eq!(3, planned_batches[1].batch.num_columns()); + } + + #[test] + fn test_encode_region_write_requests_builds_bulk_insert_requests() { + let planned_batch = PlannedRegionBatch { + region_id: RegionId::new(1024, 1), + batch: RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("__primary_key", ArrowDataType::Binary, false), + Field::new( + "greptime_timestamp", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + Field::new("greptime_value", ArrowDataType::Float64, true), + ])), + vec![ + Arc::new(BinaryArray::from(vec![b"k1".as_slice()])), + Arc::new(TimestampMillisecondArray::from(vec![1000_i64])), + Arc::new(arrow::array::Float64Array::from(vec![1.0_f64])), + ], + ) + .unwrap(), + }; + let resolved_batch = ResolvedRegionBatch { + planned: planned_batch, + datanode: Peer { + id: 1, + addr: "node-1".to_string(), + }, + }; + let writes = encode_region_write_requests(vec![resolved_batch]).unwrap(); + + assert_eq!(1, writes.len()); + assert_eq!(1, writes[0].datanode.id); + let Some(region_request::Body::BulkInsert(request)) = &writes[0].request.body else { + panic!("expected bulk insert request"); + }; + assert_eq!(RegionId::new(1024, 1).as_u64(), request.region_id); + } } diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs index 7e9b75c036..2b84b3aa30 100644 --- a/src/servers/src/postgres/handler.rs +++ b/src/servers/src/postgres/handler.rs @@ -529,7 +529,7 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner { .collect::>(); if let Some(schema) = &sql_plan.schema { - schema_to_pg(schema, &Format::UnifiedBinary, None) + schema_to_pg(schema, &Format::UnifiedText, None) .map(|fields| DescribeStatementResponse::new(param_types, fields)) .map_err(convert_err) } else { diff --git a/src/store-api/src/storage/requests.rs b/src/store-api/src/storage/requests.rs index d072ec1b39..db3fb0388a 100644 --- a/src/store-api/src/storage/requests.rs +++ b/src/store-api/src/storage/requests.rs @@ -128,8 +128,6 @@ pub struct ScanRequest { /// Optional hint for KNN vector search. When set, the scan should use /// vector index to find the k nearest neighbors. pub vector_search: Option, - /// Whether to force reading region data in flat format. - pub force_flat_format: bool, } impl Display for ScanRequest { @@ -220,14 +218,6 @@ impl Display for ScanRequest { vector_search.metric )?; } - if self.force_flat_format { - write!( - f, - "{}force_flat_format: {}", - delimiter.as_str(), - self.force_flat_format - )?; - } write!(f, " }}") } } @@ -282,15 +272,6 @@ mod tests { "ScanRequest { projection: [1, 2], limit: 10 }" ); - let request = ScanRequest { - force_flat_format: true, - ..Default::default() - }; - assert_eq!( - request.to_string(), - "ScanRequest { force_flat_format: true }" - ); - let request = ScanRequest { snapshot_on_scan: true, ..Default::default() diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index bee03ae7fe..10b7097f4f 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -63,22 +63,14 @@ meta-client.workspace = true meta-srv = { workspace = true, features = ["mock"] } mito2.workspace = true moka.workspace = true -mysql_async = { version = "0.35", default-features = false, features = [ - "time", - "default-rustls-ring", -] } object-store.workspace = true operator = { workspace = true, features = ["testing"] } prost.workspace = true query.workspace = true rand.workspace = true -rstest.workspace = true -rstest_reuse.workspace = true -sea-query.workspace = true serde_json.workspace = true servers = { workspace = true, features = ["testing"] } session.workspace = true -similar-asserts.workspace = true snafu.workspace = true sql.workspace = true sqlx = { workspace = true, features = [ @@ -108,6 +100,10 @@ hex.workspace = true http.workspace = true itertools.workspace = true jsonb.workspace = true +mysql_async = { version = "0.35", default-features = false, features = [ + "time", + "default-rustls-ring", +] } opentelemetry-proto.workspace = true otel-arrow-rust.workspace = true partition.workspace = true @@ -115,7 +111,11 @@ paste.workspace = true pipeline.workspace = true prost.workspace = true rand.workspace = true +rstest.workspace = true +rstest_reuse.workspace = true +sea-query.workspace = true session = { workspace = true, features = ["testing"] } +similar-asserts.workspace = true store-api.workspace = true tokio-postgres = { workspace = true } url = "2.3" diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 36ddb1bb38..29d4256864 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1176,6 +1176,20 @@ pub async fn test_prom_http_api(store_type: StorageType) { .await; assert_eq!(res.status(), StatusCode::OK); + // query non-exist label in metric table + let res = client + .get("/v1/prometheus/api/v1/label/not_exist_label/values?match[]=demo&start=0&end=600") + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + let prom_resp = res.json::().await; + assert_eq!(prom_resp.status, "success"); + assert!(prom_resp.error.is_none() && prom_resp.error_type.is_none()); + assert_eq!( + prom_resp.data, + serde_json::from_value::(json!([])).unwrap() + ); + // query `__name__` without match[] // create a physical table and a logical table let res = client @@ -1552,12 +1566,11 @@ index_cache_percent = 20 enable_refill_cache_on_read = true manifest_cache_size = "256MiB" sst_write_buffer_size = "8MiB" -parallel_scan_channel_size = 32 max_concurrent_scan_files = 384 allow_stale_entries = false scan_memory_on_exhausted = "fail" min_compaction_interval = "0s" -default_experimental_flat_format = false +default_flat_format = true [region_engine.mito.index] aux_path = "" @@ -5509,6 +5522,202 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { ) .await; + let existing_int_table_name = "trace_type_existing_int_widens_to_float"; + let existing_int_seed_req = make_trace_v1_request( + "type-existing-int", + vec![make_trace_v1_span( + "00000000000000000000000000000051", + "0000000000000051", + "existing-int-seed", + 1_736_480_942_445_490_000, + 1_736_480_942_445_590_000, + vec![make_int_attr("attr_num", 1)], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_table_name, + existing_int_seed_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + let existing_int_req = make_trace_v1_request( + "type-existing-int", + vec![ + make_trace_v1_span( + "00000000000000000000000000000052", + "0000000000000052", + "existing-int-upcast-int", + 1_736_480_942_445_600_000, + 1_736_480_942_445_700_000, + vec![make_int_attr("attr_num", 2)], + ), + make_trace_v1_span( + "00000000000000000000000000000053", + "0000000000000053", + "existing-int-upcast-float", + 1_736_480_942_445_710_000, + 1_736_480_942_445_810_000, + vec![make_double_attr("attr_num", 3.5)], + ), + ], + ); + let res = send_trace_v1_req(&client, existing_int_table_name, existing_int_req, false).await; + assert_eq!(StatusCode::OK, res.status()); + + validate_data( + "otlp_traces_v1_existing_int_widens_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_num\" from {} order by trace_id;", + existing_int_table_name + ), + r#"[["00000000000000000000000000000051",1.0],["00000000000000000000000000000052",2.0],["00000000000000000000000000000053",3.5]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_existing_int_widens_type", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_existing_int_widens_to_float' and column_name = 'span_attributes.attr_num';", + r#"[["span_attributes.attr_num","double","FIELD"]]"#, + ) + .await; + + let existing_int_atomic_table_name = "trace_type_existing_int_widen_atomic"; + let existing_int_atomic_seed_req = make_trace_v1_request( + "type-existing-int-atomic", + vec![make_trace_v1_span( + "00000000000000000000000000000054", + "0000000000000054", + "existing-int-atomic-seed", + 1_736_480_942_445_720_000, + 1_736_480_942_445_820_000, + vec![ + make_int_attr("attr_num", 1), + make_int_attr("attr_parse", 10), + ], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_atomic_table_name, + existing_int_atomic_seed_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + let existing_int_atomic_req = make_trace_v1_request( + "type-existing-int-atomic", + vec![make_trace_v1_span( + "00000000000000000000000000000055", + "0000000000000055", + "existing-int-atomic-invalid", + 1_736_480_942_445_830_000, + 1_736_480_942_445_930_000, + vec![ + make_double_attr("attr_num", 3.5), + make_string_attr("attr_parse", "not_a_number"), + ], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_atomic_table_name, + existing_int_atomic_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + let body = ExportTraceServiceResponse::decode(res.bytes().await).unwrap(); + let partial_success = body.partial_success.as_ref().unwrap(); + assert_eq!(partial_success.rejected_spans, 1); + assert!( + partial_success + .error_message + .contains("Accepted 0 spans, rejected 1 spans"), + "unexpected partial success body: {body:?}" + ); + + validate_data( + "otlp_traces_v1_existing_int_widen_atomic_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_num\", \"span_attributes.attr_parse\" from {} order by trace_id;", + existing_int_atomic_table_name + ), + r#"[["00000000000000000000000000000054",1,10]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_existing_int_widen_atomic_types", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_existing_int_widen_atomic' and column_name in ('span_attributes.attr_num', 'span_attributes.attr_parse') order by column_name;", + r#"[["span_attributes.attr_num","bigint","FIELD"],["span_attributes.attr_parse","bigint","FIELD"]]"#, + ) + .await; + + let existing_int_float_only_table_name = "trace_type_existing_int_float_only"; + let existing_int_float_only_seed_req = make_trace_v1_request( + "type-existing-int-float-only", + vec![make_trace_v1_span( + "00000000000000000000000000000061", + "0000000000000061", + "existing-int-float-only-seed", + 1_736_480_942_445_820_000, + 1_736_480_942_445_920_000, + vec![make_int_attr("attr_num", 1)], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_float_only_table_name, + existing_int_float_only_seed_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + let existing_int_float_only_req = make_trace_v1_request( + "type-existing-int-float-only", + vec![make_trace_v1_span( + "00000000000000000000000000000062", + "0000000000000062", + "existing-int-float-only-apply", + 1_736_480_942_445_930_000, + 1_736_480_942_446_030_000, + vec![make_double_attr("attr_num", 2.5)], + )], + ); + let res = send_trace_v1_req( + &client, + existing_int_float_only_table_name, + existing_int_float_only_req, + false, + ) + .await; + assert_eq!(StatusCode::OK, res.status()); + + validate_data( + "otlp_traces_v1_existing_int_float_only_rows", + &client, + &format!( + "select trace_id, \"span_attributes.attr_num\" from {} order by trace_id;", + existing_int_float_only_table_name + ), + r#"[["00000000000000000000000000000061",1.0],["00000000000000000000000000000062",2.5]]"#, + ) + .await; + validate_data( + "otlp_traces_v1_existing_int_float_only_type", + &client, + "select column_name, lower(data_type), semantic_type from information_schema.columns where table_name = 'trace_type_existing_int_float_only' and column_name = 'span_attributes.attr_num';", + r#"[["span_attributes.attr_num","double","FIELD"]]"#, + ) + .await; + validate_data( "otlp_traces_v1_type_coercion_rows", &client, diff --git a/tests/cases/distributed/explain/order_by.result b/tests/cases/distributed/explain/order_by.result index 362849afea..6ce8b4e170 100644 --- a/tests/cases/distributed/explain/order_by.result +++ b/tests/cases/distributed/explain/order_by.result @@ -126,8 +126,7 @@ EXPLAIN ANALYZE SELECT i, t AS alias_ts FROM test_pk ORDER BY t DESC LIMIT 5; |_|_|_| | 1_| 0_|_ProjectionExec: expr=[i@0 as i, t@1 as alias_ts] REDACTED |_|_|_SortPreservingMergeExec: [test_pk.t__temp__0@2 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@1 DESC], preserve_partitioning=[true], filter=[t@1 IS NULL OR t@1 > 2] REDACTED |_|_|_ProjectionExec: expr=[i@0 as i, t@1 as t, t@1 as test_pk.t__temp__0] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| @@ -150,8 +149,7 @@ EXPLAIN ANALYZE SELECT i, t AS alias_ts FROM test_pk ORDER BY alias_ts DESC LIMI |_|_|_| | 1_| 0_|_ProjectionExec: expr=[i@0 as i, t@1 as alias_ts] REDACTED |_|_|_SortPreservingMergeExec: [t@1 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@1 DESC], preserve_partitioning=[true], filter=[t@1 IS NULL OR t@1 > 2] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| |_|_| Total rows: 5_| diff --git a/tests/cases/distributed/information_schema/cluster_info.result b/tests/cases/distributed/information_schema/cluster_info.result index 83512e6ffb..662c73a1ea 100644 --- a/tests/cases/distributed/information_schema/cluster_info.result +++ b/tests/cases/distributed/information_schema/cluster_info.result @@ -24,55 +24,55 @@ DESC TABLE CLUSTER_INFO; +----------------------+----------------------+-----+------+---------+---------------+ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO ORDER BY peer_type; +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|0|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||1|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||2|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||0|FLOWNODE|Address|Version|Hash|Start_time|Duration|Duration||1|FRONTEND|Address|Version|Hash|Start_time|Duration|Duration||1|METASRV|Address|Version|Hash|Start_time|Duration||+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'METASRV' ORDER BY peer_type; +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|1|METASRV|Address|Version|Hash|Start_time|Duration||+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'FRONTEND' ORDER BY peer_type; +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|1|FRONTEND|Address|Version|Hash|Start_time|Duration|Duration|+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE != 'FRONTEND' ORDER BY peer_type; +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|0|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||1|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||2|DATANODE|Address|Version|Hash|Start_time|Duration|Duration||0|FLOWNODE|Address|Version|Hash|Start_time|Duration|Duration||1|METASRV|Address|Version|Hash|Start_time|Duration||+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_ID > 1 ORDER BY peer_type; diff --git a/tests/cases/distributed/information_schema/cluster_info.sql b/tests/cases/distributed/information_schema/cluster_info.sql index 3c2dcccaa0..9e85245859 100644 --- a/tests/cases/distributed/information_schema/cluster_info.sql +++ b/tests/cases/distributed/information_schema/cluster_info.sql @@ -3,47 +3,47 @@ USE INFORMATION_SCHEMA; DESC TABLE CLUSTER_INFO; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO ORDER BY peer_type; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'METASRV' ORDER BY peer_type; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'FRONTEND' ORDER BY peer_type; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE != 'FRONTEND' ORDER BY peer_type; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash --- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time --- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE (\s127\.0\.0\.1:\d+\s) Address +-- SQLNESS REPLACE (\s[\-0-9T:\.]{19,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash +-- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_ID > 1 ORDER BY peer_type; diff --git a/tests/cases/standalone/common/alter/alter_format.result b/tests/cases/standalone/common/alter/alter_format.result index d38c63997d..a1019a8c93 100644 --- a/tests/cases/standalone/common/alter/alter_format.result +++ b/tests/cases/standalone/common/alter/alter_format.result @@ -42,6 +42,26 @@ ALTER TABLE test_alt_format SET 'sst_format' = 'flat'; Affected Rows: 0 +SHOW CREATE TABLE test_alt_format; + ++-----------------+------------------------------------------------+ +| Table | Create Table | ++-----------------+------------------------------------------------+ +| test_alt_format | CREATE TABLE IF NOT EXISTS "test_alt_format" ( | +| | "h" INT NULL, | +| | "i" INT NULL DEFAULT 0, | +| | "j" TIMESTAMP(3) NOT NULL, | +| | "k" INT NULL, | +| | TIME INDEX ("j"), | +| | PRIMARY KEY ("h") | +| | ) | +| | | +| | ENGINE=mito | +| | WITH( | +| | sst_format = 'flat' | +| | ) | ++-----------------+------------------------------------------------+ + -- SQLNESS SORT_RESULT 3 1 SELECT * FROM test_alt_format; @@ -116,11 +136,68 @@ SELECT i, h FROM test_alt_format; | 23 | 13 | +----+----+ --- not allow to change from flat to primary_key --- SQLNESS REPLACE \d+\(\d+,\s+\d+\) REDACTED +-- allow to change from flat to primary_key ALTER TABLE test_alt_format SET 'sst_format' = 'primary_key'; -Error: 1004(InvalidArguments), Invalid region request, region_id: REDACTED, err: Only allow changing format type to flat +Affected Rows: 0 + +SHOW CREATE TABLE test_alt_format; + ++-----------------+------------------------------------------------+ +| Table | Create Table | ++-----------------+------------------------------------------------+ +| test_alt_format | CREATE TABLE IF NOT EXISTS "test_alt_format" ( | +| | "h" INT NULL, | +| | "i" INT NULL DEFAULT 0, | +| | "j" TIMESTAMP(3) NOT NULL, | +| | "k" INT NULL, | +| | TIME INDEX ("j"), | +| | PRIMARY KEY ("h") | +| | ) | +| | | +| | ENGINE=mito | +| | WITH( | +| | sst_format = 'primary_key' | +| | ) | ++-----------------+------------------------------------------------+ + +INSERT INTO test_alt_format (h, j, i) VALUES (14, 4, 34); + +Affected Rows: 1 + +-- SQLNESS SORT_RESULT 3 1 +SELECT * FROM test_alt_format; + ++----+----+-------------------------+----+ +| h | i | j | k | ++----+----+-------------------------+----+ +| 10 | 0 | 1970-01-01T00:00:00 | | +| 11 | 0 | 1970-01-01T00:00:00.001 | | +| 12 | 0 | 1970-01-01T00:00:00.002 | | +| 13 | 23 | 1970-01-01T00:00:00.003 | 33 | +| 14 | 34 | 1970-01-01T00:00:00.004 | | ++----+----+-------------------------+----+ + +ADMIN flush_table('test_alt_format'); + ++--------------------------------------+ +| ADMIN flush_table('test_alt_format') | ++--------------------------------------+ +| 0 | ++--------------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +SELECT * FROM test_alt_format; + ++----+----+-------------------------+----+ +| h | i | j | k | ++----+----+-------------------------+----+ +| 10 | 0 | 1970-01-01T00:00:00 | | +| 11 | 0 | 1970-01-01T00:00:00.001 | | +| 12 | 0 | 1970-01-01T00:00:00.002 | | +| 13 | 23 | 1970-01-01T00:00:00.003 | 33 | +| 14 | 34 | 1970-01-01T00:00:00.004 | | ++----+----+-------------------------+----+ DROP TABLE test_alt_format; @@ -167,6 +244,27 @@ ALTER TABLE alt_format_phy SET 'sst_format' = 'flat'; Affected Rows: 0 +SHOW CREATE TABLE alt_format_phy; + ++----------------+-----------------------------------------------+ +| Table | Create Table | ++----------------+-----------------------------------------------+ +| alt_format_phy | CREATE TABLE IF NOT EXISTS "alt_format_phy" ( | +| | "ts" TIMESTAMP(3) NOT NULL, | +| | "val" DOUBLE NULL, | +| | "host" STRING NULL, | +| | "k" STRING NULL, | +| | TIME INDEX ("ts"), | +| | PRIMARY KEY ("host", "k") | +| | ) | +| | | +| | ENGINE=metric | +| | WITH( | +| | physical_metric_table = '', | +| | sst_format = 'flat' | +| | ) | ++----------------+-----------------------------------------------+ + SELECT * FROM t1 ORDER BY ts ASC; +-------------+---+---------------------+------+ @@ -202,11 +300,47 @@ SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; | example.com | 2022-01-02T00:00:00 | 4.56 | +-------------+---------------------+------+ --- not allow to change from flat to primary_key --- SQLNESS REPLACE \d+\(\d+,\s+\d+\) REDACTED +-- allow to change from flat to primary_key ALTER TABLE alt_format_phy SET 'sst_format' = 'primary_key'; -Error: 1004(InvalidArguments), Invalid region request, region_id: REDACTED, err: Only allow changing format type to flat +Affected Rows: 0 + +SHOW CREATE TABLE alt_format_phy; + ++----------------+-----------------------------------------------+ +| Table | Create Table | ++----------------+-----------------------------------------------+ +| alt_format_phy | CREATE TABLE IF NOT EXISTS "alt_format_phy" ( | +| | "ts" TIMESTAMP(3) NOT NULL, | +| | "val" DOUBLE NULL, | +| | "host" STRING NULL, | +| | "k" STRING NULL, | +| | TIME INDEX ("ts"), | +| | PRIMARY KEY ("host", "k") | +| | ) | +| | | +| | ENGINE=metric | +| | WITH( | +| | physical_metric_table = '', | +| | sst_format = 'primary_key' | +| | ) | ++----------------+-----------------------------------------------+ + +INSERT INTO t1 (ts, val, host) VALUES + ('2022-01-01 00:00:02', 5.0, 'example.com'); + +Affected Rows: 1 + +SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; + ++-------------+---------------------+------+ +| host | ts | val | ++-------------+---------------------+------+ +| example.com | 2022-01-01T00:00:00 | 1.23 | +| example.com | 2022-01-01T00:00:01 | 3.0 | +| example.com | 2022-01-01T00:00:02 | 5.0 | +| example.com | 2022-01-02T00:00:00 | 4.56 | ++-------------+---------------------+------+ DROP TABLE t1; diff --git a/tests/cases/standalone/common/alter/alter_format.sql b/tests/cases/standalone/common/alter/alter_format.sql index e1472d28e1..c3b292875c 100644 --- a/tests/cases/standalone/common/alter/alter_format.sql +++ b/tests/cases/standalone/common/alter/alter_format.sql @@ -16,6 +16,8 @@ SELECT i, h FROM test_alt_format; ALTER TABLE test_alt_format SET 'sst_format' = 'flat'; +SHOW CREATE TABLE test_alt_format; + -- SQLNESS SORT_RESULT 3 1 SELECT * FROM test_alt_format; @@ -37,10 +39,21 @@ SELECT * FROM test_alt_format; -- SQLNESS SORT_RESULT 3 1 SELECT i, h FROM test_alt_format; --- not allow to change from flat to primary_key --- SQLNESS REPLACE \d+\(\d+,\s+\d+\) REDACTED +-- allow to change from flat to primary_key ALTER TABLE test_alt_format SET 'sst_format' = 'primary_key'; +SHOW CREATE TABLE test_alt_format; + +INSERT INTO test_alt_format (h, j, i) VALUES (14, 4, 34); + +-- SQLNESS SORT_RESULT 3 1 +SELECT * FROM test_alt_format; + +ADMIN flush_table('test_alt_format'); + +-- SQLNESS SORT_RESULT 3 1 +SELECT * FROM test_alt_format; + DROP TABLE test_alt_format; CREATE TABLE alt_format_phy (ts timestamp time index, val double) engine=metric with ("physical_metric_table" = "", "sst_format" = "primary_key"); @@ -62,6 +75,8 @@ SELECT * FROM t1 ORDER BY ts ASC; ALTER TABLE alt_format_phy SET 'sst_format' = 'flat'; +SHOW CREATE TABLE alt_format_phy; + SELECT * FROM t1 ORDER BY ts ASC; SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; @@ -72,10 +87,16 @@ INSERT INTO t1 (ts, val, host) VALUES SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; --- not allow to change from flat to primary_key --- SQLNESS REPLACE \d+\(\d+,\s+\d+\) REDACTED +-- allow to change from flat to primary_key ALTER TABLE alt_format_phy SET 'sst_format' = 'primary_key'; +SHOW CREATE TABLE alt_format_phy; + +INSERT INTO t1 (ts, val, host) VALUES + ('2022-01-01 00:00:02', 5.0, 'example.com'); + +SELECT host, ts, val FROM t1 where host = 'example.com' ORDER BY ts ASC; + DROP TABLE t1; DROP TABLE alt_format_phy; diff --git a/tests/cases/standalone/common/function/system.result b/tests/cases/standalone/common/function/system.result index 0cb6839292..d2007b5240 100644 --- a/tests/cases/standalone/common/function/system.result +++ b/tests/cases/standalone/common/function/system.result @@ -8,7 +8,7 @@ SELECT build(); ++|build()|++|branch:BRANCH|commit:COMMIT|commit_short:COMMIT_SHORT|clean:CLEAN|version:VERSION++ --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) VERSION +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) VERSION -- SQLNESS REPLACE [\s\-]+ SELECT version(); diff --git a/tests/cases/standalone/common/function/system.sql b/tests/cases/standalone/common/function/system.sql index 8ae1475311..6504b48679 100644 --- a/tests/cases/standalone/common/function/system.sql +++ b/tests/cases/standalone/common/function/system.sql @@ -6,7 +6,7 @@ -- SQLNESS REPLACE [\s\-]+ SELECT build(); --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) VERSION +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) VERSION -- SQLNESS REPLACE [\s\-]+ SELECT version(); diff --git a/tests/cases/standalone/common/function/vector/vector_index_explain.result b/tests/cases/standalone/common/function/vector/vector_index_explain.result index 246a49f405..10351cce11 100644 --- a/tests/cases/standalone/common/function/vector/vector_index_explain.result +++ b/tests/cases/standalone/common/function/vector/vector_index_explain.result @@ -41,6 +41,9 @@ ADMIN FLUSH_TABLE('vectors_explain'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE ,\s"dyn_filters":\s\[.* REDACTED +-- SQLNESS REPLACE ,\s"vector_index_k":\s\d+ +-- SQLNESS REPLACE "index_size":\d+ "index_size":REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain @@ -56,7 +59,7 @@ LIMIT 2; | 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| |_|_|_SortPreservingMergeExec: [vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| |_|_|_SortExec: TopK(fetch=2), expr=[vec_l2sq_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "dyn_filters": ["DynamicFilter [ vec_l2sq_distance(embedding@1, [1.0, 0.0]) < 0.010000004 OR vec_l2sq_distance(embedding@1, [1.0, 0.0]) = 0.010000004 AND vec_id@0 < 2 ]"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":893}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"]REDACTED |_|_|_| |_|_| Total rows: REDACTED_| +-+-+-+ @@ -122,6 +125,7 @@ ADMIN FLUSH_TABLE('vectors_explain_left'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED EXPLAIN ANALYZE VERBOSE SELECT l.vec_id FROM vectors_explain_left l @@ -144,10 +148,10 @@ LIMIT 1; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_CooperativeExec metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "ts", "embedding"], "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":893}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_SeqScan: REDACTED |_|_|_| | 1_| 0_|_CooperativeExec metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "projection": ["vec_id", "note", "ts"], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED_| +|_|_|_SeqScan: REDACTED |_|_|_| |_|_| Total rows: REDACTED_| +-+-+-+ @@ -202,6 +206,9 @@ ADMIN FLUSH_TABLE('vectors_explain_metric'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED +-- SQLNESS REPLACE ,\s"vector_index_k":\s\d+ +-- SQLNESS REPLACE "index_size":\d+ "index_size":REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain_metric @@ -217,7 +224,7 @@ LIMIT 2; | 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| |_|_|_SortPreservingMergeExec: [vec_cos_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| |_|_|_SortExec: TopK(fetch=2), expr=[vec_cos_distance(embedding@1, [1.0, 0.0]) ASC NULLS LAST, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "dyn_filters": ["DynamicFilter [ vec_cos_distance(embedding@1, [1.0, 0.0]) < 1 OR vec_cos_distance(embedding@1, [1.0, 0.0]) = 1 AND vec_id@0 < 4 ]"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":895}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_SeqScan: REDACTED |_|_|_| |_|_| Total rows: REDACTED_| +-+-+-+ @@ -236,6 +243,7 @@ LIMIT 2; -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain_metric @@ -251,7 +259,7 @@ LIMIT 2; | 1_| 0_|_ProjectionExec: expr=[vec_id@0 as vec_id] metrics=REDACTED_| |_|_|_SortPreservingMergeExec: [vec_dot_product(embedding@1, [1.0, 0.0]) DESC, vec_id@0 ASC NULLS LAST], fetch=2 metrics=REDACTED_| |_|_|_SortExec: TopK(fetch=2), expr=[vec_dot_product(embedding@1, [1.0, 0.0]) DESC, vec_id@0 ASC NULLS LAST], preserve_partitioning=[true] metrics=REDACTED_| -|_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":1, "mem_ranges":0, "files":1, "file_ranges":1}, "projection": ["vec_id", "embedding"], "dyn_filters": ["DynamicFilter [ vec_dot_product(embedding@1, [1.0, 0.0]) IS NULL OR vec_dot_product(embedding@1, [1.0, 0.0]) > 0 OR vec_dot_product(embedding@1, [1.0, 0.0]) = 0 AND vec_id@0 < 2 ]"], "vector_index_k": 2, "files": [{"file_id":"REDACTED","time_range_start":"REDACTED","time_range_end":"REDACTED","rows":4,"size":REDACTED,"index_size":895}], "flat_format":REDACTED, "metrics_per_partition": REDACTED metrics=REDACTED | +|_|_|_SeqScan: REDACTED |_|_|_| |_|_| Total rows: REDACTED_| +-+-+-+ diff --git a/tests/cases/standalone/common/function/vector/vector_index_explain.sql b/tests/cases/standalone/common/function/vector/vector_index_explain.sql index 50bf3cdbaa..7a0330b0ea 100644 --- a/tests/cases/standalone/common/function/vector/vector_index_explain.sql +++ b/tests/cases/standalone/common/function/vector/vector_index_explain.sql @@ -33,6 +33,9 @@ ADMIN FLUSH_TABLE('vectors_explain'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE ,\s"dyn_filters":\s\[.* REDACTED +-- SQLNESS REPLACE ,\s"vector_index_k":\s\d+ +-- SQLNESS REPLACE "index_size":\d+ "index_size":REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain @@ -84,6 +87,7 @@ ADMIN FLUSH_TABLE('vectors_explain_left'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED EXPLAIN ANALYZE VERBOSE SELECT l.vec_id FROM vectors_explain_left l @@ -126,6 +130,9 @@ ADMIN FLUSH_TABLE('vectors_explain_metric'); -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED +-- SQLNESS REPLACE ,\s"vector_index_k":\s\d+ +-- SQLNESS REPLACE "index_size":\d+ "index_size":REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain_metric @@ -146,6 +153,7 @@ LIMIT 2; -- SQLNESS REPLACE Total\s+rows:\s+\d+ Total rows: REDACTED -- SQLNESS REPLACE (peers.*) REDACTED -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE SeqScan:.* SeqScan: REDACTED EXPLAIN ANALYZE VERBOSE SELECT vec_id FROM vectors_explain_metric diff --git a/tests/cases/standalone/common/order/order_by.result b/tests/cases/standalone/common/order/order_by.result index 13ac8caebe..6a2807e4b6 100644 --- a/tests/cases/standalone/common/order/order_by.result +++ b/tests/cases/standalone/common/order/order_by.result @@ -288,6 +288,7 @@ select tag from t where num > 6 order by ts; -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED -- SQLNESS REPLACE num_ranges=\d+ num_ranges=REDACTED +-- SQLNESS REPLACE ,\sfilter=\[[^]]+\] explain analyze select tag from t where num > 6 order by ts desc limit 2; +-+-+-+ @@ -295,18 +296,16 @@ explain analyze select tag from t where num > 6 order by ts desc limit 2; +-+-+-+ | 0_| 0_|_ProjectionExec: expr=[tag@0 as tag] REDACTED |_|_|_SortPreservingMergeExec: [ts@1 DESC], fetch=2 REDACTED -|_|_|_SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[true], filter=[ts@1 IS NULL OR ts@1 > 6000] REDACTED +|_|_|_SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [ts@1 DESC], fetch=2 REDACTED -|_|_|_WindowedSortExec: expr=ts@1 DESC num_ranges=REDACTED fetch=2 REDACTED -|_|_|_PartSortExec: expr=ts@1 DESC num_ranges=REDACTED limit=2 REDACTED +|_|_|_SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_FilterExec: num@2 > 6, projection=[tag@0, ts@1] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| | 1_| 1_|_SortPreservingMergeExec: [ts@1 DESC], fetch=2 REDACTED -|_|_|_WindowedSortExec: expr=ts@1 DESC num_ranges=REDACTED fetch=2 REDACTED -|_|_|_PartSortExec: expr=ts@1 DESC num_ranges=REDACTED limit=2 REDACTED +|_|_|_SortExec: TopK(fetch=2), expr=[ts@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_FilterExec: num@2 > 6, projection=[tag@0, ts@1] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| diff --git a/tests/cases/standalone/common/order/order_by.sql b/tests/cases/standalone/common/order/order_by.sql index dd641613d9..95fd2c6f18 100644 --- a/tests/cases/standalone/common/order/order_by.sql +++ b/tests/cases/standalone/common/order/order_by.sql @@ -95,6 +95,7 @@ select tag from t where num > 6 order by ts; -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED -- SQLNESS REPLACE num_ranges=\d+ num_ranges=REDACTED +-- SQLNESS REPLACE ,\sfilter=\[[^]]+\] explain analyze select tag from t where num > 6 order by ts desc limit 2; drop table t; diff --git a/tests/cases/standalone/common/order/windowed_sort.result b/tests/cases/standalone/common/order/windowed_sort.result index 4e550bf311..f85aa8c04e 100644 --- a/tests/cases/standalone/common/order/windowed_sort.result +++ b/tests/cases/standalone/common/order/windowed_sort.result @@ -70,7 +70,7 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@1 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -103,8 +103,7 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t DESC LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -137,7 +136,7 @@ EXPLAIN ANALYZE SELECT * FROM test where i > 2 ORDER BY t LIMIT 4; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST], fetch=4 REDACTED -|_|_|_WindowedSortExec: expr=t@1 ASC NULLS LAST num_ranges=REDACTED fetch=4 REDACTED +|_|_|_SortExec: TopK(fetch=4), expr=[t@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_FilterExec: i@0 > 2 REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| @@ -171,8 +170,7 @@ EXPLAIN ANALYZE SELECT * FROM test where i > 2 ORDER BY t DESC LIMIT 4; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 DESC], fetch=4 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=4 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=4 REDACTED +|_|_|_SortExec: TopK(fetch=4), expr=[t@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_FilterExec: i@0 > 2 REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| @@ -206,8 +204,7 @@ EXPLAIN ANALYZE SELECT * FROM test where t > 8 ORDER BY t DESC LIMIT 4; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@1 DESC], fetch=4 REDACTED -|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=REDACTED fetch=4 REDACTED -|_|_|_PartSortExec: expr=t@1 DESC num_ranges=REDACTED limit=4 REDACTED +|_|_|_SortExec: TopK(fetch=4), expr=[t@1 DESC], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":2, "mem_ranges":1, "files":1, "file_ranges":1} REDACTED |_|_|_| |_|_| Total rows: 4_| @@ -289,8 +286,7 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -312,8 +308,7 @@ EXPLAIN ANALYZE VERBOSE SELECT * FROM test_pk ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":4, "mem_ranges":1, "REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -346,8 +341,7 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t DESC LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 DESC], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -381,8 +375,7 @@ EXPLAIN ANALYZE SELECT * FROM test_pk where pk > 7 ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":4, "mem_ranges":1, "files":3, "file_ranges":3} REDACTED |_|_|_| |_|_| Total rows: 5_| @@ -404,8 +397,7 @@ EXPLAIN ANALYZE VERBOSE SELECT * FROM test_pk where pk > 7 ORDER BY t LIMIT 5; |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[t@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED |_|_|_SeqScan: region=REDACTED, {"partition_count":{"count":4, "mem_ranges":1, "REDACTED |_|_|_| |_|_| Total rows: 5_| diff --git a/tests/cases/standalone/common/system/pg_catalog.result b/tests/cases/standalone/common/system/pg_catalog.result index ef0452e316..7a6b65c7c8 100644 --- a/tests/cases/standalone/common/system/pg_catalog.result +++ b/tests/cases/standalone/common/system/pg_catalog.result @@ -14,15 +14,12 @@ SELECT session_user is not null; +----------------------------+ -- SQLNESS REPLACE PostgreSQL.* VERSION +-- SQLNESS REPLACE [\s\-]+ -- current_schema -- SQLNESS PROTOCOL POSTGRES select current_schema(), current_schemas(true), current_schemas(false), version(), current_database(); -+------------------+---------------------------------------------------------+---------------------------------+---------------------------------------+--------------------+ -| current_schema() | current_schemas(Boolean(true)) | current_schemas(Boolean(false)) | version | current_database() | -+------------------+---------------------------------------------------------+---------------------------------+---------------------------------------+--------------------+ -| public | {public,information_schema,pg_catalog,greptime_private} | {public} | VERSION -+------------------+---------------------------------------------------------+---------------------------------+---------------------------------------+--------------------+ +++++++|current_schema()|current_schemas(Boolean(true))|current_schemas(Boolean(false))|version|current_database()|++++++|public|{public,information_schema,pg_catalog,greptime_private}|{public}|VERSION++++++ -- search_path for pg using schema for now FIXME when support real search_path -- SQLNESS PROTOCOL POSTGRES diff --git a/tests/cases/standalone/common/system/pg_catalog.sql b/tests/cases/standalone/common/system/pg_catalog.sql index ad59da372c..2e84ecd7ce 100644 --- a/tests/cases/standalone/common/system/pg_catalog.sql +++ b/tests/cases/standalone/common/system/pg_catalog.sql @@ -6,6 +6,7 @@ create database pg_catalog; SELECT session_user is not null; -- SQLNESS REPLACE PostgreSQL.* VERSION +-- SQLNESS REPLACE [\s\-]+ -- current_schema -- SQLNESS PROTOCOL POSTGRES select current_schema(), current_schemas(true), current_schemas(false), version(), current_database(); diff --git a/tests/cases/standalone/information_schema/cluster_info.result b/tests/cases/standalone/information_schema/cluster_info.result index 07fc4bd5c6..04567ff721 100644 --- a/tests/cases/standalone/information_schema/cluster_info.result +++ b/tests/cases/standalone/information_schema/cluster_info.result @@ -24,9 +24,9 @@ DESC TABLE CLUSTER_INFO; +----------------------+----------------------+-----+------+---------+---------------+ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO; @@ -34,9 +34,9 @@ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, a +++++++++|peer_id|peer_type|peer_addr|node_version|git_commit|start_time|uptime|active_time|+++++++++|0|STANDALONE||Version|Hash|Start_time|Duration||+++++++++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'STANDALONE'; @@ -49,9 +49,9 @@ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, a ++ -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_ID = 0; diff --git a/tests/cases/standalone/information_schema/cluster_info.sql b/tests/cases/standalone/information_schema/cluster_info.sql index 5e253fc43d..798e9eff28 100644 --- a/tests/cases/standalone/information_schema/cluster_info.sql +++ b/tests/cases/standalone/information_schema/cluster_info.sql @@ -3,17 +3,17 @@ USE INFORMATION_SCHEMA; DESC TABLE CLUSTER_INFO; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE = 'STANDALONE'; @@ -21,9 +21,9 @@ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, a SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_TYPE != 'STANDALONE'; -- SQLNESS REPLACE version node_version --- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*-[a-zA-Z0-9.-]+) Version --- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) Start_time +-- SQLNESS REPLACE (\d+\.\d+(?:\.\d+)*(?:-[a-zA-Z0-9.-]+)?) Version +-- SQLNESS REPLACE (\s[a-z0-9]{7,10}\s) Hash -- SQLNESS REPLACE ((\d+(s|ms|m)\s)+) Duration -- SQLNESS REPLACE [\s\-]+ SELECT peer_id, peer_type, peer_addr, version, git_commit, start_time, uptime, active_time FROM CLUSTER_INFO WHERE PEER_ID = 0; diff --git a/tests/cases/standalone/optimizer/order_by.result b/tests/cases/standalone/optimizer/order_by.result index 8bc4c14816..06b06ae442 100644 --- a/tests/cases/standalone/optimizer/order_by.result +++ b/tests/cases/standalone/optimizer/order_by.result @@ -142,8 +142,7 @@ EXPLAIN ANALYZE SELECT i, t AS alias_ts FROM test_pk ORDER BY t DESC LIMIT 5; |_|_|_| | 1_| 0_|_ProjectionExec: expr=[i@0 as i, alias_ts@1 as alias_ts] REDACTED |_|_|_SortPreservingMergeExec: [t@2 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=alias_ts@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=alias_ts@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[alias_ts@1 DESC], preserve_partitioning=[true], filter=[alias_ts@1 IS NULL OR alias_ts@1 > 2] REDACTED |_|_|_ProjectionExec: expr=[i@0 as i, t@1 as alias_ts, t@1 as t] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| @@ -165,8 +164,7 @@ EXPLAIN ANALYZE SELECT i, t AS alias_ts FROM test_pk ORDER BY alias_ts DESC LIMI |_|_|_MergeScanExec: REDACTED |_|_|_| | 1_| 0_|_SortPreservingMergeExec: [alias_ts@1 DESC], fetch=5 REDACTED -|_|_|_WindowedSortExec: expr=alias_ts@1 DESC num_ranges=REDACTED fetch=5 REDACTED -|_|_|_PartSortExec: expr=alias_ts@1 DESC num_ranges=REDACTED limit=5 REDACTED +|_|_|_SortExec: TopK(fetch=5), expr=[alias_ts@1 DESC], preserve_partitioning=[true], filter=[alias_ts@1 IS NULL OR alias_ts@1 > 2] REDACTED |_|_|_ProjectionExec: expr=[i@0 as i, t@1 as alias_ts] REDACTED |_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED |_|_|_| diff --git a/tests/conf/datanode-test.toml.template b/tests/conf/datanode-test.toml.template index 3ec8a2f695..e68a76cc9a 100644 --- a/tests/conf/datanode-test.toml.template +++ b/tests/conf/datanode-test.toml.template @@ -6,7 +6,7 @@ rpc_runtime_size = 8 [[region_engine]] [region_engine.mito] {{ if enable_flat_format }} -default_experimental_flat_format = true +default_flat_format = true {{ endif }} [wal] diff --git a/tests/conf/standalone-test.toml.template b/tests/conf/standalone-test.toml.template index 50c014e991..bcd263d0b5 100644 --- a/tests/conf/standalone-test.toml.template +++ b/tests/conf/standalone-test.toml.template @@ -5,7 +5,7 @@ require_lease_before_startup = true [[region_engine]] [region_engine.mito] {{ if enable_flat_format }} -default_experimental_flat_format = true +default_flat_format = true {{ endif }} [wal] diff --git a/tests/runner/src/cmd/bare.rs b/tests/runner/src/cmd/bare.rs index e9a4ff8b79..58199f959e 100644 --- a/tests/runner/src/cmd/bare.rs +++ b/tests/runner/src/cmd/bare.rs @@ -103,7 +103,7 @@ pub struct BareCommand { #[clap(long)] extra_args: Vec, - /// Enable flat format for storage engine (sets default_experimental_flat_format = true). + /// Enable flat format for storage engine (sets default_flat_format = true). #[clap(long, default_value = "false")] enable_flat_format: bool, }