diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml
index d0d2804c6a..65546dcc25 100644
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -669,18 +669,28 @@ jobs:
- name: "Basic"
opts: ""
kafka: false
+ postgres: false
+ mysql: false
- name: "Remote WAL"
opts: "-w kafka -k 127.0.0.1:9092"
kafka: true
+ postgres: false
+ mysql: false
- name: "PostgreSQL KvBackend"
- opts: "--setup-pg"
+ opts: "--setup-pg postgresql://greptimedb:admin@127.0.0.1:5432/postgres"
kafka: false
- - name: "MySQL Kvbackend"
- opts: "--setup-mysql"
+ postgres: true
+ mysql: false
+ - name: "MySQL KvBackend"
+ opts: "--setup-mysql mysql://greptimedb:admin@127.0.0.1:3306/mysql"
kafka: false
+ postgres: false
+ mysql: true
- name: "Flat format"
opts: "--enable-flat-format"
kafka: false
+ postgres: false
+ mysql: false
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
@@ -688,9 +698,19 @@ jobs:
persist-credentials: false
- if: matrix.mode.kafka
- name: Setup kafka server
+ name: Setup Kafka
working-directory: tests-integration/fixtures
- run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait kafka
+ run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait kafka
+
+ - if: matrix.mode.postgres
+ name: Setup PostgreSQL
+ working-directory: tests-integration/fixtures
+ run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait postgres
+
+ - if: matrix.mode.mysql
+ name: Setup MySQL
+ working-directory: tests-integration/fixtures
+ run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait mysql
- name: Download pre-built binaries
uses: actions/download-artifact@v4
diff --git a/.github/workflows/nightly-jsonbench.yaml b/.github/workflows/nightly-jsonbench.yaml
new file mode 100644
index 0000000000..3667ee26a6
--- /dev/null
+++ b/.github/workflows/nightly-jsonbench.yaml
@@ -0,0 +1,162 @@
+name: Nightly JSONBench
+
+on:
+ schedule:
+ # Trigger at 00:00(Asia/Shanghai) on every weekday.
+ - cron: "0 16 * * 0-4"
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ allocate-runner:
+ name: Allocate runner
+ if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+ runs-on: ubuntu-latest
+ outputs:
+ linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
+
+ # The following EC2 resource id will be used for resource releasing.
+ linux-arm64-ec2-runner-label: ${{ steps.start-linux-arm64-runner.outputs.label }}
+ linux-arm64-ec2-runner-instance-id: ${{ steps.start-linux-arm64-runner.outputs.ec2-instance-id }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ persist-credentials: false
+
+ - name: Allocate Linux ARM64 runner
+ uses: ./.github/actions/start-runner
+ id: start-linux-arm64-runner
+ with:
+ runner: ${{ vars.DEFAULT_ARM64_RUNNER }}
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ aws-region: ${{ vars.EC2_RUNNER_REGION }}
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+ image-id: ${{ vars.EC2_RUNNER_LINUX_ARM64_IMAGE_ID }}
+ security-group-id: ${{ vars.EC2_RUNNER_SECURITY_GROUP_ID }}
+ subnet-id: ${{ vars.EC2_RUNNER_SUBNET_ID }}
+
+ jsonbench:
+ name: Run JSONBench
+ if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+ needs: [ allocate-runner ]
+ runs-on: ${{ needs.allocate-runner.outputs.linux-arm64-runner }}
+ timeout-minutes: 120
+ env:
+ JSONBENCH_DATA_DIR: /home/runner/data/bluesky
+ JSONBENCH_OUTPUT_PREFIX: _ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ persist-credentials: false
+
+ - uses: arduino/setup-protoc@v3
+ with:
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+ - uses: actions-rust-lang/setup-rust-toolchain@v1
+
+ - name: Rust Cache
+ uses: Swatinem/rust-cache@v2
+ with:
+ shared-key: "nightly-jsonbench"
+ cache-all-crates: "true"
+ save-if: ${{ github.ref == 'refs/heads/main' }}
+
+ - name: Build GreptimeDB
+ run: cargo build --profile nightly --bin greptime
+
+ - name: Reclaim disk space
+ shell: bash
+ run: |
+ set -euo pipefail
+
+ mkdir -p "${RUNNER_TEMP}/greptimedb-bin"
+ cp ./target/nightly/greptime "${RUNNER_TEMP}/greptimedb-bin/greptime"
+ chmod +x "${RUNNER_TEMP}/greptimedb-bin/greptime"
+
+ rm -rf ./target
+
+ - name: Run JSONBench
+ shell: bash
+ run: |
+ set -euo pipefail
+
+ cd "${RUNNER_TEMP}"
+ cp "${RUNNER_TEMP}/greptimedb-bin/greptime" ./greptime
+ chmod +x ./greptime
+
+ export GREPTIMEDB_STANDALONE__WAL__DIR=greptimedb_data/wal
+ export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME=greptimedb_data
+ export GREPTIMEDB_STANDALONE__LOGGING__DIR=greptimedb_data/logs
+ export GREPTIMEDB_STANDALONE__LOGGING__APPEND_STDOUT=false
+ export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB
+ export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s
+
+ ./greptime standalone start > greptimedb.log 2>&1 &
+ greptime_pid=$!
+ trap 'kill "${greptime_pid}" 2>/dev/null || true' EXIT
+
+ until curl -s --fail -o /dev/null http://localhost:4000/health; do
+ if ! kill -0 "${greptime_pid}" 2>/dev/null; then
+ cat greptimedb.log
+ exit 1
+ fi
+ sleep 1
+ done
+
+ git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
+ cp ./greptime JSONBench/greptimedb/greptime
+
+ cd JSONBench/greptimedb
+ ./main.sh 3 "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
+
+ - name: Upload JSONBench results
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: jsonbench-results
+ path: |
+ ${{ runner.temp }}/greptimedb.log
+ ${{ runner.temp }}/JSONBench/greptimedb/*.log
+ ${{ runner.temp }}/JSONBench/greptimedb/*.total_size
+ ${{ runner.temp }}/JSONBench/greptimedb/*.data_size
+ ${{ runner.temp }}/JSONBench/greptimedb/*.index_size
+ ${{ runner.temp }}/JSONBench/greptimedb/*.count
+ ${{ runner.temp }}/JSONBench/greptimedb/*.results_runtime
+ ${{ runner.temp }}/JSONBench/greptimedb/*.query_results
+ if-no-files-found: ignore
+ retention-days: 7
+
+ stop-linux-arm64-runner:
+ name: Stop Linux ARM64 runner
+ # It's always run as the last job in the workflow to make sure that the runner is released.
+ if: ${{ always() }}
+ runs-on: ubuntu-latest
+ needs: [
+ allocate-runner,
+ jsonbench,
+ ]
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ persist-credentials: false
+
+ - name: Stop Linux ARM64 runner
+ uses: ./.github/actions/stop-runner
+ with:
+ label: ${{ needs.allocate-runner.outputs.linux-arm64-ec2-runner-label }}
+ ec2-instance-id: ${{ needs.allocate-runner.outputs.linux-arm64-ec2-runner-instance-id }}
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ aws-region: ${{ vars.EC2_RUNNER_REGION }}
+ github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
diff --git a/Cargo.lock b/Cargo.lock
index aafa225b4b..63ba289947 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1321,9 +1321,9 @@ dependencies = [
[[package]]
name = "bitpacking"
-version = "0.9.2"
+version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92"
+checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019"
dependencies = [
"crunchy",
]
@@ -1832,7 +1832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7d8d1efd5109b9c1cd3b7966bd071cdfb53bb6eb0b22a473a68c2f70a11a1eb"
dependencies = [
"parse-zoneinfo",
- "phf_codegen",
+ "phf_codegen 0.12.1",
"phf_shared 0.12.1",
"uncased",
]
@@ -4380,6 +4380,12 @@ dependencies = [
"tracing",
]
+[[package]]
+name = "datasketches"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745"
+
[[package]]
name = "datatypes"
version = "1.1.0"
@@ -5486,12 +5492,12 @@ dependencies = [
[[package]]
name = "fs4"
-version = "0.8.4"
+version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8"
+checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4"
dependencies = [
- "rustix 0.38.44",
- "windows-sys 0.52.0",
+ "rustix 1.0.7",
+ "windows-sys 0.59.0",
]
[[package]]
@@ -5820,7 +5826,7 @@ dependencies = [
[[package]]
name = "greptime-proto"
version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=dfd2a6d7d3d9c718cb159fcf9abae144b74fc503#dfd2a6d7d3d9c718cb159fcf9abae144b74fc503"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=7224c2ad6d11db612fbdb621c36135fc37ffce35#7224c2ad6d11db612fbdb621c36135fc37ffce35"
dependencies = [
"prost 0.14.1",
"prost-types 0.14.1",
@@ -6564,27 +6570,37 @@ checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed"
[[package]]
name = "include-flate"
-version = "0.3.0"
+version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e"
+checksum = "23e233413926ef735f7d87024466cfda5a4b87467730846bd82ea7d504121347"
dependencies = [
"include-flate-codegen",
- "lazy_static",
- "libflate",
+ "include-flate-compress",
]
[[package]]
name = "include-flate-codegen"
-version = "0.2.0"
+version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7"
+checksum = "5e7148f24ef8922cc0e5574ebb908729ccdd3a110c440a45165733fedadd9969"
dependencies = [
- "libflate",
+ "include-flate-compress",
+ "proc-macro-error2",
"proc-macro2",
"quote",
"syn 2.0.117",
]
+[[package]]
+name = "include-flate-compress"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74783a9ed407e844e99d5e7a57bd650acbfa124cf6e97ffd790ba59d8ab8e7ff"
+dependencies = [
+ "libflate",
+ "zstd",
+]
+
[[package]]
name = "include_dir"
version = "0.7.4"
@@ -6918,25 +6934,25 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "jieba-macros"
-version = "0.8.0"
+version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6105f38f083bb1a79ad523bd32fa0d8ffcb6abd2fc4da9da203c32bca5b6ace3"
+checksum = "661344b2412fb00aee1841d2405c9a31f7c91cf6e578a8e953647c43dd1a8b0a"
dependencies = [
- "phf_codegen",
+ "phf_codegen 0.13.1",
]
[[package]]
name = "jieba-rs"
-version = "0.8.0"
+version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47982a320106da83b0c5d6aec0fb83e109f0132b69670b063adaa6fa5b4f3f4a"
+checksum = "d7ef90d6209fcff084a01b488c4199d882e3764b15ff0e7a6b5d7efaa46e1e4f"
dependencies = [
"cedarwood",
- "fxhash",
"include-flate",
"jieba-macros",
- "phf 0.12.1",
+ "phf 0.13.1",
"regex",
+ "rustc-hash 2.1.1",
]
[[package]]
@@ -7483,25 +7499,25 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
[[package]]
name = "libflate"
-version = "2.1.0"
+version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
+checksum = "cd96e993e5f3368b0cb8497dae6c860c22af8ff18388c61c6c0b86c58d86b5df"
dependencies = [
"adler32",
- "core2",
"crc32fast",
"dary_heap",
"libflate_lz77",
+ "no_std_io2",
]
[[package]]
name = "libflate_lz77"
-version = "2.1.0"
+version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
+checksum = "ff7a10e427698aef6eef269482776debfef63384d30f13aad39a1a95e0e098fd"
dependencies = [
- "core2",
- "hashbrown 0.14.5",
+ "hashbrown 0.16.1",
+ "no_std_io2",
"rle-decode-fast",
]
@@ -7816,6 +7832,15 @@ dependencies = [
"hashbrown 0.15.4",
]
+[[package]]
+name = "lru"
+version = "0.16.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
+dependencies = [
+ "hashbrown 0.16.1",
+]
+
[[package]]
name = "lru-slab"
version = "0.1.2"
@@ -8299,6 +8324,7 @@ dependencies = [
"either",
"futures",
"greptime-proto",
+ "humantime",
"humantime-serde",
"index",
"itertools 0.14.0",
@@ -8434,7 +8460,7 @@ dependencies = [
"flate2",
"io-enum",
"libc",
- "lru",
+ "lru 0.12.5",
"mysql_common 0.34.1",
"named_pipe",
"pem",
@@ -8497,7 +8523,7 @@ dependencies = [
"futures-sink",
"futures-util",
"keyed_priority_queue",
- "lru",
+ "lru 0.12.5",
"mysql_common 0.34.1",
"pem",
"percent-encoding",
@@ -8695,6 +8721,15 @@ dependencies = [
"libc",
]
+[[package]]
+name = "no_std_io2"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003"
+dependencies = [
+ "memchr",
+]
+
[[package]]
name = "nohash"
version = "0.2.0"
@@ -9635,6 +9670,15 @@ dependencies = [
"serde",
]
+[[package]]
+name = "ordered-float"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e"
+dependencies = [
+ "num-traits",
+]
+
[[package]]
name = "ordered-multimap"
version = "0.4.3"
@@ -10122,6 +10166,15 @@ checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7"
dependencies = [
"phf_macros",
"phf_shared 0.12.1",
+]
+
+[[package]]
+name = "phf"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
+dependencies = [
+ "phf_shared 0.13.1",
"serde",
]
@@ -10131,10 +10184,20 @@ version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "efbdcb6f01d193b17f0b9c3360fa7e0e620991b193ff08702f78b3ce365d7e61"
dependencies = [
- "phf_generator",
+ "phf_generator 0.12.1",
"phf_shared 0.12.1",
]
+[[package]]
+name = "phf_codegen"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+]
+
[[package]]
name = "phf_generator"
version = "0.12.1"
@@ -10145,13 +10208,23 @@ dependencies = [
"phf_shared 0.12.1",
]
+[[package]]
+name = "phf_generator"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
+dependencies = [
+ "fastrand",
+ "phf_shared 0.13.1",
+]
+
[[package]]
name = "phf_macros"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d713258393a82f091ead52047ca779d37e5766226d009de21696c4e667044368"
dependencies = [
- "phf_generator",
+ "phf_generator 0.12.1",
"phf_shared 0.12.1",
"proc-macro2",
"quote",
@@ -10178,6 +10251,15 @@ dependencies = [
"uncased",
]
+[[package]]
+name = "phf_shared"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
+dependencies = [
+ "siphasher",
+]
+
[[package]]
name = "pin-project"
version = "1.1.10"
@@ -11415,16 +11497,6 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
-[[package]]
-name = "rand_distr"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
-dependencies = [
- "num-traits",
- "rand 0.8.5",
-]
-
[[package]]
name = "rand_xorshift"
version = "0.4.0"
@@ -12705,6 +12777,7 @@ dependencies = [
"metric-engine",
"mime_guess",
"mysql_async",
+ "mysql_common 0.34.1",
"notify",
"object-pool",
"once_cell",
@@ -12960,9 +13033,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
[[package]]
name = "sketches-ddsketch"
-version = "0.3.0"
+version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a"
+checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea"
dependencies = [
"serde",
]
@@ -13863,9 +13936,9 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
[[package]]
name = "tantivy"
-version = "0.24.2"
+version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43"
+checksum = "edde6a10743fff00a4e1a8c9ef020bf5f3cbad301b7d2d39f2b07f123c4eac07"
dependencies = [
"aho-corasick",
"arc-swap",
@@ -13876,17 +13949,17 @@ dependencies = [
"census",
"crc32fast",
"crossbeam-channel",
+ "datasketches",
"downcast-rs",
"fastdivide",
"fnv",
"fs4",
"htmlescape",
- "hyperloglogplus",
"itertools 0.14.0",
"levenshtein_automata",
"log",
- "lru",
- "lz4_flex 0.11.6",
+ "lru 0.16.4",
+ "lz4_flex 0.13.1",
"measure_time",
"memmap2",
"once_cell",
@@ -13909,6 +13982,7 @@ dependencies = [
"tempfile",
"thiserror 2.0.17",
"time",
+ "typetag",
"uuid",
"winapi",
"zstd",
@@ -13916,18 +13990,18 @@ dependencies = [
[[package]]
name = "tantivy-bitpacker"
-version = "0.8.0"
+version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494"
+checksum = "4fed3d674429bcd2de5d0a6d1aa5495fed8afd9c5ecce993019caf7615f53fa4"
dependencies = [
"bitpacking",
]
[[package]]
name = "tantivy-columnar"
-version = "0.5.0"
+version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344"
+checksum = "c57166f5bcfd478f370ab8445afb4678dce44801fa5ce5c451aaf8595583c5dc"
dependencies = [
"downcast-rs",
"fastdivide",
@@ -13941,9 +14015,9 @@ dependencies = [
[[package]]
name = "tantivy-common"
-version = "0.9.0"
+version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f"
+checksum = "bbf10915aa75da3c3b0d58b58853d2e889efbaf32d4982a4c3715dde6bba23e5"
dependencies = [
"async-trait",
"byteorder",
@@ -13965,9 +14039,9 @@ dependencies = [
[[package]]
name = "tantivy-jieba"
-version = "0.16.0"
+version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b08147cc130e323ecc522117927b198bec617fe1df562a0b6449905858d0363"
+checksum = "3392170e86f1c387170aba7d171a466ffdc98a8b55b006e19ac64b123a7b690a"
dependencies = [
"jieba-rs",
"lazy_static",
@@ -13976,20 +14050,22 @@ dependencies = [
[[package]]
name = "tantivy-query-grammar"
-version = "0.24.0"
+version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a"
+checksum = "dfadb8526b6da90704feb293b0701a6aae62ea14983143344be2dc5ce30f1d82"
dependencies = [
+ "fnv",
"nom 7.1.3",
+ "ordered-float 5.3.0",
"serde",
"serde_json",
]
[[package]]
name = "tantivy-sstable"
-version = "0.5.0"
+version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416"
+checksum = "8a2cfc3ac5164cbadc28965ffb145a8f47582a60ae5897859ad8d4316596c606"
dependencies = [
"futures-util",
"itertools 0.14.0",
@@ -14001,20 +14077,19 @@ dependencies = [
[[package]]
name = "tantivy-stacker"
-version = "0.5.0"
+version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1"
+checksum = "6cbb051742da9d53ca9e8fff43a9b10e319338b24e2c0e15d0372df19ffeb951"
dependencies = [
"murmurhash32",
- "rand_distr",
"tantivy-common",
]
[[package]]
name = "tantivy-tokenizer-api"
-version = "0.5.0"
+version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d"
+checksum = "eac258c2c6390673f2685813afeeafcb8c4e0ee7de8dd3fc46838dcc37263f98"
dependencies = [
"serde",
]
@@ -15017,9 +15092,9 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
[[package]]
name = "typetag"
-version = "0.2.20"
+version = "0.2.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73f22b40dd7bfe8c14230cf9702081366421890435b2d625fa92b4acc4c3de6f"
+checksum = "c5a897b12c6c1151ad0b138b8db50252dc301f93bc3b027db05eec82aeed298c"
dependencies = [
"erased-serde",
"inventory",
@@ -15030,9 +15105,9 @@ dependencies = [
[[package]]
name = "typetag-impl"
-version = "0.2.20"
+version = "0.2.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35f5380909ffc31b4de4f4bdf96b877175a016aa2ca98cee39fcfd8c4d53d952"
+checksum = "cf808357c6ed7e13ba0f3277ec8d8f21b2d501274895104263985330c726c1c5"
dependencies = [
"proc-macro2",
"quote",
diff --git a/Cargo.toml b/Cargo.toml
index eeddc7099f..32407f31cf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -158,7 +158,7 @@ fs2 = "0.4"
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "dfd2a6d7d3d9c718cb159fcf9abae144b74fc503" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "7224c2ad6d11db612fbdb621c36135fc37ffce35" }
hex = "0.4"
http = "1"
humantime = "2.1"
diff --git a/README.md b/README.md
index 4ed99fa306..127dd1ba85 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
One database for metrics, logs, and traces
replacing Prometheus, Loki, and Elasticsearch
-> The unified OpenTelemetry backend — with SQL + PromQL on object storage.
+> The unified OpenTelemetry backend — with SQL + PromQL on object storage.
@@ -30,11 +30,11 @@ replacing Prometheus, Loki, and Elasticsearch
-
-
+
+
-
-
+
+
@@ -51,7 +51,8 @@ replacing Prometheus, Loki, and Elasticsearch
- [Introduction](#introduction)
-- [⭐ Key Features](#features)
+- [Overview](#overview)
+- [Features](#features)
- [How GreptimeDB Compares](#how-greptimedb-compares)
- [Architecture](#architecture)
- [Try GreptimeDB](#try-greptimedb)
@@ -69,37 +70,47 @@ replacing Prometheus, Loki, and Elasticsearch
**GreptimeDB** is an open-source observability database built for [Observability 2.0](https://docs.greptime.com/user-guide/concepts/observability-2/) — treating metrics, logs, and traces as one unified data model (wide events) instead of three separate pillars.
-Use it as the single OpenTelemetry backend — replacing Prometheus, Loki, and Elasticsearch with one database built on object storage. Query with SQL and PromQL, scale without pain, cut costs up to 50x.
+Use it as the single OpenTelemetry backend — replacing Prometheus, Loki, and Elasticsearch with one database built on object storage. Query with SQL and PromQL, scale without pain, cut costs up to 50×.
+
+## Overview
+
+A quick overview of what GreptimeDB ingests, how it connects to other systems, and what its distributed engine lets you do.
+
+
+
+
+
+
## Features
-| Feature | Description |
-| --------- | ----------- |
-| Drop-in replacement | [PromQL](https://docs.greptime.com/user-guide/query-data/promql/), [Prometheus remote write](https://docs.greptime.com/user-guide/ingest-data/for-observability/prometheus/), [Jaeger](https://docs.greptime.com/user-guide/query-data/jaeger/), and [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/) native. Use as your single backend for all three signals, or migrate one at a time.|
-| 50x lower cost | Object storage (S3, GCS, Azure Blob etc.) as [primary storage](https://docs.greptime.com/user-guide/deployments-administration/configuration/#storage-options). Compute-storage separation scales without pain.|
-| SQL + PromQL | Monitor with [PromQL](https://docs.greptime.com/user-guide/query-data/promql), analyze with [SQL](https://docs.greptime.com/user-guide/query-data/sql). One database replaces Prometheus + your data warehouse.|
-| Sub-second at PB-EB scale | Columnar engine with [fulltext, inverted, and skipping indexes](https://docs.greptime.com/user-guide/manage-data/data-index). Written in Rust.|
+| Feature | Description |
+|---------|-------------|
+| **Observability 2.0 native** | Logs, metrics, and traces in one engine with [SQL + PromQL](https://docs.greptime.com/user-guide/query-data/overview/). Native [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/), [Prometheus remote write](https://docs.greptime.com/user-guide/ingest-data/for-observability/prometheus/), and [Jaeger](https://docs.greptime.com/user-guide/query-data/jaeger/). Migrate one signal at a time, or use as a single backend. |
+| **Elastic compute-storage separation** | Scale reads independently with horizontal replicas. Serve high-concurrency workloads from dashboards, alerting, and AI agents — without resharding or data migration. |
+| **Sub-second on PB–EB-scale data** | Columnar engine with [fulltext, inverted, and skipping indexes](https://docs.greptime.com/user-guide/manage-data/data-index). Written in Rust. Designed for high-concurrency point queries, not just analytical scans. |
+| **50× lower cost** | Object storage (S3, GCS, Azure Blob) as [primary storage](https://docs.greptime.com/user-guide/deployments-administration/configuration/#storage-options), with a tiered cache (memory + local disk) to keep writes and queries fast. |
- ✅ **Perfect for:**
- * Replacing Prometheus + Loki + Elasticsearch with one database
+**Perfect for:**
+ * Replacing Prometheus + Loki + Elasticsearch with a single observability backend
* Scaling past Prometheus — high cardinality, long-term storage, no Thanos/Mimir overhead
- * Cutting observability costs with object storage (up to 50x savings on traces, 30% on logs)
- * AI/LLM observability — store and analyze high-volume conversation data, agent traces, and token metrics via [OpenTelemetry GenAI conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)
+ * AI/agent workloads — store GenAI telemetry ([OTel GenAI conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)), and serve high-concurrency reads from SRE/developer agents via horizontal read replicas
+ * Cutting observability costs with object storage (up to 50× savings on traces, 30% on logs)
* Edge-to-cloud observability with unified APIs on resource-constrained devices
-> **Why Observability 2.0?** The three-pillar model (separate databases for metrics, logs, traces) creates data silos and operational complexity. GreptimeDB treats all observability data as timestamped wide events in a single columnar engine — enabling cross-signal SQL JOINs, eliminating redundant infrastructure, and naturally supporting emerging workloads like AI agent observability. Read more: [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database).
+> **Why Observability 2.0?** Three separate databases for metrics, logs, and traces means three storage layers, three query languages, and three sets of dashboards. GreptimeDB stores all three as timestamped wide events in one columnar engine — JOIN across signals in SQL, run one stack instead of three, and ingest AI agent telemetry the same way. Read more: [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database).
Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb).
## How GreptimeDB Compares
-| Feature | GreptimeDB | Prometheus / Thanos / Mimir | Grafana Loki | Elasticsearch |
+| Capability | GreptimeDB | Prometheus / Thanos / Mimir | Grafana Loki | Elasticsearch |
|---|---|---|---|---|
| Data types | Metrics, logs, traces | Metrics only | Logs only | Logs, traces |
| Query language | SQL + PromQL | PromQL | LogQL | Query DSL |
| Storage | Native object storage (S3, etc.) | Local disk + object storage (Thanos/Mimir) | Object storage (chunks) | Local disk |
| Scaling | Compute-storage separation, stateless nodes | Federation / Thanos / Mimir — multi-component, ops heavy | Stateless + object storage | Shard-based, ops heavy |
-| Cost efficiency | Up to 50x lower storage | High at scale | Moderate | High (inverted index overhead) |
+| Cost efficiency | Up to 50× lower storage cost | High at scale | Moderate | High (inverted index overhead) |
| OpenTelemetry | Native (metrics + logs + traces) | Partial (metrics only) | Partial (logs only) | Via instrumentation |
**Benchmarks:**
@@ -110,19 +121,26 @@ Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why
## Architecture
GreptimeDB can run in two modes:
-* **Standalone Mode** - Single binary for development and small deployments
-* **Distributed Mode** - Separate components for production scale:
- - Frontend: Query processing and protocol handling
- - Datanode: Data storage and retrieval
- - Metasrv: Metadata management and coordination
-
-Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document. [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB:
-
+* **Standalone** — single binary for development and small deployments.
+* **Distributed** — four components, each independently scalable:
+ - **Frontend** — protocol entry (OTel, Prometheus, MySQL/PostgreSQL, gRPC, ingestion APIs for Elasticsearch/InfluxDB/Loki) and the distributed query engine. Stateless, scales horizontally.
+ - **Datanode** — region engine with WAL, memtable, SST, cache, compaction, and indexes. Persists data to object storage. Elastic.
+ - **Metasrv** — metadata, routing, repartitioning, autopilot, and security. Backed by a pluggable KV layer (etcd or RDS).
+ - **Flownode** (optional) — continuous flow computation (streaming and materialized views).
+
+For deeper coverage, see the [architecture doc](https://docs.greptime.com/contributor-guide/overview/#architecture) or [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview).
+
+
+
+
## Try GreptimeDB
-```shell
-docker pull greptime/greptimedb
+**For AI agents** — paste this prompt into your agent:
+
+```text
+Read https://docs.greptime.com/SKILL.md and follow the instructions
+to deploy, configure, ingest, and query GreptimeDB.
```
```shell
@@ -131,7 +149,7 @@ docker run -p 127.0.0.1:4000-4003:4000-4003 \
--name greptime --rm \
greptime/greptimedb:latest standalone start \
--http-addr 0.0.0.0:4000 \
- --grpc-bind-addr 0.0.0.0:4001 \
+ --rpc-bind-addr 0.0.0.0:4001 \
--mysql-addr 0.0.0.0:4002 \
--postgres-addr 0.0.0.0:4003
```
@@ -153,20 +171,30 @@ Read more in the [full Install Guide](https://docs.greptime.com/getting-started/
## Build From Source
**Prerequisites:**
-* [Rust toolchain](https://www.rust-lang.org/tools/install) (nightly)
+* [Rust toolchain](https://www.rust-lang.org/tools/install) — nightly, pinned by [`rust-toolchain.toml`](https://github.com/GreptimeTeam/greptimedb/blob/main/rust-toolchain.toml)
* [Protobuf compiler](https://grpc.io/docs/protoc-installation/) (>= 3.15)
-* C/C++ building essentials, including `gcc`/`g++`/`autoconf` and glibc library (eg. `libc6-dev` on Ubuntu and `glibc-devel` on Fedora)
-* Python toolchain (optional): Required only if using some test scripts.
+* C/C++ building essentials: `gcc` / `g++` / `autoconf` and the glibc dev package (`libc6-dev` on Ubuntu, `glibc-devel` on Fedora)
+* Python toolchain (optional, only for some test scripts)
-**Build and Run:**
+**Build and run:**
```bash
-make
-cargo run -- standalone start
+make # build greptime binary
+cargo run -- standalone start # start in standalone mode
```
+**Common dev commands:**
+```bash
+make fmt # format Rust code
+make clippy # lint (fails on warnings)
+make test # unit + integration tests (uses cargo-nextest)
+make sqlness-test # SQL regression tests
+```
+
+See the [Contribution Guidelines](CONTRIBUTING.md) for the full developer workflow.
+
## Tools & Extensions
-- **Kubernetes**: [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator)
+- **Kubernetes**: [GreptimeDB Operator](https://github.com/GreptimeTeam/greptimedb-operator)
- **Helm Charts**: [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts)
- **Dashboard**: [Web UI](https://github.com/GreptimeTeam/dashboard)
- **gRPC Ingester**: [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust), [.NET](https://github.com/GreptimeTeam/greptimedb-ingester-dotnet)
@@ -175,18 +203,11 @@ cargo run -- standalone start
## Project Status
-> **Status:** [v1.0 GA](https://github.com/GreptimeTeam/greptimedb/releases/tag/v1.0.0) — generally available and production-ready! 🎉
+GreptimeDB is at [v1.0 GA](https://github.com/GreptimeTeam/greptimedb/releases/tag/v1.0.0) with stable APIs and regular releases. It runs in production at scale — [OceanBase Cloud](https://greptime.com/blogs/2025-07-22-user-case-obcloud-log-management-greptimedb) operates 80+ GreptimeDB clusters managing 300 TB of logs, cutting log storage cost by 60% after migrating from Grafana Loki. See more in [case studies](https://greptime.com/blogs/?category=Use%20Case).
-- Deployed in production handling billions of data points daily
-- Stable APIs, actively maintained, with regular releases ([version info](https://docs.greptime.com/nightly/reference/about-greptimedb-version))
+Read the [v1.0 highlights](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) and [2026 roadmap](https://greptime.com/blogs/2026-02-11-greptimedb-roadmap-2026), or browse the [version reference](https://docs.greptime.com/nightly/reference/about-greptimedb-version).
-GreptimeDB v1.0 marks a major milestone — stable APIs, production readiness, and proven performance at scale.
-
-**Learn more:** [v1.0 highlights](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) and [2026 roadmap](https://greptime.com/blogs/2026-02-11-greptimedb-roadmap-2026).
-
-For production use, we recommend v1.0 or later.
-
-If you find this project useful, a ⭐ would mean a lot to us!
+If GreptimeDB is useful to you, please star the repo.
[](https://www.star-history.com/#GreptimeTeam/GreptimeDB&Date)
@@ -216,15 +237,19 @@ We offer enterprise add-ons, services, training, and consulting.
## Contributing
-- Read our [Contribution Guidelines](https://github.com/GreptimeTeam/greptimedb/blob/main/CONTRIBUTING.md).
+- Read our [Contribution Guidelines](CONTRIBUTING.md).
- Explore [Internal Concepts](https://docs.greptime.com/contributor-guide/overview.html) and [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb).
- Pick up a [good first issue](https://github.com/GreptimeTeam/greptimedb/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and join the #contributors [Slack](https://greptime.com/slack) channel.
## Acknowledgement
-Special thanks to all contributors! See [AUTHORS.md](https://github.com/GreptimeTeam/greptimedb/blob/main/AUTHOR.md).
+Special thanks to all contributors! See [AUTHOR.md](AUTHOR.md).
- Uses [Apache Arrow™](https://arrow.apache.org/) (memory model)
- [Apache Parquet™](https://parquet.apache.org/) (file storage)
-- [Apache DataFusion™](https://arrow.apache.org/datafusion/) (query engine)
+- [Apache DataFusion™](https://datafusion.apache.org/) (query engine)
- [Apache OpenDAL™](https://opendal.apache.org/) (data access abstraction)
+
+---
+
+*All trademarks, logos, and brand names referenced in this README and in the Overview diagram are the property of their respective owners. Their use is for identification purposes only and does not imply endorsement or affiliation.*
diff --git a/config/config.md b/config/config.md
index b1630d97ad..0fae0caaa4 100644
--- a/config/config.md
+++ b/config/config.md
@@ -155,6 +155,8 @@
| `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache. If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
| `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache. If not set, it's default to 1/8 of OS memory. |
| `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache. If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.range_result_cache_size` | String | Auto | Cache size for flat range scan results. Setting it to 0 to disable the cache. If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.prefilter_result_cache_size` | String | Auto | Cache size for prefilter results. Setting it to 0 to disable the cache. If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
| `region_engine.mito.enable_write_cache` | Bool | `false` | Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance. |
| `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
| `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
@@ -543,6 +545,8 @@
| `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache. If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
| `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache. If not set, it's default to 1/8 of OS memory. |
| `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache. If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.range_result_cache_size` | String | Auto | Cache size for flat range scan results. Setting it to 0 to disable the cache. If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.prefilter_result_cache_size` | String | Auto | Cache size for prefilter results. Setting it to 0 to disable the cache. If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
| `region_engine.mito.enable_write_cache` | Bool | `false` | Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance. |
| `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
| `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index 170045a090..d558918daf 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -480,6 +480,16 @@ auto_flush_interval = "1h"
## @toml2docs:none-default="Auto"
#+ selector_result_cache_size = "512MB"
+## Cache size for flat range scan results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+## @toml2docs:none-default="Auto"
+#+ range_result_cache_size = "512MB"
+
+## Cache size for prefilter results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
+## @toml2docs:none-default="Auto"
+#+ prefilter_result_cache_size = "128MB"
+
## Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance.
enable_write_cache = false
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index 24249270b2..d5c42e744c 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -599,6 +599,16 @@ auto_flush_interval = "1h"
## @toml2docs:none-default="Auto"
#+ selector_result_cache_size = "512MB"
+## Cache size for flat range scan results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+## @toml2docs:none-default="Auto"
+#+ range_result_cache_size = "512MB"
+
+## Cache size for prefilter results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
+## @toml2docs:none-default="Auto"
+#+ prefilter_result_cache_size = "128MB"
+
## Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance.
enable_write_cache = false
diff --git a/docs/architecture.png b/docs/architecture.png
index 992b6c856d..697292ef2f 100644
Binary files a/docs/architecture.png and b/docs/architecture.png differ
diff --git a/docs/overview.png b/docs/overview.png
new file mode 100644
index 0000000000..5ab20834a4
Binary files /dev/null and b/docs/overview.png differ
diff --git a/src/auth/src/permission.rs b/src/auth/src/permission.rs
index 88adfda633..8914635290 100644
--- a/src/auth/src/permission.rs
+++ b/src/auth/src/permission.rs
@@ -16,6 +16,7 @@ use std::fmt::Debug;
use std::sync::Arc;
use api::v1::greptime_request::Request;
+use api::v1::query_request::Query;
use common_telemetry::debug;
use sql::statements::statement::Statement;
@@ -42,10 +43,12 @@ impl<'a> PermissionReq<'a> {
/// Returns true if the permission request is for read operations.
pub fn is_readonly(&self) -> bool {
match self {
- PermissionReq::GrpcRequest(Request::Query(_))
- | PermissionReq::PromQuery
- | PermissionReq::LogQuery
- | PermissionReq::PromStoreRead => true,
+ PermissionReq::GrpcRequest(Request::Query(query_request)) => {
+ !matches!(query_request.query, Some(Query::InsertIntoPlan(_)))
+ }
+ PermissionReq::PromQuery | PermissionReq::LogQuery | PermissionReq::PromStoreRead => {
+ true
+ }
PermissionReq::SqlStatement(stmt) => stmt.is_readonly(),
PermissionReq::GrpcRequest(_)
@@ -196,4 +199,14 @@ mod tests {
assert!(matches!(read_result, PermissionResp::Reject));
assert!(matches!(write_result, PermissionResp::Allow));
}
+
+ #[test]
+ fn test_grpc_insert_into_plan_is_write_request() {
+ let request = Request::Query(api::v1::QueryRequest {
+ query: Some(Query::InsertIntoPlan(api::v1::InsertIntoPlan::default())),
+ });
+ let req = PermissionReq::GrpcRequest(&request);
+
+ assert!(req.is_write());
+ }
}
diff --git a/src/catalog/src/system_schema/information_schema.rs b/src/catalog/src/system_schema/information_schema.rs
index 9715aa9402..a35950194c 100644
--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -20,6 +20,7 @@ pub mod key_column_usage;
mod partitions;
mod procedure_info;
pub mod process_list;
+mod region_info;
pub mod region_peers;
mod region_statistics;
pub mod schemata;
@@ -47,6 +48,8 @@ use datatypes::schema::SchemaRef;
use lazy_static::lazy_static;
use paste::paste;
use process_list::InformationSchemaProcessList;
+use region_info::InformationSchemaRegionInfo;
+use store_api::region_info::RegionInfoEntry;
use store_api::sst_entry::{ManifestSstEntry, PuffinIndexMetaEntry, StorageSstEntry};
use store_api::storage::{ScanRequest, TableId};
use table::TableRef;
@@ -242,6 +245,9 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
self.catalog_manager.clone(),
),
) as _),
+ REGION_INFO => Some(Arc::new(InformationSchemaRegionInfo::new(
+ self.catalog_manager.clone(),
+ )) as _),
PROCESS_LIST => self
.process_manager
.as_ref()
@@ -320,6 +326,10 @@ impl InformationSchemaProvider {
REGION_STATISTICS.to_string(),
self.build_table(REGION_STATISTICS).unwrap(),
);
+ tables.insert(
+ REGION_INFO.to_string(),
+ self.build_table(REGION_INFO).unwrap(),
+ );
tables.insert(
SSTS_MANIFEST.to_string(),
self.build_table(SSTS_MANIFEST).unwrap(),
@@ -447,6 +457,8 @@ pub enum DatanodeInspectKind {
SstStorage,
/// List index metadata collected from manifest
SstIndexMeta,
+ /// List region runtime and manifest info
+ RegionInfo,
}
impl DatanodeInspectRequest {
@@ -456,6 +468,7 @@ impl DatanodeInspectRequest {
DatanodeInspectKind::SstManifest => ManifestSstEntry::build_plan(self.scan),
DatanodeInspectKind::SstStorage => StorageSstEntry::build_plan(self.scan),
DatanodeInspectKind::SstIndexMeta => PuffinIndexMetaEntry::build_plan(self.scan),
+ DatanodeInspectKind::RegionInfo => RegionInfoEntry::build_plan(self.scan),
}
}
}
@@ -488,3 +501,28 @@ impl InformationExtension for NoopInformationExtension {
Ok(common_recordbatch::RecordBatches::empty().as_stream())
}
}
+
+#[cfg(test)]
+mod tests {
+ use store_api::region_info::RegionInfoEntry;
+
+ use super::*;
+
+ #[test]
+ fn test_datanode_inspect_region_info_build_plan() {
+ let plan = DatanodeInspectRequest {
+ kind: DatanodeInspectKind::RegionInfo,
+ scan: ScanRequest::default(),
+ }
+ .build_plan()
+ .unwrap();
+
+ let LogicalPlan::TableScan(scan) = plan else {
+ panic!("expected table scan");
+ };
+ assert_eq!(
+ scan.table_name.to_string(),
+ RegionInfoEntry::reserved_table_name_for_inspection()
+ );
+ }
+}
diff --git a/src/catalog/src/system_schema/information_schema/region_info.rs b/src/catalog/src/system_schema/information_schema/region_info.rs
new file mode 100644
index 0000000000..ffc9dfc7ae
--- /dev/null
+++ b/src/catalog/src/system_schema/information_schema/region_info.rs
@@ -0,0 +1,86 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::{Arc, Weak};
+
+use common_catalog::consts::INFORMATION_SCHEMA_REGION_INFO_TABLE_ID;
+use common_error::ext::BoxedError;
+use common_recordbatch::SendableRecordBatchStream;
+use common_recordbatch::adapter::AsyncRecordBatchStreamAdapter;
+use datatypes::schema::SchemaRef;
+use snafu::ResultExt;
+use store_api::region_info::RegionInfoEntry;
+use store_api::storage::{ScanRequest, TableId};
+
+use crate::CatalogManager;
+use crate::error::{ProjectSchemaSnafu, Result};
+use crate::information_schema::{
+ DatanodeInspectKind, DatanodeInspectRequest, InformationTable, REGION_INFO,
+};
+use crate::system_schema::utils;
+
+/// Information schema table for region info.
+pub struct InformationSchemaRegionInfo {
+ schema: SchemaRef,
+ catalog_manager: Weak,
+}
+
+impl InformationSchemaRegionInfo {
+ pub(super) fn new(catalog_manager: Weak) -> Self {
+ Self {
+ schema: RegionInfoEntry::schema(),
+ catalog_manager,
+ }
+ }
+}
+
+impl InformationTable for InformationSchemaRegionInfo {
+ fn table_id(&self) -> TableId {
+ INFORMATION_SCHEMA_REGION_INFO_TABLE_ID
+ }
+
+ fn table_name(&self) -> &'static str {
+ REGION_INFO
+ }
+
+ fn schema(&self) -> SchemaRef {
+ self.schema.clone()
+ }
+
+ fn to_stream(&self, request: ScanRequest) -> Result {
+ let schema = if let Some(p) = request.projection_indices() {
+ Arc::new(self.schema.try_project(p).context(ProjectSchemaSnafu)?)
+ } else {
+ self.schema.clone()
+ };
+
+ let info_ext = utils::information_extension(&self.catalog_manager)?;
+ let req = DatanodeInspectRequest {
+ kind: DatanodeInspectKind::RegionInfo,
+ scan: request,
+ };
+
+ let future = async move {
+ info_ext
+ .inspect_datanode(req)
+ .await
+ .map_err(BoxedError::new)
+ .context(common_recordbatch::error::ExternalSnafu)
+ };
+ Ok(Box::pin(AsyncRecordBatchStreamAdapter::new(
+ schema,
+ Box::pin(future),
+ )))
+ }
+}
diff --git a/src/catalog/src/system_schema/information_schema/table_names.rs b/src/catalog/src/system_schema/information_schema/table_names.rs
index 2a3329fece..3a4c86487a 100644
--- a/src/catalog/src/system_schema/information_schema/table_names.rs
+++ b/src/catalog/src/system_schema/information_schema/table_names.rs
@@ -45,6 +45,7 @@ pub const CLUSTER_INFO: &str = "cluster_info";
pub const VIEWS: &str = "views";
pub const FLOWS: &str = "flows";
pub const PROCEDURE_INFO: &str = "procedure_info";
+pub const REGION_INFO: &str = "region_info";
pub const REGION_STATISTICS: &str = "region_statistics";
pub const PROCESS_LIST: &str = "process_list";
pub const SSTS_MANIFEST: &str = "ssts_manifest";
diff --git a/src/catalog/src/table_source/dummy_catalog.rs b/src/catalog/src/table_source/dummy_catalog.rs
index db49db0eed..20637c3a3a 100644
--- a/src/catalog/src/table_source/dummy_catalog.rs
+++ b/src/catalog/src/table_source/dummy_catalog.rs
@@ -22,6 +22,7 @@ use async_trait::async_trait;
use common_catalog::format_full_table_name;
use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider};
use datafusion::datasource::TableProvider;
+use session::context::QueryContextRef;
use snafu::OptionExt;
use table::table::adapter::DfTableProviderAdapter;
@@ -32,12 +33,27 @@ use crate::error::TableNotExistSnafu;
#[derive(Clone)]
pub struct DummyCatalogList {
catalog_manager: CatalogManagerRef,
+ query_ctx: Option,
}
impl DummyCatalogList {
- /// Creates a new catalog list with the given catalog manager.
+ /// Creates a new catalog list with the given catalog manager (no query context).
pub fn new(catalog_manager: CatalogManagerRef) -> Self {
- Self { catalog_manager }
+ Self {
+ catalog_manager,
+ query_ctx: None,
+ }
+ }
+
+ /// Creates a new catalog list with the given catalog manager and query context.
+ pub fn new_with_query_ctx(
+ catalog_manager: CatalogManagerRef,
+ query_ctx: QueryContextRef,
+ ) -> Self {
+ Self {
+ catalog_manager,
+ query_ctx: Some(query_ctx),
+ }
}
}
@@ -68,6 +84,7 @@ impl CatalogProviderList for DummyCatalogList {
Some(Arc::new(DummyCatalogProvider {
catalog_name: catalog_name.to_string(),
catalog_manager: self.catalog_manager.clone(),
+ query_ctx: self.query_ctx.clone(),
}))
}
}
@@ -77,6 +94,7 @@ impl CatalogProviderList for DummyCatalogList {
struct DummyCatalogProvider {
catalog_name: String,
catalog_manager: CatalogManagerRef,
+ query_ctx: Option,
}
impl CatalogProvider for DummyCatalogProvider {
@@ -93,6 +111,7 @@ impl CatalogProvider for DummyCatalogProvider {
catalog_name: self.catalog_name.clone(),
schema_name: schema_name.to_string(),
catalog_manager: self.catalog_manager.clone(),
+ query_ctx: self.query_ctx.clone(),
}))
}
}
@@ -111,6 +130,7 @@ struct DummySchemaProvider {
catalog_name: String,
schema_name: String,
catalog_manager: CatalogManagerRef,
+ query_ctx: Option,
}
#[async_trait]
@@ -126,7 +146,12 @@ impl SchemaProvider for DummySchemaProvider {
async fn table(&self, name: &str) -> datafusion::error::Result>> {
let table = self
.catalog_manager
- .table(&self.catalog_name, &self.schema_name, name, None)
+ .table(
+ &self.catalog_name,
+ &self.schema_name,
+ name,
+ self.query_ctx.as_deref(),
+ )
.await?
.with_context(|| TableNotExistSnafu {
table: format_full_table_name(&self.catalog_name, &self.schema_name, name),
diff --git a/src/cli/src/data/export_v2/command.rs b/src/cli/src/data/export_v2/command.rs
index 3c069a72be..db0f576a4e 100644
--- a/src/cli/src/data/export_v2/command.rs
+++ b/src/cli/src/data/export_v2/command.rs
@@ -15,6 +15,7 @@
//! Export V2 CLI commands.
use std::collections::HashSet;
+use std::io::{self, Write};
use std::time::Duration;
use async_trait::async_trait;
@@ -28,7 +29,7 @@ use crate::Tool;
use crate::common::ObjectStoreConfig;
use crate::data::export_v2::coordinator::export_data;
use crate::data::export_v2::error::{
- ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu,
+ ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu, IoSnafu,
ManifestVersionMismatchSnafu, Result, ResumeConfigMismatchSnafu, SchemaOnlyArgsNotAllowedSnafu,
SchemaOnlyModeMismatchSnafu, SnapshotVerifyFailedSnafu, UnexpectedValueTypeSnafu,
};
@@ -38,7 +39,9 @@ use crate::data::export_v2::manifest::{
};
use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR, SCHEMAS_FILE};
use crate::data::path::{data_dir_for_schema_chunk, ddl_path_for_schema};
-use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
+use crate::data::snapshot_storage::{
+ OpenDalStorage, SnapshotStorage, validate_snapshot_uri, validate_uri,
+};
use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
use crate::database::{DatabaseClient, parse_proxy_opts};
@@ -51,6 +54,8 @@ pub enum ExportV2Command {
List(ExportListCommand),
/// Verify snapshot integrity.
Verify(ExportVerifyCommand),
+ /// Delete a snapshot and all data under it.
+ Delete(ExportDeleteCommand),
}
impl ExportV2Command {
@@ -59,6 +64,7 @@ impl ExportV2Command {
ExportV2Command::Create(cmd) => cmd.build().await,
ExportV2Command::List(cmd) => cmd.build().await,
ExportV2Command::Verify(cmd) => cmd.build().await,
+ ExportV2Command::Delete(cmd) => cmd.build().await,
}
}
}
@@ -172,6 +178,75 @@ impl ExportVerify {
}
}
+/// Delete a snapshot and all data under it.
+#[derive(Debug, Parser)]
+pub struct ExportDeleteCommand {
+ /// Snapshot storage location (e.g., s3://bucket/path, file:///tmp/backup).
+ #[clap(long)]
+ snapshot: String,
+
+ /// Skip interactive confirmation.
+ #[clap(long = "no-confirm", alias = "yes")]
+ skip_confirmation: bool,
+
+ /// Object store configuration for remote storage backends.
+ #[clap(flatten)]
+ storage: ObjectStoreConfig,
+}
+
+impl ExportDeleteCommand {
+ pub async fn build(&self) -> std::result::Result, BoxedError> {
+ validate_snapshot_uri(&self.snapshot).map_err(BoxedError::new)?;
+ let storage =
+ OpenDalStorage::from_uri(&self.snapshot, &self.storage).map_err(BoxedError::new)?;
+
+ Ok(Box::new(ExportDelete {
+ snapshot: self.snapshot.clone(),
+ skip_confirmation: self.skip_confirmation,
+ storage,
+ }))
+ }
+}
+
+/// Export delete tool implementation.
+pub struct ExportDelete {
+ snapshot: String,
+ skip_confirmation: bool,
+ storage: OpenDalStorage,
+}
+
+#[async_trait]
+impl Tool for ExportDelete {
+ async fn do_work(&self) -> std::result::Result<(), BoxedError> {
+ self.run().await.map_err(BoxedError::new)
+ }
+}
+
+impl ExportDelete {
+ async fn run(&self) -> Result<()> {
+ self.run_with_confirmation(confirm_delete).await
+ }
+
+ async fn run_with_confirmation(&self, confirm: F) -> Result<()>
+ where
+ F: FnOnce(&str) -> Result,
+ {
+ let manifest = self.storage.read_manifest().await?;
+ print_delete_summary(&self.snapshot, &manifest);
+
+ if !self.skip_confirmation && !confirm(&self.snapshot)? {
+ println!("Deletion cancelled.");
+ return Ok(());
+ }
+
+ println!("Deleting snapshot...");
+ self.storage.delete_snapshot().await?;
+ println!("Snapshot deleted successfully.");
+
+ Ok(())
+ }
+}
+
/// Create a new snapshot.
#[derive(Debug, Parser)]
pub struct ExportCreateCommand {
@@ -1239,6 +1314,79 @@ fn print_verify_report(snapshot: &str, report: &VerifyReport) {
);
}
+fn print_delete_summary(snapshot: &str, manifest: &Manifest) {
+ println!("Snapshot: {}", manifest.snapshot_id);
+ println!(" Location: {}", snapshot);
+ println!(
+ " Created: {} UTC",
+ manifest.created_at.format("%Y-%m-%d %H:%M:%S")
+ );
+ println!(" Catalog: {}", manifest.catalog);
+ println!(" Schemas: {}", manifest.schemas.join(", "));
+ println!(" Chunks: {}", format_delete_chunks(manifest));
+}
+
+fn format_delete_chunks(manifest: &Manifest) -> String {
+ if manifest.schema_only {
+ return "0 (schema-only)".to_string();
+ }
+
+ let summary = summarize_chunks(manifest);
+ if manifest.is_complete() {
+ format!("{} (all processed)", summary.total)
+ } else {
+ format!(
+ "{} ({} completed, {} skipped, {} pending, {} in_progress, {} failed)",
+ summary.total,
+ summary.completed,
+ summary.skipped,
+ summary.pending,
+ summary.in_progress,
+ summary.failed
+ )
+ }
+}
+
+fn confirm_delete(snapshot: &str) -> Result {
+ println!();
+ println!(
+ "Warning: this removes the entire snapshot directory/prefix, not only files listed in manifest."
+ );
+ println!("This will permanently delete all data under:");
+ println!(" {}", display_snapshot_prefix(snapshot));
+ print!("Type 'yes' to confirm deletion: ");
+ io::stdout().flush().map_err(|error| {
+ IoSnafu {
+ operation: "flushing delete confirmation prompt",
+ error,
+ }
+ .build()
+ })?;
+
+ let mut input = String::new();
+ io::stdin().read_line(&mut input).map_err(|error| {
+ IoSnafu {
+ operation: "reading delete confirmation",
+ error,
+ }
+ .build()
+ })?;
+
+ Ok(delete_confirmation_matches(&input))
+}
+
+fn delete_confirmation_matches(input: &str) -> bool {
+ input.trim() == "yes"
+}
+
+fn display_snapshot_prefix(snapshot: &str) -> String {
+ if snapshot.ends_with('/') {
+ snapshot.to_string()
+ } else {
+ format!("{}/", snapshot)
+ }
+}
+
#[cfg(test)]
mod tests {
use chrono::TimeZone;
@@ -1563,6 +1711,7 @@ mod tests {
);
assert_eq!(snapshot_status(&complete), "complete");
assert_eq!(format_list_chunks(&complete), "2/2");
+ assert_eq!(format_delete_chunks(&complete), "2 (all processed)");
let incomplete = test_manifest(
chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
@@ -1571,6 +1720,150 @@ mod tests {
);
assert_eq!(snapshot_status(&incomplete), "incomplete");
assert_eq!(format_list_chunks(&incomplete), "1/2");
+ assert_eq!(
+ format_delete_chunks(&incomplete),
+ "2 (1 completed, 0 skipped, 1 pending, 0 in_progress, 0 failed)"
+ );
+ }
+
+ #[tokio::test]
+ async fn test_delete_build_rejects_bucket_root_uri() {
+ let cmd = ExportDeleteCommand::parse_from([
+ "export-v2-delete",
+ "--snapshot",
+ "s3://bucket",
+ "--no-confirm",
+ ]);
+
+ let error = cmd.build().await.err().unwrap().to_string();
+ assert!(error.contains("non-empty path"));
+ }
+
+ #[test]
+ fn test_delete_skip_confirmation_aliases() {
+ let no_confirm = ExportDeleteCommand::parse_from([
+ "export-v2-delete",
+ "--snapshot",
+ "s3://bucket/snapshot",
+ "--no-confirm",
+ ]);
+ assert!(no_confirm.skip_confirmation);
+
+ let yes = ExportDeleteCommand::parse_from([
+ "export-v2-delete",
+ "--snapshot",
+ "s3://bucket/snapshot",
+ "--yes",
+ ]);
+ assert!(yes.skip_confirmation);
+ }
+
+ #[tokio::test]
+ async fn test_delete_snapshot_with_no_confirm_removes_snapshot_contents() {
+ let parent = tempdir().unwrap();
+ let snapshot = parent.path().join("snapshot");
+ let sibling = parent.path().join("sibling");
+ std::fs::create_dir_all(&snapshot).unwrap();
+ std::fs::create_dir_all(&sibling).unwrap();
+ std::fs::write(sibling.join("keep.txt"), b"keep").unwrap();
+ write_root_manifest(
+ &snapshot,
+ test_manifest(
+ chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
+ true,
+ true,
+ ),
+ );
+ write_snapshot_file(&snapshot, "schema/schemas.json", b"[]");
+
+ let uri = Url::from_directory_path(&snapshot).unwrap().to_string();
+ let delete = ExportDelete {
+ snapshot: uri,
+ skip_confirmation: true,
+ storage: file_storage_for_dir(&snapshot),
+ };
+
+ delete
+ .run_with_confirmation(|_| unreachable!())
+ .await
+ .unwrap();
+
+ assert!(!snapshot.join(MANIFEST_FILE).exists());
+ assert!(!snapshot.join("schema/schemas.json").exists());
+ assert!(sibling.join("keep.txt").exists());
+ }
+
+ #[tokio::test]
+ async fn test_delete_snapshot_requires_manifest() {
+ let dir = tempdir().unwrap();
+ let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+ let delete = ExportDelete {
+ snapshot: uri,
+ skip_confirmation: true,
+ storage: file_storage_for_dir(dir.path()),
+ };
+
+ let error = delete
+ .run_with_confirmation(|_| unreachable!())
+ .await
+ .err()
+ .unwrap()
+ .to_string();
+
+ assert!(error.contains("Snapshot not found"));
+ assert!(dir.path().exists());
+ }
+
+ #[tokio::test]
+ async fn test_delete_snapshot_cancels_without_exact_confirmation() {
+ let dir = tempdir().unwrap();
+ write_root_manifest(
+ dir.path(),
+ test_manifest(
+ chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
+ true,
+ true,
+ ),
+ );
+ write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
+ let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+ let delete = ExportDelete {
+ snapshot: uri.clone(),
+ skip_confirmation: false,
+ storage: file_storage_for_dir(dir.path()),
+ };
+
+ delete
+ .run_with_confirmation(|snapshot| {
+ assert_eq!(snapshot, uri);
+ Ok(false)
+ })
+ .await
+ .unwrap();
+
+ assert!(dir.path().join(MANIFEST_FILE).exists());
+ assert!(dir.path().join("schema/schemas.json").exists());
+ }
+
+ #[test]
+ fn test_delete_confirmation_requires_exact_yes() {
+ assert!(delete_confirmation_matches("yes"));
+ assert!(delete_confirmation_matches(" yes\n"));
+ assert!(!delete_confirmation_matches("YES"));
+ assert!(!delete_confirmation_matches("y"));
+ assert!(!delete_confirmation_matches("yes please"));
+ }
+
+ #[test]
+ fn test_display_snapshot_prefix_adds_trailing_slash() {
+ assert_eq!(
+ display_snapshot_prefix("s3://bucket/snapshot"),
+ "s3://bucket/snapshot/"
+ );
+ assert_eq!(
+ display_snapshot_prefix("s3://bucket/snapshot/"),
+ "s3://bucket/snapshot/"
+ );
}
#[tokio::test]
diff --git a/src/cli/src/data/export_v2/error.rs b/src/cli/src/data/export_v2/error.rs
index 8d9a53f186..e16e3a6176 100644
--- a/src/cli/src/data/export_v2/error.rs
+++ b/src/cli/src/data/export_v2/error.rs
@@ -71,6 +71,14 @@ pub enum Error {
location: Location,
},
+ #[snafu(display("I/O error while {}: {}", operation, error))]
+ Io {
+ operation: &'static str,
+ error: std::io::Error,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
#[snafu(display(
"Cannot resume snapshot with a different schema_only mode (existing: {}, requested: {}). Use --force to recreate.",
existing_schema_only,
@@ -223,6 +231,8 @@ impl ErrorExt for Error {
| Error::UnexpectedValueType { .. }
| Error::UrlParse { .. } => StatusCode::Internal,
+ Error::Io { .. } => StatusCode::External,
+
Error::Database { error, .. } => error.status_code(),
Error::SnapshotNotFound { .. } => StatusCode::InvalidArguments,
diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs
index da8fdf6ab1..93e211628a 100644
--- a/src/cli/src/data/snapshot_storage.rs
+++ b/src/cli/src/data/snapshot_storage.rs
@@ -18,6 +18,7 @@
//! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem).
use std::collections::BTreeSet;
+use std::path::Component;
use async_trait::async_trait;
use futures::TryStreamExt;
@@ -131,6 +132,92 @@ pub fn validate_uri(uri: &str) -> Result {
StorageScheme::from_uri(uri)
}
+/// Validates a URI for snapshot-scoped destructive operations.
+///
+/// Unlike read-only parent scans, destructive commands must target a concrete
+/// snapshot directory instead of a bucket/container root or filesystem root.
+/// Remote storage buckets/containers already provide namespace isolation, so a
+/// non-empty object prefix is enough; local filesystem paths require at least
+/// two non-root path segments to avoid deleting broad system directories.
+pub fn validate_snapshot_uri(uri: &str) -> Result {
+ let scheme = validate_uri(uri)?;
+ reject_query_or_fragment(uri)?;
+ match scheme {
+ StorageScheme::File => validate_file_snapshot_uri(uri)?,
+ StorageScheme::S3 | StorageScheme::Oss | StorageScheme::Gcs | StorageScheme::Azblob => {
+ extract_remote_location_with_root_policy(uri, false)?;
+ }
+ }
+ Ok(scheme)
+}
+
+fn reject_query_or_fragment(uri: &str) -> Result<()> {
+ let url = Url::parse(uri).context(UrlParseSnafu)?;
+ if url.query().is_some() || url.fragment().is_some() {
+ return InvalidUriSnafu {
+ uri,
+ reason: "snapshot URI must not include query or fragment",
+ }
+ .fail();
+ }
+
+ Ok(())
+}
+
+fn validate_file_snapshot_uri(uri: &str) -> Result<()> {
+ if has_explicit_dot_segment(uri) {
+ return InvalidUriSnafu {
+ uri,
+ reason: "file snapshot URI must not contain '.' or '..' path segments",
+ }
+ .fail();
+ }
+
+ let path = extract_file_path_from_uri(uri)?;
+ let mut normal_component_count = 0;
+
+ // This is only a path-shape guard for destructive operations. It does not
+ // resolve symlinks. Drive prefixes and root separators also do not count
+ // toward depth; delete still relies on the manifest check and explicit
+ // confirmation before removing the rooted storage prefix.
+ for component in std::path::Path::new(&path).components() {
+ match component {
+ Component::Normal(_) => normal_component_count += 1,
+ Component::CurDir | Component::ParentDir => {
+ return InvalidUriSnafu {
+ uri,
+ reason: "file snapshot URI must not contain '.' or '..' path segments",
+ }
+ .fail();
+ }
+ Component::Prefix(_) | Component::RootDir => {}
+ }
+ }
+
+ if normal_component_count < 2 {
+ return InvalidUriSnafu {
+ uri,
+ reason: "file snapshot URI must point to a directory at least two levels deep",
+ }
+ .fail();
+ }
+
+ Ok(())
+}
+
+fn has_explicit_dot_segment(uri: &str) -> bool {
+ // Defense in depth: catch dot segments at the raw URI level before
+ // `Url::to_file_path()` can normalize them away. The `Path::components()`
+ // check below still runs because URL decoding can reintroduce them.
+ let without_fragment = uri.split_once('#').map_or(uri, |(path, _)| path);
+ let path = without_fragment
+ .split_once('?')
+ .map_or(without_fragment, |(path, _)| path);
+
+ path.split('/')
+ .any(|segment| segment == "." || segment == "..")
+}
+
fn schema_index_path() -> String {
format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE)
}
@@ -708,6 +795,43 @@ mod tests {
assert!(OpenDalStorage::from_parent_uri("s3://bucket", &storage).is_ok());
}
+ #[test]
+ fn test_validate_snapshot_uri_rejects_dangerous_roots() {
+ assert!(validate_snapshot_uri("s3://bucket").is_err());
+ assert!(validate_snapshot_uri("s3://bucket/").is_err());
+ assert!(validate_snapshot_uri("oss://bucket").is_err());
+ assert!(validate_snapshot_uri("gs://bucket").is_err());
+ assert!(validate_snapshot_uri("azblob://container").is_err());
+ assert!(validate_snapshot_uri("s3://bucket/snapshot?version=1").is_err());
+ assert!(validate_snapshot_uri("file:///tmp/backup#fragment").is_err());
+ assert!(validate_snapshot_uri("file:///").is_err());
+ assert!(validate_snapshot_uri("file:///tmp").is_err());
+ assert!(validate_snapshot_uri("file:///tmp/backup/.").is_err());
+ assert!(validate_snapshot_uri("file:///tmp/backup/..").is_err());
+ }
+
+ #[test]
+ fn test_validate_snapshot_uri_accepts_snapshot_paths() {
+ assert_eq!(
+ validate_snapshot_uri("s3://bucket/snapshots/prod").unwrap(),
+ StorageScheme::S3
+ );
+
+ let dir = tempdir().unwrap();
+ let snapshot = dir.path().join("snapshot");
+ std::fs::create_dir_all(&snapshot).unwrap();
+ let uri = Url::from_directory_path(snapshot).unwrap().to_string();
+ assert_eq!(validate_snapshot_uri(&uri).unwrap(), StorageScheme::File);
+ }
+
+ #[cfg(windows)]
+ #[test]
+ fn test_validate_snapshot_uri_windows_drive_prefix_depth() {
+ assert!(validate_snapshot_uri("file:///C:/").is_err());
+ assert!(validate_snapshot_uri("file:///C:/Users").is_err());
+ assert!(validate_snapshot_uri("file:///C:/Users/snapshot").is_ok());
+ }
+
#[cfg(not(windows))]
#[test]
fn test_extract_path_from_uri_unix_examples() {
diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs
index a298430c83..65f194d19f 100644
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -588,6 +588,8 @@ async fn build_cache_manager(
.vector_cache_size(config.vector_cache_size.as_bytes())
.page_cache_size(config.page_cache_size.as_bytes())
.selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+ .range_result_cache_size(config.range_result_cache_size.as_bytes())
+ .prefilter_result_cache_size(config.prefilter_result_cache_size.as_bytes())
.index_metadata_size(config.index.metadata_cache_size.as_bytes())
.index_content_size(config.index.content_cache_size.as_bytes())
.index_content_page_size(config.index.content_cache_page_size.as_bytes())
diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs
index b0601088cf..e0f2c673ff 100644
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -20,6 +20,7 @@ use std::{fs, path};
use async_trait::async_trait;
use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
+use catalog::CatalogManagerRef;
use catalog::information_schema::InformationExtensionRef;
use catalog::kvbackend::{CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder};
use catalog::process_manager::ProcessManager;
@@ -28,7 +29,8 @@ use common_base::Plugins;
use common_catalog::consts::{MIN_USER_FLOW_ID, MIN_USER_TABLE_ID};
use common_config::{Configurable, metadata_store_dir};
use common_error::ext::BoxedError;
-use common_meta::cache::LayeredCacheRegistryBuilder;
+use common_meta::DatanodeId;
+use common_meta::cache::{LayeredCacheRegistryBuilder, LayeredCacheRegistryRef};
use common_meta::ddl::flow_meta::FlowMetadataAllocator;
use common_meta::ddl::table_meta::TableMetadataAllocator;
use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl};
@@ -53,8 +55,8 @@ use datanode::config::DatanodeOptions;
use datanode::datanode::{Datanode, DatanodeBuilder};
use datanode::region_server::RegionServer;
use flow::{
- FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient, FrontendInvoker,
- GrpcQueryHandlerWithBoxedError,
+ FlowDualEngineRef, FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient,
+ FrontendInvoker, GrpcQueryHandlerWithBoxedError,
};
use frontend::frontend::Frontend;
use frontend::instance::StandaloneDatanodeManager;
@@ -124,8 +126,8 @@ pub struct Instance {
frontend: Frontend,
flownode: FlownodeInstance,
procedure_manager: ProcedureManagerRef,
- wal_provider: WalProviderRef,
leader_services_controller: Box,
+ leader_services_context: LeaderServicesContext,
// Keep the logging guard to prevent the worker from being dropped.
_guard: Vec,
}
@@ -159,11 +161,7 @@ impl App for Instance {
self.datanode.start_telemetry();
self.leader_services_controller
- .start(
- self.procedure_manager.clone(),
- self.wal_provider.clone(),
- self.datanode.region_server(),
- )
+ .start(self.leader_services_context.clone())
.await?;
plugins::start_frontend_plugins(self.frontend.instance.plugins().clone())
@@ -379,6 +377,8 @@ impl StartCommand {
opts.grpc.detect_server_addr();
let fe_opts = opts.frontend_options();
let dn_opts = opts.datanode_options();
+ let node_id = dn_opts.node_id;
+ let init_regions_parallelism = dn_opts.init_regions_parallelism;
plugins::setup_frontend_plugins(&mut plugins, &plugin_opts, &fe_opts)
.await
@@ -491,21 +491,18 @@ impl StartCommand {
.await
.map_err(BoxedError::new)
.context(error::OtherSnafu)?;
+ let flow_engine = flownode.flow_engine();
// set the ref to query for the local flow state
{
information_extension
- .set_flow_engine(flownode.flow_engine())
+ .set_flow_engine(flow_engine.clone())
.await;
}
let node_manager = creator
.node_manager_creator
- .create(
- &kv_backend,
- datanode.region_server(),
- flownode.flow_engine(),
- )
+ .create(&kv_backend, datanode.region_server(), flow_engine.clone())
.await?;
let table_id_allocator = creator.table_id_allocator_creator.create(&kv_backend);
@@ -596,7 +593,7 @@ impl StartCommand {
.await;
// set the frontend invoker for flownode
- let flow_streaming_engine = flownode.flow_engine().streaming_engine();
+ let flow_streaming_engine = flow_engine.streaming_engine();
// flow server need to be able to use frontend to write insert requests back
let invoker = FrontendInvoker::build_from(
flow_streaming_engine.clone(),
@@ -620,14 +617,27 @@ impl StartCommand {
servers,
heartbeat_task: None,
};
+ let leader_services_context = LeaderServicesContext {
+ procedure_manager: procedure_manager.clone(),
+ wal_provider: wal_provider.clone(),
+ region_server: datanode.region_server(),
+ kv_backend: kv_backend.clone(),
+ cache_registry: layered_cache_registry,
+ catalog_manager,
+ flow_engine,
+ frontend_client,
+ node_id,
+ init_regions_parallelism,
+ plugin_options: plugin_opts,
+ };
let instance = Instance {
datanode,
frontend,
flownode,
procedure_manager,
- wal_provider,
leader_services_controller: creator.leader_services_controller,
+ leader_services_context,
_guard: vec![],
};
let result = InstanceCreatorResult {
@@ -743,16 +753,11 @@ impl ProcedureExecutorCreator for DefaultProcedureExecutorCreator {
#[async_trait]
pub trait StandaloneLeaderServicesController: Send + Sync {
- /// Starts services that manage standalone metadata or WAL state.
+ /// Starts leader services that manage standalone metadata or WAL state.
///
/// The default implementation starts the procedure manager and WAL provider
/// during instance startup.
- async fn start(
- &self,
- procedure_manager: ProcedureManagerRef,
- wal_provider: WalProviderRef,
- region_server: RegionServer,
- ) -> Result<()>;
+ async fn start(&self, context: LeaderServicesContext) -> Result<()>;
/// Stops services started by [`StandaloneLeaderServicesController::start`].
async fn stop(
@@ -762,21 +767,42 @@ pub trait StandaloneLeaderServicesController: Send + Sync {
) -> Result<()>;
}
+#[derive(Clone)]
+/// Additional runtime handles for custom leader-service controllers.
+///
+/// The default standalone startup only needs to start/stop the procedure
+/// manager and WAL provider. Some embedders need to do more work around
+/// leader-service startup, for example reconciling metadata-backed runtime
+/// state before publishing writable leadership. Grouping those handles here
+/// keeps `Instance` small and avoids expanding
+/// [`StandaloneLeaderServicesController::start`] every time a custom lifecycle
+/// needs one more standalone component.
+pub struct LeaderServicesContext {
+ pub procedure_manager: ProcedureManagerRef,
+ pub wal_provider: WalProviderRef,
+ pub region_server: RegionServer,
+ pub kv_backend: KvBackendRef,
+ pub cache_registry: LayeredCacheRegistryRef,
+ pub catalog_manager: CatalogManagerRef,
+ pub flow_engine: FlowDualEngineRef,
+ pub frontend_client: Arc,
+ pub node_id: Option,
+ pub init_regions_parallelism: usize,
+ pub plugin_options: Vec,
+}
+
pub struct DefaultStandaloneLeaderServicesController;
#[async_trait]
impl StandaloneLeaderServicesController for DefaultStandaloneLeaderServicesController {
- async fn start(
- &self,
- procedure_manager: ProcedureManagerRef,
- wal_provider: WalProviderRef,
- _region_server: RegionServer,
- ) -> Result<()> {
- procedure_manager
+ async fn start(&self, context: LeaderServicesContext) -> Result<()> {
+ context
+ .procedure_manager
.start()
.await
.context(error::StartProcedureManagerSnafu)?;
- wal_provider
+ context
+ .wal_provider
.start()
.await
.context(error::StartWalProviderSnafu)
diff --git a/src/common/catalog/src/consts.rs b/src/common/catalog/src/consts.rs
index 1cd5db8a0c..dd09893177 100644
--- a/src/common/catalog/src/consts.rs
+++ b/src/common/catalog/src/consts.rs
@@ -112,6 +112,8 @@ pub const INFORMATION_SCHEMA_SSTS_STORAGE_TABLE_ID: u32 = 38;
pub const INFORMATION_SCHEMA_SSTS_INDEX_META_TABLE_ID: u32 = 39;
/// id for information_schema.alerts
pub const INFORMATION_SCHEMA_ALERTS_TABLE_ID: u32 = 40;
+/// id for information_schema.region_info
+pub const INFORMATION_SCHEMA_REGION_INFO_TABLE_ID: u32 = 41;
// ----- End of information_schema tables -----
diff --git a/src/common/function/src/scalars/json/json_get_rewriter.rs b/src/common/function/src/scalars/json/json_get_rewriter.rs
index 137b307412..0143ee05d5 100644
--- a/src/common/function/src/scalars/json/json_get_rewriter.rs
+++ b/src/common/function/src/scalars/json/json_get_rewriter.rs
@@ -59,7 +59,10 @@ impl FunctionRewrite for JsonGetRewriter {
// json_get(column, path, )
// )
fn inject_type_from_cast_expr(cast: Cast) -> Result> {
- let Cast { expr, data_type } = cast;
+ let Cast {
+ expr,
+ mut data_type,
+ } = cast;
let mut json_get = match *expr {
Expr::ScalarFunction(f)
@@ -75,6 +78,9 @@ fn inject_type_from_cast_expr(cast: Cast) -> Result> {
}
};
+ if data_type.is_string() {
+ data_type = DataType::Utf8View;
+ }
let with_type = ScalarValue::try_new_null(&data_type).map(|x| Expr::Literal(x, None))?;
json_get.args.push(with_type);
Ok(Transformed::yes(Expr::ScalarFunction(json_get)))
diff --git a/src/common/meta/src/cache/container.rs b/src/common/meta/src/cache/container.rs
index e3a3e13a76..e3a1a50adc 100644
--- a/src/common/meta/src/cache/container.rs
+++ b/src/common/meta/src/cache/container.rs
@@ -196,8 +196,8 @@ where
#[async_trait::async_trait]
impl CacheInvalidator for CacheContainer
where
- K: Send + Sync,
- V: Send + Sync,
+ K: Hash + Eq + Send + Sync + 'static,
+ V: Clone + Send + Sync + 'static,
{
async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> {
let idents = caches
@@ -211,6 +211,12 @@ where
Ok(())
}
+
+ fn invalidate_all(&self) -> Result<()> {
+ self.inc_version();
+ self.cache.invalidate_all();
+ Ok(())
+ }
}
impl CacheContainer
diff --git a/src/common/meta/src/cache/flow/table_flownode.rs b/src/common/meta/src/cache/flow/table_flownode.rs
index ebe3664202..4d3513a21d 100644
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -210,7 +210,7 @@ mod tests {
use crate::cache::flow::table_flownode::{FlowIdent, new_table_flownode_set_cache};
use crate::instruction::{CacheIdent, CreateFlow, DropFlow};
use crate::key::flow::FlowMetadataManager;
- use crate::key::flow::flow_info::FlowInfoValue;
+ use crate::key::flow::flow_info::{FlowInfoValue, FlowStatus};
use crate::key::flow::flow_route::FlowRouteValue;
use crate::kv_backend::memory::MemoryKvBackend;
use crate::peer::Peer;
@@ -242,11 +242,14 @@ mod tests {
catalog_name: DEFAULT_CATALOG_NAME.to_string(),
query_context: None,
flow_name: "my_flow".to_string(),
+ all_source_table_names: vec![],
+ unresolved_source_table_names: vec![],
raw_sql: "sql".to_string(),
expire_after: Some(300),
eval_interval_secs: None,
comment: "comment".to_string(),
options: Default::default(),
+ status: FlowStatus::Active,
created_time: chrono::Utc::now(),
updated_time: chrono::Utc::now(),
},
diff --git a/src/common/meta/src/cache/registry.rs b/src/common/meta/src/cache/registry.rs
index d541525f98..b7ee82b6e5 100644
--- a/src/common/meta/src/cache/registry.rs
+++ b/src/common/meta/src/cache/registry.rs
@@ -67,6 +67,13 @@ impl CacheInvalidator for LayeredCacheRegistry {
}
results.into_iter().collect::>>().map(|_| ())
}
+
+ fn invalidate_all(&self) -> Result<()> {
+ for registry in &self.layers {
+ registry.invalidate_all()?;
+ }
+ Ok(())
+ }
}
impl LayeredCacheRegistry {
@@ -124,6 +131,13 @@ impl CacheInvalidator for CacheRegistry {
.collect::>>()?;
Ok(())
}
+
+ fn invalidate_all(&self) -> Result<()> {
+ for invalidator in &self.indexes {
+ invalidator.invalidate_all()?;
+ }
+ Ok(())
+ }
}
impl CacheRegistry {
@@ -149,6 +163,8 @@ mod tests {
use crate::cache::registry::CacheRegistryBuilder;
use crate::cache::*;
+ use crate::cache_invalidator::{CacheInvalidator, Context};
+ use crate::error::Result;
use crate::instruction::CacheIdent;
fn always_true_filter(_: &CacheIdent) -> bool {
@@ -259,4 +275,91 @@ mod tests {
.unwrap();
assert_eq!(cache.name(), "string_cache");
}
+
+ #[tokio::test]
+ async fn test_registry_invalidate_all() {
+ let invalidator: Invalidator<_, String, CacheIdent> =
+ Box::new(|_, _| Box::pin(async { Ok(()) }));
+ let i32_cache = Arc::new(test_i32_cache("i32_cache", invalidator));
+ let invalidator: Invalidator<_, String, CacheIdent> =
+ Box::new(|_, _| Box::pin(async { Ok(()) }));
+ let string_cache = Arc::new(test_cache("string_cache", invalidator));
+
+ i32_cache.get(1).await.unwrap();
+ string_cache.get_by_ref("foo").await.unwrap();
+ assert!(i32_cache.contains_key(&1));
+ assert!(string_cache.contains_key("foo"));
+
+ let registry = CacheRegistryBuilder::default()
+ .add_cache(i32_cache.clone())
+ .add_cache(string_cache.clone())
+ .build();
+
+ registry.invalidate_all().unwrap();
+
+ assert!(!i32_cache.contains_key(&1));
+ assert!(!string_cache.contains_key("foo"));
+ }
+
+ struct LayerOrderInvalidator {
+ expected_order: i32,
+ order: Arc,
+ }
+
+ #[async_trait::async_trait]
+ impl CacheInvalidator for LayerOrderInvalidator {
+ async fn invalidate(&self, _ctx: &Context, _caches: &[CacheIdent]) -> Result<()> {
+ Ok(())
+ }
+
+ fn invalidate_all(&self) -> Result<()> {
+ let previous = self.order.fetch_add(1, Ordering::Relaxed);
+ assert_eq!(self.expected_order, previous);
+ Ok(())
+ }
+ }
+
+ #[tokio::test]
+ async fn test_layered_registry_invalidate_all() {
+ let order = Arc::new(AtomicI32::new(0));
+ let invalidator: Invalidator<_, String, CacheIdent> =
+ Box::new(|_, _| Box::pin(async { Ok(()) }));
+ let first_layer_cache = Arc::new(test_cache("first_layer_cache", invalidator));
+ let first_layer_order = Arc::new(LayerOrderInvalidator {
+ expected_order: 0,
+ order: order.clone(),
+ });
+ let first_layer = CacheRegistryBuilder::default()
+ .add_cache(first_layer_order)
+ .add_cache(first_layer_cache.clone())
+ .build();
+
+ let invalidator: Invalidator<_, String, CacheIdent> =
+ Box::new(|_, _| Box::pin(async { Ok(()) }));
+ let second_layer_cache = Arc::new(test_i32_cache("second_layer_cache", invalidator));
+ let second_layer_order = Arc::new(LayerOrderInvalidator {
+ expected_order: 1,
+ order: order.clone(),
+ });
+ let second_layer = CacheRegistryBuilder::default()
+ .add_cache(second_layer_order)
+ .add_cache(second_layer_cache.clone())
+ .build();
+
+ first_layer_cache.get_by_ref("foo").await.unwrap();
+ second_layer_cache.get(1).await.unwrap();
+ assert!(first_layer_cache.contains_key("foo"));
+ assert!(second_layer_cache.contains_key(&1));
+
+ let registry = LayeredCacheRegistryBuilder::default()
+ .add_cache_registry(first_layer)
+ .add_cache_registry(second_layer)
+ .build();
+
+ registry.invalidate_all().unwrap();
+
+ assert_eq!(2, order.load(Ordering::Relaxed));
+ assert!(!first_layer_cache.contains_key("foo"));
+ assert!(!second_layer_cache.contains_key(&1));
+ }
}
diff --git a/src/common/meta/src/cache_invalidator.rs b/src/common/meta/src/cache_invalidator.rs
index ffc3dd1c9a..4fe0699ba5 100644
--- a/src/common/meta/src/cache_invalidator.rs
+++ b/src/common/meta/src/cache_invalidator.rs
@@ -55,6 +55,13 @@ pub struct Context {
pub trait CacheInvalidator: Send + Sync {
async fn invalidate(&self, ctx: &Context, caches: &[CacheIdent]) -> Result<()>;
+ /// Invalidates every cache entry owned by this invalidator.
+ ///
+ /// This method is required so each implementer explicitly decides how
+ /// full-cache invalidation should behave. Implementations that intentionally
+ /// do nothing must document why a no-op is safe.
+ fn invalidate_all(&self) -> Result<()>;
+
fn name(&self) -> &'static str {
std::any::type_name::()
}
@@ -69,6 +76,11 @@ impl CacheInvalidator for DummyCacheInvalidator {
async fn invalidate(&self, _ctx: &Context, _caches: &[CacheIdent]) -> Result<()> {
Ok(())
}
+
+ fn invalidate_all(&self) -> Result<()> {
+ // Dummy invalidator owns no cache state, so there is nothing to clear.
+ Ok(())
+ }
}
#[async_trait::async_trait]
@@ -157,4 +169,11 @@ where
}
Ok(())
}
+
+ fn invalidate_all(&self) -> Result<()> {
+ // KvCacheInvalidator only knows how to invalidate explicit metadata
+ // keys. There is no safe generic way to enumerate or clear the backend
+ // keyspace, so full invalidation is intentionally a no-op here.
+ Ok(())
+ }
}
diff --git a/src/common/meta/src/ddl/create_flow.rs b/src/common/meta/src/ddl/create_flow.rs
index 7120e50425..ddfb0c0759 100644
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -14,7 +14,7 @@
mod metadata;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
use std::fmt;
use api::v1::ExpireAfter;
@@ -34,13 +34,14 @@ use serde::{Deserialize, Serialize};
use snafu::{ResultExt, ensure};
use strum::AsRefStr;
use table::metadata::TableId;
+use table::table_name::TableName;
use crate::cache_invalidator::Context;
use crate::ddl::DdlContext;
use crate::ddl::utils::{add_peer_context_if_needed, map_to_procedure_error};
use crate::error::{self, Result, UnexpectedSnafu};
use crate::instruction::{CacheIdent, CreateFlow, DropFlow};
-use crate::key::flow::flow_info::FlowInfoValue;
+use crate::key::flow::flow_info::{FlowInfoValue, FlowStatus};
use crate::key::flow::flow_route::FlowRouteValue;
use crate::key::table_name::TableNameKey;
use crate::key::{DeserializedValueWithBytes, FlowId, FlowPartitionId};
@@ -67,6 +68,7 @@ impl CreateFlowProcedure {
flow_id: None,
peers: vec![],
source_table_ids: vec![],
+ unresolved_source_table_names: vec![],
flow_context: query_context.into(), // Convert to FlowQueryContext
state: CreateFlowState::Prepare,
prev_flow_info_value: None,
@@ -89,6 +91,8 @@ impl CreateFlowProcedure {
let create_if_not_exists = self.data.task.create_if_not_exists;
let or_replace = self.data.task.or_replace;
+ validate_flow_options(&self.data.task)?;
+
let flow_name_value = self
.context
.flow_metadata_manager
@@ -167,6 +171,21 @@ impl CreateFlowProcedure {
}
self.collect_source_tables().await?;
+ ensure!(
+ self.data.unresolved_source_table_names.is_empty()
+ || defer_on_missing_source(&self.data.task)?,
+ error::UnsupportedSnafu {
+ operation: format!(
+ "Create flow with missing source tables requires WITH ('{DEFER_ON_MISSING_SOURCE_KEY}'='true'): {}",
+ self.data
+ .unresolved_source_table_names
+ .iter()
+ .map(ToString::to_string)
+ .join(", ")
+ )
+ }
+ );
+ self.ensure_supported_replace_transition()?;
// Validate that source and sink tables are not the same
let sink_table_name = &self.data.task.sink_table_name;
@@ -189,13 +208,38 @@ impl CreateFlowProcedure {
if self.data.flow_id.is_none() {
self.allocate_flow_id().await?;
}
- self.data.state = CreateFlowState::CreateFlows;
- // determine flow type
self.data.flow_type = Some(get_flow_type_from_options(&self.data.task)?);
+ self.data.state = if self.data.is_pending() {
+ self.data.peers.clear();
+ CreateFlowState::CreateMetadata
+ } else {
+ CreateFlowState::CreateFlows
+ };
+
Ok(Status::executing(true))
}
+ fn ensure_supported_replace_transition(&self) -> Result<()> {
+ if !self.data.task.or_replace {
+ return Ok(());
+ }
+
+ let Some(prev_flow_info) = self.data.prev_flow_info_value.as_ref() else {
+ return Ok(());
+ };
+ let prev_pending = prev_flow_info.get_inner_ref().is_pending();
+ let new_pending = self.data.is_pending();
+ ensure!(
+ prev_pending == new_pending,
+ error::UnsupportedSnafu {
+ operation: "Replacing between pending and active flow states is not supported yet"
+ }
+ );
+
+ Ok(())
+ }
+
async fn on_flownode_create_flows(&mut self) -> Result {
// Safety: must be allocated.
let mut create_flow = Vec::with_capacity(self.data.peers.len());
@@ -365,6 +409,61 @@ pub fn get_flow_type_from_options(flow_task: &CreateFlowTask) -> Result Result {
+ flow_task
+ .flow_options
+ .get(DEFER_ON_MISSING_SOURCE_KEY)
+ .map(|value| {
+ value
+ .trim()
+ .to_ascii_lowercase()
+ .parse::()
+ .map_err(|_| {
+ error::UnexpectedSnafu {
+ err_msg: format!(
+ "Invalid flow option '{DEFER_ON_MISSING_SOURCE_KEY}': {value}"
+ ),
+ }
+ .build()
+ })
+ })
+ .transpose()
+ .map(|value| value.unwrap_or(false))
+}
+
+pub fn validate_flow_options(flow_task: &CreateFlowTask) -> Result<()> {
+ for key in flow_task.flow_options.keys() {
+ match key.as_str() {
+ DEFER_ON_MISSING_SOURCE_KEY | FlowType::FLOW_TYPE_KEY => {}
+ unknown => {
+ return UnexpectedSnafu {
+ err_msg: format!(
+ "Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}"
+ ),
+ }
+ .fail();
+ }
+ }
+ }
+
+ defer_on_missing_source(flow_task)?;
+ get_flow_type_from_options(flow_task)?;
+ Ok(())
+}
+
+fn user_runtime_flow_options(options: &HashMap) -> HashMap {
+ let mut options = options.clone();
+ options.remove(DEFER_ON_MISSING_SOURCE_KEY);
+ options
+}
+
+fn metadata_flow_options(options: &HashMap) -> HashMap {
+ options.clone()
+}
+
/// The state of [CreateFlowProcedure].
#[derive(Debug, Clone, Serialize, Deserialize, AsRefStr, PartialEq)]
pub enum CreateFlowState {
@@ -411,6 +510,8 @@ pub struct CreateFlowData {
pub(crate) flow_id: Option,
pub(crate) peers: Vec,
pub(crate) source_table_ids: Vec,
+ #[serde(default)]
+ pub(crate) unresolved_source_table_names: Vec,
/// Use alias for backward compatibility with QueryContext serialized data
#[serde(alias = "query_context")]
pub(crate) flow_context: FlowQueryContext,
@@ -424,6 +525,16 @@ pub struct CreateFlowData {
pub(crate) flow_type: Option,
}
+impl CreateFlowData {
+ pub(crate) fn is_pending(&self) -> bool {
+ !self.unresolved_source_table_names.is_empty()
+ }
+
+ pub(crate) fn is_active(&self) -> bool {
+ !self.is_pending()
+ }
+}
+
impl From<&CreateFlowData> for CreateRequest {
fn from(value: &CreateFlowData) -> Self {
let flow_id = value.flow_id.unwrap();
@@ -446,7 +557,7 @@ impl From<&CreateFlowData> for CreateRequest {
.map(|seconds| api::v1::EvalInterval { seconds }),
comment: value.task.comment.clone(),
sql: value.task.sql.clone(),
- flow_options: value.task.flow_options.clone(),
+ flow_options: user_runtime_flow_options(&value.task.flow_options),
};
let flow_type = value.flow_type.unwrap_or_default().to_string();
@@ -466,9 +577,9 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
eval_interval_secs: eval_interval,
comment,
sql,
- flow_options: mut options,
..
} = value.task.clone();
+ let mut options = metadata_flow_options(&value.task.flow_options);
let flownode_ids = value
.peers
@@ -484,7 +595,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
.collect::>();
let flow_type = value.flow_type.unwrap_or_default().to_string();
- options.insert("flow_type".to_string(), flow_type);
+ options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
let mut create_time = chrono::Utc::now();
if let Some(prev_flow_value) = value.prev_flow_info_value.as_ref()
@@ -495,6 +606,8 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
let flow_info: FlowInfoValue = FlowInfoValue {
source_table_ids: value.source_table_ids.clone(),
+ all_source_table_names: value.task.source_table_names.clone(),
+ unresolved_source_table_names: value.unresolved_source_table_names.clone(),
sink_table_name,
flownode_ids,
catalog_name,
@@ -506,6 +619,11 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
eval_interval_secs: eval_interval,
comment,
options,
+ status: if value.is_active() {
+ FlowStatus::Active
+ } else {
+ FlowStatus::PendingSources
+ },
created_time: create_time,
updated_time: chrono::Utc::now(),
};
diff --git a/src/common/meta/src/ddl/create_flow/metadata.rs b/src/common/meta/src/ddl/create_flow/metadata.rs
index 27b85b7946..f97ecfdf4a 100644
--- a/src/common/meta/src/ddl/create_flow/metadata.rs
+++ b/src/common/meta/src/ddl/create_flow/metadata.rs
@@ -12,10 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-use snafu::OptionExt;
-
use crate::ddl::create_flow::CreateFlowProcedure;
-use crate::error::{self, Result};
+use crate::error::Result;
use crate::key::table_name::TableNameKey;
impl CreateFlowProcedure {
@@ -34,9 +32,8 @@ impl CreateFlowProcedure {
Ok(())
}
- /// Ensures all source tables exist and collects source table ids
+ /// Collects source table ids and keeps track of missing tables.
pub(crate) async fn collect_source_tables(&mut self) -> Result<()> {
- // Ensures all source tables exist.
let keys = self
.data
.task
@@ -52,22 +49,24 @@ impl CreateFlowProcedure {
.batch_get(keys)
.await?;
- let source_table_ids = self
+ let mut resolved = Vec::with_capacity(self.data.task.source_table_names.len());
+ let mut unresolved = Vec::new();
+
+ for (name, table_id) in self
.data
.task
.source_table_names
.iter()
.zip(source_table_ids)
- .map(|(name, table_id)| {
- Ok(table_id
- .with_context(|| error::TableNotFoundSnafu {
- table_name: name.to_string(),
- })?
- .table_id())
- })
- .collect::>>()?;
+ {
+ match table_id {
+ Some(table_id) => resolved.push(table_id.table_id()),
+ None => unresolved.push(name.clone()),
+ }
+ }
- self.data.source_table_ids = source_table_ids;
+ self.data.source_table_ids = resolved;
+ self.data.unresolved_source_table_names = unresolved;
Ok(())
}
}
diff --git a/src/common/meta/src/ddl/drop_flow/metadata.rs b/src/common/meta/src/ddl/drop_flow/metadata.rs
index 0437098be3..7afd00f9d5 100644
--- a/src/common/meta/src/ddl/drop_flow/metadata.rs
+++ b/src/common/meta/src/ddl/drop_flow/metadata.rs
@@ -43,7 +43,7 @@ impl DropFlowProcedure {
.map(|(_, value)| value)
.collect::>();
ensure!(
- !flow_route_values.is_empty(),
+ flow_info_value.is_pending() || !flow_route_values.is_empty(),
error::FlowRouteNotFoundSnafu {
flow_name: format_full_flow_name(catalog_name, flow_name),
}
diff --git a/src/common/meta/src/ddl/tests/create_flow.rs b/src/common/meta/src/ddl/tests/create_flow.rs
index 344fc05024..a1a6c040f1 100644
--- a/src/common/meta/src/ddl/tests/create_flow.rs
+++ b/src/common/meta/src/ddl/tests/create_flow.rs
@@ -16,12 +16,17 @@ use std::assert_matches;
use std::collections::HashMap;
use std::sync::Arc;
+use api::v1::flow::CreateRequest;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_procedure::Status;
use common_procedure_test::execute_procedure_until_done;
use table::table_name::TableName;
use crate::ddl::DdlContext;
-use crate::ddl::create_flow::{CreateFlowData, CreateFlowProcedure, CreateFlowState, FlowType};
+use crate::ddl::create_flow::{
+ CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY, FlowType,
+ defer_on_missing_source,
+};
use crate::ddl::test_util::create_table::test_create_table_task;
use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
use crate::error;
@@ -63,6 +68,11 @@ pub(crate) fn test_create_flow_task(
}
}
+fn enable_defer_on_missing_source(task: &mut CreateFlowTask) {
+ task.flow_options
+ .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string());
+}
+
#[tokio::test]
async fn test_create_flow_source_table_not_found() {
let source_table_names = vec![TableName::new(
@@ -78,7 +88,261 @@ async fn test_create_flow_source_table_not_found() {
let query_ctx = test_query_context();
let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
let err = procedure.on_prepare().await.unwrap_err();
- assert_matches!(err, error::Error::TableNotFound { .. });
+ assert_matches!(err, error::Error::Unsupported { .. });
+ assert!(
+ err.to_string()
+ .contains("requires WITH ('defer_on_missing_source'='true')")
+ );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_source_table_not_found_with_defer() {
+ let source_table_names = vec![TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "my_table",
+ )];
+ let sink_table_name =
+ TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table");
+ let mut task = test_create_flow_task("my_flow", source_table_names, sink_table_name, false);
+ enable_defer_on_missing_source(&mut task);
+ let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+ let ddl_context = new_ddl_context(node_manager);
+ let query_ctx = test_query_context();
+ let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+ let status = procedure.on_prepare().await.unwrap();
+ assert_matches!(status, Status::Executing { persist: true, .. });
+ assert_eq!(procedure.data.unresolved_source_table_names.len(), 1);
+ assert_eq!(procedure.data.source_table_ids, Vec::::new());
+
+ let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+ let flow_id = *output.downcast_ref::().unwrap();
+ let flow_info = ddl_context
+ .flow_metadata_manager
+ .flow_info_manager()
+ .get(flow_id)
+ .await
+ .unwrap()
+ .unwrap();
+ assert_eq!(flow_info.source_table_ids(), Vec::::new());
+ assert_eq!(
+ flow_info
+ .options()
+ .get(DEFER_ON_MISSING_SOURCE_KEY)
+ .map(String::as_str),
+ Some("true")
+ );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_source_table_not_found_with_defer_false() {
+ let source_table_names = vec![TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "my_table",
+ )];
+ let sink_table_name =
+ TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table");
+ let mut task = test_create_flow_task("my_flow", source_table_names, sink_table_name, false);
+ task.flow_options
+ .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "false".to_string());
+ let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+ let ddl_context = new_ddl_context(node_manager);
+ let query_ctx = test_query_context();
+ let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
+ let err = procedure.on_prepare().await.unwrap_err();
+ assert_matches!(err, error::Error::Unsupported { .. });
+ assert!(
+ err.to_string()
+ .contains("requires WITH ('defer_on_missing_source'='true')")
+ );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_records_partial_source_resolution() {
+ let existing_source = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "partial_existing_source_table",
+ );
+ let missing_source = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "partial_missing_source_table",
+ );
+ let sink_table_name = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "partial_pending_sink_table",
+ );
+ let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+ let ddl_context = new_ddl_context(node_manager);
+
+ let existing_table_id = 3026;
+ let create_table_task =
+ test_create_table_task("partial_existing_source_table", existing_table_id);
+ ddl_context
+ .table_metadata_manager
+ .create_table_metadata(
+ create_table_task.table_info.clone(),
+ TableRouteValue::physical(vec![]),
+ HashMap::new(),
+ )
+ .await
+ .unwrap();
+
+ let mut task = test_create_flow_task(
+ "partial_pending_flow",
+ vec![existing_source.clone(), missing_source.clone()],
+ sink_table_name,
+ false,
+ );
+ enable_defer_on_missing_source(&mut task);
+ let query_ctx = test_query_context();
+ let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+ let status = procedure.on_prepare().await.unwrap();
+ assert_matches!(status, Status::Executing { persist: true, .. });
+ assert_eq!(procedure.data.source_table_ids, vec![existing_table_id]);
+ assert_eq!(
+ procedure.data.unresolved_source_table_names,
+ vec![missing_source.clone()]
+ );
+
+ let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+ let flow_id = *output.downcast_ref::().unwrap();
+ let flow_info = ddl_context
+ .flow_metadata_manager
+ .flow_info_manager()
+ .get(flow_id)
+ .await
+ .unwrap()
+ .unwrap();
+
+ assert!(flow_info.is_pending());
+ assert_eq!(flow_info.source_table_ids(), &[existing_table_id]);
+ let expected_all_sources = vec![existing_source, missing_source.clone()];
+ assert_eq!(
+ flow_info.all_source_table_names(),
+ expected_all_sources.as_slice()
+ );
+ assert_eq!(flow_info.unresolved_source_table_names(), &[missing_source]);
+ assert!(flow_info.flownode_ids().is_empty());
+}
+
+#[test]
+fn test_defer_on_missing_source_defaults_false() {
+ let task = test_create_flow_task(
+ "my_flow",
+ vec![],
+ TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+ false,
+ );
+
+ assert!(!defer_on_missing_source(&task).unwrap());
+}
+
+#[test]
+fn test_defer_on_missing_source_true() {
+ let mut task = test_create_flow_task(
+ "my_flow",
+ vec![],
+ TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+ false,
+ );
+ task.flow_options
+ .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string());
+
+ assert!(defer_on_missing_source(&task).unwrap());
+}
+
+#[test]
+fn test_defer_on_missing_source_invalid_value() {
+ let mut task = test_create_flow_task(
+ "my_flow",
+ vec![],
+ TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+ false,
+ );
+ task.flow_options.insert(
+ DEFER_ON_MISSING_SOURCE_KEY.to_string(),
+ "invalid".to_string(),
+ );
+
+ let err = defer_on_missing_source(&task).unwrap_err();
+ assert!(
+ err.to_string()
+ .contains("Invalid flow option 'defer_on_missing_source': invalid")
+ );
+}
+
+#[tokio::test]
+async fn test_create_flow_rejects_unknown_option_in_meta_task() {
+ let mut task = test_create_flow_task(
+ "my_flow",
+ vec![],
+ TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+ false,
+ );
+ task.flow_options
+ .insert("unknown_option".to_string(), "value".to_string());
+ let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+ let ddl_context = new_ddl_context(node_manager);
+ let query_ctx = test_query_context();
+ let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
+
+ let err = procedure.on_prepare().await.unwrap_err();
+ assert_matches!(err, error::Error::Unexpected { .. });
+ assert!(
+ err.to_string()
+ .contains("Unknown flow option 'unknown_option'")
+ );
+}
+
+#[test]
+fn test_create_request_strips_defer_on_missing_source_runtime_option() {
+ let mut task = test_create_flow_task(
+ "my_flow",
+ vec![],
+ TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+ false,
+ );
+ enable_defer_on_missing_source(&mut task);
+
+ let data = CreateFlowData {
+ state: CreateFlowState::CreateFlows,
+ task,
+ flow_id: Some(1024),
+ peers: vec![],
+ source_table_ids: vec![],
+ unresolved_source_table_names: vec![],
+ flow_context: FlowQueryContext {
+ catalog: DEFAULT_CATALOG_NAME.to_string(),
+ schema: DEFAULT_SCHEMA_NAME.to_string(),
+ timezone: "UTC".to_string(),
+ extensions: HashMap::new(),
+ channel: 0,
+ snapshot_seqs: HashMap::new(),
+ sst_min_sequences: HashMap::new(),
+ },
+ prev_flow_info_value: None,
+ did_replace: false,
+ flow_type: Some(FlowType::Batching),
+ };
+
+ let request: CreateRequest = (&data).into();
+
+ assert!(
+ !request
+ .flow_options
+ .contains_key(DEFER_ON_MISSING_SOURCE_KEY)
+ );
+ assert_eq!(
+ request
+ .flow_options
+ .get(FlowType::FLOW_TYPE_KEY)
+ .map(String::as_str),
+ Some(FlowType::BATCHING)
+ );
}
pub(crate) async fn create_test_flow(
@@ -101,6 +365,27 @@ pub(crate) async fn create_test_flow(
*flow_id
}
+pub(crate) async fn create_test_pending_flow(
+ ddl_context: &DdlContext,
+ flow_name: &str,
+ source_table_names: Vec,
+ sink_table_name: TableName,
+) -> FlowId {
+ let mut task = test_create_flow_task(
+ flow_name,
+ source_table_names.clone(),
+ sink_table_name.clone(),
+ false,
+ );
+ enable_defer_on_missing_source(&mut task);
+ let query_ctx = test_query_context();
+ let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+ let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+ let flow_id = output.downcast_ref::().unwrap();
+
+ *flow_id
+}
+
#[tokio::test]
async fn test_create_flow() {
let table_id = 1024;
@@ -154,6 +439,201 @@ async fn test_create_flow() {
assert_matches!(err, error::Error::FlowAlreadyExists { .. });
}
+#[tokio::test]
+async fn test_replace_pending_flow_with_active_flow_is_unsupported() {
+ let source_table_name = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "replace_pending_source_table",
+ );
+ let sink_table_name = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "replace_pending_sink_table",
+ );
+ let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+ let ddl_context = new_ddl_context(node_manager);
+
+ let pending_flow_id = create_test_pending_flow(
+ &ddl_context,
+ "replace_pending_flow",
+ vec![source_table_name.clone()],
+ sink_table_name.clone(),
+ )
+ .await;
+
+ let pending_flow = ddl_context
+ .flow_metadata_manager
+ .flow_info_manager()
+ .get(pending_flow_id)
+ .await
+ .unwrap()
+ .unwrap();
+ assert!(pending_flow.is_pending());
+ assert!(pending_flow.flownode_ids().is_empty());
+
+ let create_table_task = test_create_table_task("replace_pending_source_table", 1026);
+ ddl_context
+ .table_metadata_manager
+ .create_table_metadata(
+ create_table_task.table_info.clone(),
+ TableRouteValue::physical(vec![]),
+ HashMap::new(),
+ )
+ .await
+ .unwrap();
+
+ let mut replace_task = test_create_flow_task(
+ "replace_pending_flow",
+ vec![source_table_name],
+ sink_table_name,
+ false,
+ );
+ replace_task.or_replace = true;
+ let query_ctx = test_query_context();
+ let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+ let err = procedure.on_prepare().await.unwrap_err();
+ assert_matches!(err, error::Error::Unsupported { .. });
+ assert!(
+ err.to_string()
+ .contains("Replacing between pending and active flow states")
+ );
+}
+
+#[tokio::test]
+async fn test_replace_active_flow_with_pending_flow_is_unsupported() {
+ let existing_source_table = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "replace_active_source_table",
+ );
+ let missing_source_table = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "replace_missing_source_table",
+ );
+ let sink_table_name = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "replace_active_sink_table",
+ );
+
+ let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+ let ddl_context = new_ddl_context(node_manager);
+
+ let create_table_task = test_create_table_task("replace_active_source_table", 2026);
+ ddl_context
+ .table_metadata_manager
+ .create_table_metadata(
+ create_table_task.table_info.clone(),
+ TableRouteValue::physical(vec![]),
+ HashMap::new(),
+ )
+ .await
+ .unwrap();
+
+ let _flow_id = create_test_flow(
+ &ddl_context,
+ "replace_active_flow_to_pending",
+ vec![existing_source_table],
+ sink_table_name.clone(),
+ )
+ .await;
+
+ let mut replace_task = test_create_flow_task(
+ "replace_active_flow_to_pending",
+ vec![missing_source_table],
+ sink_table_name,
+ false,
+ );
+ enable_defer_on_missing_source(&mut replace_task);
+ replace_task.or_replace = true;
+ let query_ctx = test_query_context();
+ let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+ let err = procedure.on_prepare().await.unwrap_err();
+ assert_matches!(err, error::Error::Unsupported { .. });
+ assert!(
+ err.to_string()
+ .contains("Replacing between pending and active flow states")
+ );
+}
+
+#[tokio::test]
+async fn test_replace_pending_flow_with_pending_flow_updates_metadata() {
+ let first_missing_source = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "replace_pending_first_missing_source",
+ );
+ let second_missing_source = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "replace_pending_second_missing_source",
+ );
+ let sink_table_name = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "replace_pending_to_pending_sink_table",
+ );
+ let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+ let ddl_context = new_ddl_context(node_manager);
+
+ let original_flow_id = create_test_pending_flow(
+ &ddl_context,
+ "replace_pending_to_pending_flow",
+ vec![first_missing_source.clone()],
+ sink_table_name.clone(),
+ )
+ .await;
+
+ let original_flow = ddl_context
+ .flow_metadata_manager
+ .flow_info_manager()
+ .get(original_flow_id)
+ .await
+ .unwrap()
+ .unwrap();
+ assert!(original_flow.is_pending());
+ assert_eq!(
+ original_flow.unresolved_source_table_names(),
+ &[first_missing_source]
+ );
+ assert!(original_flow.flownode_ids().is_empty());
+
+ let mut replace_task = test_create_flow_task(
+ "replace_pending_to_pending_flow",
+ vec![second_missing_source.clone()],
+ sink_table_name,
+ false,
+ );
+ enable_defer_on_missing_source(&mut replace_task);
+ replace_task.or_replace = true;
+ let query_ctx = test_query_context();
+ let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+ let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+ let replaced_flow_id = *output.downcast_ref::().unwrap();
+ assert_eq!(replaced_flow_id, original_flow_id);
+
+ let replaced_flow = ddl_context
+ .flow_metadata_manager
+ .flow_info_manager()
+ .get(replaced_flow_id)
+ .await
+ .unwrap()
+ .unwrap();
+ assert!(replaced_flow.is_pending());
+ assert_eq!(replaced_flow.source_table_ids(), Vec::::new());
+ assert_eq!(
+ replaced_flow.unresolved_source_table_names(),
+ std::slice::from_ref(&second_missing_source)
+ );
+ assert_eq!(
+ replaced_flow.all_source_table_names(),
+ &[second_missing_source]
+ );
+ assert!(replaced_flow.flownode_ids().is_empty());
+}
+
#[tokio::test]
async fn test_create_flow_same_source_and_sink_table() {
let table_id = 1024;
@@ -228,6 +708,7 @@ fn test_create_flow_data_serialization_backward_compatibility() {
"flow_id": null,
"peers": [],
"source_table_ids": [],
+ "unresolved_source_table_names": [],
"query_context": {
"current_catalog": "old_catalog",
"current_schema": "old_schema",
@@ -265,6 +746,7 @@ fn test_create_flow_data_new_format_serialization() {
flow_id: None,
peers: vec![],
source_table_ids: vec![],
+ unresolved_source_table_names: vec![],
flow_context,
prev_flow_info_value: None,
did_replace: false,
@@ -327,6 +809,7 @@ fn test_flow_info_conversion_with_flow_context() {
flow_id: Some(123),
peers: vec![],
source_table_ids: vec![456, 789],
+ unresolved_source_table_names: vec![],
flow_context,
prev_flow_info_value: None,
did_replace: false,
diff --git a/src/common/meta/src/ddl/tests/drop_flow.rs b/src/common/meta/src/ddl/tests/drop_flow.rs
index af34da4809..400fd2e118 100644
--- a/src/common/meta/src/ddl/tests/drop_flow.rs
+++ b/src/common/meta/src/ddl/tests/drop_flow.rs
@@ -23,7 +23,7 @@ use table::table_name::TableName;
use crate::ddl::drop_flow::DropFlowProcedure;
use crate::ddl::test_util::create_table::test_create_table_task;
use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
-use crate::ddl::tests::create_flow::create_test_flow;
+use crate::ddl::tests::create_flow::{create_test_flow, create_test_pending_flow};
use crate::error;
use crate::key::table_route::TableRouteValue;
use crate::rpc::ddl::DropFlowTask;
@@ -91,3 +91,45 @@ async fn test_drop_flow() {
let err = procedure.on_prepare().await.unwrap_err();
assert_matches!(err, error::Error::FlowNotFound { .. });
}
+
+#[tokio::test]
+async fn test_drop_pending_flow_without_routes() {
+ let source_table_name = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "drop_pending_missing_source_table",
+ );
+ let sink_table_name = TableName::new(
+ DEFAULT_CATALOG_NAME,
+ DEFAULT_SCHEMA_NAME,
+ "drop_pending_sink_table",
+ );
+ let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+ let ddl_context = new_ddl_context(node_manager);
+
+ let flow_id = create_test_pending_flow(
+ &ddl_context,
+ "drop_pending_flow",
+ vec![source_table_name],
+ sink_table_name,
+ )
+ .await;
+ let flow_info = ddl_context
+ .flow_metadata_manager
+ .flow_info_manager()
+ .get(flow_id)
+ .await
+ .unwrap()
+ .unwrap();
+ assert!(flow_info.is_pending());
+ assert!(flow_info.flownode_ids().is_empty());
+
+ let task = test_drop_flow_task("drop_pending_flow", flow_id, false);
+ let mut procedure = DropFlowProcedure::new(task, ddl_context.clone());
+ execute_procedure_until_done(&mut procedure).await;
+
+ let task = test_drop_flow_task("drop_pending_flow", flow_id, false);
+ let mut procedure = DropFlowProcedure::new(task, ddl_context);
+ let err = procedure.on_prepare().await.unwrap_err();
+ assert_matches!(err, error::Error::FlowNotFound { .. });
+}
diff --git a/src/common/meta/src/ddl_manager.rs b/src/common/meta/src/ddl_manager.rs
index 52af4a36af..8dceeb2e5a 100644
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -15,8 +15,9 @@
use std::sync::Arc;
use std::time::Duration;
-use api::v1::Repartition;
use api::v1::alter_table_expr::Kind;
+use api::v1::repartition::Source as PbRepartitionSource;
+use api::v1::{PartitionExprs, Repartition};
use common_error::ext::BoxedError;
use common_procedure::{
BoxedProcedure, BoxedProcedureLoader, Output, ProcedureId, ProcedureManagerRef,
@@ -151,13 +152,18 @@ macro_rules! procedure_loader {
pub type RepartitionProcedureFactoryRef = Arc;
+pub enum RepartitionSource {
+ Partitioned { exprs: Vec },
+ Unpartitioned { partition_columns: Vec },
+}
+
pub trait RepartitionProcedureFactory: Send + Sync {
fn create(
&self,
ddl_ctx: &DdlContext,
table_name: TableName,
table_id: TableId,
- from_exprs: Vec,
+ source: RepartitionSource,
to_exprs: Vec,
timeout: Option,
) -> std::result::Result;
@@ -280,22 +286,38 @@ impl DdlManager {
&self,
table_id: TableId,
table_name: TableName,
- Repartition {
- from_partition_exprs,
- into_partition_exprs,
- }: Repartition,
+ repartition: Repartition,
wait: bool,
timeout: Duration,
) -> Result<(ProcedureId, Option)> {
let context = self.create_context();
+ let into_partition_exprs = repartition.into_partition_exprs;
+ let source = repartition.source;
+
+ let source = match source {
+ Some(PbRepartitionSource::PartitionExprs(PartitionExprs { exprs })) => {
+ RepartitionSource::Partitioned { exprs }
+ }
+ Some(PbRepartitionSource::Unpartitioned(source)) => RepartitionSource::Unpartitioned {
+ partition_columns: source.partition_columns,
+ },
+ None => {
+ // Reads the deprecated field for backward compatibility with old persisted DDL tasks.
+ #[allow(deprecated)]
+ RepartitionSource::Partitioned {
+ exprs: repartition.from_partition_exprs,
+ }
+ }
+ };
+
let procedure = self
.repartition_procedure_factory
.create(
&context,
table_name,
table_id,
- from_partition_exprs,
+ source,
into_partition_exprs,
Some(timeout),
)
@@ -1108,7 +1130,7 @@ mod tests {
use crate::ddl::table_meta::TableMetadataAllocator;
use crate::ddl::truncate_table::TruncateTableProcedure;
use crate::ddl::{DdlContext, NoopRegionFailureDetectorControl};
- use crate::ddl_manager::RepartitionProcedureFactory;
+ use crate::ddl_manager::{RepartitionProcedureFactory, RepartitionSource};
use crate::key::TableMetadataManager;
use crate::key::flow::FlowMetadataManager;
use crate::kv_backend::memory::MemoryKvBackend;
@@ -1146,7 +1168,7 @@ mod tests {
_ddl_ctx: &DdlContext,
_table_name: TableName,
_table_id: TableId,
- _from_exprs: Vec,
+ _source: RepartitionSource,
_to_exprs: Vec,
_timeout: Option,
) -> std::result::Result {
diff --git a/src/common/meta/src/key/flow.rs b/src/common/meta/src/key/flow.rs
index d581b92685..bc9aaaa6b3 100644
--- a/src/common/meta/src/key/flow.rs
+++ b/src/common/meta/src/key/flow.rs
@@ -459,6 +459,7 @@ mod tests {
use super::*;
use crate::FlownodeId;
+ use crate::key::flow::flow_info::FlowStatus;
use crate::key::flow::table_flow::TableFlowKey;
use crate::key::node_address::{NodeAddressKey, NodeAddressValue};
use crate::key::{FlowPartitionId, MetadataValue};
@@ -522,6 +523,8 @@ mod tests {
query_context: None,
flow_name: flow_name.to_string(),
source_table_ids,
+ all_source_table_names: vec![],
+ unresolved_source_table_names: vec![],
sink_table_name,
flownode_ids,
raw_sql: "raw".to_string(),
@@ -529,6 +532,7 @@ mod tests {
eval_interval_secs: None,
comment: "hi".to_string(),
options: Default::default(),
+ status: FlowStatus::Active,
created_time: chrono::Utc::now(),
updated_time: chrono::Utc::now(),
}
@@ -774,6 +778,8 @@ mod tests {
query_context: None,
flow_name: "flow".to_string(),
source_table_ids: vec![1024, 1025, 1026],
+ all_source_table_names: vec![],
+ unresolved_source_table_names: vec![],
sink_table_name: another_sink_table_name,
flownode_ids: [(0, 1u64)].into(),
raw_sql: "raw".to_string(),
@@ -781,6 +787,7 @@ mod tests {
eval_interval_secs: None,
comment: "hi".to_string(),
options: Default::default(),
+ status: FlowStatus::Active,
created_time: chrono::Utc::now(),
updated_time: chrono::Utc::now(),
};
@@ -1151,6 +1158,8 @@ mod tests {
query_context: None,
flow_name: "flow".to_string(),
source_table_ids: vec![1024, 1025, 1026],
+ all_source_table_names: vec![],
+ unresolved_source_table_names: vec![],
sink_table_name: another_sink_table_name,
flownode_ids: [(0, 1u64)].into(),
raw_sql: "raw".to_string(),
@@ -1158,6 +1167,7 @@ mod tests {
eval_interval_secs: None,
comment: "hi".to_string(),
options: Default::default(),
+ status: FlowStatus::Active,
created_time: chrono::Utc::now(),
updated_time: chrono::Utc::now(),
};
diff --git a/src/common/meta/src/key/flow/flow_info.rs b/src/common/meta/src/key/flow/flow_info.rs
index d501822c3c..b1056902da 100644
--- a/src/common/meta/src/key/flow/flow_info.rs
+++ b/src/common/meta/src/key/flow/flow_info.rs
@@ -16,6 +16,8 @@ use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use chrono::{DateTime, Utc};
+use futures::TryStreamExt;
+use futures::stream::BoxStream;
use lazy_static::lazy_static;
use regex::Regex;
use serde::{Deserialize, Serialize};
@@ -27,12 +29,27 @@ use crate::FlownodeId;
use crate::error::{self, Result};
use crate::key::flow::FlowScoped;
use crate::key::txn_helper::TxnOpGetResponseSet;
-use crate::key::{DeserializedValueWithBytes, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
+use crate::key::{
+ BytesAdapter, DeserializedValueWithBytes, FlowId, FlowPartitionId, MetadataKey, MetadataValue,
+};
use crate::kv_backend::KvBackendRef;
use crate::kv_backend::txn::{Compare, CompareOp, Txn, TxnOp};
+use crate::range_stream::{DEFAULT_PAGE_SIZE, PaginationStream};
+use crate::rpc::KeyValue;
+use crate::rpc::store::RangeRequest;
pub const FLOW_INFO_KEY_PREFIX: &str = "info";
+/// The lifecycle status of a flow stored in metadata.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub enum FlowStatus {
+ /// The flow metadata exists, but at least one source table did not exist at create time.
+ PendingSources,
+ /// The flow has resolved source tables and can be scheduled on flownodes.
+ #[default]
+ Active,
+}
+
lazy_static! {
static ref FLOW_INFO_KEY_PATTERN: Regex =
Regex::new(&format!("^{FLOW_INFO_KEY_PREFIX}/([0-9]+)$")).unwrap();
@@ -114,7 +131,12 @@ impl<'a> MetadataKey<'a, FlowInfoKeyInner> for FlowInfoKeyInner {
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FlowInfoValue {
/// The source tables used by the flow.
+ #[serde(default)]
pub source_table_ids: Vec,
+ #[serde(default)]
+ pub all_source_table_names: Vec,
+ #[serde(default)]
+ pub unresolved_source_table_names: Vec,
/// The sink table used by the flow.
pub sink_table_name: TableName,
/// Which flow nodes this flow is running on.
@@ -145,6 +167,8 @@ pub struct FlowInfoValue {
pub comment: String,
/// The options.
pub options: HashMap,
+ #[serde(default)]
+ pub status: FlowStatus,
/// The created time
#[serde(default)]
pub created_time: DateTime,
@@ -154,6 +178,14 @@ pub struct FlowInfoValue {
}
impl FlowInfoValue {
+ pub fn is_pending(&self) -> bool {
+ self.status == FlowStatus::PendingSources
+ }
+
+ pub fn is_active(&self) -> bool {
+ self.status == FlowStatus::Active
+ }
+
/// Returns the `flownode_id`.
pub fn flownode_ids(&self) -> &BTreeMap {
&self.flownode_ids
@@ -173,6 +205,14 @@ impl FlowInfoValue {
&self.source_table_ids
}
+ pub fn all_source_table_names(&self) -> &[TableName] {
+ &self.all_source_table_names
+ }
+
+ pub fn unresolved_source_table_names(&self) -> &[TableName] {
+ &self.unresolved_source_table_names
+ }
+
pub fn catalog_name(&self) -> &String {
&self.catalog_name
}
@@ -209,6 +249,10 @@ impl FlowInfoValue {
&self.options
}
+ pub fn status(&self) -> &FlowStatus {
+ &self.status
+ }
+
pub fn created_time(&self) -> &DateTime {
&self.created_time
}
@@ -225,6 +269,12 @@ pub struct FlowInfoManager {
kv_backend: KvBackendRef,
}
+pub fn flow_info_decoder(kv: KeyValue) -> Result<(FlowInfoKey, FlowInfoValue)> {
+ let key = FlowInfoKey::from_bytes(&kv.key)?;
+ let value = FlowInfoValue::try_from_raw_value(&kv.value)?;
+ Ok((key, value))
+}
+
impl FlowInfoManager {
/// Returns a new [FlowInfoManager].
pub fn new(kv_backend: KvBackendRef) -> Self {
@@ -254,6 +304,23 @@ impl FlowInfoManager {
.transpose()
}
+ pub fn flow_infos(&self) -> BoxStream<'static, Result<(FlowId, FlowInfoValue)>> {
+ let start_key = FlowScoped::new(BytesAdapter::from(
+ format!("{FLOW_INFO_KEY_PREFIX}/").into_bytes(),
+ ))
+ .to_bytes();
+ let req = RangeRequest::new().with_prefix(start_key);
+ let stream = PaginationStream::new(
+ self.kv_backend.clone(),
+ req,
+ DEFAULT_PAGE_SIZE,
+ flow_info_decoder,
+ )
+ .into_stream();
+
+ Box::pin(stream.map_ok(|(key, value)| (key.flow_id(), value)))
+ }
+
/// Builds a create flow transaction.
/// It is expected that the `__flow/info/{flow_id}` wasn't occupied.
/// Otherwise, the transaction will retrieve existing value.
diff --git a/src/common/procedure/src/local.rs b/src/common/procedure/src/local.rs
index 9e8536308c..5e8717a53a 100644
--- a/src/common/procedure/src/local.rs
+++ b/src/common/procedure/src/local.rs
@@ -24,7 +24,7 @@ use async_trait::async_trait;
use backon::ExponentialBuilder;
use common_error::ext::BoxedError;
use common_event_recorder::EventRecorderRef;
-use common_runtime::{RepeatedTask, TaskFunction};
+use common_runtime::{JoinHandle, RepeatedTask, TaskFunction};
use common_telemetry::tracing_context::{FutureExt, TracingContext};
use common_telemetry::{error, info, tracing};
use snafu::{OptionExt, ResultExt, ensure};
@@ -254,6 +254,8 @@ pub(crate) struct ManagerContext {
running_procedures: Mutex>,
/// Ids and finished time of finished procedures.
finished_procedures: Mutex>,
+ /// Runner tasks of procedures.
+ runner_tasks: Mutex>>,
/// Running flag.
running: Arc,
/// Poison manager.
@@ -310,6 +312,7 @@ impl ManagerContext {
procedures: RwLock::new(HashMap::new()),
running_procedures: Mutex::new(HashSet::new()),
finished_procedures: Mutex::new(VecDeque::new()),
+ runner_tasks: Mutex::new(HashMap::new()),
running: Arc::new(AtomicBool::new(false)),
poison_manager,
}
@@ -329,6 +332,76 @@ impl ManagerContext {
self.running.store(false, Ordering::Relaxed);
}
+ fn reset_runtime_state(&self) {
+ self.procedures.write().unwrap().clear();
+ self.running_procedures.lock().unwrap().clear();
+ self.finished_procedures.lock().unwrap().clear();
+ for handle in self
+ .runner_tasks
+ .lock()
+ .unwrap()
+ .drain()
+ .map(|(_, handle)| handle)
+ {
+ handle.abort();
+ }
+ self.key_lock.clear();
+ self.dynamic_key_lock.clear();
+ }
+
+ fn spawn_runner_task(&self, procedure_id: ProcedureId, spawn: F) -> bool
+ where
+ F: FnOnce() -> JoinHandle<()>,
+ {
+ let mut tasks = self.runner_tasks.lock().unwrap();
+ if !self.running() {
+ return false;
+ }
+
+ let handle = spawn();
+ let _ = tasks.insert(procedure_id, handle);
+ true
+ }
+
+ fn remove_procedure(&self, procedure_id: ProcedureId) {
+ self.procedures.write().unwrap().remove(&procedure_id);
+ self.running_procedures
+ .lock()
+ .unwrap()
+ .remove(&procedure_id);
+ }
+
+ pub(crate) fn remove_runner_task(&self, procedure_id: ProcedureId) {
+ let _ = self.runner_tasks.lock().unwrap().remove(&procedure_id);
+ }
+
+ fn take_runner_tasks(&self) -> Vec> {
+ self.runner_tasks
+ .lock()
+ .unwrap()
+ .drain()
+ .map(|(_, handle)| handle)
+ .collect()
+ }
+
+ async fn abort_runner_tasks(&self) {
+ let handles = self.take_runner_tasks();
+
+ for handle in &handles {
+ handle.abort();
+ }
+
+ for handle in handles {
+ if let Err(e) = handle.await
+ && !e.is_cancelled()
+ {
+ error!(
+ e; "Procedure runner task exits unexpectedly during stop",
+ );
+ }
+ }
+ }
+
/// Return `ProcedureManager` is running.
pub(crate) fn running(&self) -> bool {
self.running.load(Ordering::Relaxed)
@@ -675,17 +748,25 @@ impl LocalManager {
let tracing_context = TracingContext::from_current_span();
- let _handle = common_runtime::spawn_global(async move {
- let span = tracing_context.attach(tracing::info_span!(
- "LocalManager::submit_root_procedure",
- procedure_name = %runner.meta.type_name,
- procedure_id = %runner.meta.id,
- ));
- // Run the root procedure.
- // The task was moved to another runtime for execution.
- // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
- runner.run().trace(span).await;
- });
+ ensure!(
+ self.manager_ctx.spawn_runner_task(procedure_id, || {
+ common_runtime::spawn_global(async move {
+ let span = tracing_context.attach(tracing::info_span!(
+ "LocalManager::submit_root_procedure",
+ procedure_name = %runner.meta.type_name,
+ procedure_id = %runner.meta.id,
+ ));
+ // Run the root procedure.
+ // The task was moved to another runtime for execution.
+ // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
+ runner.run().trace(span).await;
+ })
+ }),
+ {
+ self.manager_ctx.remove_procedure(procedure_id);
+ ManagerNotStartSnafu
+ }
+ );
Ok(watcher)
}
@@ -822,6 +903,7 @@ impl ProcedureManager for LocalManager {
*task = Some(task_inner);
+ self.manager_ctx.reset_runtime_state();
self.manager_ctx.start();
info!("LocalManager is start.");
@@ -830,14 +912,18 @@ impl ProcedureManager for LocalManager {
}
async fn stop(&self) -> Result<()> {
- let mut task = self.remove_outdated_meta_task.lock().await;
-
- if let Some(task) = task.take() {
- task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)?;
- }
-
self.manager_ctx.stop();
+ let mut task = self.remove_outdated_meta_task.lock().await;
+ if let Some(task) = task.take()
+ && let Err(e) = task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)
+ {
+ error!(e; "Failed to stop remove outdated meta task");
+ };
+
+ self.manager_ctx.abort_runner_tasks().await;
+ self.manager_ctx.reset_runtime_state();
+
info!("LocalManager is stopped.");
Ok(())
@@ -921,10 +1007,12 @@ pub(crate) mod test_util {
#[cfg(test)]
mod tests {
use std::assert_matches;
+ use std::sync::atomic::{AtomicBool, Ordering as AtomicOrdering};
use common_error::mock::MockError;
use common_error::status_code::StatusCode;
use common_test_util::temp_dir::create_temp_dir;
+ use tokio::sync::oneshot;
use tokio::time::timeout;
use super::*;
@@ -954,6 +1042,67 @@ mod tests {
assert!(ctx.state(meta.id).unwrap().is_done());
}
+ #[test]
+ fn test_reset_runtime_state() {
+ let ctx = new_test_manager_context();
+ ctx.set_running();
+ let mut meta = test_util::procedure_meta_for_test();
+ meta.lock_key = LockKey::single_exclusive("test.reset_runtime_state");
+ let meta = Arc::new(meta);
+ let procedure_id = meta.id;
+
+ assert!(ctx.try_insert_procedure(meta.clone()));
+ ctx.finished_procedures
+ .lock()
+ .unwrap()
+ .push_back((procedure_id, Instant::now()));
+ ctx.spawn_runner_task(procedure_id, || {
+ common_runtime::spawn_global(std::future::pending::<()>())
+ });
+
+ drop(
+ ctx.key_lock
+ .try_write("test.reset_runtime_state".to_string()),
+ );
+ drop(
+ ctx.dynamic_key_lock
+ .try_write("test.reset_runtime_state.dynamic".to_string()),
+ );
+ assert!(ctx.contains_procedure(procedure_id));
+ assert_eq!(1, ctx.running_procedures.lock().unwrap().len());
+ assert_eq!(1, ctx.finished_procedures.lock().unwrap().len());
+ assert_eq!(1, ctx.runner_tasks.lock().unwrap().len());
+ assert_eq!(1, ctx.key_lock.len());
+ assert_eq!(1, ctx.dynamic_key_lock.len());
+
+ ctx.reset_runtime_state();
+
+ assert!(!ctx.contains_procedure(procedure_id));
+ assert!(ctx.running_procedures.lock().unwrap().is_empty());
+ assert!(ctx.finished_procedures.lock().unwrap().is_empty());
+ assert!(ctx.runner_tasks.lock().unwrap().is_empty());
+ assert!(ctx.key_lock.is_empty());
+ assert!(ctx.dynamic_key_lock.is_empty());
+ }
+
+ #[test]
+ fn test_spawn_runner_task_not_started_after_stop() {
+ let ctx = new_test_manager_context();
+ let procedure_id = ProcedureId::random();
+
+ let spawned = Arc::new(AtomicBool::new(false));
+ let spawned_in_task = spawned.clone();
+ let started = ctx.spawn_runner_task(procedure_id, || {
+ common_runtime::spawn_global(async move {
+ spawned_in_task.store(true, AtomicOrdering::Relaxed);
+ })
+ });
+
+ assert!(!started);
+ assert!(!spawned.load(AtomicOrdering::Relaxed));
+ assert!(ctx.runner_tasks.lock().unwrap().is_empty());
+ }
+
#[test]
fn test_manager_context_insert_duplicate() {
let ctx = new_test_manager_context();
@@ -1046,6 +1195,105 @@ mod tests {
}
}
+ #[derive(Debug)]
+ struct BlockingProcedure {
+ started_tx: Option>,
+ dropped: Arc,
+ lock_key: LockKey,
+ }
+
+ impl Drop for BlockingProcedure {
+ fn drop(&mut self) {
+ self.dropped.store(true, AtomicOrdering::Relaxed);
+ }
+ }
+
+ #[async_trait]
+ impl Procedure for BlockingProcedure {
+ fn type_name(&self) -> &str {
+ "BlockingProcedure"
+ }
+
+ async fn execute(&mut self, _ctx: &Context) -> Result {
+ if let Some(tx) = self.started_tx.take() {
+ let _ = tx.send(());
+ }
+ std::future::pending::>().await
+ }
+
+ fn dump(&self) -> Result {
+ Ok(String::new())
+ }
+
+ fn lock_key(&self) -> LockKey {
+ self.lock_key.clone()
+ }
+ }
+
+ #[tokio::test]
+ async fn test_stop_aborts_runner_and_resets_runtime_state() {
+ let dir = create_temp_dir("stop_aborts_runner_and_resets_runtime_state");
+ let config = ManagerConfig::default();
+ let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
+ let poison_manager = Arc::new(InMemoryPoisonStore::new());
+ let manager = LocalManager::new(config, state_store, poison_manager, None, None);
+ manager.start().await.unwrap();
+
+ let procedure_id = ProcedureId::random();
+ let (started_tx, started_rx) = oneshot::channel();
+ let dropped = Arc::new(AtomicBool::new(false));
+ let procedure = BlockingProcedure {
+ started_tx: Some(started_tx),
+ dropped: dropped.clone(),
+ lock_key: LockKey::single_exclusive("test.stop_aborts_runner"),
+ };
+
+ manager
+ .submit(ProcedureWithId {
+ id: procedure_id,
+ procedure: Box::new(procedure),
+ })
+ .await
+ .unwrap();
+ timeout(Duration::from_secs(5), started_rx)
+ .await
+ .unwrap()
+ .unwrap();
+
+ assert!(manager.manager_ctx.contains_procedure(procedure_id));
+ assert_eq!(
+ 1,
+ manager.manager_ctx.running_procedures.lock().unwrap().len()
+ );
+ assert_eq!(1, manager.manager_ctx.runner_tasks.lock().unwrap().len());
+ assert_eq!(1, manager.manager_ctx.key_lock.len());
+
+ manager.stop().await.unwrap();
+
+ assert!(dropped.load(AtomicOrdering::Relaxed));
+ assert!(!manager.manager_ctx.running());
+ assert!(!manager.manager_ctx.contains_procedure(procedure_id));
+ assert!(
+ manager
+ .manager_ctx
+ .running_procedures
+ .lock()
+ .unwrap()
+ .is_empty()
+ );
+ assert!(
+ manager
+ .manager_ctx
+ .finished_procedures
+ .lock()
+ .unwrap()
+ .is_empty()
+ );
+ assert!(manager.manager_ctx.runner_tasks.lock().unwrap().is_empty());
+ assert!(manager.manager_ctx.key_lock.is_empty());
+ assert!(manager.manager_ctx.dynamic_key_lock.is_empty());
+ }
+
#[test]
fn test_register_loader() {
let dir = create_temp_dir("register");
@@ -1439,7 +1687,7 @@ mod tests {
let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
let poison_manager = Arc::new(InMemoryPoisonStore::new());
let manager = LocalManager::new(config, state_store, poison_manager, None, None);
- manager.manager_ctx.set_running();
+ manager.start().await.unwrap();
manager
.manager_ctx
@@ -1447,7 +1695,6 @@ mod tests {
.lock()
.unwrap()
.insert(ProcedureId::random());
- manager.start().await.unwrap();
// Submit a new procedure should fail.
let mut procedure = ProcedureToLoad::new("submit");
diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs
index ca3e221f43..509b3a7756 100644
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -20,6 +20,7 @@ use backon::{BackoffBuilder, ExponentialBuilder};
use common_error::ext::PlainError;
use common_error::status_code::StatusCode;
use common_event_recorder::EventRecorderRef;
+use common_telemetry::tracing::warn;
use common_telemetry::tracing_context::{FutureExt, TracingContext};
use common_telemetry::{debug, error, info, tracing};
use rand::Rng;
@@ -480,6 +481,15 @@ impl Runner {
procedure_state: ProcedureState,
procedure: BoxedProcedure,
) {
+ if !self.running() {
+ warn!(
+ "ProcedureManager is not running, skip submitting subprocedure {}-{}",
+ procedure.type_name(),
+ procedure_id
+ );
+ return;
+ }
+
if self.manager_ctx.contains_procedure(procedure_id) {
// If the parent has already submitted this procedure, don't submit it again.
return;
@@ -520,23 +530,29 @@ impl Runner {
procedure_id,
);
- // Add the id of the subprocedure to the metadata.
- self.meta.push_child(procedure_id);
let parent_id = self.meta.id;
let tracing_context = TracingContext::from_current_span();
- let _handle = common_runtime::spawn_global(async move {
- let span = tracing_context.attach(tracing::info_span!(
- "LocalManager::submit_subprocedure",
- procedure_name = %runner.meta.type_name,
- procedure_id = %runner.meta.id,
- parent_id = %parent_id,
- ));
- // Run the root procedure.
- // The task was moved to another runtime for execution.
- // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
- runner.run().trace(span).await
- });
+ if !self.manager_ctx.spawn_runner_task(procedure_id, || {
+ common_runtime::spawn_global(async move {
+ let span = tracing_context.attach(tracing::info_span!(
+ "LocalManager::submit_subprocedure",
+ procedure_name = %runner.meta.type_name,
+ procedure_id = %runner.meta.id,
+ parent_id = %parent_id,
+ ));
+ // Run the root procedure.
+ // The task was moved to another runtime for execution.
+ // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
+ runner.run().trace(span).await
+ })
+ }) {
+ self.manager_ctx.remove_procedure(procedure_id);
+ return;
+ }
+
+ // Add the id of the subprocedure to the metadata.
+ self.meta.push_child(procedure_id);
}
/// Extend the retry time to wait for the next retry.
@@ -702,6 +718,12 @@ impl Runner {
}
}
+impl Drop for Runner {
+ fn drop(&mut self) {
+ self.manager_ctx.remove_runner_task(self.meta.id);
+ }
+}
+
#[cfg(test)]
mod tests {
use std::assert_matches;
diff --git a/src/common/procedure/src/rwlock.rs b/src/common/procedure/src/rwlock.rs
index cbdfe30977..c4807cf2f7 100644
--- a/src/common/procedure/src/rwlock.rs
+++ b/src/common/procedure/src/rwlock.rs
@@ -106,6 +106,13 @@ where
locks.remove(key);
}
}
+
+ /// Clears all key locks.
+ ///
+ /// Callers must ensure no tasks are holding or waiting for these locks.
+ pub fn clear(&self) {
+ self.inner.lock().unwrap().clear();
+ }
}
#[cfg(test)]
diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs
index aa2e627ca2..d5711e1761 100644
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -314,6 +314,7 @@ impl RegionServer {
let ctx = request.header.as_ref().map(|h| h.into());
let query_ctx = Arc::new(ctx.unwrap_or_else(|| QueryContextBuilder::default().build()));
+ let region_id = request.region_id;
let injector_builder = NameAwareDataSourceInjectorBuilder::from_plan(&request.plan)
.context(DataFusionSnafu)?;
let mut injector = injector_builder
@@ -326,7 +327,6 @@ impl RegionServer {
.context(DataFusionSnafu)?
.data;
- let region_id = request.region_id;
let stream = self
.inner
.handle_read(QueryRequest { plan, ..request }, query_ctx.clone())
@@ -837,14 +837,13 @@ fn wrap_flow_region_watermark_stream(
region_id: RegionId,
query_ctx: &QueryContextRef,
) -> SendableRecordBatchStream {
- let Some(seq) = should_collect_region_watermark_from_extensions(&query_ctx.extensions())
- .then(|| query_ctx.get_snapshot(region_id.as_u64()))
- .flatten()
- else {
- return stream;
- };
-
- Box::pin(RegionWatermarkStream::new(stream, region_id, seq))
+ if should_collect_region_watermark_from_extensions(&query_ctx.extensions())
+ && let Some(seq) = query_ctx.get_snapshot(region_id.as_u64())
+ {
+ Box::pin(RegionWatermarkStream::new(stream, region_id, seq)) as SendableRecordBatchStream
+ } else {
+ stream
+ }
}
/// Wraps a region read stream so terminal metrics can carry the scan-open watermark.
diff --git a/src/datanode/src/region_server/catalog.rs b/src/datanode/src/region_server/catalog.rs
index 1c0f48951f..a4df422b75 100644
--- a/src/datanode/src/region_server/catalog.rs
+++ b/src/datanode/src/region_server/catalog.rs
@@ -27,6 +27,7 @@ use datafusion_expr::{LogicalPlan, TableSource};
use futures::TryStreamExt;
use session::context::QueryContextRef;
use snafu::{OptionExt, ResultExt};
+use store_api::region_info::RegionInfoEntry;
use store_api::sst_entry::{ManifestSstEntry, PuffinIndexMetaEntry, StorageSstEntry};
use store_api::storage::RegionId;
@@ -41,6 +42,7 @@ enum InternalTableKind {
InspectSstManifest,
InspectSstStorage,
InspectSstIndexMeta,
+ InspectRegionInfo,
}
impl InternalTableKind {
@@ -55,6 +57,9 @@ impl InternalTableKind {
if name.eq_ignore_ascii_case(PuffinIndexMetaEntry::reserved_table_name_for_inspection()) {
return Some(Self::InspectSstIndexMeta);
}
+ if name.eq_ignore_ascii_case(RegionInfoEntry::reserved_table_name_for_inspection()) {
+ return Some(Self::InspectRegionInfo);
+ }
None
}
@@ -64,6 +69,7 @@ impl InternalTableKind {
Self::InspectSstManifest => server.inspect_sst_manifest_provider().await,
Self::InspectSstStorage => server.inspect_sst_storage_provider().await,
Self::InspectSstIndexMeta => server.inspect_sst_index_meta_provider().await,
+ Self::InspectRegionInfo => server.inspect_region_info_provider().await,
}
}
}
@@ -128,6 +134,25 @@ impl RegionServer {
let table = MemTable::try_new(schema, vec![vec![batch]]).context(DataFusionSnafu)?;
Ok(Arc::new(table))
}
+
+ /// Expose region info across the engine as an in-memory table.
+ pub async fn inspect_region_info_provider(&self) -> Result> {
+ let mito = {
+ let guard = self.inner.mito_engine.read().unwrap();
+ guard.as_ref().cloned().context(UnexpectedSnafu {
+ violated: "mito engine not available",
+ })?
+ };
+
+ let entries = mito.all_region_infos().await;
+ let schema = RegionInfoEntry::schema().arrow_schema().clone();
+ let batch = RegionInfoEntry::to_record_batch(&entries)
+ .map_err(DataFusionError::from)
+ .context(DataFusionSnafu)?;
+
+ let table = MemTable::try_new(schema, vec![vec![batch]]).context(DataFusionSnafu)?;
+ Ok(Arc::new(table))
+ }
}
/// A catalog list that resolves `TableProvider` by table name:
@@ -347,6 +372,7 @@ mod tests {
use datatypes::arrow::array::Int32Array;
use datatypes::arrow::datatypes::{DataType, Field, Schema};
use datatypes::arrow::record_batch::RecordBatch;
+ use store_api::region_info::RegionInfoEntry;
use super::*; // bring rewrite() into scope
@@ -409,6 +435,18 @@ mod tests {
b3.reserved_table_needed,
vec![InternalTableKind::InspectSstManifest]
);
+
+ let region_info = RegionInfoEntry::reserved_table_name_for_inspection();
+ let plan4 = table_scan(Some(region_info), &schema, None)
+ .unwrap()
+ .build()
+ .unwrap();
+ let b4 = NameAwareDataSourceInjectorBuilder::from_plan(&plan4).unwrap();
+ assert!(!b4.need_region_provider);
+ assert_eq!(
+ b4.reserved_table_needed,
+ vec![InternalTableKind::InspectRegionInfo]
+ );
}
#[test]
@@ -445,6 +483,39 @@ mod tests {
}
}
+ #[test]
+ fn test_rewriter_replaces_with_region_info_reserved_source() {
+ let schema = test_schema();
+ let table_name = RegionInfoEntry::reserved_table_name_for_inspection();
+ let plan = table_scan(Some(table_name), &schema, None)
+ .unwrap()
+ .build()
+ .unwrap();
+
+ let provider = empty_mem_table();
+ let source = provider_as_source(provider);
+
+ let mut injector = NameAwareDataSourceInjector {
+ reserved_sources: {
+ let mut m = HashMap::new();
+ m.insert(InternalTableKind::InspectRegionInfo, source.clone());
+ m
+ },
+ region_source: None,
+ };
+
+ let transformed = plan.rewrite(&mut injector).unwrap();
+ let new_plan = transformed.data;
+
+ if let LogicalPlan::TableScan(scan) = new_plan {
+ let src_ptr = Arc::as_ptr(&scan.source);
+ let want_ptr = Arc::as_ptr(&source);
+ assert!(std::ptr::eq(src_ptr, want_ptr));
+ } else {
+ panic!("expected TableScan after rewrite");
+ }
+ }
+
#[test]
fn test_rewriter_replaces_with_region_source_for_normal() {
let schema = test_schema();
diff --git a/src/datanode/src/utils.rs b/src/datanode/src/utils.rs
index 488ddacdf0..c5cd008c28 100644
--- a/src/datanode/src/utils.rs
+++ b/src/datanode/src/utils.rs
@@ -29,10 +29,28 @@ use tracing::info;
use crate::error::{GetMetadataSnafu, Result};
/// The requests to open regions.
-pub(crate) struct RegionOpenRequests {
- pub leader_regions: Vec<(RegionId, RegionOpenRequest)>,
+pub struct RegionOpenRequests {
+ pub(crate) leader_regions: Vec<(RegionId, RegionOpenRequest)>,
#[cfg(feature = "enterprise")]
- pub follower_regions: Vec<(RegionId, RegionOpenRequest)>,
+ pub(crate) follower_regions: Vec<(RegionId, RegionOpenRequest)>,
+}
+
+impl RegionOpenRequests {
+ /// Splits the request set into leader and follower regions.
+ #[allow(clippy::type_complexity)]
+ pub fn into_parts(
+ self,
+ ) -> (
+ Vec<(RegionId, RegionOpenRequest)>,
+ Vec<(RegionId, RegionOpenRequest)>,
+ ) {
+ let leader_regions = self.leader_regions;
+ #[cfg(feature = "enterprise")]
+ let follower_regions = self.follower_regions;
+ #[cfg(not(feature = "enterprise"))]
+ let follower_regions = Vec::new();
+ (leader_regions, follower_regions)
+ }
}
fn group_region_by_topic(
@@ -58,7 +76,8 @@ fn get_replay_checkpoint(
})
}
-pub(crate) async fn build_region_open_requests(
+/// Builds region-open requests from persisted metadata.
+pub async fn build_region_open_requests(
node_id: DatanodeId,
kv_backend: KvBackendRef,
) -> Result {
diff --git a/src/datatypes/src/json.rs b/src/datatypes/src/json.rs
index db657abbcb..33104084ad 100644
--- a/src/datatypes/src/json.rs
+++ b/src/datatypes/src/json.rs
@@ -26,12 +26,12 @@ use std::sync::Arc;
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value as Json};
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ResultExt};
use crate::error::{self, InvalidJsonSnafu, Result, SerializeSnafu};
use crate::json::value::{JsonValue, JsonVariant};
use crate::types::json_type::{JsonNativeType, JsonNumberType, JsonObjectType};
-use crate::types::{StructField, StructType};
+use crate::types::{JsonType, StructField, StructType};
use crate::value::{ListValue, StructValue, Value};
/// The configuration of JSON encoding
@@ -305,33 +305,47 @@ fn encode_json_array_with_context<'a>(
) -> Result {
let json_array_len = json_array.len();
let mut items = Vec::with_capacity(json_array_len);
- let mut element_type = item_type.cloned();
for (index, value) in json_array.into_iter().enumerate() {
let array_context = context.with_key(&index.to_string());
- let item_value =
- encode_json_value_with_context(value, element_type.as_ref(), &array_context)?;
- let item_type = item_value.json_type().native_type().clone();
- items.push(item_value.into_variant());
-
- // Determine the common type for the list
- if let Some(current_type) = &element_type {
- // It's valid for json array to have different types of items, for example,
- // ["a string", 1]. However, the `JsonValue` will be converted to Arrow list array,
- // which requires all items have exactly same type. So we forbid the different types
- // case here. Besides, it's not common for items in a json array to differ. So I think
- // we are good here.
- ensure!(
- item_type == *current_type,
- error::InvalidJsonSnafu {
- value: "all items in json array must have the same type"
- }
- );
- } else {
- element_type = Some(item_type);
- }
+ let item_value = encode_json_value_with_context(value, None, &array_context)?;
+ items.push(item_value);
}
+ // In specification, it's valid for a JSON array to have different types of items, for example,
+ // ["a string", 1]. However, in implementation, the `JsonValue` will be converted to Arrow list
+ // array, which requires all items have exactly the same type. So we merge out the maybe
+ // different item types to a unified type, and align all the item values to it.
+
+ let provided_item_type = item_type.map(|x| JsonType::new_json2(x.clone()));
+ let merged_item_type = if let Some((first, rests)) = items.split_first() {
+ let mut merged = first.json_type().clone();
+ for rest in rests.iter().map(|x| x.json_type()) {
+ if matches!(merged.native_type(), JsonNativeType::Variant) {
+ break;
+ }
+ merged.merge(rest)?;
+ }
+ Some(merged)
+ } else {
+ None
+ };
+ let unified_item_type = match (provided_item_type, merged_item_type) {
+ (Some(mut x), Some(y)) => {
+ x.merge(&y)?;
+ Some(x)
+ }
+ (x, y) => x.or(y),
+ };
+ if let Some(unified_item_type) = unified_item_type {
+ for item in &mut items {
+ item.try_align(&unified_item_type)?;
+ }
+ }
+ let items = items
+ .into_iter()
+ .map(|x| x.into_variant())
+ .collect::>();
Ok(JsonValue::new(JsonVariant::Array(items)))
}
@@ -1050,11 +1064,8 @@ mod tests {
fn test_encode_json_array_mixed_types() {
let json = json!([1, "hello", true, 3.15]);
let settings = JsonStructureSettings::Structured(None);
- let result = settings.encode_with_type(json, None);
- assert_eq!(
- result.unwrap_err().to_string(),
- "Invalid JSON: all items in json array must have the same type"
- );
+ let value = settings.encode_with_type(json, None).unwrap();
+ assert_eq!(value.data_type().to_string(), r#"Json2[""]"#);
}
#[test]
@@ -1276,12 +1287,12 @@ mod tests {
#[test]
fn test_encode_json_array_with_item_type() {
let json = json!([1, 2, 3]);
- let item_type = Arc::new(ConcreteDataType::uint64_datatype());
+ let item_type = Arc::new(ConcreteDataType::int64_datatype());
let settings = JsonStructureSettings::Structured(None);
let result = settings
.encode_with_type(
json,
- Some(&JsonNativeType::Array(Box::new(JsonNativeType::u64()))),
+ Some(&JsonNativeType::Array(Box::new(JsonNativeType::i64()))),
)
.unwrap()
.into_json_inner()
@@ -1289,9 +1300,9 @@ mod tests {
if let Value::List(list_value) = result {
assert_eq!(list_value.items().len(), 3);
- assert_eq!(list_value.items()[0], Value::UInt64(1));
- assert_eq!(list_value.items()[1], Value::UInt64(2));
- assert_eq!(list_value.items()[2], Value::UInt64(3));
+ assert_eq!(list_value.items()[0], Value::Int64(1));
+ assert_eq!(list_value.items()[1], Value::Int64(2));
+ assert_eq!(list_value.items()[2], Value::Int64(3));
assert_eq!(list_value.datatype(), item_type);
} else {
panic!("Expected List value");
@@ -2249,10 +2260,10 @@ mod tests {
)])),
);
- let decoded_struct = settings.decode_struct(array_struct);
+ let decoded_struct = settings.decode_struct(array_struct).unwrap();
assert_eq!(
- decoded_struct.unwrap_err().to_string(),
- "Invalid JSON: all items in json array must have the same type"
+ format!("{decoded_struct:?}"),
+ r#"StructValue { items: [List(ListValue { items: [Binary(Bytes(b"1")), Binary(Bytes(b"\"hello\"")), Binary(Bytes(b"true")), Binary(Bytes(b"3.15"))], datatype: Binary(BinaryType { repr_type: Binary }) })], fields: StructType { fields: [StructField { name: "value", data_type: List(ListType { item_type: Binary(BinaryType { repr_type: Binary }) }), nullable: true, metadata: {} }] } }"#
);
}
diff --git a/src/datatypes/src/json/value.rs b/src/datatypes/src/json/value.rs
index f3b652a549..4350630003 100644
--- a/src/datatypes/src/json/value.rs
+++ b/src/datatypes/src/json/value.rs
@@ -65,6 +65,14 @@ impl JsonNumber {
JsonNumber::Float(n) => n.0,
}
}
+
+ fn native_type(&self) -> JsonNativeType {
+ match self {
+ JsonNumber::PosInt(_) => JsonNativeType::u64(),
+ JsonNumber::NegInt(_) => JsonNativeType::i64(),
+ JsonNumber::Float(_) => JsonNativeType::f64(),
+ }
+ }
}
impl From for JsonNumber {
@@ -147,26 +155,14 @@ impl JsonVariant {
match self {
JsonVariant::Null => JsonNativeType::Null,
JsonVariant::Bool(_) => JsonNativeType::Bool,
- JsonVariant::Number(n) => match n {
- JsonNumber::PosInt(_) => JsonNativeType::u64(),
- JsonNumber::NegInt(_) => JsonNativeType::i64(),
- JsonNumber::Float(_) => JsonNativeType::f64(),
- },
+ JsonVariant::Number(n) => n.native_type(),
JsonVariant::String(_) => JsonNativeType::String,
JsonVariant::Array(array) => {
- let item_type = if let Some(first) = array.first() {
- first.native_type()
- } else {
- JsonNativeType::Null
- };
- JsonNativeType::Array(Box::new(item_type))
+ json_array_native_type(array.iter().map(JsonVariant::native_type))
+ }
+ JsonVariant::Object(object) => {
+ json_object_native_type(object.iter().map(|(k, v)| (k, v.native_type())))
}
- JsonVariant::Object(object) => JsonNativeType::Object(
- object
- .iter()
- .map(|(k, v)| (k.clone(), v.native_type()))
- .collect(),
- ),
JsonVariant::Variant(_) => JsonNativeType::Variant,
}
}
@@ -469,6 +465,7 @@ impl JsonValue {
.collect::>()?,
),
+ (JsonVariant::Object(kvs), _) if kvs.is_empty() => JsonVariant::Null,
(JsonVariant::Object(mut kvs), JsonNativeType::Object(expected)) => {
ensure!(
expected.keys().len() >= kvs.keys().len()
@@ -517,7 +514,7 @@ impl JsonValue {
let x = std::mem::take(&mut self.json_variant);
self.json_variant = helper(x, expected.native_type())?;
- self.json_type = OnceLock::from(expected.clone());
+ self.json_type = OnceLock::new();
Ok(())
}
}
@@ -623,35 +620,55 @@ pub enum JsonVariantRef<'a> {
}
impl JsonVariantRef<'_> {
- fn json_type(&self) -> JsonType {
- fn native_type(v: &JsonVariantRef<'_>) -> JsonNativeType {
- match v {
- JsonVariantRef::Null => JsonNativeType::Null,
- JsonVariantRef::Bool(_) => JsonNativeType::Bool,
- JsonVariantRef::Number(n) => match n {
- JsonNumber::PosInt(_) => JsonNativeType::u64(),
- JsonNumber::NegInt(_) => JsonNativeType::i64(),
- JsonNumber::Float(_) => JsonNativeType::f64(),
- },
- JsonVariantRef::String(_) => JsonNativeType::String,
- JsonVariantRef::Array(array) => {
- let item_type = if let Some(first) = array.first() {
- native_type(first)
- } else {
- JsonNativeType::Null
- };
- JsonNativeType::Array(Box::new(item_type))
- }
- JsonVariantRef::Object(object) => JsonNativeType::Object(
- object
- .iter()
- .map(|(k, v)| (k.to_string(), native_type(v)))
- .collect(),
- ),
- JsonVariantRef::Variant(_) => JsonNativeType::Variant,
+ fn native_type(&self) -> JsonNativeType {
+ match self {
+ JsonVariantRef::Null => JsonNativeType::Null,
+ JsonVariantRef::Bool(_) => JsonNativeType::Bool,
+ JsonVariantRef::Number(n) => n.native_type(),
+ JsonVariantRef::String(_) => JsonNativeType::String,
+ JsonVariantRef::Array(array) => {
+ json_array_native_type(array.iter().map(JsonVariantRef::native_type))
}
+ JsonVariantRef::Object(object) => {
+ json_object_native_type(object.iter().map(|(k, v)| (*k, v.native_type())))
+ }
+ JsonVariantRef::Variant(_) => JsonNativeType::Variant,
}
- JsonType::new_json2(native_type(self))
+ }
+
+ fn json_type(&self) -> JsonType {
+ JsonType::new_json2(self.native_type())
+ }
+}
+
+fn json_array_native_type(items: I) -> JsonNativeType
+where
+ I: IntoIterator- ,
+{
+ let mut iter = items.into_iter();
+ let mut item_type = match iter.next() {
+ Some(t) => t,
+ None => return JsonNativeType::Array(Box::new(JsonNativeType::Null)),
+ };
+ for x in iter {
+ if matches!(item_type, JsonNativeType::Variant) {
+ break;
+ }
+ item_type.merge(&x);
+ }
+ JsonNativeType::Array(Box::new(item_type))
+}
+
+fn json_object_native_type
(fields: I) -> JsonNativeType
+where
+ I: IntoIterator- ,
+ K: Into
,
+{
+ let mut fields = fields.into_iter().peekable();
+ if fields.peek().is_none() {
+ JsonNativeType::Null
+ } else {
+ JsonNativeType::Object(fields.map(|(k, v)| (k.into(), v)).collect())
}
}
@@ -941,7 +958,6 @@ mod tests {
("name".to_string(), JsonVariant::Null),
])))
);
- assert_eq!(value.json_type(), &expected);
// Object alignment should fail if the expected type misses any field from the value.
let expected = JsonType::new_json2(JsonNativeType::Object(JsonObjectType::from([(
diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs
index 362357c5e6..652847da43 100644
--- a/src/datatypes/src/types/json_type.rs
+++ b/src/datatypes/src/types/json_type.rs
@@ -115,6 +115,14 @@ impl JsonNativeType {
(JsonNativeType::Null, that) => that.clone(),
(this, JsonNativeType::Null) => this,
(this, that) if this == *that => this,
+
+ (JsonNativeType::Number(x), JsonNativeType::Number(y)) => {
+ JsonNativeType::Number(match (x, y) {
+ (x, y) if x == *y => x,
+ (JsonNumberType::F64, _) | (_, JsonNumberType::F64) => JsonNumberType::F64,
+ _ => JsonNumberType::I64,
+ })
+ }
_ => JsonNativeType::Variant,
};
}
@@ -128,7 +136,7 @@ impl JsonNativeType {
JsonNumberType::I64 => ArrowDataType::Int64,
JsonNumberType::F64 => ArrowDataType::Float64,
},
- JsonNativeType::String => ArrowDataType::Utf8,
+ JsonNativeType::String => ArrowDataType::Utf8View,
JsonNativeType::Array(array) => {
ArrowDataType::List(Arc::new(Field::new("item", array.as_arrow_type(), true)))
}
@@ -822,7 +830,7 @@ mod tests {
test(
"1.5",
&mut JsonType::new_json2(JsonNativeType::i64()),
- Ok(r#""""#),
+ Ok(r#""""#),
)?;
// Object merge should preserve existing fields and append missing fields.
diff --git a/src/datatypes/src/vectors/json/array.rs b/src/datatypes/src/vectors/json/array.rs
index 75779821c5..b3bd24cd98 100644
--- a/src/datatypes/src/vectors/json/array.rs
+++ b/src/datatypes/src/vectors/json/array.rs
@@ -17,16 +17,24 @@ use std::sync::Arc;
use arrow::compute;
use arrow::util::display::{ArrayFormatter, FormatOptions};
+use arrow_array::builder::{
+ ArrayBuilder, BooleanBuilder, Float64Builder, Int64Builder, NullBuilder, StringViewBuilder,
+ make_builder,
+};
use arrow_array::cast::AsArray;
use arrow_array::types::{Float64Type, Int64Type, UInt64Type};
use arrow_array::{Array, ArrayRef, GenericListArray, ListArray, StructArray, new_null_array};
use arrow_schema::{DataType, FieldRef};
+use common_telemetry::debug;
use serde_json::Value;
use snafu::{OptionExt, ResultExt};
-use crate::arrow_array::{StringArray, binary_array_value, string_array_value};
+use crate::arrow_array::{
+ MutableBinaryArray, StringViewArray, binary_array_value, string_array_value,
+};
use crate::error::{
- AlignJsonArraySnafu, ArrowComputeSnafu, DeserializeSnafu, InvalidJsonSnafu, Result,
+ AlignJsonArraySnafu, ArrowComputeSnafu, CastTypeSnafu, DeserializeSnafu, InvalidJsonSnafu,
+ Result, SerializeSnafu,
};
pub struct JsonArray<'a> {
@@ -101,6 +109,12 @@ impl JsonArray<'_> {
return Ok(self.inner.clone());
}
+ debug!(
+ "Try aligning JSON array {} to data type {}",
+ self.inner.data_type(),
+ expect
+ );
+
let struct_array = self.inner.as_struct_opt().context(AlignJsonArraySnafu {
reason: "expect struct array",
})?;
@@ -178,11 +192,23 @@ impl JsonArray<'_> {
}
fn try_cast(&self, to_type: &DataType) -> Result {
- if compute::can_cast_types(self.inner.data_type(), to_type) {
+ let from_type = self.inner.data_type();
+ if from_type == to_type {
+ return Ok(self.inner.clone());
+ }
+
+ if from_type.is_binary() && !to_type.is_binary() {
+ return self.decode_variant(to_type);
+ }
+
+ if !from_type.is_binary() && to_type.is_binary() {
+ return self.encode_variant();
+ }
+
+ if compute::can_cast_types(from_type, to_type) {
return compute::cast(&self.inner, to_type).context(ArrowComputeSnafu);
}
- // TODO(LFC): Cast according to `to_type` instead of formatting to String here.
let formatter = ArrayFormatter::try_new(&self.inner, &FormatOptions::default())
.context(ArrowComputeSnafu)?;
let values = (0..self.inner.len())
@@ -192,7 +218,91 @@ impl JsonArray<'_> {
.then(|| formatter.value(i).to_string())
})
.collect::>();
- Ok(Arc::new(StringArray::from(values)))
+ Ok(Arc::new(StringViewArray::from(values)))
+ }
+
+ fn encode_variant(&self) -> Result {
+ let len = self.inner.len();
+ let mut encoded = Vec::with_capacity(len);
+ let mut total_bytes = 0;
+
+ for i in 0..len {
+ let value = self.try_get_value(i)?;
+ if value.is_null() {
+ encoded.push(None);
+ } else {
+ let bytes = serde_json::to_vec(&value).context(SerializeSnafu)?;
+ total_bytes += bytes.len();
+ encoded.push(Some(bytes));
+ }
+ }
+
+ let mut builder = MutableBinaryArray::with_capacity(len, total_bytes);
+ for value in encoded {
+ builder.append_option(value);
+ }
+ Ok(Arc::new(builder.finish()))
+ }
+
+ fn decode_variant(&self, to_type: &DataType) -> Result {
+ fn downcast_builder<'a, T: ArrayBuilder>(
+ builder: &'a mut dyn ArrayBuilder,
+ to_type: &DataType,
+ ) -> Result<&'a mut T> {
+ builder
+ .as_any_mut()
+ .downcast_mut::()
+ .with_context(|| CastTypeSnafu {
+ msg: format!("Expect ArrayBuilder is of type {to_type}"),
+ })
+ }
+
+ let mut builder = make_builder(to_type, self.inner.len());
+ if to_type.is_null() {
+ downcast_builder::(builder.as_mut(), to_type)?
+ .append_nulls(self.inner.len());
+ } else {
+ match to_type {
+ DataType::Boolean => {
+ let b = downcast_builder::(builder.as_mut(), to_type)?;
+ for i in 0..self.inner.len() {
+ b.append_option(self.try_get_value(i)?.as_bool());
+ }
+ }
+ DataType::Int64 => {
+ let b = downcast_builder::(builder.as_mut(), to_type)?;
+ for i in 0..self.inner.len() {
+ b.append_option(self.try_get_value(i)?.as_i64());
+ }
+ }
+ DataType::Float64 => {
+ let b = downcast_builder::(builder.as_mut(), to_type)?;
+ for i in 0..self.inner.len() {
+ b.append_option(self.try_get_value(i)?.as_f64());
+ }
+ }
+ DataType::Utf8View => {
+ let b = downcast_builder::(builder.as_mut(), to_type)?;
+ for i in 0..self.inner.len() {
+ let v = self.try_get_value(i)?;
+ if v.is_null() {
+ b.append_null();
+ } else if let Some(s) = v.as_str() {
+ b.append_value(s);
+ } else {
+ b.append_value(v.to_string());
+ }
+ }
+ }
+ _ => {
+ return CastTypeSnafu {
+ msg: format!("Cannot cast JSON value to {to_type}"),
+ }
+ .fail();
+ }
+ }
+ }
+ Ok(builder.finish())
}
}
@@ -231,7 +341,9 @@ impl<'a> From<&'a ArrayRef> for JsonArray<'a> {
#[cfg(test)]
mod test {
use arrow_array::types::Int64Type;
- use arrow_array::{BinaryArray, BooleanArray, Float64Array, Int32Array, Int64Array, ListArray};
+ use arrow_array::{
+ BinaryArray, BooleanArray, Float64Array, Int32Array, Int64Array, ListArray, StringArray,
+ };
use arrow_schema::{Field, Fields};
use serde_json::json;
diff --git a/src/datatypes/src/vectors/json/builder.rs b/src/datatypes/src/vectors/json/builder.rs
index be79a921c7..7ca1ff2f6a 100644
--- a/src/datatypes/src/vectors/json/builder.rs
+++ b/src/datatypes/src/vectors/json/builder.rs
@@ -89,7 +89,9 @@ impl MutableVector for JsonVectorBuilder {
.fail();
};
let json_type = value.json_type();
- self.merged_type.merge(json_type)?;
+ if !self.merged_type.is_include(json_type) {
+ self.merged_type.merge(json_type)?;
+ }
let value = JsonValue::new(JsonVariant::from(value.variant().clone()));
self.values.push(value);
diff --git a/src/flow/src/adapter/flownode_impl.rs b/src/flow/src/adapter/flownode_impl.rs
index 53a3265d7d..f4ca149f1a 100644
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -465,6 +465,11 @@ impl FlowDualEngine {
Ok(())
}
+ /// Reconciles in-memory flow tasks from persisted metadata.
+ pub async fn reconcile_flows_from_metadata(&self) -> Result<(), Error> {
+ self.check_flow_consistent(true, true).await
+ }
+
/// TODO(discord9): also add a `exists` api using flow metadata manager's `exists` method
async fn flow_exist_in_metadata(&self, flow_id: FlowId) -> Result {
self.flow_metadata_manager
diff --git a/src/flow/src/batching_mode.rs b/src/flow/src/batching_mode.rs
index 4162daa20c..580762a142 100644
--- a/src/flow/src/batching_mode.rs
+++ b/src/flow/src/batching_mode.rs
@@ -20,12 +20,15 @@ use common_grpc::channel_manager::ClientTlsOption;
use serde::{Deserialize, Serialize};
use session::ReadPreference;
+mod checkpoint;
pub(crate) mod engine;
pub(crate) mod frontend_client;
+mod incremental_filter;
mod state;
+mod table_creator;
mod task;
mod time_window;
-mod utils;
+pub(crate) mod utils;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct BatchingModeOptions {
diff --git a/src/flow/src/batching_mode/checkpoint.rs b/src/flow/src/batching_mode/checkpoint.rs
new file mode 100644
index 0000000000..7341d3d9e7
--- /dev/null
+++ b/src/flow/src/batching_mode/checkpoint.rs
@@ -0,0 +1,127 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::batching_mode::state::CheckpointMode;
+
+pub(super) const CHECKPOINT_DECISION_ADVANCE: &str = "advance";
+pub(super) const CHECKPOINT_DECISION_FALLBACK: &str = "fallback";
+pub(super) const CHECKPOINT_REASON_NONE: &str = "none";
+
+/// Why the task fell back to full snapshot mode.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) enum FlowQueryFallbackReason {
+ /// The query result did not include a region-watermark map at all.
+ MissingRegionWatermark,
+ /// Some participating regions could not prove safe advancement against
+ /// both the returned watermarks and the checkpoint map.
+ IncompleteRegionWatermark,
+ /// The query only covered part of the dirty backlog, so global checkpoints
+ /// cannot advance yet. Incremental SQL drains all dirty windows before
+ /// checkpoint advancement; this primarily protects scoped full-snapshot
+ /// runs capped by the per-query dirty-window limit.
+ DirtyBacklogPending,
+ /// The datanode detected a stale incremental cursor and the Flow
+ /// must recompute from scratch.
+ StaleCursor,
+ /// A non-stale-cursor query failure; the Flow resets to full snapshot
+ /// to avoid cascading errors.
+ IncrementalQueryFailure,
+ /// Incremental mode has been permanently disabled for this Flow
+ /// (e.g. because the query shape is not incrementally safe).
+ IncrementalDisabled,
+}
+
+impl FlowQueryFallbackReason {
+ pub(super) fn as_label(self) -> &'static str {
+ match self {
+ Self::MissingRegionWatermark => "missing_region_watermark",
+ Self::IncompleteRegionWatermark => "incomplete_region_watermark",
+ Self::DirtyBacklogPending => "dirty_backlog_pending",
+ Self::StaleCursor => "stale_cursor",
+ Self::IncrementalQueryFailure => "incremental_query_failure",
+ Self::IncrementalDisabled => "incremental_disabled",
+ }
+ }
+}
+
+/// Decision produced by `BatchingTask::apply_query_result_to_state` after
+/// each Flow query execution. Describes whether the task advanced its
+/// checkpoint state or fell back to full snapshot, and why.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) enum FlowCheckpointDecision {
+ /// FullSnapshot → Incremental transition.
+ ///
+ /// The query exercised every participating region, all returned valid
+ /// watermarks, and the checkpoint map was populated from scratch.
+ /// Subsequent executions will use incremental after-seqs.
+ AdvancedFromFullSnapshot {
+ participating_regions: usize,
+ watermarks: usize,
+ },
+ /// Existing Incremental → Incremental (in-place advancement).
+ ///
+ /// A subset of participating regions advanced their watermarks. The
+ /// task stays in incremental mode with an updated checkpoint map.
+ AdvancedIncremental {
+ participating_regions: usize,
+ watermarks: usize,
+ },
+ /// Any mode → FullSnapshot.
+ ///
+ /// Watermark information was incomplete, a participating region was
+ /// absent from the existing checkpoint map, the task has permanently
+ /// disabled incremental mode, or the query itself failed. The task
+ /// resets to full snapshot semantics for the next execution.
+ FallbackToFullSnapshot {
+ previous_mode: CheckpointMode,
+ reason: FlowQueryFallbackReason,
+ },
+}
+
+impl FlowCheckpointDecision {
+ pub(super) fn mode_label(self) -> &'static str {
+ match self {
+ Self::AdvancedFromFullSnapshot { .. } => {
+ checkpoint_mode_label(CheckpointMode::FullSnapshot)
+ }
+ Self::AdvancedIncremental { .. } => checkpoint_mode_label(CheckpointMode::Incremental),
+ Self::FallbackToFullSnapshot { previous_mode, .. } => {
+ checkpoint_mode_label(previous_mode)
+ }
+ }
+ }
+
+ pub(super) fn decision_label(self) -> &'static str {
+ match self {
+ Self::AdvancedFromFullSnapshot { .. } | Self::AdvancedIncremental { .. } => {
+ CHECKPOINT_DECISION_ADVANCE
+ }
+ Self::FallbackToFullSnapshot { .. } => CHECKPOINT_DECISION_FALLBACK,
+ }
+ }
+
+ pub(super) fn reason_label(self) -> &'static str {
+ match self {
+ Self::FallbackToFullSnapshot { reason, .. } => reason.as_label(),
+ _ => CHECKPOINT_REASON_NONE,
+ }
+ }
+}
+
+pub(super) fn checkpoint_mode_label(mode: CheckpointMode) -> &'static str {
+ match mode {
+ CheckpointMode::FullSnapshot => "full_snapshot",
+ CheckpointMode::Incremental => "incremental",
+ }
+}
diff --git a/src/flow/src/batching_mode/engine.rs b/src/flow/src/batching_mode/engine.rs
index 054f5db9d6..f37e54d80b 100644
--- a/src/flow/src/batching_mode/engine.rs
+++ b/src/flow/src/batching_mode/engine.rs
@@ -59,8 +59,7 @@ use crate::{CreateFlowArgs, Error, FlowId, TableName};
///
/// TODO(discord9): determine how to configure refresh rate
pub struct BatchingEngine {
- tasks: RwLock>,
- shutdown_txs: RwLock>>,
+ runtime: RwLock,
/// frontend client for insert request
pub(crate) frontend_client: Arc,
flow_metadata_manager: FlowMetadataManagerRef,
@@ -72,6 +71,51 @@ pub struct BatchingEngine {
pub(crate) batch_opts: Arc,
}
+#[derive(Default)]
+struct FlowRuntimeRegistry {
+ tasks: BTreeMap,
+ shutdown_txs: BTreeMap>,
+}
+
+impl FlowRuntimeRegistry {
+ fn insert(
+ &mut self,
+ flow_id: FlowId,
+ task: BatchingTask,
+ shutdown_tx: oneshot::Sender<()>,
+ ) -> (Option, Option>) {
+ (
+ self.tasks.insert(flow_id, task),
+ self.shutdown_txs.insert(flow_id, shutdown_tx),
+ )
+ }
+
+ fn remove(&mut self, flow_id: FlowId) -> Option<(BatchingTask, Option>)> {
+ let task = self.tasks.remove(&flow_id)?;
+ let shutdown_tx = self.shutdown_txs.remove(&flow_id);
+ Some((task, shutdown_tx))
+ }
+
+ fn remove_if_current(
+ &mut self,
+ flow_id: FlowId,
+ task: &BatchingTask,
+ ) -> (Option, Option>) {
+ if self
+ .tasks
+ .get(&flow_id)
+ .is_some_and(|current| Arc::ptr_eq(¤t.state, &task.state))
+ {
+ let Some((removed_task, removed_shutdown_tx)) = self.remove(flow_id) else {
+ return (None, None);
+ };
+ (Some(removed_task), removed_shutdown_tx)
+ } else {
+ (None, None)
+ }
+ }
+}
+
impl BatchingEngine {
pub fn new(
frontend_client: Arc,
@@ -82,8 +126,7 @@ impl BatchingEngine {
batch_opts: BatchingModeOptions,
) -> Self {
Self {
- tasks: Default::default(),
- shutdown_txs: Default::default(),
+ runtime: Default::default(),
frontend_client,
flow_metadata_manager,
table_meta,
@@ -95,8 +138,9 @@ impl BatchingEngine {
/// Returns last execution timestamps (millisecond) for all batching flows.
pub async fn get_last_exec_time_map(&self) -> BTreeMap {
- let tasks = self.tasks.read().await;
- tasks
+ let runtime = self.runtime.read().await;
+ runtime
+ .tasks
.iter()
.filter_map(|(flow_id, task)| {
task.last_execution_time_millis()
@@ -151,10 +195,17 @@ impl BatchingEngine {
let group_by_table_name = Arc::new(group_by_table_name);
+ let tasks = self
+ .runtime
+ .read()
+ .await
+ .tasks
+ .values()
+ .cloned()
+ .collect::>();
let mut handles = Vec::new();
- let tasks = self.tasks.read().await;
- for (_flow_id, task) in tasks.iter() {
+ for task in tasks {
let src_table_names = &task.config.source_table_names;
if src_table_names
@@ -204,7 +255,6 @@ impl BatchingEngine {
});
handles.push(handle);
}
- drop(tasks);
for handle in handles {
match handle.await {
Err(e) => {
@@ -274,9 +324,16 @@ impl BatchingEngine {
let group_by_table_name = Arc::new(group_by_table_name);
+ let tasks = self
+ .runtime
+ .read()
+ .await
+ .tasks
+ .values()
+ .cloned()
+ .collect::>();
let mut handles = Vec::new();
- let tasks = self.tasks.read().await;
- for (_flow_id, task) in tasks.iter() {
+ for task in tasks {
let src_table_names = &task.config.source_table_names;
if src_table_names
@@ -327,8 +384,6 @@ impl BatchingEngine {
}
}
}
- drop(tasks);
-
Ok(())
}
}
@@ -390,7 +445,7 @@ impl BatchingEngine {
// or replace logic
{
- let is_exist = self.tasks.read().await.contains_key(&flow_id);
+ let is_exist = self.runtime.read().await.tasks.contains_key(&flow_id);
match (create_if_not_exists, or_replace, is_exist) {
// if replace, ignore that old flow exists
(_, true, true) => {
@@ -521,17 +576,60 @@ impl BatchingEngine {
// check execute once first to detect any error early
task.check_or_create_sink_table(&engine, &frontend).await?;
+ let (start_tx, start_rx) = oneshot::channel();
+
// TODO(discord9): use time wheel or what for better
let handle = common_runtime::spawn_global(async move {
- task_inner.start_executing_loop(engine, frontend).await;
+ if start_rx.await.is_ok() {
+ task_inner.start_executing_loop(engine, frontend).await;
+ }
});
task.state.write().unwrap().task_handle = Some(handle);
+ let task_for_rollback = task.clone();
- // only replace here not earlier because we want the old one intact if something went wrong before this line
- let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
- drop(replaced_old_task_opt);
+ // Only replace here, not earlier, because we want the old one intact if
+ // something went wrong before this line. Keep the task and shutdown
+ // sender in one registry lock so create/remove can't observe one
+ // without the other.
+ let (replaced_old_task_opt, replaced_old_shutdown_tx) = {
+ let mut runtime = self.runtime.write().await;
- self.shutdown_txs.write().await.insert(flow_id, tx);
+ let is_exist = runtime.tasks.contains_key(&flow_id);
+ match (create_if_not_exists, or_replace, is_exist) {
+ (_, true, true) => {
+ info!(
+ "Replacing flow with id={} after final registry check",
+ flow_id
+ );
+ }
+ (false, false, true) => {
+ abort_flow_task(flow_id, Some(task), "unregistered");
+ return FlowAlreadyExistSnafu { id: flow_id }.fail();
+ }
+ (true, false, true) => {
+ info!(
+ "Flow with id={} already exists at final registry check, do nothing",
+ flow_id
+ );
+ abort_flow_task(flow_id, Some(task), "unregistered");
+ return Ok(None);
+ }
+ (_, _, false) => (),
+ }
+
+ runtime.insert(flow_id, task, tx)
+ };
+
+ notify_flow_shutdown(flow_id, replaced_old_shutdown_tx, "replaced");
+ abort_flow_task(flow_id, replaced_old_task_opt, "replaced");
+ if start_tx.send(()).is_err() {
+ self.rollback_flow_runtime_if_current(flow_id, &task_for_rollback)
+ .await;
+ UnexpectedSnafu {
+ reason: format!("Failed to start flow {flow_id} due to task already dropped"),
+ }
+ .fail()?;
+ }
Ok(Some(flow_id))
}
@@ -662,21 +760,25 @@ impl BatchingEngine {
}
pub async fn remove_flow_inner(&self, flow_id: FlowId) -> Result<(), Error> {
- if self.tasks.write().await.remove(&flow_id).is_none() {
- warn!("Flow {flow_id} not found in tasks");
- FlowNotFoundSnafu { id: flow_id }.fail()?;
- }
- let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
+ let (task, shutdown_tx) = {
+ let mut runtime = self.runtime.write().await;
+ let Some((task, shutdown_tx)) = runtime.remove(flow_id) else {
+ warn!("Flow {flow_id} not found in tasks");
+ FlowNotFoundSnafu { id: flow_id }.fail()?
+ };
+ (task, shutdown_tx)
+ };
+
+ let had_shutdown_tx = notify_flow_shutdown(flow_id, shutdown_tx, "removed");
+ abort_flow_task(flow_id, Some(task), "removed");
+
+ if !had_shutdown_tx {
UnexpectedSnafu {
reason: format!("Can't found shutdown tx for flow {flow_id}"),
}
.fail()?
- };
- if tx.send(()).is_err() {
- warn!(
- "Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?"
- )
}
+
Ok(())
}
@@ -688,7 +790,7 @@ impl BatchingEngine {
// this is only useful for the case when we are flushing the flow right after inserting data into it
// TODO(discord9): find a better way to ensure the data is ready, maybe inform flownode from frontend?
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
- let task = self.tasks.read().await.get(&flow_id).cloned();
+ let task = self.runtime.read().await.tasks.get(&flow_id).cloned();
let task = task.with_context(|| FlowNotFoundSnafu { id: flow_id })?;
let time_window_size = task
@@ -713,7 +815,7 @@ impl BatchingEngine {
)
.await?;
- let affected_rows = res.map(|(r, _)| r).unwrap_or_default() as usize;
+ let affected_rows = res.map(|(r, _)| r).unwrap_or_default();
debug!(
"Successfully flush flow {flow_id}, affected rows={}",
affected_rows
@@ -723,8 +825,46 @@ impl BatchingEngine {
/// Determine if the batching mode flow task exists with given flow id
pub async fn flow_exist_inner(&self, flow_id: FlowId) -> bool {
- self.tasks.read().await.contains_key(&flow_id)
+ self.runtime.read().await.tasks.contains_key(&flow_id)
}
+
+ async fn rollback_flow_runtime_if_current(&self, flow_id: FlowId, task: &BatchingTask) {
+ let (removed_task, removed_shutdown_tx) = {
+ let mut runtime = self.runtime.write().await;
+ runtime.remove_if_current(flow_id, task)
+ };
+
+ notify_flow_shutdown(flow_id, removed_shutdown_tx, "rolled back");
+ abort_flow_task(flow_id, removed_task, "rolled back");
+ }
+}
+
+fn notify_flow_shutdown(flow_id: FlowId, tx: Option>, action: &str) -> bool {
+ let Some(tx) = tx else {
+ return false;
+ };
+
+ if tx.send(()).is_err() {
+ warn!(
+ "Fail to shutdown {action} flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?"
+ );
+ }
+
+ true
+}
+
+fn abort_flow_task(flow_id: FlowId, task: Option, action: &str) -> bool {
+ let Some(task) = task else {
+ return false;
+ };
+
+ if let Some(handle) = task.state.write().unwrap().task_handle.take() {
+ handle.abort();
+ debug!("Aborted {action} flow task {flow_id}");
+ return true;
+ }
+
+ false
}
impl FlowEngine for BatchingEngine {
@@ -741,7 +881,14 @@ impl FlowEngine for BatchingEngine {
Ok(self.flow_exist_inner(flow_id).await)
}
async fn list_flows(&self) -> Result, Error> {
- Ok(self.tasks.read().await.keys().cloned().collect::>())
+ Ok(self
+ .runtime
+ .read()
+ .await
+ .tasks
+ .keys()
+ .cloned()
+ .collect::>())
}
async fn handle_flow_inserts(
&self,
@@ -756,3 +903,241 @@ impl FlowEngine for BatchingEngine {
self.handle_mark_dirty_time_window(req).await
}
}
+
+#[cfg(test)]
+mod tests {
+ use catalog::memory::new_memory_catalog_manager;
+ use common_meta::key::TableMetadataManager;
+ use common_meta::key::flow::FlowMetadataManager;
+ use common_meta::kv_backend::memory::MemoryKvBackend;
+ use query::options::QueryOptions;
+ use session::context::QueryContext;
+
+ use super::*;
+ use crate::test_utils::create_test_query_engine;
+
+ struct DropNotify(Option>);
+
+ impl Drop for DropNotify {
+ fn drop(&mut self) {
+ if let Some(tx) = self.0.take() {
+ let _ = tx.send(());
+ }
+ }
+ }
+
+ async fn new_test_engine() -> BatchingEngine {
+ let kv_backend = Arc::new(MemoryKvBackend::new());
+ let table_meta = Arc::new(TableMetadataManager::new(kv_backend.clone()));
+ table_meta.init().await.unwrap();
+ let flow_meta = Arc::new(FlowMetadataManager::new(kv_backend));
+ let catalog_manager = new_memory_catalog_manager().unwrap();
+ let query_engine = create_test_query_engine();
+ let (frontend_client, _handler) =
+ FrontendClient::from_empty_grpc_handler(QueryOptions::default());
+
+ BatchingEngine::new(
+ Arc::new(frontend_client),
+ query_engine,
+ flow_meta,
+ table_meta,
+ catalog_manager,
+ BatchingModeOptions::default(),
+ )
+ }
+
+ async fn new_test_task(flow_id: FlowId) -> (BatchingTask, oneshot::Sender<()>) {
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ let plan = sql_to_df_plan(
+ ctx.clone(),
+ query_engine.clone(),
+ "SELECT number, ts FROM numbers_with_ts",
+ true,
+ )
+ .await
+ .unwrap();
+ let (tx, rx) = oneshot::channel();
+
+ let task = BatchingTask::try_new(TaskArgs {
+ flow_id,
+ query: "SELECT number, ts FROM numbers_with_ts",
+ plan,
+ time_window_expr: None,
+ expire_after: None,
+ sink_table_name: [
+ "greptime".to_string(),
+ "public".to_string(),
+ "sink".to_string(),
+ ],
+ source_table_names: vec![[
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ]],
+ query_ctx: ctx,
+ catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+ shutdown_rx: rx,
+ batch_opts: Arc::new(BatchingModeOptions::default()),
+ flow_eval_interval: None,
+ })
+ .unwrap();
+
+ (task, tx)
+ }
+
+ async fn install_abort_observed_handle(task: &BatchingTask) -> oneshot::Receiver<()> {
+ let (drop_tx, drop_rx) = oneshot::channel();
+ let (entered_tx, entered_rx) = oneshot::channel();
+ let handle = tokio::spawn(async move {
+ let _guard = DropNotify(Some(drop_tx));
+ let _ = entered_tx.send(());
+ std::future::pending::<()>().await;
+ });
+ task.state.write().unwrap().task_handle = Some(handle);
+ tokio::time::timeout(Duration::from_secs(1), entered_rx)
+ .await
+ .expect("test task handle should start")
+ .expect("test task handle should report start");
+ drop_rx
+ }
+
+ #[tokio::test]
+ async fn test_notify_flow_shutdown_sends_signal() {
+ let (tx, rx) = oneshot::channel();
+
+ assert!(notify_flow_shutdown(42, Some(tx), "test"));
+
+ rx.await.expect("replaced flow should receive shutdown");
+ }
+
+ #[test]
+ fn test_notify_flow_shutdown_accepts_missing_sender() {
+ assert!(!notify_flow_shutdown(42, None, "test"));
+ }
+
+ #[tokio::test]
+ async fn test_abort_flow_task_aborts_handle() {
+ let (task, _shutdown_tx) = new_test_task(42).await;
+ let drop_rx = install_abort_observed_handle(&task).await;
+
+ assert!(abort_flow_task(42, Some(task), "test"));
+
+ tokio::time::timeout(Duration::from_secs(1), drop_rx)
+ .await
+ .expect("aborted task should be dropped")
+ .expect("drop notifier should fire");
+ }
+
+ #[tokio::test]
+ async fn test_remove_flow_inner_aborts_registered_task() {
+ let engine = new_test_engine().await;
+ let (task, shutdown_tx) = new_test_task(42).await;
+ let drop_rx = install_abort_observed_handle(&task).await;
+
+ engine.runtime.write().await.insert(42, task, shutdown_tx);
+
+ engine.remove_flow_inner(42).await.unwrap();
+
+ tokio::time::timeout(Duration::from_secs(1), drop_rx)
+ .await
+ .expect("removed task should be dropped")
+ .expect("drop notifier should fire");
+ assert!(!engine.flow_exist_inner(42).await);
+ assert!(!engine.runtime.read().await.shutdown_txs.contains_key(&42));
+ }
+
+ #[tokio::test]
+ async fn test_or_replace_flow_runtime_replaces_old_handles_and_keeps_new_task() {
+ let engine = new_test_engine().await;
+ let (old_task, old_shutdown_tx) = new_test_task(42).await;
+ let old_task_identity = old_task.clone();
+ let old_drop_rx = install_abort_observed_handle(&old_task).await;
+ let (new_task, new_shutdown_tx) = new_test_task(42).await;
+ let new_task_identity = new_task.clone();
+
+ engine
+ .runtime
+ .write()
+ .await
+ .insert(42, old_task, old_shutdown_tx);
+ let (replaced_old_task, replaced_old_shutdown_tx) =
+ engine
+ .runtime
+ .write()
+ .await
+ .insert(42, new_task, new_shutdown_tx);
+
+ let replaced_old_task = replaced_old_task.expect("old task should be returned");
+ assert!(Arc::ptr_eq(
+ &replaced_old_task.state,
+ &old_task_identity.state
+ ));
+ assert!(notify_flow_shutdown(
+ 42,
+ replaced_old_shutdown_tx,
+ "replaced"
+ ));
+ old_task_identity
+ .state
+ .write()
+ .unwrap()
+ .shutdown_rx
+ .try_recv()
+ .expect("old shutdown receiver should receive signal");
+ assert!(abort_flow_task(42, Some(replaced_old_task), "replaced"));
+
+ tokio::time::timeout(Duration::from_secs(1), old_drop_rx)
+ .await
+ .expect("replaced task should be dropped")
+ .expect("drop notifier should fire");
+
+ let runtime = engine.runtime.read().await;
+ assert_eq!(1, runtime.tasks.len());
+ assert_eq!(1, runtime.shutdown_txs.len());
+ let registered_task = runtime.tasks.get(&42).expect("new task should remain");
+ assert!(Arc::ptr_eq(
+ ®istered_task.state,
+ &new_task_identity.state
+ ));
+ assert!(runtime.shutdown_txs.contains_key(&42));
+ assert!(matches!(
+ new_task_identity
+ .state
+ .write()
+ .unwrap()
+ .shutdown_rx
+ .try_recv(),
+ Err(oneshot::error::TryRecvError::Empty)
+ ));
+ }
+
+ #[tokio::test]
+ async fn test_rollback_flow_runtime_if_current_removes_matching_task_only() {
+ let engine = new_test_engine().await;
+ let (old_task, _old_shutdown_tx) = new_test_task(42).await;
+ let (current_task, current_shutdown_tx) = new_test_task(42).await;
+ let current_task_identity = current_task.clone();
+
+ engine
+ .runtime
+ .write()
+ .await
+ .insert(42, current_task, current_shutdown_tx);
+
+ engine.rollback_flow_runtime_if_current(42, &old_task).await;
+
+ let registered_task = engine.runtime.read().await.tasks.get(&42).cloned().unwrap();
+ assert!(Arc::ptr_eq(
+ ®istered_task.state,
+ ¤t_task_identity.state
+ ));
+ assert!(engine.runtime.read().await.shutdown_txs.contains_key(&42));
+
+ engine
+ .rollback_flow_runtime_if_current(42, ¤t_task_identity)
+ .await;
+ assert!(!engine.flow_exist_inner(42).await);
+ assert!(!engine.runtime.read().await.shutdown_txs.contains_key(&42));
+ }
+}
diff --git a/src/flow/src/batching_mode/frontend_client.rs b/src/flow/src/batching_mode/frontend_client.rs
index 7382f214e5..c6194d96b3 100644
--- a/src/flow/src/batching_mode/frontend_client.rs
+++ b/src/flow/src/batching_mode/frontend_client.rs
@@ -20,15 +20,17 @@ use std::sync::{Arc, Mutex, Weak};
use api::v1::greptime_request::Request;
use api::v1::query_request::Query;
use api::v1::{CreateTableExpr, QueryRequest};
-use client::{Client, Database};
+use client::{Client, Database, OutputWithMetrics};
use common_error::ext::BoxedError;
use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_client_tls_config};
use common_meta::peer::{Peer, PeerDiscovery};
-use common_query::Output;
+use common_query::{Output, OutputData};
+use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
use common_telemetry::warn;
use meta_client::client::MetaClient;
use query::datafusion::QUERY_PARALLELISM_HINT;
-use query::options::QueryOptions;
+use query::metrics::terminal_recordbatch_metrics_from_plan;
+use query::options::{FlowQueryExtensions, QueryOptions};
use rand::rng;
use rand::seq::SliceRandom;
use servers::query_handler::grpc::GrpcQueryHandler;
@@ -196,9 +198,6 @@ impl DatabaseWithPeer {
}
impl FrontendClient {
- // TODO: support more fine-grained load balancing strategies for frontend
- // selection, such as AZ (availability zone) awareness, to prefer frontends
- // in the same zone as the flownode and reduce cross-AZ latency.
/// scan for available frontend from metadata
pub(crate) async fn scan_for_frontend(&self) -> Result, Error> {
let Self::Distributed { meta_client, .. } = self else {
@@ -341,6 +340,83 @@ impl FrontendClient {
}
}
+ pub(crate) async fn query_with_terminal_metrics(
+ &self,
+ catalog: &str,
+ schema: &str,
+ request: QueryRequest,
+ extensions: &[(&str, &str)],
+ peer_desc: &mut Option,
+ ) -> Result {
+ let flow_extensions = build_flow_extensions(extensions)?;
+ match self {
+ FrontendClient::Distributed {
+ query, batch_opts, ..
+ } => {
+ let query_parallelism = query.parallelism.to_string();
+ let hints = vec![
+ (QUERY_PARALLELISM_HINT, query_parallelism.as_str()),
+ (READ_PREFERENCE_HINT, batch_opts.read_preference.as_ref()),
+ ];
+ let db = self.get_random_active_frontend(catalog, schema).await?;
+ *peer_desc = Some(PeerDesc::Dist {
+ peer: db.peer.clone(),
+ });
+ db.database
+ .query_with_terminal_metrics_and_flow_extensions(request, &hints, extensions)
+ .await
+ .map_err(BoxedError::new)
+ .context(ExternalSnafu)
+ }
+ FrontendClient::Standalone {
+ database_client,
+ query,
+ } => {
+ *peer_desc = Some(PeerDesc::Standalone);
+ let mut extensions_map = HashMap::from([(
+ QUERY_PARALLELISM_HINT.to_string(),
+ query.parallelism.to_string(),
+ )]);
+ for (key, value) in extensions {
+ extensions_map.insert((*key).to_string(), (*value).to_string());
+ }
+ let ctx = QueryContextBuilder::default()
+ .current_catalog(catalog.to_string())
+ .current_schema(schema.to_string())
+ .extensions(extensions_map)
+ .build();
+ let ctx = Arc::new(ctx);
+ let database_client = {
+ database_client
+ .handler
+ .lock()
+ .map_err(|e| {
+ UnexpectedSnafu {
+ reason: format!("Failed to lock database client: {e}"),
+ }
+ .build()
+ })?
+ .as_ref()
+ .context(UnexpectedSnafu {
+ reason: "Standalone's frontend instance is not set",
+ })?
+ .upgrade()
+ .context(UnexpectedSnafu {
+ reason: "Failed to upgrade database client",
+ })?
+ };
+ database_client
+ .do_query(Request::Query(request), ctx.clone())
+ .await
+ .map(|output| {
+ wrap_standalone_output_with_terminal_metrics(output, &flow_extensions, &ctx)
+ })
+ .map_err(BoxedError::new)
+ .context(ExternalSnafu)
+ }
+ }
+ }
+
/// Handle a request to frontend
pub(crate) async fn handle(
&self,
@@ -426,22 +502,83 @@ impl FrontendClient {
}
}
+fn build_flow_extensions(extensions: &[(&str, &str)]) -> Result {
+ let flow_extensions = HashMap::from_iter(
+ extensions
+ .iter()
+ .map(|(key, value)| ((*key).to_string(), (*value).to_string())),
+ );
+ FlowQueryExtensions::parse_flow_extensions(&flow_extensions)
+ .map_err(BoxedError::new)
+ .context(ExternalSnafu)
+ .map(|extensions| extensions.unwrap_or_default())
+}
+
+fn wrap_standalone_output_with_terminal_metrics(
+ output: Output,
+ flow_extensions: &FlowQueryExtensions,
+ query_ctx: &QueryContextRef,
+) -> OutputWithMetrics {
+ let should_collect_region_watermark = flow_extensions.should_collect_region_watermark();
+ let terminal_metrics =
+ if should_collect_region_watermark && !matches!(&output.data, OutputData::Stream(_)) {
+ output
+ .meta
+ .plan
+ .clone()
+ .and_then(terminal_recordbatch_metrics_from_plan)
+ .or_else(|| terminal_recordbatch_metrics_from_snapshots(query_ctx))
+ } else {
+ None
+ };
+ let result = OutputWithMetrics::from_output(output);
+ if let Some(metrics) = terminal_metrics {
+ result.metrics.update(Some(metrics));
+ }
+ result
+}
+
+fn terminal_recordbatch_metrics_from_snapshots(
+ query_ctx: &QueryContextRef,
+) -> Option {
+ let mut region_watermarks = query_ctx
+ .snapshots()
+ .into_iter()
+ .map(|(region_id, watermark)| RegionWatermarkEntry {
+ region_id,
+ watermark: Some(watermark),
+ })
+ .collect::>();
+ if region_watermarks.is_empty() {
+ return None;
+ }
+
+ region_watermarks.sort_by_key(|entry| entry.region_id);
+ Some(RecordBatchMetrics {
+ region_watermarks,
+ ..Default::default()
+ })
+}
+
/// Describe a peer of frontend
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
pub(crate) enum PeerDesc {
+ /// The query failed before a frontend peer was selected.
+ #[default]
+ Unknown,
/// Distributed mode's frontend peer address
Dist {
/// frontend peer address
peer: Peer,
},
/// Standalone mode
- #[default]
Standalone,
}
impl std::fmt::Display for PeerDesc {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
+ PeerDesc::Unknown => write!(f, "unknown"),
PeerDesc::Dist { peer } => write!(f, "{}", peer.addr),
PeerDesc::Standalone => write!(f, "standalone"),
}
@@ -450,9 +587,17 @@ impl std::fmt::Display for PeerDesc {
#[cfg(test)]
mod tests {
+ use std::pin::Pin;
+ use std::task::{Context, Poll};
use std::time::Duration;
- use common_query::Output;
+ use common_query::{Output, OutputData};
+ use common_recordbatch::adapter::RecordBatchMetrics;
+ use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream};
+ use datatypes::prelude::{ConcreteDataType, VectorRef};
+ use datatypes::schema::{ColumnSchema, Schema};
+ use datatypes::vectors::Int32Vector;
+ use futures::StreamExt;
use tokio::time::timeout;
use super::*;
@@ -460,6 +605,58 @@ mod tests {
#[derive(Debug)]
struct NoopHandler;
+ struct MockMetricsStream {
+ schema: datatypes::schema::SchemaRef,
+ batch: Option,
+ metrics: RecordBatchMetrics,
+ terminal_metrics_only: bool,
+ }
+
+ impl futures::Stream for MockMetricsStream {
+ type Item = common_recordbatch::error::Result;
+
+ fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> {
+ Poll::Ready(self.batch.take().map(Ok))
+ }
+
+ fn size_hint(&self) -> (usize, Option) {
+ (
+ usize::from(self.batch.is_some()),
+ Some(usize::from(self.batch.is_some())),
+ )
+ }
+ }
+
+ impl RecordBatchStream for MockMetricsStream {
+ fn name(&self) -> &str {
+ "MockMetricsStream"
+ }
+
+ fn schema(&self) -> datatypes::schema::SchemaRef {
+ self.schema.clone()
+ }
+
+ fn output_ordering(&self) -> Option<&[OrderOption]> {
+ None
+ }
+
+ fn metrics(&self) -> Option {
+ if self.terminal_metrics_only && self.batch.is_some() {
+ return None;
+ }
+ Some(self.metrics.clone())
+ }
+ }
+
+ #[derive(Debug)]
+ struct MetricsHandler;
+
+ #[derive(Debug)]
+ struct ExtensionAwareHandler;
+
+ #[derive(Debug)]
+ struct SnapshotBindingHandler;
+
#[async_trait::async_trait]
impl GrpcQueryHandlerWithBoxedError for NoopHandler {
async fn do_query(
@@ -471,6 +668,63 @@ mod tests {
}
}
+ #[async_trait::async_trait]
+ impl GrpcQueryHandlerWithBoxedError for MetricsHandler {
+ async fn do_query(
+ &self,
+ _query: Request,
+ _ctx: QueryContextRef,
+ ) -> std::result::Result {
+ let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
+ "v",
+ ConcreteDataType::int32_datatype(),
+ false,
+ )]));
+ let batch = RecordBatch::new(
+ schema.clone(),
+ vec![Arc::new(Int32Vector::from_slice([1, 2])) as VectorRef],
+ )
+ .unwrap();
+ Ok(Output::new_with_stream(Box::pin(MockMetricsStream {
+ schema,
+ batch: Some(batch),
+ metrics: RecordBatchMetrics {
+ region_watermarks: vec![common_recordbatch::adapter::RegionWatermarkEntry {
+ region_id: 42,
+ watermark: Some(99),
+ }],
+ ..Default::default()
+ },
+ terminal_metrics_only: true,
+ })))
+ }
+ }
+
+ #[async_trait::async_trait]
+ impl GrpcQueryHandlerWithBoxedError for ExtensionAwareHandler {
+ async fn do_query(
+ &self,
+ _query: Request,
+ ctx: QueryContextRef,
+ ) -> std::result::Result {
+ assert_eq!(ctx.extension("flow.return_region_seq"), Some("true"));
+ Ok(Output::new_with_affected_rows(1))
+ }
+ }
+
+ #[async_trait::async_trait]
+ impl GrpcQueryHandlerWithBoxedError for SnapshotBindingHandler {
+ async fn do_query(
+ &self,
+ _query: Request,
+ ctx: QueryContextRef,
+ ) -> std::result::Result {
+ assert_eq!(ctx.extension("flow.return_region_seq"), Some("true"));
+ ctx.set_snapshot(42, 99);
+ Ok(Output::new_with_affected_rows(1))
+ }
+ }
+
#[tokio::test]
async fn wait_initialized() {
let (client, handler_mut) =
@@ -516,4 +770,117 @@ mod tests {
.is_ok()
);
}
+
+ #[tokio::test]
+ async fn test_query_with_terminal_metrics_tracks_watermark_in_standalone_mode() {
+ let handler: Arc = Arc::new(MetricsHandler);
+ let client =
+ FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+ let mut peer_desc = None;
+
+ let result = client
+ .query_with_terminal_metrics(
+ "greptime",
+ "public",
+ QueryRequest {
+ query: Some(Query::Sql("select 1".to_string())),
+ },
+ &[],
+ &mut peer_desc,
+ )
+ .await
+ .unwrap();
+ assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+ let terminal_metrics = result.metrics.clone();
+ assert!(!result.metrics.is_ready());
+ assert!(terminal_metrics.get().is_none());
+
+ let OutputData::Stream(mut stream) = result.output.data else {
+ panic!("expected stream output");
+ };
+ while stream.next().await.is_some() {}
+
+ assert!(terminal_metrics.is_ready());
+ assert_eq!(
+ terminal_metrics.region_watermark_map(),
+ Some(HashMap::from([(42_u64, 99_u64)]))
+ );
+ }
+
+ #[tokio::test]
+ async fn test_query_with_terminal_metrics_forwards_flow_extensions_in_standalone_mode() {
+ let handler: Arc = Arc::new(ExtensionAwareHandler);
+ let client =
+ FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+ let mut peer_desc = None;
+
+ let result = client
+ .query_with_terminal_metrics(
+ "greptime",
+ "public",
+ QueryRequest {
+ query: Some(Query::Sql("insert into t select 1".to_string())),
+ },
+ &[("flow.return_region_seq", "true")],
+ &mut peer_desc,
+ )
+ .await
+ .unwrap();
+ assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+ assert!(result.metrics.is_ready());
+ assert!(result.region_watermark_map().is_none());
+ }
+
+ #[tokio::test]
+ async fn test_query_with_terminal_metrics_uses_standalone_snapshot_bounds() {
+ let handler: Arc = Arc::new(SnapshotBindingHandler);
+ let client =
+ FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+ let mut peer_desc = None;
+
+ let result = client
+ .query_with_terminal_metrics(
+ "greptime",
+ "public",
+ QueryRequest {
+ query: Some(Query::Sql("insert into t select * from src".to_string())),
+ },
+ &[("flow.return_region_seq", "true")],
+ &mut peer_desc,
+ )
+ .await
+ .unwrap();
+ assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+ assert!(result.metrics.is_ready());
+ assert_eq!(
+ result.region_watermark_map(),
+ Some(HashMap::from([(42, 99)]))
+ );
+ }
+
+ #[tokio::test]
+ async fn test_query_with_terminal_metrics_rejects_invalid_flow_extensions() {
+ let handler: Arc = Arc::new(NoopHandler);
+ let client =
+ FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+ let mut peer_desc = None;
+
+ let err = client
+ .query_with_terminal_metrics(
+ "greptime",
+ "public",
+ QueryRequest {
+ query: Some(Query::Sql("select 1".to_string())),
+ },
+ &[("flow.return_region_seq", "not-a-bool")],
+ &mut peer_desc,
+ )
+ .await
+ .unwrap_err();
+
+ assert!(format!("{err:?}").contains("Invalid value for flow.return_region_seq"));
+ }
}
diff --git a/src/flow/src/batching_mode/incremental_filter.rs b/src/flow/src/batching_mode/incremental_filter.rs
new file mode 100644
index 0000000000..ddc58d0378
--- /dev/null
+++ b/src/flow/src/batching_mode/incremental_filter.rs
@@ -0,0 +1,222 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_telemetry::tracing::debug;
+use datafusion_expr::Expr;
+use datatypes::schema::Schema;
+
+use crate::batching_mode::state::FilterExprInfo;
+use crate::batching_mode::utils::IncrementalAggregateAnalysis;
+use crate::{Error, FlowId};
+
+pub(super) fn build_sink_dirty_time_window_filter_expr(
+ flow_id: FlowId,
+ analysis: &IncrementalAggregateAnalysis,
+ sink_schema: &Schema,
+ dirty_filter: Option<&FilterExprInfo>,
+) -> Result, Error> {
+ let Some(dirty_filter) = dirty_filter else {
+ return Ok(None);
+ };
+
+ let Some(sink_filter_col) =
+ infer_sink_time_window_filter_col(flow_id, analysis, sink_schema, dirty_filter)
+ else {
+ return Ok(None);
+ };
+
+ dirty_filter.predicate_for_col(&sink_filter_col)
+}
+
+fn infer_sink_time_window_filter_col(
+ flow_id: FlowId,
+ analysis: &IncrementalAggregateAnalysis,
+ sink_schema: &Schema,
+ dirty_filter: &FilterExprInfo,
+) -> Option {
+ if analysis.group_key_names.is_empty() {
+ return None;
+ }
+
+ let is_timestamp_group_key = |name: &str| {
+ analysis.group_key_names.iter().any(|key| key == name)
+ && sink_schema
+ .column_schema_by_name(name)
+ .is_some_and(|col| col.data_type.is_timestamp())
+ };
+
+ if is_timestamp_group_key(&dirty_filter.col_name) {
+ return Some(dirty_filter.col_name.clone());
+ }
+
+ let candidates = analysis
+ .group_key_names
+ .iter()
+ .filter(|name| is_timestamp_group_key(name))
+ .cloned()
+ .collect::>();
+
+ match candidates.as_slice() {
+ [name] => Some(name.clone()),
+ [] => {
+ debug!(
+ "Flow {} cannot infer sink dirty-window filter column: no timestamp group key in {:?}",
+ flow_id, analysis.group_key_names
+ );
+ None
+ }
+ _ => {
+ debug!(
+ "Flow {} cannot infer sink dirty-window filter column: ambiguous timestamp group keys {:?}",
+ flow_id, candidates
+ );
+ None
+ }
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use datatypes::prelude::ConcreteDataType;
+ use datatypes::schema::ColumnSchema;
+ use pretty_assertions::assert_eq;
+
+ use super::*;
+ use crate::adapter::AUTO_CREATED_UPDATE_AT_TS_COL;
+ use crate::batching_mode::state::FilterExprInfo;
+ use crate::batching_mode::utils::IncrementalAggregateAnalysis;
+
+ fn test_analysis_with_group_keys(group_key_names: Vec<&str>) -> IncrementalAggregateAnalysis {
+ IncrementalAggregateAnalysis {
+ group_key_names: group_key_names
+ .into_iter()
+ .map(|name| name.to_string())
+ .collect(),
+ merge_columns: vec![],
+ literal_columns: vec![],
+ output_field_names: vec![],
+ unsupported_exprs: vec![],
+ }
+ }
+
+ fn test_dirty_filter(col_name: &str) -> FilterExprInfo {
+ FilterExprInfo {
+ expr: datafusion_expr::col(col_name),
+ col_name: col_name.to_string(),
+ time_ranges: vec![],
+ window_size: chrono::Duration::seconds(1),
+ }
+ }
+
+ fn test_sink_schema(columns: Vec<(&str, ConcreteDataType)>) -> Schema {
+ Schema::new(
+ columns
+ .into_iter()
+ .map(|(name, data_type)| ColumnSchema::new(name, data_type, true))
+ .collect(),
+ )
+ }
+
+ #[test]
+ fn test_infer_sink_time_window_filter_col_uses_matching_source_group_key() {
+ let analysis = test_analysis_with_group_keys(vec!["ts", "host"]);
+ let sink_schema = test_sink_schema(vec![
+ ("ts", ConcreteDataType::timestamp_millisecond_datatype()),
+ ("host", ConcreteDataType::string_datatype()),
+ ]);
+ let dirty_filter = test_dirty_filter("ts");
+
+ assert_eq!(
+ Some("ts".to_string()),
+ infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+ );
+ }
+
+ #[test]
+ fn test_infer_sink_time_window_filter_col_uses_unique_timestamp_group_key() {
+ let analysis = test_analysis_with_group_keys(vec!["host", "time_window"]);
+ let sink_schema = test_sink_schema(vec![
+ ("host", ConcreteDataType::string_datatype()),
+ (
+ "time_window",
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ ),
+ (
+ AUTO_CREATED_UPDATE_AT_TS_COL,
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ ),
+ ]);
+ let dirty_filter = test_dirty_filter("ts");
+
+ assert_eq!(
+ Some("time_window".to_string()),
+ infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+ );
+ }
+
+ #[test]
+ fn test_infer_sink_time_window_filter_col_skips_global_aggregate() {
+ let analysis = test_analysis_with_group_keys(vec![]);
+ let sink_schema = test_sink_schema(vec![
+ ("number", ConcreteDataType::uint32_datatype()),
+ (
+ "time_window",
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ ),
+ ]);
+ let dirty_filter = test_dirty_filter("ts");
+
+ assert_eq!(
+ None,
+ infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+ );
+ }
+
+ #[test]
+ fn test_infer_sink_time_window_filter_col_skips_without_timestamp_group_key() {
+ let analysis = test_analysis_with_group_keys(vec!["host", "device"]);
+ let sink_schema = test_sink_schema(vec![
+ ("host", ConcreteDataType::string_datatype()),
+ ("device", ConcreteDataType::string_datatype()),
+ (
+ AUTO_CREATED_UPDATE_AT_TS_COL,
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ ),
+ ]);
+ let dirty_filter = test_dirty_filter("ts");
+
+ assert_eq!(
+ None,
+ infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+ );
+ }
+
+ #[test]
+ fn test_infer_sink_time_window_filter_col_skips_ambiguous_timestamp_group_keys() {
+ let analysis = test_analysis_with_group_keys(vec!["ts", "time_window"]);
+ let sink_schema = test_sink_schema(vec![
+ ("ts", ConcreteDataType::timestamp_millisecond_datatype()),
+ (
+ "time_window",
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ ),
+ ]);
+ let dirty_filter = test_dirty_filter("source_ts");
+
+ assert_eq!(
+ None,
+ infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+ );
+ }
+}
diff --git a/src/flow/src/batching_mode/state.rs b/src/flow/src/batching_mode/state.rs
index d90023ae46..42b71a4ec7 100644
--- a/src/flow/src/batching_mode/state.rs
+++ b/src/flow/src/batching_mode/state.rs
@@ -13,8 +13,9 @@
// limitations under the License.
//! Batching mode task state, which changes frequently
+//!
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::time::Duration;
use common_telemetry::debug;
@@ -49,6 +50,14 @@ pub struct TaskState {
/// Dirty Time windows need to be updated
/// mapping of `start -> end` and non-overlapping
pub(crate) dirty_time_windows: DirtyTimeWindows,
+ checkpoint_mode: CheckpointMode,
+ /// Region id -> last consumed watermark sequence. Incremental scans use
+ /// this as the next lower sequence bound for each source region.
+ checkpoints: BTreeMap,
+ /// Once set, the task will never attempt incremental mode again.
+ /// Set when the flow's query shape is deterministically incompatible
+ /// with incremental execution (e.g. unsupported aggregate expressions).
+ incremental_disabled: bool,
exec_state: ExecState,
/// Shutdown receiver
pub(crate) shutdown_rx: oneshot::Receiver<()>,
@@ -63,6 +72,9 @@ impl TaskState {
last_query_duration: Duration::from_secs(0),
last_exec_time_millis: None,
dirty_time_windows: Default::default(),
+ checkpoint_mode: CheckpointMode::FullSnapshot,
+ checkpoints: Default::default(),
+ incremental_disabled: false,
exec_state: ExecState::Idle,
shutdown_rx,
task_handle: None,
@@ -84,6 +96,84 @@ impl TaskState {
self.last_exec_time_millis
}
+ pub fn checkpoint_mode(&self) -> CheckpointMode {
+ self.checkpoint_mode
+ }
+
+ pub fn checkpoints(&self) -> &BTreeMap {
+ &self.checkpoints
+ }
+
+ pub fn is_incremental_disabled(&self) -> bool {
+ self.incremental_disabled
+ }
+
+ /// Permanently disable incremental mode for this task and
+ /// immediately fall back to full snapshot for the current cycle.
+ pub fn disable_incremental(&mut self) {
+ self.incremental_disabled = true;
+ self.mark_full_snapshot();
+ }
+
+ pub fn mark_full_snapshot(&mut self) {
+ self.checkpoint_mode = CheckpointMode::FullSnapshot;
+ }
+
+ pub fn advance_checkpoints(&mut self, watermark_map: HashMap) {
+ self.checkpoints = watermark_map.into_iter().collect();
+ if !self.incremental_disabled {
+ self.checkpoint_mode = CheckpointMode::Incremental;
+ }
+ }
+
+ pub fn advance_incremental_checkpoints_with_participation(
+ &mut self,
+ participating_regions: &BTreeSet,
+ watermark_map: HashMap,
+ ) {
+ for region_id in participating_regions {
+ if let Some(seq) = watermark_map.get(region_id) {
+ self.checkpoints.insert(*region_id, *seq);
+ }
+ }
+ if !self.incremental_disabled {
+ self.checkpoint_mode = CheckpointMode::Incremental;
+ }
+ }
+
+ pub fn can_advance_full_snapshot_checkpoints(
+ &self,
+ participating_regions: &BTreeSet,
+ watermark_map: &HashMap,
+ ) -> bool {
+ !participating_regions.is_empty()
+ && participating_regions.len() == watermark_map.len()
+ && participating_regions
+ .iter()
+ .all(|region_id| watermark_map.contains_key(region_id))
+ }
+
+ pub fn can_advance_incremental_checkpoints_with_participation(
+ &self,
+ participating_regions: &BTreeSet,
+ watermark_map: &HashMap,
+ ) -> bool {
+ !self.incremental_disabled
+ && !self.checkpoints.is_empty()
+ && !participating_regions.is_empty()
+ && participating_regions.len() == watermark_map.len()
+ && participating_regions
+ .iter()
+ .all(|region_id| self.checkpoints.contains_key(region_id))
+ && participating_regions.iter().all(|region_id| {
+ let checkpoint = self.checkpoints.get(region_id);
+ watermark_map
+ .get(region_id)
+ .zip(checkpoint)
+ .is_some_and(|(seq, checkpoint)| seq >= checkpoint)
+ })
+ }
+
/// Compute the next query delay based on the time window size or the last query duration.
/// Aiming to avoid too frequent queries. But also not too long delay.
///
@@ -94,6 +184,10 @@ impl TaskState {
/// if current the dirty time range is longer than one query can handle,
/// execute immediately to faster clean up dirty time windows.
///
+ /// If `prefer_short_incremental_cadence` is true, run incremental queries
+ /// more often when there is no large dirty backlog. This only reduces the
+ /// chance of hitting a stale cursor after flush; it is not required for
+ /// correctness.
pub fn get_next_start_query_time(
&self,
flow_id: FlowId,
@@ -101,6 +195,7 @@ impl TaskState {
min_refresh_duration: Duration,
max_timeout: Option,
max_filter_num_per_query: usize,
+ prefer_short_incremental_cadence: bool,
) -> Instant {
// = last query duration, capped by [max(min_run_interval, time_window_size), max_timeout], note at most `max_timeout`
let lower = time_window_size.unwrap_or(min_refresh_duration);
@@ -119,7 +214,20 @@ impl TaskState {
// if dirty time range is more than one query can handle, execute immediately
// to faster clean up dirty time windows
if cur_dirty_window_size < max_query_update_range {
- self.last_update_time + next_duration
+ if prefer_short_incremental_cadence {
+ // Run incremental queries sooner than the normal time-window
+ // cadence, while still backing off by at least the previous
+ // query duration and respecting the max-timeout cap.
+ let next_duration = self.last_query_duration.max(min_refresh_duration);
+ let next_duration = if let Some(max_timeout) = max_timeout {
+ next_duration.min(max_timeout)
+ } else {
+ next_duration
+ };
+ self.last_update_time + next_duration
+ } else {
+ self.last_update_time + next_duration
+ }
} else {
// if dirty time windows can't be clean up in one query, execute immediately to faster
// clean up dirty time windows
@@ -199,12 +307,42 @@ impl DirtyTimeWindows {
}
pub fn add_window(&mut self, start: Timestamp, end: Option) {
- self.windows.insert(start, end);
+ self.add_or_merge_window(start, end);
}
pub fn add_windows(&mut self, time_ranges: Vec<(Timestamp, Timestamp)>) {
for (start, end) in time_ranges {
- self.windows.insert(start, Some(end));
+ self.add_or_merge_window(start, Some(end));
+ }
+ }
+
+ /// Add all dirty markers from another dirty-window set.
+ pub fn add_dirty_windows(&mut self, dirty_windows: &DirtyTimeWindows) {
+ for (start, end) in &dirty_windows.windows {
+ self.add_or_merge_window(*start, *end);
+ }
+ }
+
+ fn add_or_merge_window(&mut self, start: Timestamp, end: Option) {
+ self.windows
+ .entry(start)
+ .and_modify(|current_end| {
+ *current_end = Self::union_window_end(*current_end, end);
+ })
+ .or_insert(end);
+ }
+
+ fn union_window_end(
+ current_end: Option,
+ incoming_end: Option,
+ ) -> Option {
+ match (current_end, incoming_end) {
+ (Some(current), Some(incoming)) => Some(current.max(incoming)),
+ // `None` is a dirty marker without a known upper bound. When one
+ // side has a concrete end, keep it so merging a restored snapshot
+ // never shrinks an already-known dirty range with the same start.
+ (Some(end), None) | (None, Some(end)) => Some(end),
+ (None, None) => None,
}
}
@@ -216,7 +354,7 @@ impl DirtyTimeWindows {
/// Set windows to be dirty, only useful for full aggr without time window
/// to mark some new data is inserted
pub fn set_dirty(&mut self) {
- self.windows.insert(Timestamp::new_second(0), None);
+ self.add_or_merge_window(Timestamp::new_second(0), None);
}
/// Number of dirty windows.
@@ -283,7 +421,7 @@ impl DirtyTimeWindows {
);
self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
- if self.windows.len() > self.max_filter_num_per_query {
+ if self.windows.len() > window_cnt {
let first_time_window = self.windows.first_key_value();
let last_time_window = self.windows.last_key_value();
@@ -292,7 +430,7 @@ impl DirtyTimeWindows {
"Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
task_ctx.config.flow_id,
self.windows.len(),
- self.max_filter_num_per_query,
+ window_cnt,
task_ctx.config.time_window_expr,
task_ctx.config.expire_after,
first_time_window,
@@ -304,7 +442,7 @@ impl DirtyTimeWindows {
"Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. first_time_window={:?}, last_time_window={:?}",
flow_id,
self.windows.len(),
- self.max_filter_num_per_query,
+ window_cnt,
first_time_window,
last_time_window
)
@@ -559,6 +697,12 @@ enum ExecState {
Executing,
}
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CheckpointMode {
+ FullSnapshot,
+ Incremental,
+}
+
/// Filter Expression's information
#[derive(Debug, Clone)]
pub struct FilterExprInfo {
@@ -576,6 +720,28 @@ impl FilterExprInfo {
acc + end.sub(start).unwrap_or(chrono::Duration::zero())
})
}
+
+ pub fn predicate_for_col(
+ &self,
+ col_name: &str,
+ ) -> Result, Error> {
+ use datafusion_common::Column;
+ use datafusion_expr::{Expr, lit};
+
+ let mut expr_lst = Vec::with_capacity(self.time_ranges.len());
+ for (start, end) in &self.time_ranges {
+ let lower = to_df_literal(*start)?;
+ let upper = to_df_literal(*end)?;
+ let filter_col = || Expr::Column(Column::new_unqualified(col_name));
+ expr_lst.push(
+ filter_col()
+ .gt_eq(lit(lower))
+ .and(filter_col().lt(lit(upper))),
+ );
+ }
+
+ Ok(expr_lst.into_iter().reduce(|a, b| a.or(b)))
+ }
}
#[cfg(test)]
@@ -820,4 +986,370 @@ mod test {
}
}
}
+
+ #[test]
+ fn test_task_state_checkpoint_mode_and_advancement() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert!(state.checkpoints().is_empty());
+
+ state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+ );
+
+ state.mark_full_snapshot();
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+ );
+ }
+
+ #[test]
+ fn test_disable_incremental_persists_full_snapshot_mode() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+
+ assert!(!state.is_incremental_disabled());
+
+ // After disable, mode becomes FullSnapshot and flag is set.
+ state.disable_incremental();
+ assert!(state.is_incremental_disabled());
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+
+ // `advance_checkpoints` will NOT transition to Incremental when disabled.
+ state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+ );
+
+ // `mark_full_snapshot` does not re-enable incremental.
+ state.mark_full_snapshot();
+ assert!(state.is_incremental_disabled());
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ }
+
+ #[test]
+ fn test_full_snapshot_checkpoint_advancement_requires_participating_regions() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let state = TaskState::new(query_ctx, rx);
+
+ assert!(!state.can_advance_full_snapshot_checkpoints(&BTreeSet::new(), &HashMap::new()));
+ assert!(!state.can_advance_full_snapshot_checkpoints(
+ &BTreeSet::from([1_u64, 2_u64]),
+ &HashMap::from([(1_u64, 10_u64)]),
+ ));
+ assert!(state.can_advance_full_snapshot_checkpoints(
+ &BTreeSet::from([1_u64, 2_u64]),
+ &HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]),
+ ));
+ }
+
+ #[test]
+ fn test_incremental_checkpoint_advancement_requires_participation_alignment() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+
+ assert!(
+ state.can_advance_incremental_checkpoints_with_participation(
+ &BTreeSet::from([1_u64]),
+ &HashMap::from([(1_u64, 11_u64)]),
+ )
+ );
+ assert!(
+ !state.can_advance_incremental_checkpoints_with_participation(
+ &BTreeSet::from([1_u64, 2_u64]),
+ &HashMap::from([(1_u64, 11_u64)]),
+ )
+ );
+ assert!(
+ !state.can_advance_incremental_checkpoints_with_participation(
+ &BTreeSet::from([3_u64]),
+ &HashMap::from([(3_u64, 11_u64)]),
+ )
+ );
+ assert!(
+ !state.can_advance_incremental_checkpoints_with_participation(
+ &BTreeSet::from([1_u64]),
+ &HashMap::from([(1_u64, 9_u64)]),
+ )
+ );
+ assert!(
+ state.can_advance_incremental_checkpoints_with_participation(
+ &BTreeSet::from([1_u64, 2_u64]),
+ &HashMap::from([(1_u64, 11_u64), (2_u64, 21_u64)]),
+ )
+ );
+
+ state.disable_incremental();
+ assert!(
+ !state.can_advance_incremental_checkpoints_with_participation(
+ &BTreeSet::from([1_u64, 2_u64]),
+ &HashMap::from([(1_u64, 12_u64), (2_u64, 22_u64)]),
+ )
+ );
+ }
+
+ #[test]
+ fn test_incremental_checkpoint_advancement_merges_participating_subset() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ state.advance_checkpoints(HashMap::from([
+ (1_u64, 10_u64),
+ (2_u64, 20_u64),
+ (3_u64, 30_u64),
+ ]));
+
+ state.advance_incremental_checkpoints_with_participation(
+ &BTreeSet::from([1_u64, 3_u64]),
+ HashMap::from([(1_u64, 12_u64), (3_u64, 35_u64)]),
+ );
+
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
+ );
+ }
+
+ #[test]
+ fn test_filter_expr_info_predicate_for_col_empty_ranges() {
+ let filter = FilterExprInfo {
+ expr: datafusion_expr::col("ts"),
+ col_name: "ts".to_string(),
+ time_ranges: vec![],
+ window_size: chrono::Duration::seconds(1),
+ };
+
+ assert!(filter.predicate_for_col("time_window").unwrap().is_none());
+ }
+
+ #[test]
+ fn test_filter_expr_info_predicate_for_col_single_range() {
+ let filter = FilterExprInfo {
+ expr: datafusion_expr::col("ts"),
+ col_name: "ts".to_string(),
+ time_ranges: vec![(Timestamp::new_second(0), Timestamp::new_second(1))],
+ window_size: chrono::Duration::seconds(1),
+ };
+
+ let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
+ let unparser = datafusion::sql::unparser::Unparser::default();
+ assert_eq!(
+ "((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP)))",
+ unparser.expr_to_sql(&predicate).unwrap().to_string()
+ );
+ }
+
+ #[test]
+ fn test_filter_expr_info_predicate_for_col_multiple_ranges() {
+ let filter = FilterExprInfo {
+ expr: datafusion_expr::col("ts"),
+ col_name: "ts".to_string(),
+ time_ranges: vec![
+ (Timestamp::new_second(0), Timestamp::new_second(1)),
+ (Timestamp::new_second(10), Timestamp::new_second(11)),
+ ],
+ window_size: chrono::Duration::seconds(1),
+ };
+
+ let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
+ let unparser = datafusion::sql::unparser::Unparser::default();
+ assert_eq!(
+ "(((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP))) OR ((time_window >= CAST('1970-01-01 00:00:10' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:11' AS TIMESTAMP))))",
+ unparser.expr_to_sql(&predicate).unwrap().to_string()
+ );
+ }
+
+ /// Helper: create a `TaskState` whose `last_update_time` is a known duration in the past.
+ fn state_with_past_update(age: Duration) -> TaskState {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ state.last_update_time = Instant::now() - age;
+ state
+ }
+
+ #[test]
+ fn test_short_incremental_cadence_uses_min_refresh() {
+ // When prefer_short_incremental_cadence is true and dirty backlog is manageable,
+ // the next start time should be last_update_time + min_refresh (short cadence),
+ // ignoring the longer time_window_size.
+ let state = state_with_past_update(Duration::from_secs(10));
+
+ let time_window_size = Some(Duration::from_secs(60)); // large window
+ let min_refresh = Duration::from_secs(5);
+ let flow_id = 1;
+
+ let result = state.get_next_start_query_time(
+ flow_id,
+ &time_window_size,
+ min_refresh,
+ None,
+ 20,
+ true, // prefer_short_incremental_cadence
+ );
+
+ // With short cadence, result should be last_update_time + min_refresh.
+ let expected = state.last_update_time + min_refresh;
+ assert_eq!(result, expected);
+ }
+
+ #[test]
+ fn test_short_incremental_cadence_respects_last_query_duration() {
+ let mut state = state_with_past_update(Duration::from_secs(10));
+ state.last_query_duration = Duration::from_secs(20);
+
+ let time_window_size = Some(Duration::from_secs(60));
+ let min_refresh = Duration::from_secs(5);
+ let flow_id = 1;
+
+ let result = state.get_next_start_query_time(
+ flow_id,
+ &time_window_size,
+ min_refresh,
+ None,
+ 20,
+ true,
+ );
+
+ assert_eq!(result, state.last_update_time + state.last_query_duration);
+ }
+
+ #[test]
+ fn test_short_incremental_cadence_respects_max_timeout() {
+ let mut state = state_with_past_update(Duration::from_secs(10));
+ state.last_query_duration = Duration::from_secs(20);
+
+ let time_window_size = Some(Duration::from_secs(60));
+ let min_refresh = Duration::from_secs(30);
+ let max_timeout = Duration::from_secs(5);
+ let flow_id = 1;
+
+ let result = state.get_next_start_query_time(
+ flow_id,
+ &time_window_size,
+ min_refresh,
+ Some(max_timeout),
+ 20,
+ true,
+ );
+
+ assert_eq!(result, state.last_update_time + max_timeout);
+ }
+
+ #[test]
+ fn test_full_snapshot_ignores_short_cadence() {
+ // When prefer_short_incremental_cadence is false (full snapshot mode),
+ // the normal long-cadence based on time_window_size applies.
+ let mut state = state_with_past_update(Duration::from_secs(10));
+ // Make last_query_duration small so the lower bound (time_window_size) dominates.
+ state.last_query_duration = Duration::from_secs(1);
+
+ let time_window_size = Some(Duration::from_secs(60)); // large window
+ let min_refresh = Duration::from_secs(5);
+ let flow_id = 1;
+
+ let result = state.get_next_start_query_time(
+ flow_id,
+ &time_window_size,
+ min_refresh,
+ None,
+ 20,
+ false, // prefer_short_incremental_cadence = false
+ );
+
+ // With normal cadence, result should be last_update_time + time_window_size
+ // (since last_query_duration < time_window_size).
+ let expected = state.last_update_time + Duration::from_secs(60);
+ assert_eq!(result, expected);
+ }
+
+ #[test]
+ fn test_dirty_window_overflow_schedules_immediately_even_with_short_cadence() {
+ // Dirty-window overflow must always schedule immediately,
+ // regardless of prefer_short_incremental_cadence.
+ let mut state = state_with_past_update(Duration::from_secs(10));
+ // Create a very large dirty backlog.
+ state
+ .dirty_time_windows
+ .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(3600)));
+
+ let time_window_size = Some(Duration::from_secs(1)); // tiny window => overflow
+ let min_refresh = Duration::from_secs(5);
+ let flow_id = 1;
+
+ // With short cadence flag.
+ let result = state.get_next_start_query_time(
+ flow_id,
+ &time_window_size,
+ min_refresh,
+ None,
+ 1, // max 1 filter => tiny capacity
+ true,
+ );
+ assert!(
+ result <= Instant::now(),
+ "dirty overflow should schedule immediately"
+ );
+
+ // Without short cadence flag — same behavior.
+ let result2 = state.get_next_start_query_time(
+ flow_id,
+ &time_window_size,
+ min_refresh,
+ None,
+ 1,
+ false,
+ );
+ assert!(
+ result2 <= Instant::now(),
+ "dirty overflow should schedule immediately"
+ );
+ }
+
+ #[test]
+ fn test_incremental_disabled_ignores_short_cadence() {
+ // When prefer_short_incremental_cadence is true but the dirty backlog is
+ // manageable, the short cadence is applied. This test verifies that the
+ // caller-side guard (checkpoint_mode + !is_incremental_disabled) controls
+ // whether short cadence is requested at all — when incremental is disabled,
+ // the flag is false, and the long cadence applies.
+ //
+ // This simulates the case where the caller computed
+ // prefer_short_incremental_cadence = false (e.g. incremental disabled
+ // or FullSnapshot mode), so the long cadence is used.
+ let mut state = state_with_past_update(Duration::from_secs(10));
+ state.last_query_duration = Duration::from_secs(1);
+
+ let time_window_size = Some(Duration::from_secs(60));
+ let min_refresh = Duration::from_secs(5);
+ let flow_id = 1;
+
+ let result = state.get_next_start_query_time(
+ flow_id,
+ &time_window_size,
+ min_refresh,
+ None,
+ 20,
+ false, // prefer_short_incremental_cadence = false
+ );
+
+ // With normal cadence, result should be last_update_time + time_window_size.
+ let expected = state.last_update_time + Duration::from_secs(60);
+ assert_eq!(result, expected);
+ }
}
diff --git a/src/flow/src/batching_mode/table_creator.rs b/src/flow/src/batching_mode/table_creator.rs
new file mode 100644
index 0000000000..05da055a40
--- /dev/null
+++ b/src/flow/src/batching_mode/table_creator.rs
@@ -0,0 +1,381 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::CreateTableExpr;
+use datafusion_common::tree_node::TreeNode;
+use datafusion_expr::LogicalPlan;
+use datatypes::prelude::ConcreteDataType;
+use datatypes::schema::ColumnSchema;
+use operator::expr_helper::column_schemas_to_defs;
+use snafu::ResultExt;
+
+use crate::Error;
+use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
+use crate::batching_mode::utils::FindGroupByFinalName;
+use crate::error::{ConvertColumnSchemaSnafu, DatafusionSnafu};
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum QueryType {
+ /// query is a tql query
+ Tql,
+ /// query is a sql query
+ Sql,
+}
+
+// auto created table have a auto added column `update_at`, and optional have a `AUTO_CREATED_PLACEHOLDER_TS_COL` column for time index placeholder if no timestamp column is specified
+// TODO(discord9): for now no default value is set for auto added column for compatibility reason with streaming mode, but this might change in favor of simpler code?
+pub(super) fn create_table_with_expr(
+ plan: &LogicalPlan,
+ sink_table_name: &[String; 3],
+ query_type: &QueryType,
+) -> Result {
+ let table_def = match query_type {
+ &QueryType::Sql => {
+ if let Some(def) = build_pk_from_aggr(plan)? {
+ def
+ } else {
+ build_by_sql_schema(plan)?
+ }
+ }
+ QueryType::Tql => {
+ // first try build from aggr, then from tql schema because tql query might not have aggr node
+ if let Some(table_def) = build_pk_from_aggr(plan)? {
+ table_def
+ } else {
+ build_by_tql_schema(plan)?
+ }
+ }
+ };
+ let first_time_stamp = table_def.ts_col;
+ let primary_keys = table_def.pks;
+
+ let mut column_schemas = Vec::new();
+ for field in plan.schema().fields() {
+ let name = field.name();
+ let ty = ConcreteDataType::from_arrow_type(field.data_type());
+ let col_schema = if first_time_stamp == Some(name.clone()) {
+ ColumnSchema::new(name, ty, false).with_time_index(true)
+ } else {
+ ColumnSchema::new(name, ty, true)
+ };
+
+ match query_type {
+ QueryType::Sql => {
+ column_schemas.push(col_schema);
+ }
+ QueryType::Tql => {
+ // if is val column, need to rename as val DOUBLE NULL
+ // if is tag column, need to cast type as STRING NULL
+ let is_tag_column = primary_keys.contains(name);
+ let is_val_column = !is_tag_column && first_time_stamp.as_ref() != Some(name);
+ if is_val_column {
+ let col_schema =
+ ColumnSchema::new(name, ConcreteDataType::float64_datatype(), true);
+ column_schemas.push(col_schema);
+ } else if is_tag_column {
+ let col_schema =
+ ColumnSchema::new(name, ConcreteDataType::string_datatype(), true);
+ column_schemas.push(col_schema);
+ } else {
+ // time index column
+ column_schemas.push(col_schema);
+ }
+ }
+ }
+ }
+
+ if query_type == &QueryType::Sql {
+ let update_at_schema = ColumnSchema::new(
+ AUTO_CREATED_UPDATE_AT_TS_COL,
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ true,
+ );
+ column_schemas.push(update_at_schema);
+ }
+
+ let time_index = if let Some(time_index) = first_time_stamp {
+ time_index
+ } else {
+ column_schemas.push(
+ ColumnSchema::new(
+ AUTO_CREATED_PLACEHOLDER_TS_COL,
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ false,
+ )
+ .with_time_index(true),
+ );
+ AUTO_CREATED_PLACEHOLDER_TS_COL.to_string()
+ };
+
+ let column_defs =
+ column_schemas_to_defs(column_schemas, &primary_keys).context(ConvertColumnSchemaSnafu)?;
+ Ok(CreateTableExpr {
+ catalog_name: sink_table_name[0].clone(),
+ schema_name: sink_table_name[1].clone(),
+ table_name: sink_table_name[2].clone(),
+ desc: "Auto created table by flow engine".to_string(),
+ column_defs,
+ time_index,
+ primary_keys,
+ create_if_not_exists: true,
+ table_options: Default::default(),
+ table_id: None,
+ engine: "mito".to_string(),
+ })
+}
+
+/// simply build by schema, return first timestamp column and no primary key
+fn build_by_sql_schema(plan: &LogicalPlan) -> Result {
+ let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
+ if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
+ Some(f.name().clone())
+ } else {
+ None
+ }
+ });
+ Ok(TableDef {
+ ts_col: first_time_stamp,
+ pks: vec![],
+ })
+}
+
+/// Return first timestamp column found in output schema and all string columns
+fn build_by_tql_schema(plan: &LogicalPlan) -> Result {
+ let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
+ if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
+ Some(f.name().clone())
+ } else {
+ None
+ }
+ });
+ let string_columns = plan
+ .schema()
+ .fields()
+ .iter()
+ .filter_map(|f| {
+ if ConcreteDataType::from_arrow_type(f.data_type()).is_string() {
+ Some(f.name().clone())
+ } else {
+ None
+ }
+ })
+ .collect::>();
+
+ Ok(TableDef {
+ ts_col: first_time_stamp,
+ pks: string_columns,
+ })
+}
+
+struct TableDef {
+ ts_col: Option,
+ pks: Vec,
+}
+
+/// Return first timestamp column which is in group by clause and other columns which are also in group by clause
+///
+/// # Returns
+///
+/// * `Option` - first timestamp column which is in group by clause
+/// * `Vec` - other columns which are also in group by clause
+///
+/// if no aggregation found, return None
+fn build_pk_from_aggr(plan: &LogicalPlan) -> Result, Error> {
+ let fields = plan.schema().fields();
+ let mut pk_names = FindGroupByFinalName::default();
+
+ plan.visit(&mut pk_names)
+ .with_context(|_| DatafusionSnafu {
+ context: format!("Can't find aggr expr in plan {plan:?}"),
+ })?;
+
+ // if no group by clause, return empty with first timestamp column found in output schema
+ let Some(pk_final_names) = pk_names.get_group_expr_names() else {
+ return Ok(None);
+ };
+ if pk_final_names.is_empty() {
+ let first_ts_col = fields
+ .iter()
+ .find(|f| ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp())
+ .map(|f| f.name().clone());
+ return Ok(Some(TableDef {
+ ts_col: first_ts_col,
+ pks: vec![],
+ }));
+ }
+
+ let all_pk_cols: Vec<_> = fields
+ .iter()
+ .filter(|f| pk_final_names.contains(f.name()))
+ .map(|f| f.name().clone())
+ .collect();
+ // Auto-created tables use the first timestamp column in the group-by keys
+ // as the time index. It is possible that timestamp columns appear only as
+ // aggregate outputs (for example `max(ts)`) and are not group-by keys; in
+ // that case `first_time_stamp` stays `None` and the caller falls back to a
+ // placeholder time index column.
+ let first_time_stamp = fields
+ .iter()
+ .find(|f| {
+ all_pk_cols.contains(&f.name().clone())
+ && ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp()
+ })
+ .map(|f| f.name().clone());
+
+ let all_pk_cols: Vec<_> = all_pk_cols
+ .into_iter()
+ .filter(|col| first_time_stamp.as_ref() != Some(col))
+ .collect();
+
+ Ok(Some(TableDef {
+ ts_col: first_time_stamp,
+ pks: all_pk_cols,
+ }))
+}
+
+#[cfg(test)]
+mod test {
+ use api::v1::column_def::try_as_column_schema;
+ use datatypes::prelude::ConcreteDataType;
+ use datatypes::schema::ColumnSchema;
+ use pretty_assertions::assert_eq;
+ use session::context::QueryContext;
+
+ use super::*;
+ use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
+ use crate::batching_mode::utils::sql_to_df_plan;
+ use crate::test_utils::create_test_query_engine;
+
+ #[tokio::test]
+ async fn test_gen_create_table_sql() {
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ struct TestCase {
+ sql: String,
+ sink_table_name: String,
+ column_schemas: Vec,
+ primary_keys: Vec,
+ time_index: String,
+ }
+
+ let update_at_schema = ColumnSchema::new(
+ AUTO_CREATED_UPDATE_AT_TS_COL,
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ true,
+ );
+
+ let ts_placeholder_schema = ColumnSchema::new(
+ AUTO_CREATED_PLACEHOLDER_TS_COL,
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ false,
+ )
+ .with_time_index(true);
+
+ let testcases = vec![
+ TestCase {
+ sql: "SELECT number, ts FROM numbers_with_ts".to_string(),
+ sink_table_name: "new_table".to_string(),
+ column_schemas: vec![
+ ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+ ColumnSchema::new(
+ "ts",
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ false,
+ )
+ .with_time_index(true),
+ update_at_schema.clone(),
+ ],
+ primary_keys: vec![],
+ time_index: "ts".to_string(),
+ },
+ TestCase {
+ sql: "SELECT number, max(ts) FROM numbers_with_ts GROUP BY number".to_string(),
+ sink_table_name: "new_table".to_string(),
+ column_schemas: vec![
+ ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+ ColumnSchema::new(
+ "max(numbers_with_ts.ts)",
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ true,
+ ),
+ update_at_schema.clone(),
+ ts_placeholder_schema.clone(),
+ ],
+ primary_keys: vec!["number".to_string()],
+ time_index: AUTO_CREATED_PLACEHOLDER_TS_COL.to_string(),
+ },
+ TestCase {
+ sql: "SELECT max(number), ts FROM numbers_with_ts GROUP BY ts".to_string(),
+ sink_table_name: "new_table".to_string(),
+ column_schemas: vec![
+ ColumnSchema::new(
+ "max(numbers_with_ts.number)",
+ ConcreteDataType::uint32_datatype(),
+ true,
+ ),
+ ColumnSchema::new(
+ "ts",
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ false,
+ )
+ .with_time_index(true),
+ update_at_schema.clone(),
+ ],
+ primary_keys: vec![],
+ time_index: "ts".to_string(),
+ },
+ TestCase {
+ sql: "SELECT number, ts FROM numbers_with_ts GROUP BY ts, number".to_string(),
+ sink_table_name: "new_table".to_string(),
+ column_schemas: vec![
+ ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+ ColumnSchema::new(
+ "ts",
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ false,
+ )
+ .with_time_index(true),
+ update_at_schema.clone(),
+ ],
+ primary_keys: vec!["number".to_string()],
+ time_index: "ts".to_string(),
+ },
+ ];
+
+ for tc in testcases {
+ let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &tc.sql, true)
+ .await
+ .unwrap();
+ let expr = create_table_with_expr(
+ &plan,
+ &[
+ "greptime".to_string(),
+ "public".to_string(),
+ tc.sink_table_name.clone(),
+ ],
+ &QueryType::Sql,
+ )
+ .unwrap();
+ // TODO(discord9): assert expr
+ let column_schemas = expr
+ .column_defs
+ .iter()
+ .map(|c| try_as_column_schema(c).unwrap())
+ .collect::>();
+ assert_eq!(tc.column_schemas, column_schemas, "{:?}", tc.sql);
+ assert_eq!(tc.primary_keys, expr.primary_keys, "{:?}", tc.sql);
+ assert_eq!(tc.time_index, expr.time_index, "{:?}", tc.sql);
+ }
+ }
+}
diff --git a/src/flow/src/batching_mode/task.rs b/src/flow/src/batching_mode/task.rs
index 84c96cc7cd..3cdf7899a6 100644
--- a/src/flow/src/batching_mode/task.rs
+++ b/src/flow/src/batching_mode/task.rs
@@ -28,13 +28,12 @@ use datafusion::sql::unparser::expr_to_sql;
use datafusion_common::DFSchemaRef;
use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp};
-use datatypes::prelude::ConcreteDataType;
-use datatypes::schema::{ColumnSchema, Schema};
-use operator::expr_helper::column_schemas_to_defs;
+use datatypes::schema::Schema;
use query::QueryEngineRef;
+use query::options::FLOW_INCREMENTAL_MODE;
use query::query_engine::DefaultSerializer;
use session::context::QueryContextRef;
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ResultExt};
use sql::parsers::utils::is_tql;
use store_api::mito_engine_options::MERGE_MODE_KEY;
use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
@@ -43,19 +42,20 @@ use tokio::sync::oneshot;
use tokio::sync::oneshot::error::TryRecvError;
use tokio::time::Instant;
-use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
use crate::batching_mode::BatchingModeOptions;
-use crate::batching_mode::frontend_client::FrontendClient;
-use crate::batching_mode::state::{FilterExprInfo, TaskState};
+use crate::batching_mode::checkpoint::checkpoint_mode_label;
+use crate::batching_mode::frontend_client::{FrontendClient, PeerDesc};
+use crate::batching_mode::state::{CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState};
+use crate::batching_mode::table_creator::{QueryType, create_table_with_expr};
use crate::batching_mode::time_window::TimeWindowExpr;
use crate::batching_mode::utils::{
- AddFilterRewriter, ColumnMatcherRewriter, FindGroupByFinalName, gen_plan_with_matching_schema,
+ AddFilterRewriter, ColumnMatcherRewriter, gen_plan_with_matching_schema,
get_table_info_df_schema, sql_to_df_plan,
};
use crate::df_optimizer::apply_df_optimizer;
use crate::error::{
- ConvertColumnSchemaSnafu, DatafusionSnafu, ExternalSnafu, InvalidQuerySnafu,
- SubstraitEncodeLogicalPlanSnafu, UnexpectedSnafu,
+ DatafusionSnafu, ExternalSnafu, InvalidQuerySnafu, SubstraitEncodeLogicalPlanSnafu,
+ UnexpectedSnafu,
};
use crate::metrics::{
METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_TIME,
@@ -64,6 +64,15 @@ use crate::metrics::{
};
use crate::{Error, FlowId};
+mod ckpt;
+mod inc;
+
+/// Maximum number of dirty time-window predicates attached to one incremental
+/// SQL query. This keeps generated OR filters bounded so Substrait encoding and
+/// downstream planning remain predictable; if the backlog is larger, the flow
+/// drains one capped batch and postpones checkpoint advancement to a later run.
+const MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS: usize = 4096;
+
/// The task's config, immutable once created
#[derive(Clone)]
pub struct TaskConfig {
@@ -100,14 +109,6 @@ fn is_merge_mode_last_non_null(options: &HashMap) -> bool {
.unwrap_or(false)
}
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum QueryType {
- /// query is a tql query
- Tql,
- /// query is a sql query
- Sql,
-}
-
#[derive(Clone)]
pub struct BatchingTask {
pub config: Arc,
@@ -132,7 +133,21 @@ pub struct TaskArgs<'a> {
pub struct PlanInfo {
pub plan: LogicalPlan,
- pub filter: Option,
+ pub dirty_restore: DirtyRestore,
+ pub can_advance_checkpoints: bool,
+}
+
+pub enum DirtyRestore {
+ /// The query was scoped to dirty time ranges; restore those ranges if the
+ /// run fails.
+ Scoped(FilterExprInfo),
+ /// The query could not be scoped to dirty time ranges, so the dirty-window
+ /// state is only a dirty signal. Restore the consumed signal if the full
+ /// run fails.
+ ///
+ /// TODO(discord9): Full-query runs only need a dirty bool flag. Refactor
+ /// the unscoped path to stop reusing `DirtyTimeWindows` for this signal.
+ Unscoped(DirtyTimeWindows),
}
impl BatchingTask {
@@ -210,7 +225,7 @@ impl BatchingTask {
&self,
engine: &QueryEngineRef,
frontend_client: &Arc,
- ) -> Result, Error> {
+ ) -> Result , Error> {
if !self.is_table_exist(&self.config.sink_table_name).await? {
let create_table = self.gen_create_table_expr(engine.clone()).await?;
info!(
@@ -241,11 +256,28 @@ impl BatchingTask {
engine: &QueryEngineRef,
frontend_client: &Arc,
max_window_cnt: Option,
- ) -> Result, Error> {
+ ) -> Result , Error> {
if let Some(new_query) = self.gen_insert_plan(engine, max_window_cnt).await? {
debug!("Generate new query: {}", new_query.plan);
- self.execute_logical_plan(frontend_client, &new_query.plan)
+ let dirty_filter = match &new_query.dirty_restore {
+ DirtyRestore::Scoped(f) => Some(f),
+ _ => None,
+ };
+ match self
+ .execute_logical_plan(
+ frontend_client,
+ &new_query.plan,
+ dirty_filter,
+ new_query.can_advance_checkpoints,
+ )
.await
+ {
+ Ok(result) => Ok(result),
+ Err(err) => {
+ self.handle_executed_query_failure(Some(&new_query));
+ Err(err)
+ }
+ }
} else {
debug!("Generate no query");
Ok(None)
@@ -278,57 +310,68 @@ impl BatchingTask {
)
.await?;
- let insert_into_info = if let Some(new_query) = new_query {
- // first check if all columns in input query exists in sink table
- // since insert into ref to names in record batch generate by given query
- let table_columns = df_schema
- .columns()
- .into_iter()
- .map(|c| c.name)
- .collect::>();
- for column in new_query.plan.schema().columns() {
- ensure!(
- table_columns.contains(column.name()),
- InvalidQuerySnafu {
- reason: format!(
- "Column {} not found in sink table with columns {:?}",
- column, table_columns
- ),
- }
- );
- }
-
- let table_provider = Arc::new(DfTableProviderAdapter::new(table));
- let table_source = Arc::new(DefaultTableSource::new(table_provider));
-
- // update_at& time index placeholder (if exists) should have default value
- let plan = LogicalPlan::Dml(DmlStatement::new(
- datafusion_common::TableReference::Full {
- catalog: self.config.sink_table_name[0].clone().into(),
- schema: self.config.sink_table_name[1].clone().into(),
- table: self.config.sink_table_name[2].clone().into(),
- },
- table_source,
- WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
- Arc::new(new_query.plan),
- ));
- PlanInfo {
- plan,
- filter: new_query.filter,
- }
- } else {
+ let Some(new_query) = new_query else {
return Ok(None);
};
- let insert_into = insert_into_info
- .plan
- .recompute_schema()
- .context(DatafusionSnafu {
- context: "Failed to recompute schema",
- })?;
+
+ // first check if all columns in input query exists in sink table
+ // since insert into ref to names in record batch generate by given query
+ let table_columns = df_schema
+ .columns()
+ .into_iter()
+ .map(|c| c.name)
+ .collect::>();
+ for column in new_query.plan.schema().columns() {
+ if !table_columns.contains(column.name()) {
+ self.restore_dirty_windows_after_failure(&new_query);
+ return InvalidQuerySnafu {
+ reason: format!(
+ "Column {} not found in sink table with columns {:?}",
+ column, table_columns
+ ),
+ }
+ .fail();
+ }
+ }
+
+ let table_provider = Arc::new(DfTableProviderAdapter::new(table));
+ let table_source = Arc::new(DefaultTableSource::new(table_provider));
+
+ // update_at& time index placeholder (if exists) should have default value
+ let plan = LogicalPlan::Dml(DmlStatement::new(
+ datafusion_common::TableReference::Full {
+ catalog: self.config.sink_table_name[0].clone().into(),
+ schema: self.config.sink_table_name[1].clone().into(),
+ table: self.config.sink_table_name[2].clone().into(),
+ },
+ table_source,
+ WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
+ Arc::new(new_query.plan.clone()),
+ ));
+ let insert_into_info = PlanInfo {
+ plan,
+ dirty_restore: new_query.dirty_restore,
+ can_advance_checkpoints: new_query.can_advance_checkpoints,
+ };
+ let insert_into =
+ match insert_into_info
+ .plan
+ .clone()
+ .recompute_schema()
+ .context(DatafusionSnafu {
+ context: "Failed to recompute schema",
+ }) {
+ Ok(insert_into) => insert_into,
+ Err(err) => {
+ self.restore_dirty_windows_after_failure(&insert_into_info);
+ return Err(err);
+ }
+ };
Ok(Some(PlanInfo {
plan: insert_into,
- filter: insert_into_info.filter,
+ dirty_restore: insert_into_info.dirty_restore,
+ can_advance_checkpoints: insert_into_info.can_advance_checkpoints,
}))
}
@@ -349,7 +392,9 @@ impl BatchingTask {
&self,
frontend_client: &Arc,
plan: &LogicalPlan,
- ) -> Result, Error> {
+ dirty_filter: Option<&FilterExprInfo>,
+ can_advance_checkpoints: bool,
+ ) -> Result , Error> {
let instant = Instant::now();
let flow_id = self.config.flow_id;
@@ -378,81 +423,167 @@ impl BatchingTask {
})?
.data;
- let mut peer_desc = None;
+ // For incremental-mode SQL queries, attempt to rewrite the delta aggregate
+ // plan into a safe delta-LEFT-JOIN-sink form before deciding on extensions.
+ let incremental_plan = if can_advance_checkpoints {
+ self.prepare_plan_for_incremental(&plan, dirty_filter)
+ .await?
+ } else {
+ None
+ };
+ let incremental_safe = incremental_plan.is_some();
+ let plan = incremental_plan.unwrap_or_else(|| plan.clone());
+ let extensions = self
+ .build_flow_query_extensions(incremental_safe, can_advance_checkpoints)
+ .await?;
+ let extension_refs = extensions
+ .iter()
+ .map(|(key, value)| (*key, value.as_str()))
+ .collect::>();
+ let query_mode = if extensions
+ .iter()
+ .any(|(key, _)| *key == FLOW_INCREMENTAL_MODE)
+ {
+ CheckpointMode::Incremental
+ } else {
+ CheckpointMode::FullSnapshot
+ };
+ Self::record_query_mode(flow_id, query_mode);
+ debug!(
+ "Flow {flow_id} executing batching query with checkpoint_mode={}, extension_count={}",
+ checkpoint_mode_label(query_mode),
+ extensions.len()
+ );
+
+ let mut peer_desc = None;
let res = {
let _timer = METRIC_FLOW_BATCHING_ENGINE_QUERY_TIME
.with_label_values(&[flow_id.to_string().as_str()])
.start_timer();
- // hack and special handling the insert logical plan
let req = if let Some((insert_to, insert_plan)) =
breakup_insert_plan(&plan, catalog, schema)
{
let message = DFLogicalSubstraitConvertor {}
.encode(&insert_plan, DefaultSerializer)
.context(SubstraitEncodeLogicalPlanSnafu)?;
- api::v1::greptime_request::Request::Query(api::v1::QueryRequest {
+ api::v1::QueryRequest {
query: Some(api::v1::query_request::Query::InsertIntoPlan(
api::v1::InsertIntoPlan {
table_name: Some(insert_to),
logical_plan: message.to_vec(),
},
)),
- })
+ }
} else {
let message = DFLogicalSubstraitConvertor {}
.encode(&plan, DefaultSerializer)
.context(SubstraitEncodeLogicalPlanSnafu)?;
- api::v1::greptime_request::Request::Query(api::v1::QueryRequest {
+ api::v1::QueryRequest {
query: Some(api::v1::query_request::Query::LogicalPlan(message.to_vec())),
- })
+ }
};
frontend_client
- .handle(req, catalog, schema, &mut peer_desc)
+ .query_with_terminal_metrics(catalog, schema, req, &extension_refs, &mut peer_desc)
.await
};
let elapsed = instant.elapsed();
- if let Ok(affected_rows) = &res {
- debug!(
- "Flow {flow_id} executed, affected_rows: {affected_rows:?}, elapsed: {:?}",
- elapsed
- );
- METRIC_FLOW_ROWS
- .with_label_values(&[format!("{}-out-batching", flow_id).as_str()])
- .inc_by(*affected_rows as _);
- } else if let Err(err) = &res {
+ let peer_label = peer_desc
+ .as_ref()
+ .map(ToString::to_string)
+ .unwrap_or_else(|| PeerDesc::default().to_string());
+ if let Err(err) = &res {
warn!(
- "Failed to execute Flow {flow_id} on frontend {:?}, result: {err:?}, elapsed: {:?} with query: {}",
- peer_desc, elapsed, &plan
+ "Failed to execute Flow {flow_id} on frontend {peer_label}, result: {err:?}, elapsed: {:?} with query: {}",
+ elapsed, &plan
);
+ let decision = {
+ let mut state = self.state.write().unwrap();
+ let reason = Self::query_failure_reason(err);
+ Self::apply_query_failure_to_state(&mut state, elapsed, reason)
+ };
+ if let Some(decision) = decision {
+ Self::record_checkpoint_decision(flow_id, decision);
+ }
}
// record slow query
if elapsed >= self.config.batch_opts.slow_query_threshold {
warn!(
- "Flow {flow_id} on frontend {:?} executed for {:?} before complete, query: {}",
- peer_desc, elapsed, &plan
+ "Flow {flow_id} on frontend {peer_label} executed for {:?} before complete, query: {}",
+ elapsed, &plan
);
+ let flow_id = flow_id.to_string();
METRIC_FLOW_BATCHING_ENGINE_SLOW_QUERY
- .with_label_values(&[
- flow_id.to_string().as_str(),
- &peer_desc.unwrap_or_default().to_string(),
- ])
+ .with_label_values(&[flow_id.as_str(), peer_label.as_str()])
.observe(elapsed.as_secs_f64());
}
+ let res = res?;
+ let (affected_rows, _) = res.output.extract_rows_and_cost();
+ debug!(
+ "Flow {flow_id} executed, affected_rows: {affected_rows:?}, elapsed: {:?}, watermark: {:?}",
+ elapsed,
+ res.region_watermark_map()
+ );
+ METRIC_FLOW_ROWS
+ .with_label_values(&[format!("{}-out-batching", flow_id).as_str()])
+ .inc_by(affected_rows as _);
+ {
+ let mut state = self.state.write().unwrap();
+ let decision = Self::apply_query_result_to_state(
+ &mut state,
+ &res,
+ elapsed,
+ can_advance_checkpoints,
+ );
+ Self::record_checkpoint_decision(flow_id, decision);
+ }
+
+ Ok(Some((affected_rows, elapsed)))
+ }
+
+ /// Restore dirty windows consumed by a failed query so they are retried on
+ /// the next execution.
+ ///
+ fn restore_dirty_windows_after_failure(&self, query: &PlanInfo) {
+ match &query.dirty_restore {
+ DirtyRestore::Scoped(filter) => self.restore_scoped_dirty_windows(filter),
+ DirtyRestore::Unscoped(dirty_windows) => self
+ .state
+ .write()
+ .unwrap()
+ .dirty_time_windows
+ .add_dirty_windows(dirty_windows),
+ }
+ }
+
+ fn restore_scoped_dirty_windows(&self, filter: &FilterExprInfo) {
self.state
.write()
.unwrap()
- .after_query_exec(elapsed, res.is_ok());
+ .dirty_time_windows
+ .add_windows(filter.time_ranges.clone());
+ }
- let res = res?;
+ fn restore_scoped_dirty_windows_on_err(
+ &self,
+ filter: &FilterExprInfo,
+ result: Result,
+ ) -> Result {
+ result.inspect_err(|_| {
+ self.restore_scoped_dirty_windows(filter);
+ })
+ }
- Ok(Some((res, elapsed)))
+ fn handle_executed_query_failure(&self, query: Option<&PlanInfo>) {
+ if let Some(query) = query {
+ self.restore_dirty_windows_after_failure(query);
+ }
}
/// start executing query in a loop, break when receive shutdown signal
@@ -506,8 +637,17 @@ impl BatchingTask {
};
let res = if let Some(new_query) = &new_query {
- self.execute_logical_plan(&frontend_client, &new_query.plan)
- .await
+ let dirty_filter = match &new_query.dirty_restore {
+ DirtyRestore::Scoped(f) => Some(f),
+ _ => None,
+ };
+ self.execute_logical_plan(
+ &frontend_client,
+ &new_query.plan,
+ dirty_filter,
+ new_query.can_advance_checkpoints,
+ )
+ .await
} else {
Ok(None)
};
@@ -535,12 +675,17 @@ impl BatchingTask {
.as_ref()
.and_then(|t| *t.time_window_size());
+ let prefer_short_incremental_cadence = state.checkpoint_mode()
+ == CheckpointMode::Incremental
+ && !state.is_incremental_disabled();
+
state.get_next_start_query_time(
self.config.flow_id,
&time_window_size,
min_refresh,
Some(self.config.batch_opts.query_timeout),
self.config.batch_opts.experimental_max_filter_num_per_query,
+ prefer_short_incremental_cadence,
)
};
@@ -558,16 +703,13 @@ impl BatchingTask {
}
// TODO(discord9): this error should have better place to go, but for now just print error, also more context is needed
Err(err) => {
+ self.handle_executed_query_failure(new_query.as_ref());
METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT
.with_label_values(&[&flow_id_str])
.inc();
match new_query {
Some(query) => {
common_telemetry::error!(err; "Failed to execute query for flow={} with query: {}", self.config.flow_id, query.plan);
- // Re-add dirty windows back since query failed
- self.state.write().unwrap().dirty_time_windows.add_windows(
- query.filter.map(|f| f.time_ranges).unwrap_or_default(),
- );
// TODO(discord9): add some backoff here? half the query time window or what
// backoff meaning use smaller `max_window_cnt` for next query
@@ -641,8 +783,13 @@ impl BatchingTask {
self.config.flow_id
);
// clean dirty time window too, this could be from create flow's check_execute
- let is_dirty = !self.state.read().unwrap().dirty_time_windows.is_empty();
- self.state.write().unwrap().dirty_time_windows.clean();
+ let (is_dirty, dirty_windows_to_restore) = {
+ let mut state = self.state.write().unwrap();
+ let dirty_windows_to_restore = state.dirty_time_windows.clone();
+ let is_dirty = !dirty_windows_to_restore.is_empty();
+ state.dirty_time_windows.clean();
+ (is_dirty, dirty_windows_to_restore)
+ };
if !is_dirty {
// no dirty data, hence no need to update
@@ -650,7 +797,7 @@ impl BatchingTask {
return Ok(None);
}
- let plan = gen_plan_with_matching_schema(
+ let plan = match gen_plan_with_matching_schema(
&self.config.query,
query_ctx,
engine,
@@ -658,15 +805,36 @@ impl BatchingTask {
primary_key_indices,
allow_partial,
)
- .await?;
+ .await
+ {
+ Ok(plan) => plan,
+ Err(err) => {
+ self.state
+ .write()
+ .unwrap()
+ .dirty_time_windows
+ .add_dirty_windows(&dirty_windows_to_restore);
+ return Err(err);
+ }
+ };
- return Ok(Some(PlanInfo { plan, filter: None }));
+ return Ok(Some(PlanInfo {
+ plan,
+ dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
+ can_advance_checkpoints: true,
+ }));
}
_ => {
- // clean for tql have no use for time window
- self.state.write().unwrap().dirty_time_windows.clean();
+ // Clean dirty windows for full-query/non-scoped paths,
+ // such as TQL, that cannot use a time-window filter.
+ let dirty_windows_to_restore = {
+ let mut state = self.state.write().unwrap();
+ let dirty_windows_to_restore = state.dirty_time_windows.clone();
+ state.dirty_time_windows.clean();
+ dirty_windows_to_restore
+ };
- let plan = gen_plan_with_matching_schema(
+ let plan = match gen_plan_with_matching_schema(
&self.config.query,
query_ctx,
engine,
@@ -674,9 +842,24 @@ impl BatchingTask {
primary_key_indices,
allow_partial,
)
- .await?;
+ .await
+ {
+ Ok(plan) => plan,
+ Err(err) => {
+ self.state
+ .write()
+ .unwrap()
+ .dirty_time_windows
+ .add_dirty_windows(&dirty_windows_to_restore);
+ return Err(err);
+ }
+ };
- return Ok(Some(PlanInfo { plan, filter: None }));
+ return Ok(Some(PlanInfo {
+ plan,
+ dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
+ can_advance_checkpoints: true,
+ }));
}
};
@@ -706,33 +889,33 @@ impl BatchingTask {
),
})?;
- let expr = self
- .state
- .write()
- .unwrap()
- .dirty_time_windows
- .gen_filter_exprs(
+ let (expr, can_advance_checkpoints) = {
+ let mut state = self.state.write().unwrap();
+ let window_cnt = if state.checkpoint_mode() == CheckpointMode::Incremental
+ && !state.is_incremental_disabled()
+ && matches!(self.config.query_type, QueryType::Sql)
+ {
+ // Incremental scans are bounded by region sequence checkpoints,
+ // so the dirty-window filter only narrows sink-side/time-window
+ // work. Drain more windows than normal, but keep a hard cap to
+ // avoid building a huge OR filter after a long downtime. If
+ // windows remain, checkpoints won't advance this round.
+ MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS
+ } else {
+ max_window_cnt
+ .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query)
+ };
+ let expr = state.dirty_time_windows.gen_filter_exprs(
&col_name,
Some(expire_lower_bound),
window_size,
- max_window_cnt
- .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query),
+ window_cnt,
self.config.flow_id,
Some(self),
)?;
-
- debug!(
- "Flow id={:?}, Generated filter expr: {:?}",
- self.config.flow_id,
- expr.as_ref()
- .map(
- |expr| expr_to_sql(&expr.expr).with_context(|_| DatafusionSnafu {
- context: format!("Failed to generate filter expr from {expr:?}"),
- })
- )
- .transpose()?
- .map(|s| s.to_string())
- );
+ let can_advance_checkpoints = state.dirty_time_windows.is_empty();
+ (expr, can_advance_checkpoints)
+ };
let Some(expr) = expr else {
// no new data, hence no need to update
@@ -740,6 +923,15 @@ impl BatchingTask {
return Ok(None);
};
+ let filter_sql = expr_to_sql(&expr.expr)
+ .map(|sql| sql.to_string())
+ .unwrap_or_else(|err| format!(""));
+
+ debug!(
+ "Flow id={:?}, Generated filter expr: {:?}",
+ self.config.flow_id, filter_sql
+ );
+
let mut add_filter = AddFilterRewriter::new(expr.expr.clone());
let mut add_auto_column = ColumnMatcherRewriter::new(
sink_table_schema.clone(),
@@ -747,363 +939,35 @@ impl BatchingTask {
allow_partial,
);
- let plan =
- sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, false).await?;
- let rewrite = plan
- .clone()
- .rewrite(&mut add_filter)
- .and_then(|p| p.data.rewrite(&mut add_auto_column))
- .with_context(|_| DatafusionSnafu {
- context: format!("Failed to rewrite plan:\n {}\n", plan),
- })?
- .data;
+ let plan = self.restore_scoped_dirty_windows_on_err(
+ &expr,
+ sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, false).await,
+ )?;
+ let rewrite = self.restore_scoped_dirty_windows_on_err(
+ &expr,
+ plan.clone()
+ .rewrite(&mut add_filter)
+ .and_then(|p| p.data.rewrite(&mut add_auto_column))
+ .with_context(|_| DatafusionSnafu {
+ context: format!("Failed to rewrite plan:\n {}\n", plan),
+ })
+ .map(|rewrite| rewrite.data),
+ )?;
// only apply optimize after complex rewrite is done
- let new_plan = apply_df_optimizer(rewrite, &query_ctx).await?;
+ let new_plan = self.restore_scoped_dirty_windows_on_err(
+ &expr,
+ apply_df_optimizer(rewrite, &query_ctx).await,
+ )?;
let info = PlanInfo {
plan: new_plan.clone(),
- filter: Some(expr),
+ dirty_restore: DirtyRestore::Scoped(expr),
+ can_advance_checkpoints,
};
Ok(Some(info))
}
}
-// auto created table have a auto added column `update_at`, and optional have a `AUTO_CREATED_PLACEHOLDER_TS_COL` column for time index placeholder if no timestamp column is specified
-// TODO(discord9): for now no default value is set for auto added column for compatibility reason with streaming mode, but this might change in favor of simpler code?
-fn create_table_with_expr(
- plan: &LogicalPlan,
- sink_table_name: &[String; 3],
- query_type: &QueryType,
-) -> Result {
- let table_def = match query_type {
- &QueryType::Sql => {
- if let Some(def) = build_pk_from_aggr(plan)? {
- def
- } else {
- build_by_sql_schema(plan)?
- }
- }
- QueryType::Tql => {
- // first try build from aggr, then from tql schema because tql query might not have aggr node
- if let Some(table_def) = build_pk_from_aggr(plan)? {
- table_def
- } else {
- build_by_tql_schema(plan)?
- }
- }
- };
- let first_time_stamp = table_def.ts_col;
- let primary_keys = table_def.pks;
-
- let mut column_schemas = Vec::new();
- for field in plan.schema().fields() {
- let name = field.name();
- let ty = ConcreteDataType::from_arrow_type(field.data_type());
- let col_schema = if first_time_stamp == Some(name.clone()) {
- ColumnSchema::new(name, ty, false).with_time_index(true)
- } else {
- ColumnSchema::new(name, ty, true)
- };
-
- match query_type {
- QueryType::Sql => {
- column_schemas.push(col_schema);
- }
- QueryType::Tql => {
- // if is val column, need to rename as val DOUBLE NULL
- // if is tag column, need to cast type as STRING NULL
- let is_tag_column = primary_keys.contains(name);
- let is_val_column = !is_tag_column && first_time_stamp.as_ref() != Some(name);
- if is_val_column {
- let col_schema =
- ColumnSchema::new(name, ConcreteDataType::float64_datatype(), true);
- column_schemas.push(col_schema);
- } else if is_tag_column {
- let col_schema =
- ColumnSchema::new(name, ConcreteDataType::string_datatype(), true);
- column_schemas.push(col_schema);
- } else {
- // time index column
- column_schemas.push(col_schema);
- }
- }
- }
- }
-
- if query_type == &QueryType::Sql {
- let update_at_schema = ColumnSchema::new(
- AUTO_CREATED_UPDATE_AT_TS_COL,
- ConcreteDataType::timestamp_millisecond_datatype(),
- true,
- );
- column_schemas.push(update_at_schema);
- }
-
- let time_index = if let Some(time_index) = first_time_stamp {
- time_index
- } else {
- column_schemas.push(
- ColumnSchema::new(
- AUTO_CREATED_PLACEHOLDER_TS_COL,
- ConcreteDataType::timestamp_millisecond_datatype(),
- false,
- )
- .with_time_index(true),
- );
- AUTO_CREATED_PLACEHOLDER_TS_COL.to_string()
- };
-
- let column_defs =
- column_schemas_to_defs(column_schemas, &primary_keys).context(ConvertColumnSchemaSnafu)?;
- Ok(CreateTableExpr {
- catalog_name: sink_table_name[0].clone(),
- schema_name: sink_table_name[1].clone(),
- table_name: sink_table_name[2].clone(),
- desc: "Auto created table by flow engine".to_string(),
- column_defs,
- time_index,
- primary_keys,
- create_if_not_exists: true,
- table_options: Default::default(),
- table_id: None,
- engine: "mito".to_string(),
- })
-}
-
-/// simply build by schema, return first timestamp column and no primary key
-fn build_by_sql_schema(plan: &LogicalPlan) -> Result {
- let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
- if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
- Some(f.name().clone())
- } else {
- None
- }
- });
- Ok(TableDef {
- ts_col: first_time_stamp,
- pks: vec![],
- })
-}
-
-/// Return first timestamp column found in output schema and all string columns
-fn build_by_tql_schema(plan: &LogicalPlan) -> Result {
- let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
- if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
- Some(f.name().clone())
- } else {
- None
- }
- });
- let string_columns = plan
- .schema()
- .fields()
- .iter()
- .filter_map(|f| {
- if ConcreteDataType::from_arrow_type(f.data_type()).is_string() {
- Some(f.name().clone())
- } else {
- None
- }
- })
- .collect::>();
-
- Ok(TableDef {
- ts_col: first_time_stamp,
- pks: string_columns,
- })
-}
-
-struct TableDef {
- ts_col: Option,
- pks: Vec,
-}
-
-/// Return first timestamp column which is in group by clause and other columns which are also in group by clause
-///
-/// # Returns
-///
-/// * `Option` - first timestamp column which is in group by clause
-/// * `Vec` - other columns which are also in group by clause
-///
-/// if no aggregation found, return None
-fn build_pk_from_aggr(plan: &LogicalPlan) -> Result, Error> {
- let fields = plan.schema().fields();
- let mut pk_names = FindGroupByFinalName::default();
-
- plan.visit(&mut pk_names)
- .with_context(|_| DatafusionSnafu {
- context: format!("Can't find aggr expr in plan {plan:?}"),
- })?;
-
- // if no group by clause, return empty with first timestamp column found in output schema
- let Some(pk_final_names) = pk_names.get_group_expr_names() else {
- return Ok(None);
- };
- if pk_final_names.is_empty() {
- let first_ts_col = fields
- .iter()
- .find(|f| ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp())
- .map(|f| f.name().clone());
- return Ok(Some(TableDef {
- ts_col: first_ts_col,
- pks: vec![],
- }));
- }
-
- let all_pk_cols: Vec<_> = fields
- .iter()
- .filter(|f| pk_final_names.contains(f.name()))
- .map(|f| f.name().clone())
- .collect();
- // auto create table use first timestamp column in group by clause as time index
- let first_time_stamp = fields
- .iter()
- .find(|f| {
- all_pk_cols.contains(&f.name().clone())
- && ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp()
- })
- .map(|f| f.name().clone());
-
- let all_pk_cols: Vec<_> = all_pk_cols
- .into_iter()
- .filter(|col| first_time_stamp.as_ref() != Some(col))
- .collect();
-
- Ok(Some(TableDef {
- ts_col: first_time_stamp,
- pks: all_pk_cols,
- }))
-}
-
#[cfg(test)]
-mod test {
- use api::v1::column_def::try_as_column_schema;
- use pretty_assertions::assert_eq;
- use session::context::QueryContext;
-
- use super::*;
- use crate::test_utils::create_test_query_engine;
-
- #[tokio::test]
- async fn test_gen_create_table_sql() {
- let query_engine = create_test_query_engine();
- let ctx = QueryContext::arc();
- struct TestCase {
- sql: String,
- sink_table_name: String,
- column_schemas: Vec,
- primary_keys: Vec,
- time_index: String,
- }
-
- let update_at_schema = ColumnSchema::new(
- AUTO_CREATED_UPDATE_AT_TS_COL,
- ConcreteDataType::timestamp_millisecond_datatype(),
- true,
- );
-
- let ts_placeholder_schema = ColumnSchema::new(
- AUTO_CREATED_PLACEHOLDER_TS_COL,
- ConcreteDataType::timestamp_millisecond_datatype(),
- false,
- )
- .with_time_index(true);
-
- let testcases = vec![
- TestCase {
- sql: "SELECT number, ts FROM numbers_with_ts".to_string(),
- sink_table_name: "new_table".to_string(),
- column_schemas: vec![
- ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
- ColumnSchema::new(
- "ts",
- ConcreteDataType::timestamp_millisecond_datatype(),
- false,
- )
- .with_time_index(true),
- update_at_schema.clone(),
- ],
- primary_keys: vec![],
- time_index: "ts".to_string(),
- },
- TestCase {
- sql: "SELECT number, max(ts) FROM numbers_with_ts GROUP BY number".to_string(),
- sink_table_name: "new_table".to_string(),
- column_schemas: vec![
- ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
- ColumnSchema::new(
- "max(numbers_with_ts.ts)",
- ConcreteDataType::timestamp_millisecond_datatype(),
- true,
- ),
- update_at_schema.clone(),
- ts_placeholder_schema.clone(),
- ],
- primary_keys: vec!["number".to_string()],
- time_index: AUTO_CREATED_PLACEHOLDER_TS_COL.to_string(),
- },
- TestCase {
- sql: "SELECT max(number), ts FROM numbers_with_ts GROUP BY ts".to_string(),
- sink_table_name: "new_table".to_string(),
- column_schemas: vec![
- ColumnSchema::new(
- "max(numbers_with_ts.number)",
- ConcreteDataType::uint32_datatype(),
- true,
- ),
- ColumnSchema::new(
- "ts",
- ConcreteDataType::timestamp_millisecond_datatype(),
- false,
- )
- .with_time_index(true),
- update_at_schema.clone(),
- ],
- primary_keys: vec![],
- time_index: "ts".to_string(),
- },
- TestCase {
- sql: "SELECT number, ts FROM numbers_with_ts GROUP BY ts, number".to_string(),
- sink_table_name: "new_table".to_string(),
- column_schemas: vec![
- ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
- ColumnSchema::new(
- "ts",
- ConcreteDataType::timestamp_millisecond_datatype(),
- false,
- )
- .with_time_index(true),
- update_at_schema.clone(),
- ],
- primary_keys: vec!["number".to_string()],
- time_index: "ts".to_string(),
- },
- ];
-
- for tc in testcases {
- let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &tc.sql, true)
- .await
- .unwrap();
- let expr = create_table_with_expr(
- &plan,
- &[
- "greptime".to_string(),
- "public".to_string(),
- tc.sink_table_name.clone(),
- ],
- &QueryType::Sql,
- )
- .unwrap();
- // TODO(discord9): assert expr
- let column_schemas = expr
- .column_defs
- .iter()
- .map(|c| try_as_column_schema(c).unwrap())
- .collect::>();
- assert_eq!(tc.column_schemas, column_schemas, "{:?}", tc.sql);
- assert_eq!(tc.primary_keys, expr.primary_keys, "{:?}", tc.sql);
- assert_eq!(tc.time_index, expr.time_index, "{:?}", tc.sql);
- }
- }
-}
+mod test;
diff --git a/src/flow/src/batching_mode/task/ckpt.rs b/src/flow/src/batching_mode/task/ckpt.rs
new file mode 100644
index 0000000000..035d30a079
--- /dev/null
+++ b/src/flow/src/batching_mode/task/ckpt.rs
@@ -0,0 +1,181 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use client::OutputWithMetrics;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_telemetry::tracing::warn;
+use common_telemetry::{debug, info};
+
+use crate::batching_mode::checkpoint::{
+ FlowCheckpointDecision, FlowQueryFallbackReason, checkpoint_mode_label,
+};
+use crate::batching_mode::state::{CheckpointMode, TaskState};
+use crate::batching_mode::task::BatchingTask;
+use crate::metrics::{
+ METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT,
+};
+use crate::{Error, FlowId};
+
+impl BatchingTask {
+ pub(super) fn query_failure_reason(err: &Error) -> FlowQueryFallbackReason {
+ if err.status_code() == StatusCode::RequestOutdated {
+ FlowQueryFallbackReason::StaleCursor
+ } else {
+ FlowQueryFallbackReason::IncrementalQueryFailure
+ }
+ }
+
+ pub(super) fn apply_query_failure_to_state(
+ state: &mut TaskState,
+ elapsed: Duration,
+ reason: FlowQueryFallbackReason,
+ ) -> Option {
+ state.after_query_exec(elapsed, false);
+ let checkpoint_mode = state.checkpoint_mode();
+ if checkpoint_mode == CheckpointMode::Incremental {
+ state.mark_full_snapshot();
+ Some(FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: checkpoint_mode,
+ reason,
+ })
+ } else {
+ None
+ }
+ }
+
+ pub(super) fn apply_query_result_to_state(
+ state: &mut TaskState,
+ res: &OutputWithMetrics,
+ elapsed: Duration,
+ can_advance_checkpoints: bool,
+ ) -> FlowCheckpointDecision {
+ state.after_query_exec(elapsed, true);
+ let checkpoint_mode = state.checkpoint_mode();
+ if !can_advance_checkpoints {
+ state.mark_full_snapshot();
+ return FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: checkpoint_mode,
+ reason: FlowQueryFallbackReason::DirtyBacklogPending,
+ };
+ }
+
+ if let (Some(participating_regions), Some(watermark_map)) =
+ (res.participating_regions(), res.region_watermark_map())
+ {
+ let can_advance = match checkpoint_mode {
+ CheckpointMode::FullSnapshot => state
+ .can_advance_full_snapshot_checkpoints(&participating_regions, &watermark_map),
+ CheckpointMode::Incremental => state
+ .can_advance_incremental_checkpoints_with_participation(
+ &participating_regions,
+ &watermark_map,
+ ),
+ };
+
+ if can_advance {
+ let participating_region_count = participating_regions.len();
+ let watermark_count = watermark_map.len();
+ match checkpoint_mode {
+ CheckpointMode::FullSnapshot => {
+ state.advance_checkpoints(watermark_map);
+ if state.is_incremental_disabled() {
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: CheckpointMode::FullSnapshot,
+ reason: FlowQueryFallbackReason::IncrementalDisabled,
+ }
+ } else {
+ FlowCheckpointDecision::AdvancedFromFullSnapshot {
+ participating_regions: participating_region_count,
+ watermarks: watermark_count,
+ }
+ }
+ }
+ CheckpointMode::Incremental => {
+ state.advance_incremental_checkpoints_with_participation(
+ &participating_regions,
+ watermark_map,
+ );
+ FlowCheckpointDecision::AdvancedIncremental {
+ participating_regions: participating_region_count,
+ watermarks: watermark_count,
+ }
+ }
+ }
+ } else {
+ state.mark_full_snapshot();
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: checkpoint_mode,
+ reason: FlowQueryFallbackReason::IncompleteRegionWatermark,
+ }
+ }
+ } else {
+ state.mark_full_snapshot();
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: checkpoint_mode,
+ reason: FlowQueryFallbackReason::MissingRegionWatermark,
+ }
+ }
+ }
+
+ pub(super) fn record_checkpoint_decision(flow_id: FlowId, decision: FlowCheckpointDecision) {
+ let flow_id = flow_id.to_string();
+ METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT
+ .with_label_values(&[
+ flow_id.as_str(),
+ decision.mode_label(),
+ decision.decision_label(),
+ decision.reason_label(),
+ ])
+ .inc();
+
+ match decision {
+ FlowCheckpointDecision::AdvancedFromFullSnapshot {
+ participating_regions,
+ watermarks,
+ } => {
+ info!(
+ "Flow {flow_id} switched to incremental mode after full snapshot, participating_regions={participating_regions}, watermarks={watermarks}"
+ );
+ }
+ FlowCheckpointDecision::AdvancedIncremental {
+ participating_regions,
+ watermarks,
+ } => {
+ debug!(
+ "Flow {flow_id} advanced incremental checkpoints, participating_regions={participating_regions}, watermarks={watermarks}"
+ );
+ }
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode,
+ reason,
+ } => {
+ warn!(
+ "Flow {flow_id} switched to full snapshot mode, previous_mode={}, reason={}",
+ checkpoint_mode_label(previous_mode),
+ reason.as_label()
+ );
+ }
+ }
+ }
+
+ pub(super) fn record_query_mode(flow_id: FlowId, mode: CheckpointMode) {
+ let flow_id = flow_id.to_string();
+ METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT
+ .with_label_values(&[flow_id.as_str(), checkpoint_mode_label(mode)])
+ .inc();
+ }
+}
diff --git a/src/flow/src/batching_mode/task/inc.rs b/src/flow/src/batching_mode/task/inc.rs
new file mode 100644
index 0000000000..4fb64a676e
--- /dev/null
+++ b/src/flow/src/batching_mode/task/inc.rs
@@ -0,0 +1,252 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_error::ext::BoxedError;
+use common_telemetry::debug;
+use common_telemetry::tracing::warn;
+use datafusion_expr::{DmlStatement, LogicalPlan};
+use query::options::{
+ FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY,
+ FLOW_SINK_TABLE_ID,
+};
+use snafu::ResultExt;
+use table::metadata::TableId;
+
+use crate::Error;
+use crate::batching_mode::incremental_filter::build_sink_dirty_time_window_filter_expr;
+use crate::batching_mode::state::{CheckpointMode, FilterExprInfo};
+use crate::batching_mode::table_creator::QueryType;
+use crate::batching_mode::task::BatchingTask;
+use crate::batching_mode::utils::{
+ analyze_incremental_aggregate_plan, get_table_info_df_schema,
+ rewrite_incremental_aggregate_with_sink_merge,
+};
+use crate::error::{ExternalSnafu, UnexpectedSnafu};
+
+impl BatchingTask {
+ async fn sink_table_id(&self) -> Result {
+ let table = self
+ .config
+ .catalog_manager
+ .table(
+ &self.config.sink_table_name[0],
+ &self.config.sink_table_name[1],
+ &self.config.sink_table_name[2],
+ None,
+ )
+ .await
+ .map_err(BoxedError::new)
+ .context(ExternalSnafu)?
+ .ok_or_else(|| {
+ UnexpectedSnafu {
+ reason: format!(
+ "Flow {} cannot build incremental extensions because sink table {:?} was not found",
+ self.config.flow_id, self.config.sink_table_name
+ ),
+ }
+ .build()
+ })?;
+ Ok(table.table_info().table_id())
+ }
+
+ /// For incremental-mode SQL queries, attempt to prepare an executable plan
+ /// that is safe for incremental scan extensions.
+ ///
+ /// Returns `Some(plan)` when incremental extensions are safe, and `None`
+ /// when the caller should execute the original plan without incremental
+ /// extensions. The returned plan may be either a rewritten
+ /// delta-LEFT-JOIN-sink merge plan or the original plan. In particular,
+ /// plain GROUP BY queries with no aggregate merge columns are incremental
+ /// safe without a rewrite, so they return `Some(original_plan)`.
+ pub(super) async fn prepare_plan_for_incremental(
+ &self,
+ plan: &LogicalPlan,
+ dirty_filter: Option<&FilterExprInfo>,
+ ) -> Result, Error> {
+ let is_incremental_sql = {
+ let state = self.state.read().unwrap();
+ if state.is_incremental_disabled() {
+ return Ok(None);
+ }
+ state.checkpoint_mode() == CheckpointMode::Incremental
+ && matches!(self.config.query_type, QueryType::Sql)
+ };
+
+ if !is_incremental_sql {
+ return Ok(None);
+ }
+
+ // Extract inner query plan from the DML wrapper.
+ // Non-DML or non-SQL plans bypass the rewrite and keep checkpoint mode;
+ // non-aggregate TQL or non-INSERT plans do not need incremental scan extensions.
+ let inner_plan = match plan {
+ LogicalPlan::Dml(dml) => dml.input.as_ref().clone(),
+ _ => return Ok(None),
+ };
+
+ // Analyze the plan for incremental rewritability.
+ // Incremental reads currently require aggregate / group-by plans that
+ // can be rewritten into a delta-left-join-sink merge. Non-aggregate SQL
+ // (projection, filter, or other non-aggregate shapes) stays full-snapshot
+ // until separately supported, and incremental mode is permanently
+ // disabled for this flow.
+ let Some(analysis) = analyze_incremental_aggregate_plan(&inner_plan)? else {
+ warn!(
+ "Flow {} incremental mode but plan is not an aggregate query; \
+ permanently disabling incremental for this flow",
+ self.config.flow_id
+ );
+ self.state.write().unwrap().disable_incremental();
+ return Ok(None);
+ };
+
+ if !analysis.unsupported_exprs.is_empty() {
+ warn!(
+ "Flow {} incremental aggregate contains unsupported expressions {:?}; \
+ permanently disabling incremental for this flow",
+ self.config.flow_id, analysis.unsupported_exprs
+ );
+ self.state.write().unwrap().disable_incremental();
+ return Ok(None);
+ }
+
+ // Plain GROUP BY without aggregate expressions has no values to
+ // merge between delta and sink. The incremental delta scan emits
+ // changed groups, and sink primary-key write semantics make this
+ // idempotent; no explicit left-join rewrite is needed.
+ if analysis.merge_columns.is_empty() {
+ return Ok(Some(plan.clone()));
+ }
+
+ // Fetch sink table for the merge rewrite.
+ // Transient errors (catalog, schema, filter, or rewrite) should not
+ // permanently disable incremental mode. Instead, we fall back to a
+ // full-snapshot plan for this round while keeping incremental retryable.
+ let sink_table = match get_table_info_df_schema(
+ self.config.catalog_manager.clone(),
+ self.config.sink_table_name.clone(),
+ )
+ .await
+ {
+ Ok((table, _)) => table,
+ Err(err) => {
+ warn!(
+ "Flow {} failed to fetch sink table for incremental rewrite; \
+ falling back to full snapshot for this round: {:?}",
+ self.config.flow_id, err
+ );
+ self.state.write().unwrap().mark_full_snapshot();
+ return Ok(None);
+ }
+ };
+ let sink_schema = sink_table.table_info().meta.schema.clone();
+ let sink_dirty_filter = match build_sink_dirty_time_window_filter_expr(
+ self.config.flow_id,
+ &analysis,
+ &sink_schema,
+ dirty_filter,
+ ) {
+ Ok(filter) => filter,
+ Err(err) => {
+ warn!(
+ "Flow {} failed to build sink dirty time window filter; \
+ falling back to full snapshot for this round: {:?}",
+ self.config.flow_id, err
+ );
+ self.state.write().unwrap().mark_full_snapshot();
+ return Ok(None);
+ }
+ };
+
+ let rewritten_inner = match rewrite_incremental_aggregate_with_sink_merge(
+ &inner_plan,
+ &analysis,
+ sink_table,
+ &self.config.sink_table_name,
+ sink_dirty_filter,
+ )
+ .await
+ {
+ Ok(plan) => plan,
+ Err(err) => {
+ warn!(
+ "Flow {} failed to rewrite incremental aggregate with sink merge; \
+ falling back to full snapshot for this round: {:?}",
+ self.config.flow_id, err
+ );
+ self.state.write().unwrap().mark_full_snapshot();
+ return Ok(None);
+ }
+ };
+
+ // Reconstruct DML plan with the rewritten inner plan
+ let rewritten = match plan {
+ LogicalPlan::Dml(dml) => LogicalPlan::Dml(DmlStatement::new(
+ dml.table_name.clone(),
+ dml.target.clone(),
+ dml.op.clone(),
+ Arc::new(rewritten_inner),
+ )),
+ _ => unreachable!("already matched Dml above"),
+ };
+
+ debug!(
+ "Flow {} rewrote incremental SQL aggregate query with sink merge",
+ self.config.flow_id
+ );
+
+ Ok(Some(rewritten))
+ }
+
+ pub(super) async fn build_flow_query_extensions(
+ &self,
+ incremental_safe: bool,
+ can_advance_checkpoints: bool,
+ ) -> Result, Error> {
+ let mut extensions = vec![("flow.return_region_seq", "true".to_string())];
+
+ let incremental_checkpoints_json = {
+ let state = self.state.read().unwrap();
+ if incremental_safe
+ && can_advance_checkpoints
+ && !state.is_incremental_disabled()
+ && state.checkpoint_mode() == CheckpointMode::Incremental
+ && !state.checkpoints().is_empty()
+ {
+ Some(serde_json::to_string(state.checkpoints()).map_err(|err| {
+ UnexpectedSnafu {
+ reason: format!("Failed to serialize checkpoint map: {err}"),
+ }
+ .build()
+ })?)
+ } else {
+ None
+ }
+ };
+
+ if let Some(checkpoints_json) = incremental_checkpoints_json {
+ let sink_table_id = self.sink_table_id().await?;
+ extensions.push((FLOW_SINK_TABLE_ID, sink_table_id.to_string()));
+ extensions.push((
+ FLOW_INCREMENTAL_MODE,
+ FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string(),
+ ));
+ extensions.push((FLOW_INCREMENTAL_AFTER_SEQS, checkpoints_json));
+ }
+
+ Ok(extensions)
+ }
+}
diff --git a/src/flow/src/batching_mode/task/test.rs b/src/flow/src/batching_mode/task/test.rs
new file mode 100644
index 0000000000..959aeb00c9
--- /dev/null
+++ b/src/flow/src/batching_mode/task/test.rs
@@ -0,0 +1,1094 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::BTreeMap;
+
+use catalog::RegisterTableRequest;
+use catalog::memory::MemoryCatalogManager;
+use client::OutputWithMetrics;
+use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_query::Output;
+use common_recordbatch::RecordBatch;
+use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
+use datatypes::data_type::ConcreteDataType as CDT;
+use datatypes::schema::ColumnSchema;
+use datatypes::vectors::{TimestampMillisecondVector, UInt32Vector, VectorRef};
+use pretty_assertions::assert_eq;
+use query::options::{FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY};
+use session::context::QueryContext;
+use table::test_util::MemTable;
+
+use super::*;
+use crate::batching_mode::checkpoint::{
+ CHECKPOINT_DECISION_ADVANCE, CHECKPOINT_DECISION_FALLBACK, CHECKPOINT_REASON_NONE,
+ FlowCheckpointDecision, FlowQueryFallbackReason,
+};
+use crate::batching_mode::state::CheckpointMode;
+use crate::batching_mode::time_window::find_time_window_expr;
+use crate::test_utils::create_test_query_engine;
+
+async fn new_test_task_and_plan_with_missing_sink() -> (BatchingTask, LogicalPlan) {
+ new_test_task_engine_and_plan_with_query(
+ "SELECT number, ts FROM numbers_with_ts",
+ "missing_sink",
+ )
+ .await
+ .into_task_and_plan()
+}
+
+struct TestTaskParts {
+ task: BatchingTask,
+ query_engine: QueryEngineRef,
+ plan: LogicalPlan,
+}
+
+impl TestTaskParts {
+ fn into_task_and_plan(self) -> (BatchingTask, LogicalPlan) {
+ (self.task, self.plan)
+ }
+}
+
+async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str) -> TestTaskParts {
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ let plan = sql_to_df_plan(
+ ctx.clone(),
+ query_engine.clone(),
+ "SELECT number, ts FROM numbers_with_ts",
+ true,
+ )
+ .await
+ .unwrap();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+
+ let task = BatchingTask::try_new(TaskArgs {
+ flow_id: 1,
+ query,
+ plan: plan.clone(),
+ time_window_expr: None,
+ expire_after: None,
+ sink_table_name: [
+ "greptime".to_string(),
+ "public".to_string(),
+ sink_table.to_string(),
+ ],
+ source_table_names: vec![[
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ]],
+ query_ctx: ctx,
+ catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+ shutdown_rx: rx,
+ batch_opts: Arc::new(BatchingModeOptions::default()),
+ flow_eval_interval: None,
+ })
+ .unwrap();
+
+ TestTaskParts {
+ task,
+ query_engine,
+ plan,
+ }
+}
+
+async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts {
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ let plan_query = "SELECT number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window, number";
+ let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), plan_query, true)
+ .await
+ .unwrap();
+ let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
+ &plan,
+ query_engine.engine_state().catalog_manager().clone(),
+ ctx.clone(),
+ )
+ .await
+ .unwrap();
+ let time_window_expr = time_window_expr.map(|expr| {
+ TimeWindowExpr::from_expr(
+ &expr,
+ &column_name,
+ &df_schema,
+ &query_engine.engine_state().session_state(),
+ )
+ .unwrap()
+ });
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+
+ let task = BatchingTask::try_new(TaskArgs {
+ flow_id: 1,
+ query,
+ plan: plan.clone(),
+ time_window_expr,
+ expire_after: None,
+ sink_table_name: [
+ "greptime".to_string(),
+ "public".to_string(),
+ "missing_sink".to_string(),
+ ],
+ source_table_names: vec![[
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ]],
+ query_ctx: ctx,
+ catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+ shutdown_rx: rx,
+ batch_opts: Arc::new(BatchingModeOptions::default()),
+ flow_eval_interval: None,
+ })
+ .unwrap();
+
+ TestTaskParts {
+ task,
+ query_engine,
+ plan,
+ }
+}
+
+fn register_number_only_sink(query_engine: &QueryEngineRef, table_name: &str) {
+ let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
+ "number",
+ CDT::uint32_datatype(),
+ false,
+ )]));
+ let columns: Vec = vec![Arc::new(UInt32Vector::from_slice([1_u32]))];
+ let recordbatch = RecordBatch::new(schema, columns).unwrap();
+ let table = MemTable::table(table_name, recordbatch);
+ let request = RegisterTableRequest {
+ catalog: DEFAULT_CATALOG_NAME.to_string(),
+ schema: DEFAULT_SCHEMA_NAME.to_string(),
+ table_name: table_name.to_string(),
+ table_id: 9001,
+ table,
+ };
+ let catalog_manager = query_engine.engine_state().catalog_manager();
+ let memory_catalog = catalog_manager
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ memory_catalog.register_table_sync(request).unwrap();
+}
+
+fn register_auto_created_aggregate_sink(query_engine: &QueryEngineRef, table_name: &str) {
+ let schema = Arc::new(Schema::new(vec![
+ ColumnSchema::new("number", CDT::uint32_datatype(), true),
+ ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
+ ColumnSchema::new("update_at", CDT::timestamp_millisecond_datatype(), true),
+ ]));
+ let columns: Vec = vec![
+ Arc::new(UInt32Vector::from_slice([1_u32])),
+ Arc::new(TimestampMillisecondVector::from_slice([0_i64])),
+ Arc::new(TimestampMillisecondVector::from_slice([0_i64])),
+ ];
+ let recordbatch = RecordBatch::new(schema, columns).unwrap();
+ let table = MemTable::table(table_name, recordbatch);
+ let request = RegisterTableRequest {
+ catalog: DEFAULT_CATALOG_NAME.to_string(),
+ schema: DEFAULT_SCHEMA_NAME.to_string(),
+ table_name: table_name.to_string(),
+ table_id: 9002,
+ table,
+ };
+ let catalog_manager = query_engine.engine_state().catalog_manager();
+ let memory_catalog = catalog_manager
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ memory_catalog.register_table_sync(request).unwrap();
+}
+
+fn dirty_marker() -> DirtyTimeWindows {
+ let mut dirty = DirtyTimeWindows::default();
+ dirty.set_dirty();
+ dirty
+}
+
+fn dirty_range(start: i64, end: i64) -> DirtyTimeWindows {
+ let mut dirty = DirtyTimeWindows::default();
+ dirty.add_window(
+ Timestamp::new_second(start),
+ Some(Timestamp::new_second(end)),
+ );
+ dirty
+}
+
+async fn assert_unscoped_failure_restore(
+ consumed_dirty_windows: DirtyTimeWindows,
+ current_dirty_windows: DirtyTimeWindows,
+ expected_len: usize,
+ expected_window_size_secs: u64,
+) {
+ let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
+ {
+ let mut state = task.state.write().unwrap();
+ state.dirty_time_windows.clean();
+ state
+ .dirty_time_windows
+ .add_dirty_windows(¤t_dirty_windows);
+ }
+ let unscoped_query = PlanInfo {
+ plan,
+ dirty_restore: DirtyRestore::Unscoped(consumed_dirty_windows),
+ can_advance_checkpoints: true,
+ };
+
+ task.handle_executed_query_failure(Some(&unscoped_query));
+
+ let state = task.state.read().unwrap();
+ assert_eq!(state.dirty_time_windows.len(), expected_len);
+ assert_eq!(
+ state.dirty_time_windows.window_size(),
+ std::time::Duration::from_secs(expected_window_size_secs)
+ );
+}
+
+fn output_with_region_watermarks(
+ watermarks: impl IntoIterator- )>,
+) -> OutputWithMetrics {
+ let result = OutputWithMetrics::from_output(Output::new_with_affected_rows(0));
+ result.metrics.update(Some(RecordBatchMetrics {
+ region_watermarks: watermarks
+ .into_iter()
+ .map(|(region_id, watermark)| RegionWatermarkEntry {
+ region_id,
+ watermark,
+ })
+ .collect(),
+ ..Default::default()
+ }));
+ result.metrics.mark_ready();
+ result
+}
+
+#[test]
+fn test_apply_query_result_to_state_advances_full_snapshot_to_incremental() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ let result = output_with_region_watermarks([(1_u64, Some(10_u64)), (2_u64, Some(20_u64))]);
+
+ let decision = BatchingTask::apply_query_result_to_state(
+ &mut state,
+ &result,
+ std::time::Duration::from_millis(1),
+ true,
+ );
+
+ assert_eq!(
+ decision,
+ FlowCheckpointDecision::AdvancedFromFullSnapshot {
+ participating_regions: 2,
+ watermarks: 2,
+ }
+ );
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+ );
+}
+
+#[test]
+fn test_apply_query_result_to_state_stays_full_snapshot_when_incremental_disabled() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ state.disable_incremental();
+ assert!(state.is_incremental_disabled());
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+
+ let result = output_with_region_watermarks([(1_u64, Some(10_u64)), (2_u64, Some(20_u64))]);
+ let decision = BatchingTask::apply_query_result_to_state(
+ &mut state,
+ &result,
+ std::time::Duration::from_millis(1),
+ true,
+ );
+
+ // Should NOT claim advancement to incremental; should fallback with correct reason.
+ assert_eq!(
+ decision,
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: CheckpointMode::FullSnapshot,
+ reason: FlowQueryFallbackReason::IncrementalDisabled,
+ }
+ );
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert!(state.is_incremental_disabled());
+ // Checkpoints are still updated even if mode doesn't advance.
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+ );
+}
+
+#[test]
+fn test_apply_query_result_to_state_rejects_unproved_watermark() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ let result = output_with_region_watermarks([(1_u64, Some(10_u64)), (2_u64, None)]);
+
+ let decision = BatchingTask::apply_query_result_to_state(
+ &mut state,
+ &result,
+ std::time::Duration::from_millis(1),
+ true,
+ );
+
+ assert_eq!(
+ decision,
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: CheckpointMode::FullSnapshot,
+ reason: FlowQueryFallbackReason::IncompleteRegionWatermark,
+ }
+ );
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert!(state.checkpoints().is_empty());
+}
+
+#[test]
+fn test_apply_query_result_to_state_reports_missing_watermark() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ let result = OutputWithMetrics::from_output(Output::new_with_affected_rows(0));
+
+ let decision = BatchingTask::apply_query_result_to_state(
+ &mut state,
+ &result,
+ std::time::Duration::from_millis(1),
+ true,
+ );
+
+ assert_eq!(
+ decision,
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: CheckpointMode::FullSnapshot,
+ reason: FlowQueryFallbackReason::MissingRegionWatermark,
+ }
+ );
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert!(state.checkpoints().is_empty());
+}
+
+#[test]
+fn test_apply_query_result_to_state_advances_incremental_subset() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ state.advance_checkpoints(HashMap::from([
+ (1_u64, 10_u64),
+ (2_u64, 20_u64),
+ (3_u64, 30_u64),
+ ]));
+ let result = output_with_region_watermarks([(1_u64, Some(12_u64)), (3_u64, Some(35_u64))]);
+
+ let decision = BatchingTask::apply_query_result_to_state(
+ &mut state,
+ &result,
+ std::time::Duration::from_millis(1),
+ true,
+ );
+
+ assert_eq!(
+ decision,
+ FlowCheckpointDecision::AdvancedIncremental {
+ participating_regions: 2,
+ watermarks: 2,
+ }
+ );
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
+ );
+}
+
+#[test]
+fn test_apply_query_result_to_state_blocks_full_snapshot_when_dirty_backlog_pending() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ let result = output_with_region_watermarks([(1_u64, Some(10_u64)), (2_u64, Some(20_u64))]);
+
+ let decision = BatchingTask::apply_query_result_to_state(
+ &mut state,
+ &result,
+ std::time::Duration::from_millis(1),
+ false,
+ );
+
+ assert_eq!(
+ decision,
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: CheckpointMode::FullSnapshot,
+ reason: FlowQueryFallbackReason::DirtyBacklogPending,
+ }
+ );
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert!(state.checkpoints().is_empty());
+}
+
+#[test]
+fn test_apply_query_result_to_state_blocks_incremental_when_dirty_backlog_pending() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+ let result = output_with_region_watermarks([(1_u64, Some(12_u64)), (2_u64, Some(25_u64))]);
+
+ let decision = BatchingTask::apply_query_result_to_state(
+ &mut state,
+ &result,
+ std::time::Duration::from_millis(1),
+ false,
+ );
+
+ assert_eq!(
+ decision,
+ FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: CheckpointMode::Incremental,
+ reason: FlowQueryFallbackReason::DirtyBacklogPending,
+ }
+ );
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+ );
+}
+
+#[test]
+fn test_apply_query_failure_to_state_falls_back_from_incremental() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+ state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+
+ let decision = BatchingTask::apply_query_failure_to_state(
+ &mut state,
+ std::time::Duration::from_millis(1),
+ FlowQueryFallbackReason::IncrementalQueryFailure,
+ );
+
+ assert_eq!(
+ decision,
+ Some(FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: CheckpointMode::Incremental,
+ reason: FlowQueryFallbackReason::IncrementalQueryFailure,
+ })
+ );
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert_eq!(
+ state.checkpoints(),
+ &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+ );
+}
+
+#[test]
+fn test_apply_query_failure_to_state_keeps_full_snapshot_without_decision() {
+ let query_ctx = QueryContext::arc();
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let mut state = TaskState::new(query_ctx, rx);
+
+ let decision = BatchingTask::apply_query_failure_to_state(
+ &mut state,
+ std::time::Duration::from_millis(1),
+ FlowQueryFallbackReason::StaleCursor,
+ );
+
+ assert_eq!(decision, None);
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+ assert!(state.checkpoints().is_empty());
+}
+
+#[test]
+fn test_checkpoint_decision_labels_are_stable() {
+ let advance = FlowCheckpointDecision::AdvancedIncremental {
+ participating_regions: 1,
+ watermarks: 1,
+ };
+ let fallback = FlowCheckpointDecision::FallbackToFullSnapshot {
+ previous_mode: CheckpointMode::Incremental,
+ reason: FlowQueryFallbackReason::StaleCursor,
+ };
+
+ assert_eq!(advance.mode_label(), "incremental");
+ assert_eq!(advance.decision_label(), CHECKPOINT_DECISION_ADVANCE);
+ assert_eq!(advance.reason_label(), CHECKPOINT_REASON_NONE);
+ assert_eq!(fallback.mode_label(), "incremental");
+ assert_eq!(fallback.decision_label(), CHECKPOINT_DECISION_FALLBACK);
+ assert_eq!(fallback.reason_label(), "stale_cursor");
+ assert_eq!(
+ FlowQueryFallbackReason::DirtyBacklogPending.as_label(),
+ "dirty_backlog_pending"
+ );
+}
+
+#[tokio::test]
+async fn test_build_flow_query_extensions_switches_with_checkpoint_mode() {
+ let (task, _) = new_test_task_engine_and_plan_with_query(
+ "SELECT number, ts FROM numbers_with_ts",
+ "numbers_with_ts",
+ )
+ .await
+ .into_task_and_plan();
+
+ let extensions = task.build_flow_query_extensions(false, true).await.unwrap();
+ assert_eq!(
+ extensions,
+ vec![("flow.return_region_seq", "true".to_string())]
+ );
+
+ task.state
+ .write()
+ .unwrap()
+ .advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+
+ let extensions = task.build_flow_query_extensions(false, true).await.unwrap();
+ assert!(extensions.contains(&("flow.return_region_seq", "true".to_string())));
+ assert!(
+ !extensions
+ .iter()
+ .any(|(key, _)| *key == FLOW_INCREMENTAL_MODE)
+ );
+ assert!(
+ !extensions
+ .iter()
+ .any(|(key, _)| *key == FLOW_INCREMENTAL_AFTER_SEQS)
+ );
+
+ let extensions = task.build_flow_query_extensions(true, true).await.unwrap();
+
+ assert!(extensions.contains(&("flow.return_region_seq", "true".to_string())));
+ assert!(extensions.contains(&(
+ FLOW_INCREMENTAL_MODE,
+ FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string()
+ )));
+ assert!(extensions.contains(&(
+ FLOW_INCREMENTAL_AFTER_SEQS,
+ serde_json::json!({"1": 10, "2": 20}).to_string(),
+ )));
+
+ let extensions = task.build_flow_query_extensions(true, false).await.unwrap();
+ assert!(extensions.contains(&("flow.return_region_seq", "true".to_string())));
+ assert!(
+ !extensions
+ .iter()
+ .any(|(key, _)| *key == FLOW_INCREMENTAL_MODE)
+ );
+ assert!(
+ !extensions
+ .iter()
+ .any(|(key, _)| *key == FLOW_INCREMENTAL_AFTER_SEQS)
+ );
+
+ task.state.write().unwrap().disable_incremental();
+ let extensions = task.build_flow_query_extensions(true, true).await.unwrap();
+ assert!(extensions.contains(&("flow.return_region_seq", "true".to_string())));
+ assert!(
+ !extensions
+ .iter()
+ .any(|(key, _)| *key == FLOW_INCREMENTAL_MODE)
+ );
+ assert!(
+ !extensions
+ .iter()
+ .any(|(key, _)| *key == FLOW_INCREMENTAL_AFTER_SEQS)
+ );
+}
+
+#[tokio::test]
+async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after_backlog_drained() {
+ let TestTaskParts {
+ task,
+ query_engine,
+ ..
+ } = new_time_window_test_task_with_query(
+ "SELECT number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window, number",
+ )
+ .await;
+ {
+ let mut state = task.state.write().unwrap();
+ state
+ .dirty_time_windows
+ .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
+ state
+ .dirty_time_windows
+ .add_window(Timestamp::new_second(30), Some(Timestamp::new_second(35)));
+ }
+ let sink_schema = Arc::new(Schema::new(vec![
+ ColumnSchema::new("number", CDT::uint32_datatype(), false),
+ ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+ .with_time_index(true),
+ ]));
+
+ let first = task
+ .gen_query_with_time_window(query_engine.clone(), &sink_schema, &[], false, Some(1))
+ .await
+ .unwrap()
+ .unwrap();
+ assert!(!first.can_advance_checkpoints);
+ assert_eq!(task.state.read().unwrap().dirty_time_windows.len(), 1);
+
+ let second = task
+ .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
+ .await
+ .unwrap()
+ .unwrap();
+ assert!(second.can_advance_checkpoints);
+ assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+}
+
+#[tokio::test]
+async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_safety() {
+ let TestTaskParts {
+ task,
+ query_engine,
+ ..
+ } = new_time_window_test_task_with_query(
+ "SELECT number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window, number",
+ )
+ .await;
+ {
+ let mut state = task.state.write().unwrap();
+ state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+ state
+ .dirty_time_windows
+ .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
+ state
+ .dirty_time_windows
+ .add_window(Timestamp::new_second(30), Some(Timestamp::new_second(35)));
+ }
+ let sink_schema = Arc::new(Schema::new(vec![
+ ColumnSchema::new("number", CDT::uint32_datatype(), false),
+ ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+ .with_time_index(true),
+ ]));
+
+ let plan = task
+ .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
+ .await
+ .unwrap()
+ .unwrap();
+
+ assert!(plan.can_advance_checkpoints);
+ assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+}
+
+#[tokio::test]
+async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() {
+ let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
+ {
+ let mut state = task.state.write().unwrap();
+ state.dirty_time_windows.clean();
+ }
+ let scoped_query = PlanInfo {
+ plan,
+ dirty_restore: DirtyRestore::Scoped(FilterExprInfo {
+ expr: datafusion_expr::lit(true),
+ col_name: "ts".to_string(),
+ time_ranges: vec![(Timestamp::new_second(10), Timestamp::new_second(20))],
+ window_size: chrono::Duration::seconds(10),
+ }),
+ can_advance_checkpoints: true,
+ };
+
+ task.handle_executed_query_failure(Some(&scoped_query));
+
+ let state = task.state.read().unwrap();
+ assert_eq!(state.dirty_time_windows.len(), 1);
+ assert_eq!(
+ state.dirty_time_windows.window_size(),
+ std::time::Duration::from_secs(10)
+ );
+}
+
+#[tokio::test]
+async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() {
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ let plan = sql_to_df_plan(
+ ctx.clone(),
+ query_engine.clone(),
+ "SELECT number, ts FROM numbers_with_ts",
+ true,
+ )
+ .await
+ .unwrap();
+
+ // Build a DML wrapper using a real sink table from the test engine.
+ let (sink_table, _) = get_table_info_df_schema(
+ query_engine.engine_state().catalog_manager().clone(),
+ [
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ],
+ )
+ .await
+ .unwrap();
+ let table_provider = Arc::new(DfTableProviderAdapter::new(sink_table));
+ let table_source = Arc::new(DefaultTableSource::new(table_provider));
+ let dml_plan = LogicalPlan::Dml(DmlStatement::new(
+ datafusion_common::TableReference::bare("test"),
+ table_source,
+ WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
+ Arc::new(plan),
+ ));
+
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let task = BatchingTask::try_new(TaskArgs {
+ flow_id: 1,
+ query: "SELECT number, ts FROM numbers_with_ts",
+ plan: dml_plan.clone(),
+ time_window_expr: None,
+ expire_after: None,
+ sink_table_name: [
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ],
+ source_table_names: vec![[
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ]],
+ query_ctx: ctx,
+ catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+ shutdown_rx: rx,
+ batch_opts: Arc::new(BatchingModeOptions::default()),
+ flow_eval_interval: None,
+ })
+ .unwrap();
+
+ // Put the state into Incremental mode with checkpoints.
+ task.state
+ .write()
+ .unwrap()
+ .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+ assert_eq!(
+ task.state.read().unwrap().checkpoint_mode(),
+ CheckpointMode::Incremental
+ );
+
+ let incremental_plan = task
+ .prepare_plan_for_incremental(&dml_plan, None)
+ .await
+ .unwrap();
+ assert!(incremental_plan.is_none());
+ let state = task.state.read().unwrap();
+ assert!(state.is_incremental_disabled());
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+}
+
+#[tokio::test]
+async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite_error() {
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ let plan = sql_to_df_plan(
+ ctx.clone(),
+ query_engine.clone(),
+ "SELECT sum(number) AS total, ts FROM numbers_with_ts GROUP BY ts",
+ true,
+ )
+ .await
+ .unwrap();
+
+ let (sink_table, _) = get_table_info_df_schema(
+ query_engine.engine_state().catalog_manager().clone(),
+ [
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ],
+ )
+ .await
+ .unwrap();
+ let table_provider = Arc::new(DfTableProviderAdapter::new(sink_table));
+ let table_source = Arc::new(DefaultTableSource::new(table_provider));
+ let dml_plan = LogicalPlan::Dml(DmlStatement::new(
+ datafusion_common::TableReference::bare("test"),
+ table_source,
+ WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
+ Arc::new(plan),
+ ));
+
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let task = BatchingTask::try_new(TaskArgs {
+ flow_id: 1,
+ query: "SELECT sum(number) AS total, ts FROM numbers_with_ts GROUP BY ts",
+ plan: dml_plan.clone(),
+ time_window_expr: None,
+ expire_after: None,
+ // The sink table exists, but does not have the rewritten aggregate
+ // output column `total`, so the rewrite fails deterministically.
+ sink_table_name: [
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ],
+ source_table_names: vec![[
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ]],
+ query_ctx: ctx,
+ catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+ shutdown_rx: rx,
+ batch_opts: Arc::new(BatchingModeOptions::default()),
+ flow_eval_interval: None,
+ })
+ .unwrap();
+
+ task.state
+ .write()
+ .unwrap()
+ .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+ assert_eq!(
+ task.state.read().unwrap().checkpoint_mode(),
+ CheckpointMode::Incremental
+ );
+
+ let incremental_plan = task
+ .prepare_plan_for_incremental(&dml_plan, None)
+ .await
+ .unwrap();
+ assert!(incremental_plan.is_none());
+ let state = task.state.read().unwrap();
+ assert!(!state.is_incremental_disabled());
+ assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+}
+
+#[tokio::test]
+async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_original_plan() {
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ let plan = sql_to_df_plan(
+ ctx.clone(),
+ query_engine.clone(),
+ "SELECT ts FROM numbers_with_ts GROUP BY ts",
+ true,
+ )
+ .await
+ .unwrap();
+
+ let (sink_table, _) = get_table_info_df_schema(
+ query_engine.engine_state().catalog_manager().clone(),
+ [
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ],
+ )
+ .await
+ .unwrap();
+ let table_provider = Arc::new(DfTableProviderAdapter::new(sink_table));
+ let table_source = Arc::new(DefaultTableSource::new(table_provider));
+ let dml_plan = LogicalPlan::Dml(DmlStatement::new(
+ datafusion_common::TableReference::bare("test"),
+ table_source,
+ WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
+ Arc::new(plan),
+ ));
+
+ let (_tx, rx) = tokio::sync::oneshot::channel();
+ let task = BatchingTask::try_new(TaskArgs {
+ flow_id: 1,
+ query: "SELECT ts FROM numbers_with_ts GROUP BY ts",
+ plan: dml_plan.clone(),
+ time_window_expr: None,
+ expire_after: None,
+ sink_table_name: [
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ],
+ source_table_names: vec![[
+ "greptime".to_string(),
+ "public".to_string(),
+ "numbers_with_ts".to_string(),
+ ]],
+ query_ctx: ctx,
+ catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+ shutdown_rx: rx,
+ batch_opts: Arc::new(BatchingModeOptions::default()),
+ flow_eval_interval: None,
+ })
+ .unwrap();
+
+ task.state
+ .write()
+ .unwrap()
+ .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+
+ let incremental_plan = task
+ .prepare_plan_for_incremental(&dml_plan, None)
+ .await
+ .unwrap()
+ .expect("plain GROUP BY is incremental-safe without a rewrite");
+
+ assert_eq!(format!("{incremental_plan}"), format!("{dml_plan}"));
+ assert!(!task.state.read().unwrap().is_incremental_disabled());
+}
+
+#[tokio::test]
+async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() {
+ let sink_table = "auto_created_aggregate_sink";
+ let TestTaskParts {
+ task, query_engine, ..
+ } = new_test_task_engine_and_plan_with_query(
+ "SELECT max(number) AS number, ts FROM numbers_with_ts GROUP BY ts",
+ sink_table,
+ )
+ .await;
+ register_auto_created_aggregate_sink(&query_engine, sink_table);
+ task.state.write().unwrap().dirty_time_windows.set_dirty();
+
+ let plan_info = task
+ .gen_insert_plan(&query_engine, None)
+ .await
+ .unwrap()
+ .unwrap();
+ assert!(plan_info.can_advance_checkpoints);
+
+ task.state
+ .write()
+ .unwrap()
+ .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+ let incremental_plan = task
+ .prepare_plan_for_incremental(&plan_info.plan, None)
+ .await
+ .unwrap();
+ let incremental_safe = incremental_plan.is_some();
+
+ assert!(incremental_safe);
+ assert!(!task.state.read().unwrap().is_incremental_disabled());
+
+ let extensions = task
+ .build_flow_query_extensions(incremental_safe, plan_info.can_advance_checkpoints)
+ .await
+ .unwrap();
+ assert!(extensions.contains(&(
+ FLOW_INCREMENTAL_MODE,
+ FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string()
+ )));
+ assert!(
+ extensions
+ .iter()
+ .any(|(key, _)| *key == FLOW_INCREMENTAL_AFTER_SEQS)
+ );
+}
+
+#[tokio::test]
+async fn test_unscoped_failure_restores_consumed_dirty_signal() {
+ assert_unscoped_failure_restore(dirty_marker(), DirtyTimeWindows::default(), 1, 0).await;
+ assert_unscoped_failure_restore(dirty_range(30, 40), dirty_range(10, 20), 2, 20).await;
+ assert_unscoped_failure_restore(dirty_range(30, 40), dirty_range(30, 50), 1, 20).await;
+}
+
+#[tokio::test]
+async fn test_unscoped_plan_generation_failure_restores_consumed_dirty_signal() {
+ let TestTaskParts {
+ task, query_engine, ..
+ } = new_test_task_engine_and_plan_with_query(
+ "SELECT missing_column FROM numbers_with_ts",
+ "missing_sink",
+ )
+ .await;
+ task.state.write().unwrap().dirty_time_windows.set_dirty();
+ let sink_schema = Arc::new(Schema::new(vec![
+ ColumnSchema::new("number", CDT::uint32_datatype(), false),
+ ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
+ ]));
+
+ let result = task
+ .gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
+ .await;
+
+ assert!(result.is_err());
+ let state = task.state.read().unwrap();
+ assert_eq!(state.dirty_time_windows.len(), 1);
+ assert_eq!(
+ state.dirty_time_windows.window_size(),
+ std::time::Duration::from_secs(0)
+ );
+}
+
+#[tokio::test]
+async fn test_scoped_plan_generation_failure_restores_consumed_dirty_windows() {
+ let TestTaskParts {
+ task,
+ query_engine,
+ ..
+ } = new_time_window_test_task_with_query(
+ "SELECT missing_column, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window, missing_column",
+ )
+ .await;
+ task.state
+ .write()
+ .unwrap()
+ .dirty_time_windows
+ .add_window(Timestamp::new_second(10), Some(Timestamp::new_second(15)));
+ let sink_schema = Arc::new(Schema::new(vec![
+ ColumnSchema::new("number", CDT::uint32_datatype(), false),
+ ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+ .with_time_index(true),
+ ]));
+
+ let result = task
+ .gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
+ .await;
+
+ assert!(result.is_err());
+ let state = task.state.read().unwrap();
+ assert_eq!(state.dirty_time_windows.len(), 1);
+ assert_eq!(
+ state.dirty_time_windows.window_size(),
+ std::time::Duration::from_secs(5)
+ );
+}
+
+#[tokio::test]
+async fn test_insert_plan_matching_failure_restores_consumed_dirty_marker() {
+ let sink_table = "partial_sink";
+ let TestTaskParts {
+ task, query_engine, ..
+ } = new_test_task_engine_and_plan_with_query(
+ "SELECT number, ts FROM numbers_with_ts",
+ sink_table,
+ )
+ .await;
+ register_number_only_sink(&query_engine, sink_table);
+ task.state.write().unwrap().dirty_time_windows.set_dirty();
+
+ let result = task.gen_insert_plan(&query_engine, None).await;
+
+ assert!(result.is_err());
+ let _err = match result {
+ Ok(_) => panic!("gen_insert_plan should fail with a sink column mismatch"),
+ Err(err) => err,
+ };
+ let state = task.state.read().unwrap();
+ assert_eq!(state.dirty_time_windows.len(), 1);
+ assert_eq!(
+ state.dirty_time_windows.window_size(),
+ std::time::Duration::from_secs(0)
+ );
+}
diff --git a/src/flow/src/batching_mode/utils.rs b/src/flow/src/batching_mode/utils.rs
index 7b066388ec..e86b1ee3be 100644
--- a/src/flow/src/batching_mode/utils.rs
+++ b/src/flow/src/batching_mode/utils.rs
@@ -278,7 +278,7 @@ fn collect_output_projection_info(plan: &LogicalPlan) -> OutputProjectionInfo {
let mut col_names = Vec::new();
find_column_names(&alias.expr, &mut col_names);
match col_names.len() {
- 0 if matches!(alias.expr.as_ref(), Expr::Literal(_, _)) => {
+ 0 if is_passthrough_output_column(&alias_name, alias.expr.as_ref()) => {
projection_info.literal_columns.insert(alias_name);
}
1 => {
@@ -315,10 +315,38 @@ fn collect_output_projection_info(plan: &LogicalPlan) -> OutputProjectionInfo {
}
}
+ if projection_info
+ .output_field_names
+ .iter()
+ .any(|name| name == AUTO_CREATED_PLACEHOLDER_TS_COL)
+ {
+ projection_info
+ .literal_columns
+ .insert(AUTO_CREATED_PLACEHOLDER_TS_COL.to_string());
+ }
+
projection_info.output_aliases = output_aliases;
projection_info
}
+fn is_passthrough_output_column(alias_name: &str, expr: &Expr) -> bool {
+ matches!(expr, Expr::Literal(_, _))
+ || match alias_name {
+ AUTO_CREATED_UPDATE_AT_TS_COL => expr == &datafusion::prelude::now(),
+ AUTO_CREATED_PLACEHOLDER_TS_COL => is_literal_or_cast_literal(expr),
+ _ => false,
+ }
+}
+
+fn is_literal_or_cast_literal(expr: &Expr) -> bool {
+ match expr {
+ Expr::Literal(_, _) => true,
+ Expr::Cast(cast) => is_literal_or_cast_literal(cast.expr.as_ref()),
+ Expr::TryCast(cast) => is_literal_or_cast_literal(cast.expr.as_ref()),
+ _ => false,
+ }
+}
+
fn merge_op_for_aggregate_expr(aggr_expr: &Expr) -> Result
{
let Some(aggr_func) = get_aggr_func(aggr_expr) else {
return Err(aggr_expr.to_string());
@@ -385,6 +413,11 @@ fn find_uncovered_output_fields(
!group_key_names.contains(*name)
&& !merge_column_names.contains(*name)
&& !projection_info.literal_columns.contains(*name)
+ // Auto-created sink columns injected by ColumnMatcherRewriter
+ // are not part of the original aggregate semantics and must
+ // not prevent incremental aggregate rewrites.
+ && name.as_str() != AUTO_CREATED_UPDATE_AT_TS_COL
+ && name.as_str() != AUTO_CREATED_PLACEHOLDER_TS_COL
})
.cloned()
.collect()
@@ -536,7 +569,8 @@ pub fn analyze_incremental_aggregate_plan(
///
/// ```text
/// delta = SELECT ts, number FROM AS __flow_delta
-/// sink = SELECT ts, number FROM AS __flow_sink
+/// sink_scan = SELECT * FROM [WHERE ]
+/// sink = SELECT ts, number FROM sink_scan AS __flow_sink
/// SELECT
/// CASE
/// WHEN __flow_sink.number IS NULL THEN __flow_delta.number
@@ -548,11 +582,17 @@ pub fn analyze_incremental_aggregate_plan(
/// LEFT JOIN sink
/// ON __flow_delta.ts IS NOT DISTINCT FROM __flow_sink.ts
/// ```
+///
+/// If `sink_dirty_filter` is provided, it is applied to the sink table scan
+/// before projection, aliasing, and the left join. The predicate must reference
+/// raw sink table columns structurally (unqualified), before the `__flow_sink`
+/// alias exists.
pub async fn rewrite_incremental_aggregate_with_sink_merge(
delta_plan: &LogicalPlan,
analysis: &IncrementalAggregateAnalysis,
sink_table: TableRef,
sink_table_name: &TableName,
+ sink_dirty_filter: Option,
) -> Result {
ensure!(
analysis.unsupported_exprs.is_empty(),
@@ -637,7 +677,22 @@ pub async fn rewrite_incremental_aggregate_with_sink_merge(
.cloned()
.map(unqualified_col)
.collect::>();
- let sink_selected = LogicalPlanBuilder::from(sink_scan)
+ let sink_input = if let Some(predicate) = sink_dirty_filter {
+ LogicalPlanBuilder::from(sink_scan)
+ .filter(predicate)
+ .with_context(|_| DatafusionSnafu {
+ context: "Failed to filter sink table scan for incremental sink merge".to_string(),
+ })?
+ .build()
+ .with_context(|_| DatafusionSnafu {
+ context: "Failed to build filtered sink plan for incremental sink merge"
+ .to_string(),
+ })?
+ } else {
+ sink_scan
+ };
+
+ let sink_selected = LogicalPlanBuilder::from(sink_input)
.project(sink_selected_exprs)
.with_context(|_| DatafusionSnafu {
context: "Failed to project sink table scan for incremental sink merge".to_string(),
diff --git a/src/flow/src/batching_mode/utils/test.rs b/src/flow/src/batching_mode/utils/test.rs
index 863580b4ae..5b9cf7f507 100644
--- a/src/flow/src/batching_mode/utils/test.rs
+++ b/src/flow/src/batching_mode/utils/test.rs
@@ -15,10 +15,13 @@
use std::sync::Arc;
use common_recordbatch::RecordBatch;
+use common_time::Timestamp;
use datafusion_common::tree_node::TreeNode as _;
use datafusion_expr::GroupingSet;
-use datatypes::prelude::{ConcreteDataType, Scalar, VectorRef};
+use datatypes::prelude::{ConcreteDataType, MutableVector, Scalar, ScalarVectorBuilder, VectorRef};
use datatypes::schema::{ColumnSchema, Schema};
+use datatypes::timestamp::TimestampMillisecond;
+use datatypes::vectors::TimestampMillisecondVectorBuilder;
use pretty_assertions::assert_eq;
use query::query_engine::DefaultSerializer;
use session::context::QueryContext;
@@ -26,6 +29,7 @@ use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
use table::test_util::MemTable;
use super::*;
+use crate::batching_mode::state::FilterExprInfo;
use crate::test_utils::create_test_query_engine;
fn u32_table(table_name: &str, columns: Vec<&str>, rows: usize) -> TableRef {
@@ -50,6 +54,30 @@ fn empty_u32_table(table_name: &str, columns: Vec<&str>) -> TableRef {
u32_table(table_name, columns, 0)
}
+fn time_window_u32_table(table_name: &str) -> TableRef {
+ let schema = Arc::new(Schema::new(vec![
+ ColumnSchema::new(
+ "time_window",
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ false,
+ )
+ .with_time_index(true),
+ ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+ ]));
+
+ let mut time_window_builder = TimestampMillisecondVectorBuilder::with_capacity(1);
+ time_window_builder.push(Some(TimestampMillisecond::new(0)));
+ let recordbatch = RecordBatch::new(
+ schema,
+ vec![
+ time_window_builder.to_vector_cloned(),
+ Arc::new(::VectorType::from_vec(vec![1])) as VectorRef,
+ ],
+ )
+ .unwrap();
+ MemTable::table(table_name, recordbatch)
+}
+
fn assert_same_logical_plan(actual: &LogicalPlan, expected: &LogicalPlan) {
assert_eq!(
format!("{}", expected.display_indent()),
@@ -84,6 +112,29 @@ fn expected_left_join_rewrite(
sink_selected_exprs: Vec,
join_keys: (Vec, Vec),
projection_exprs: Vec,
+) -> LogicalPlan {
+ expected_left_join_rewrite_with_sink_filter(
+ delta_plan,
+ sink_table,
+ sink_table_name,
+ delta_selected_exprs,
+ sink_selected_exprs,
+ None,
+ join_keys,
+ projection_exprs,
+ )
+}
+
+#[allow(clippy::too_many_arguments)]
+fn expected_left_join_rewrite_with_sink_filter(
+ delta_plan: &LogicalPlan,
+ sink_table: TableRef,
+ sink_table_name: &TableName,
+ delta_selected_exprs: Vec,
+ sink_selected_exprs: Vec,
+ sink_filter: Option,
+ join_keys: (Vec, Vec),
+ projection_exprs: Vec,
) -> LogicalPlan {
let delta_alias = "__flow_delta";
let sink_alias = "__flow_sink";
@@ -94,7 +145,17 @@ fn expected_left_join_rewrite(
.unwrap()
.build()
.unwrap();
- let sink_selected = LogicalPlanBuilder::from(test_sink_scan(sink_table, sink_table_name))
+ let sink_scan = test_sink_scan(sink_table, sink_table_name);
+ let sink_input = if let Some(predicate) = sink_filter {
+ LogicalPlanBuilder::from(sink_scan)
+ .filter(predicate)
+ .unwrap()
+ .build()
+ .unwrap()
+ } else {
+ sink_scan
+ };
+ let sink_selected = LogicalPlanBuilder::from(sink_input)
.project(sink_selected_exprs)
.unwrap()
.alias(sink_alias)
@@ -576,6 +637,44 @@ async fn test_analyze_incremental_aggregate_plan_keeps_aliases_for_multiple_aggr
}));
}
+#[tokio::test]
+async fn test_analyze_incremental_aggregate_plan_allows_auto_created_sink_columns() {
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ let sql = format!(
+ "SELECT max(number) AS total, ts, now() AS {}, CAST('1970-01-01 00:00:00' AS TIMESTAMP) AS {} FROM numbers_with_ts GROUP BY ts",
+ AUTO_CREATED_UPDATE_AT_TS_COL, AUTO_CREATED_PLACEHOLDER_TS_COL
+ );
+ let plan = sql_to_df_plan(ctx, query_engine, &sql, false)
+ .await
+ .unwrap();
+
+ let analysis = analyze_incremental_aggregate_plan(&plan).unwrap().unwrap();
+ assert!(
+ analysis.unsupported_exprs.is_empty(),
+ "auto-created sink columns should not disable incremental analysis: {:?}",
+ analysis.unsupported_exprs
+ );
+ assert!(
+ analysis
+ .literal_columns
+ .iter()
+ .any(|name| name == AUTO_CREATED_UPDATE_AT_TS_COL)
+ );
+ assert!(
+ analysis
+ .literal_columns
+ .iter()
+ .any(|name| name == AUTO_CREATED_PLACEHOLDER_TS_COL)
+ );
+ assert_eq!(analysis.merge_columns.len(), 1);
+ assert_eq!(analysis.merge_columns[0].output_field_name, "total");
+ assert_eq!(
+ analysis.merge_columns[0].merge_op,
+ IncrementalAggregateMergeOp::Max
+ );
+}
+
#[tokio::test]
async fn test_analyze_incremental_aggregate_plan_allows_where_before_aggregate() {
let query_engine = create_test_query_engine();
@@ -641,6 +740,7 @@ async fn test_rewrite_incremental_aggregate_allows_alias_wrapped_scan() {
"public".to_string(),
"alias_wrapped_sink".to_string(),
],
+ None,
)
.await
.unwrap();
@@ -887,6 +987,7 @@ async fn test_analyze_incremental_aggregate_plan_allows_literal_outputs() {
&analysis,
sink_table.clone(),
&sink_table_name,
+ None,
)
.await
.unwrap();
@@ -975,6 +1076,7 @@ async fn test_rewrite_incremental_aggregate_preserves_non_identifier_aliases() {
"public".to_string(),
"non_identifier_alias_sink".to_string(),
],
+ None,
)
.await
.unwrap();
@@ -1161,6 +1263,7 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {
&analysis,
sink_table.clone(),
&sink_table_name,
+ None,
)
.await
.unwrap();
@@ -1183,6 +1286,67 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {
assert_same_logical_plan(&rewritten, &expected);
}
+#[tokio::test]
+async fn test_rewrite_incremental_aggregate_filters_sink_dirty_time_window() {
+ // This verifies the rewrite placement when callers supply an already
+ // inferred sink dirty-window predicate. The task-level inference rules are
+ // covered by `infer_sink_time_window_filter_col` tests in task.rs.
+ let query_engine = create_test_query_engine();
+ let ctx = QueryContext::arc();
+ let sql = "SELECT max(number) AS number, date_bin(INTERVAL '1 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window";
+ let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+ .await
+ .unwrap();
+ let analysis = analyze_incremental_aggregate_plan(&plan).unwrap().unwrap();
+ let sink_table = time_window_u32_table("time_window_sink");
+ let sink_table_name = [
+ "greptime".to_string(),
+ "public".to_string(),
+ "time_window_sink".to_string(),
+ ];
+ let dirty_filter = FilterExprInfo {
+ expr: unqualified_col("ts"),
+ col_name: "ts".to_string(),
+ time_ranges: vec![(
+ Timestamp::new_millisecond(0),
+ Timestamp::new_millisecond(1000),
+ )],
+ window_size: chrono::Duration::seconds(1),
+ };
+ let sink_filter = dirty_filter
+ .predicate_for_col("time_window")
+ .unwrap()
+ .unwrap();
+
+ let rewritten = rewrite_incremental_aggregate_with_sink_merge(
+ &plan,
+ &analysis,
+ sink_table.clone(),
+ &sink_table_name,
+ Some(sink_filter.clone()),
+ )
+ .await
+ .unwrap();
+
+ let expected = expected_left_join_rewrite_with_sink_filter(
+ &plan,
+ sink_table,
+ &sink_table_name,
+ vec![unqualified_col("time_window"), unqualified_col("number")],
+ vec![unqualified_col("time_window"), unqualified_col("number")],
+ Some(sink_filter),
+ (
+ vec![qualified_column("__flow_delta", "time_window")],
+ vec![qualified_column("__flow_sink", "time_window")],
+ ),
+ vec![
+ max_merge_expr("number"),
+ qualified_col("__flow_delta", "time_window").alias("time_window"),
+ ],
+ );
+ assert_same_logical_plan(&rewritten, &expected);
+}
+
#[tokio::test]
async fn test_analyze_incremental_aggregate_plan_rejects_global_aggregate() {
let query_engine = create_test_query_engine();
@@ -1230,6 +1394,7 @@ async fn test_rewrite_incremental_aggregate_rejects_empty_group_keys() {
&analysis,
sink_table,
&sink_table_name,
+ None,
)
.await
.unwrap_err();
@@ -1261,6 +1426,7 @@ async fn test_rewrite_incremental_aggregate_preserves_raw_aggregate_field_name()
&analysis,
sink_table.clone(),
&sink_table_name,
+ None,
)
.await
.unwrap();
diff --git a/src/flow/src/metrics.rs b/src/flow/src/metrics.rs
index 58c01793cc..00f93d47ab 100644
--- a/src/flow/src/metrics.rs
+++ b/src/flow/src/metrics.rs
@@ -87,6 +87,20 @@ lazy_static! {
&["flow_id"],
)
.unwrap();
+ pub static ref METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT: IntCounterVec =
+ register_int_counter_vec!(
+ "greptime_flow_batching_checkpoint_decision_count",
+ "flow batching checkpoint state-machine decisions",
+ &["flow_id", "mode", "decision", "reason"],
+ )
+ .unwrap();
+ pub static ref METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT: IntCounterVec =
+ register_int_counter_vec!(
+ "greptime_flow_batching_query_mode_count",
+ "flow batching query attempts by checkpoint mode",
+ &["flow_id", "mode"],
+ )
+ .unwrap();
pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(
diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs
index 24075601f6..e85bc28f9a 100644
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -303,7 +303,7 @@ impl Instance {
.await
}
_ => {
- query_interceptor.pre_execute(&stmt, None, query_ctx.clone())?;
+ query_interceptor.pre_execute(Some(&stmt), None, query_ctx.clone())?;
self.statement_executor
.execute_sql(stmt, query_ctx)
.await
@@ -326,7 +326,7 @@ impl Instance {
let QueryStatement::Sql(stmt) = stmt else {
unreachable!()
};
- query_interceptor.pre_execute(&stmt, Some(&plan), query_ctx.clone())?;
+ query_interceptor.pre_execute(Some(&stmt), Some(&plan), query_ctx.clone())?;
self.statement_executor
.exec_plan(plan, query_ctx.clone())
@@ -344,7 +344,11 @@ impl Instance {
.statement_executor
.plan_tql(tql.clone(), query_ctx)
.await?;
- query_interceptor.pre_execute(&Statement::Tql(tql), Some(&plan), query_ctx.clone())?;
+ query_interceptor.pre_execute(
+ Some(&Statement::Tql(tql)),
+ Some(&plan),
+ query_ctx.clone(),
+ )?;
self.statement_executor
.exec_plan(plan, query_ctx.clone())
.await
@@ -649,9 +653,7 @@ impl Instance {
let query_interceptor_opt = self.plugins.get::>();
let query_interceptor = query_interceptor_opt.as_ref();
- if let Some(ref s) = stmt {
- query_interceptor.pre_execute(s, Some(&plan), query_ctx.clone())?;
- }
+ query_interceptor.pre_execute(stmt.as_ref(), Some(&plan), query_ctx.clone())?;
let query = stmt
.as_ref()
@@ -880,7 +882,11 @@ impl PrometheusHandler for Instance {
.map_err(BoxedError::new)
.context(ExecuteQuerySnafu)?;
- interceptor.pre_execute(query, Some(&plan), query_ctx.clone())?;
+ let QueryStatement::Promql(eval_stmt, _) = &stmt else {
+ unreachable!("query is parsed from promql");
+ };
+
+ interceptor.pre_execute(query, &eval_stmt.expr, Some(&plan), query_ctx.clone())?;
// Take the EvalStmt from the original QueryStatement and use it to create the CatalogQueryStatement.
let query_statement = if let QueryStatement::Promql(eval_stmt, alias) = stmt {
@@ -892,7 +898,7 @@ impl PrometheusHandler for Instance {
}
.fail();
};
- let query = query_statement.to_string();
+ let raw_query = query_statement.to_string();
let slow_query_timer = self
.slow_query_options
@@ -912,7 +918,7 @@ impl PrometheusHandler for Instance {
let ticket = self.process_manager.register_query(
query_ctx.current_catalog().to_string(),
vec![query_ctx.current_schema()],
- query,
+ raw_query,
query_ctx.conn_info().to_string(),
Some(query_ctx.process_id()),
slow_query_timer,
@@ -1204,14 +1210,19 @@ fn should_track_plan_process(stmt: Option<&Statement>, plan: &LogicalPlan) -> bo
#[cfg(test)]
mod tests {
use std::collections::HashMap;
+ use std::future::Future;
+ use std::pin::Pin;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Barrier};
+ use std::task::{Context, Poll};
use std::thread;
use std::time::{Duration, Instant};
use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse};
use catalog::process_manager::ProcessManager;
use common_base::Plugins;
+ use common_error::ext::{BoxedError, PlainError};
+ use common_error::status_code::StatusCode;
use common_meta::cache::LayeredCacheRegistryBuilder;
use common_meta::kv_backend::memory::MemoryKvBackend;
use common_meta::procedure_executor::{ExecutorContext, ProcedureExecutor};
@@ -1220,23 +1231,142 @@ mod tests {
MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse,
};
use common_query::Output;
+ use common_recordbatch::{
+ OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream,
+ };
use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
use datafusion_expr::dml::InsertOp;
use datafusion_expr::{LogicalPlanBuilder, LogicalTableSource};
use datatypes::prelude::ConcreteDataType;
- use datatypes::schema::{ColumnSchema, Schema as GtSchema};
+ use datatypes::schema::{ColumnSchema, Schema as GtSchema, SchemaRef as GtSchemaRef};
use query::query_engine::options::QueryOptions;
use session::context::{Channel, ConnInfo, QueryContext, QueryContextBuilder};
+ use snafu::{Location, Snafu};
use sql::dialect::GreptimeDbDialect;
+ use store_api::data_source::DataSource;
+ use store_api::storage::ScanRequest;
use strfmt::Format;
- use table::metadata::{TableInfoBuilder, TableMetaBuilder};
+ use table::metadata::{FilterPushDownType, TableInfo, TableInfoBuilder, TableMetaBuilder};
use table::test_util::EmptyTable;
+ use table::{Table, TableRef};
use tokio::sync::{mpsc, oneshot};
use super::*;
use crate::frontend::FrontendOptions;
use crate::instance::builder::FrontendBuilder;
+ #[derive(Debug, Snafu)]
+ enum TestError {
+ #[snafu(display("Failed to build test cache registry"))]
+ BuildCacheRegistry {
+ source: cache::error::Error,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to build test table meta for table: {table_name}"))]
+ BuildTableMeta {
+ table_name: String,
+ source: table::metadata::TableMetaBuilderError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to build test table info for table: {table_name}"))]
+ BuildTableInfo {
+ table_name: String,
+ source: table::metadata::TableInfoBuilderError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to register test table: {table_name}"))]
+ RegisterTable {
+ table_name: String,
+ source: catalog::error::Error,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to build test frontend instance"))]
+ BuildFrontend {
+ source: crate::error::Error,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Expected exactly one output for SQL `{sql}`, got {actual}"))]
+ UnexpectedOutputCount {
+ sql: String,
+ actual: usize,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to execute SQL `{sql}`"))]
+ ExecuteSql {
+ sql: String,
+ source: crate::error::Error,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Timed out waiting for insert-select start notification"))]
+ InsertStartTimeout {
+ source: tokio::time::error::Elapsed,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Insert-select start notification channel closed"))]
+ InsertStartChannelClosed {
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to release blocking insert-select interceptor"))]
+ ReleaseBlockedInsert {
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Timed out waiting for insert-select source to be polled"))]
+ SourcePollTimeout {
+ source: tokio::time::error::Elapsed,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Insert-select source poll notification channel closed"))]
+ SourcePollChannelClosed {
+ source: oneshot::error::RecvError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Timed out waiting for insert task to finish"))]
+ InsertTaskTimeout {
+ source: tokio::time::error::Elapsed,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Insert task panicked"))]
+ InsertTaskPanic {
+ source: tokio::task::JoinError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Expected insert-select to be cancelled"))]
+ InsertSelectNotCancelled {
+ #[snafu(implicit)]
+ location: Location,
+ },
+ }
+
+ type TestResult = std::result::Result;
+
fn parse_one_sql(sql: &str) -> Statement {
parse_stmt(sql, &GreptimeDbDialect {}).unwrap().remove(0)
}
@@ -1270,11 +1400,11 @@ mod tests {
fn pre_execute(
&self,
- statement: &Statement,
+ statement: Option<&Statement>,
_plan: Option<&LogicalPlan>,
_query_ctx: QueryContextRef,
) -> Result<()> {
- let Statement::Insert(insert) = statement else {
+ let Some(Statement::Insert(insert)) = statement else {
return Ok(());
};
if !insert.has_non_values_query_source() {
@@ -1292,6 +1422,70 @@ mod tests {
}
}
+ struct PendingRecordBatchStream {
+ schema: GtSchemaRef,
+ polled_tx: Option>,
+ _finish_tx: oneshot::Sender<()>,
+ finish_rx: Pin>>,
+ }
+
+ impl RecordBatchStream for PendingRecordBatchStream {
+ fn schema(&self) -> GtSchemaRef {
+ self.schema.clone()
+ }
+
+ fn output_ordering(&self) -> Option<&[OrderOption]> {
+ None
+ }
+
+ fn metrics(&self) -> Option {
+ None
+ }
+ }
+
+ impl Stream for PendingRecordBatchStream {
+ type Item = common_recordbatch::error::Result;
+
+ fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> {
+ if let Some(polled_tx) = self.polled_tx.take() {
+ let _ = polled_tx.send(());
+ }
+
+ match self.finish_rx.as_mut().poll(cx) {
+ Poll::Ready(_) => Poll::Ready(None),
+ Poll::Pending => Poll::Pending,
+ }
+ }
+ }
+
+ impl Unpin for PendingRecordBatchStream {}
+
+ struct PendingDataSource {
+ schema: GtSchemaRef,
+ polled_tx: std::sync::Mutex >>,
+ }
+
+ impl DataSource for PendingDataSource {
+ fn get_stream(
+ &self,
+ _request: ScanRequest,
+ ) -> std::result::Result {
+ let (finish_tx, finish_rx) = oneshot::channel();
+ let mut polled_tx = self.polled_tx.lock().map_err(|_| {
+ BoxedError::new(PlainError::new(
+ "pending data source lock poisoned".to_string(),
+ StatusCode::Unexpected,
+ ))
+ })?;
+ Ok(Box::pin(PendingRecordBatchStream {
+ schema: self.schema.clone(),
+ polled_tx: polled_tx.take(),
+ _finish_tx: finish_tx,
+ finish_rx: Box::pin(finish_rx),
+ }))
+ }
+ }
+
struct NoopProcedureExecutor;
#[async_trait::async_trait]
@@ -1353,18 +1547,18 @@ mod tests {
fn test_cache_registry(
kv_backend: common_meta::kv_backend::KvBackendRef,
- ) -> common_meta::cache::LayeredCacheRegistryRef {
- Arc::new(
+ ) -> TestResult {
+ Ok(Arc::new(
cache::with_default_composite_cache_registry(
LayeredCacheRegistryBuilder::default()
.add_cache_registry(cache::build_fundamental_cache_registry(kv_backend)),
)
- .unwrap()
+ .context(BuildCacheRegistrySnafu)?
.build(),
- )
+ ))
}
- fn test_table(table_id: u32, table_name: &str) -> table::TableRef {
+ fn test_table_info(table_id: u32, table_name: &str) -> TestResult {
let schema = Arc::new(GtSchema::new(vec![
ColumnSchema::new("id", ConcreteDataType::int32_datatype(), false),
ColumnSchema::new(
@@ -1380,36 +1574,85 @@ mod tests {
.value_indices(vec![1])
.next_column_id(1024)
.build()
- .unwrap();
- let table_info = TableInfoBuilder::new(table_name, table_meta)
+ .with_context(|_| BuildTableMetaSnafu {
+ table_name: table_name.to_string(),
+ })?;
+
+ TableInfoBuilder::new(table_name, table_meta)
.table_id(table_id)
.build()
- .unwrap();
+ .with_context(|_| BuildTableInfoSnafu {
+ table_name: table_name.to_string(),
+ })
+ }
- EmptyTable::from_table_info(&table_info)
+ fn test_table(table_id: u32, table_name: &str) -> TestResult {
+ let table_info = test_table_info(table_id, table_name)?;
+ Ok(EmptyTable::from_table_info(&table_info))
+ }
+
+ fn pending_table(
+ table_id: u32,
+ table_name: &str,
+ polled_tx: oneshot::Sender<()>,
+ ) -> TestResult {
+ let table_info = test_table_info(table_id, table_name)?;
+ let data_source = Arc::new(PendingDataSource {
+ schema: table_info.meta.schema.clone(),
+ polled_tx: std::sync::Mutex::new(Some(polled_tx)),
+ });
+
+ Ok(Arc::new(Table::new(
+ Arc::new(table_info),
+ FilterPushDownType::Unsupported,
+ data_source,
+ )))
+ }
+
+ async fn test_instance_with_tables(
+ source_table: TableRef,
+ target_table: TableRef,
+ ) -> TestResult {
+ test_instance_with_plugins(source_table, target_table, Plugins::new()).await
}
async fn test_instance_with_insert_select_interceptor(
interceptor: SqlQueryInterceptorRef,
- ) -> Instance {
+ ) -> TestResult {
+ let plugins = Plugins::new();
+ plugins.insert::>(interceptor);
+
+ test_instance_with_plugins(
+ test_table(1024, "source")?,
+ test_table(1025, "target")?,
+ plugins,
+ )
+ .await
+ }
+
+ async fn test_instance_with_plugins(
+ source_table: TableRef,
+ target_table: TableRef,
+ plugins: Plugins,
+ ) -> TestResult