diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index dd9209ffcd..549d6300ea 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,23 +5,23 @@ * @GreptimeTeam/db-approver ## [Module] Database Engine -/src/index @zhongzc +/src/index @evenyag @discord9 @WenyXu /src/mito2 @evenyag @v0y4g3r @waynexia -/src/query @evenyag +/src/query @evenyag @waynexia @discord9 ## [Module] Distributed -/src/common/meta @MichaelScofield -/src/common/procedure @MichaelScofield -/src/meta-client @MichaelScofield -/src/meta-srv @MichaelScofield +/src/common/meta @MichaelScofield @WenyXu +/src/common/procedure @MichaelScofield @WenyXu +/src/meta-client @MichaelScofield @WenyXu +/src/meta-srv @MichaelScofield @WenyXu ## [Module] Write Ahead Log -/src/log-store @v0y4g3r -/src/store-api @v0y4g3r +/src/log-store @v0y4g3r @WenyXu +/src/store-api @v0y4g3r @evenyag ## [Module] Metrics Engine -/src/metric-engine @waynexia -/src/promql @waynexia +/src/metric-engine @waynexia @WenyXu +/src/promql @waynexia @evenyag @discord9 ## [Module] Flow -/src/flow @zhongzc @waynexia +/src/flow @discord9 @waynexia diff --git a/.github/actions/build-greptime-binary/action.yml b/.github/actions/build-greptime-binary/action.yml index ecbc05ed38..62ee9eb599 100644 --- a/.github/actions/build-greptime-binary/action.yml +++ b/.github/actions/build-greptime-binary/action.yml @@ -32,9 +32,23 @@ inputs: description: Image Registry required: false default: 'docker.io' + large-page-size: + description: Build GreptimeDB with large page size (65536). + required: false + default: 'false' + runs: using: composite steps: + - name: Set extra build environment variables + shell: bash + run: | + if [[ '${{ inputs.large-page-size }}' == 'true' ]]; then + echo 'EXTRA_BUILD_ENVS="JEMALLOC_SYS_WITH_LG_PAGE=16"' >> $GITHUB_ENV + else + echo 'EXTRA_BUILD_ENVS=' >> $GITHUB_ENV + fi + - name: Build greptime binary shell: bash if: ${{ inputs.build-android-artifacts == 'false' }} @@ -45,7 +59,8 @@ runs: FEATURES=${{ inputs.features }} \ BASE_IMAGE=${{ inputs.base-image }} \ IMAGE_NAMESPACE=${{ inputs.image-namespace }} \ - IMAGE_REGISTRY=${{ inputs.image-registry }} + IMAGE_REGISTRY=${{ inputs.image-registry }} \ + EXTRA_BUILD_ENVS=$EXTRA_BUILD_ENVS - name: Upload artifacts uses: ./.github/actions/upload-artifacts diff --git a/.github/actions/build-linux-artifacts/action.yml b/.github/actions/build-linux-artifacts/action.yml index 9c88b25075..3cb9c43955 100644 --- a/.github/actions/build-linux-artifacts/action.yml +++ b/.github/actions/build-linux-artifacts/action.yml @@ -27,6 +27,10 @@ inputs: description: Working directory to build the artifacts required: false default: . + large-page-size: + description: Build GreptimeDB with large page size (65536). + required: false + default: 'false' runs: using: composite steps: @@ -59,6 +63,7 @@ runs: working-dir: ${{ inputs.working-dir }} image-registry: ${{ inputs.image-registry }} image-namespace: ${{ inputs.image-namespace }} + large-page-size: ${{ inputs.large-page-size }} - name: Clean up the target directory # Clean up the target directory for the centos7 base image, or it will still use the objects of last build. shell: bash @@ -77,6 +82,7 @@ runs: working-dir: ${{ inputs.working-dir }} image-registry: ${{ inputs.image-registry }} image-namespace: ${{ inputs.image-namespace }} + large-page-size: ${{ inputs.large-page-size }} - name: Build greptime on android base image uses: ./.github/actions/build-greptime-binary @@ -89,3 +95,4 @@ runs: build-android-artifacts: true image-registry: ${{ inputs.image-registry }} image-namespace: ${{ inputs.image-namespace }} + large-page-size: ${{ inputs.large-page-size }} diff --git a/.github/scripts/deploy-greptimedb.sh b/.github/scripts/deploy-greptimedb.sh index fca21993b4..10831f8625 100755 --- a/.github/scripts/deploy-greptimedb.sh +++ b/.github/scripts/deploy-greptimedb.sh @@ -7,6 +7,8 @@ KUBERNETES_VERSION="${KUBERNETES_VERSION:-v1.32.0}" ENABLE_STANDALONE_MODE="${ENABLE_STANDALONE_MODE:-true}" DEFAULT_INSTALL_NAMESPACE=${DEFAULT_INSTALL_NAMESPACE:-default} GREPTIMEDB_IMAGE_TAG=${GREPTIMEDB_IMAGE_TAG:-latest} +GREPTIMEDB_OPERATOR_IMAGE_TAG=${GREPTIMEDB_OPERATOR_IMAGE_TAG:-v0.5.1} +GREPTIMEDB_INITIALIZER_IMAGE_TAG="${GREPTIMEDB_OPERATOR_IMAGE_TAG}" GREPTIME_CHART="https://greptimeteam.github.io/helm-charts/" ETCD_CHART="oci://registry-1.docker.io/bitnamicharts/etcd" ETCD_CHART_VERSION="${ETCD_CHART_VERSION:-12.0.8}" @@ -58,7 +60,7 @@ function deploy_greptimedb_operator() { # Use the latest chart and image. helm upgrade --install greptimedb-operator greptime/greptimedb-operator \ --create-namespace \ - --set image.tag=latest \ + --set image.tag="$GREPTIMEDB_OPERATOR_IMAGE_TAG" \ -n "$DEFAULT_INSTALL_NAMESPACE" # Wait for greptimedb-operator to be ready. @@ -78,6 +80,7 @@ function deploy_greptimedb_cluster() { helm upgrade --install "$cluster_name" greptime/greptimedb-cluster \ --create-namespace \ --set image.tag="$GREPTIMEDB_IMAGE_TAG" \ + --set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \ --set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \ --set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \ -n "$install_namespace" @@ -115,6 +118,7 @@ function deploy_greptimedb_cluster_with_s3_storage() { helm upgrade --install "$cluster_name" greptime/greptimedb-cluster -n "$install_namespace" \ --create-namespace \ --set image.tag="$GREPTIMEDB_IMAGE_TAG" \ + --set initializer.tag="$GREPTIMEDB_INITIALIZER_IMAGE_TAG" \ --set meta.backendStorage.etcd.endpoints="etcd.$install_namespace:2379" \ --set meta.backendStorage.etcd.storeKeyPrefix="$cluster_name" \ --set objectStorage.s3.bucket="$AWS_CI_TEST_BUCKET" \ diff --git a/.github/scripts/update-helm-charts-version.sh b/.github/scripts/update-helm-charts-version.sh index d501ed8d02..e60e991846 100755 --- a/.github/scripts/update-helm-charts-version.sh +++ b/.github/scripts/update-helm-charts-version.sh @@ -39,8 +39,11 @@ update_helm_charts_version() { --body "This PR updates the GreptimeDB version." \ --base main \ --head $BRANCH_NAME \ - --reviewer zyy17 \ - --reviewer daviderli614 + --reviewer sunng87 \ + --reviewer daviderli614 \ + --reviewer killme2008 \ + --reviewer evenyag \ + --reviewer fengjiachun } update_helm_charts_version diff --git a/.github/scripts/update-homebrew-greptme-version.sh b/.github/scripts/update-homebrew-greptme-version.sh index 4abf4f2218..f474f19778 100755 --- a/.github/scripts/update-homebrew-greptme-version.sh +++ b/.github/scripts/update-homebrew-greptme-version.sh @@ -35,8 +35,11 @@ update_homebrew_greptime_version() { --body "This PR updates the GreptimeDB version." \ --base main \ --head $BRANCH_NAME \ - --reviewer zyy17 \ - --reviewer daviderli614 + --reviewer sunng87 \ + --reviewer daviderli614 \ + --reviewer killme2008 \ + --reviewer evenyag \ + --reviewer fengjiachun } update_homebrew_greptime_version diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml index fad3e316e8..021867e4ed 100644 --- a/.github/workflows/dev-build.yml +++ b/.github/workflows/dev-build.yml @@ -4,10 +4,11 @@ name: GreptimeDB Development Build on: workflow_dispatch: # Allows you to run this workflow manually. inputs: - repository: - description: The public repository to build + large-page-size: + description: Build GreptimeDB with large page size (65536). + type: boolean required: false - default: GreptimeTeam/greptimedb + default: false commit: # Note: We only pull the source code and use the current workflow to build the artifacts. description: The commit to build required: true @@ -181,6 +182,7 @@ jobs: working-dir: ${{ env.CHECKOUT_GREPTIMEDB_PATH }} image-registry: ${{ vars.ECR_IMAGE_REGISTRY }} image-namespace: ${{ vars.ECR_IMAGE_NAMESPACE }} + large-page-size: ${{ inputs.large-page-size }} build-linux-arm64-artifacts: name: Build linux-arm64 artifacts @@ -214,6 +216,7 @@ jobs: working-dir: ${{ env.CHECKOUT_GREPTIMEDB_PATH }} image-registry: ${{ vars.ECR_IMAGE_REGISTRY }} image-namespace: ${{ vars.ECR_IMAGE_NAMESPACE }} + large-page-size: ${{ inputs.large-page-size }} release-images-to-dockerhub: name: Build and push images to DockerHub diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 8dde424c8e..af5ddc5368 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -613,6 +613,9 @@ jobs: - name: "MySQL Kvbackend" opts: "--setup-mysql" kafka: false + - name: "Flat format" + opts: "--enable-flat-format" + kafka: false timeout-minutes: 60 steps: - uses: actions/checkout@v4 @@ -808,7 +811,7 @@ jobs: - name: Setup external services working-directory: tests-integration/fixtures run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait - + - name: Run nextest cases run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend env: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index fc472e2d8b..71812a35bf 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -92,5 +92,6 @@ jobs: mode: - name: "Basic" - name: "Remote WAL" + - name: "Flat format" steps: - run: 'echo "No action required"' diff --git a/.github/workflows/multi-lang-tests.yml b/.github/workflows/multi-lang-tests.yml new file mode 100644 index 0000000000..6da0a658dd --- /dev/null +++ b/.github/workflows/multi-lang-tests.yml @@ -0,0 +1,57 @@ +name: Multi-language Integration Tests + +on: + push: + branches: + - main + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build-greptimedb: + if: ${{ github.repository == 'GreptimeTeam/greptimedb' }} + name: Build GreptimeDB binary + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + - uses: arduino/setup-protoc@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - uses: actions-rust-lang/setup-rust-toolchain@v1 + - uses: Swatinem/rust-cache@v2 + with: + shared-key: "multi-lang-build" + cache-all-crates: "true" + save-if: ${{ github.ref == 'refs/heads/main' }} + - name: Install cargo-gc-bin + shell: bash + run: cargo install cargo-gc-bin --force + - name: Build greptime binary + shell: bash + run: cargo gc -- --bin greptime --features "pg_kvbackend,mysql_kvbackend" + - name: Pack greptime binary + shell: bash + run: | + mkdir bin && \ + mv ./target/debug/greptime bin + - name: Print greptime binary info + run: ls -lh bin + - name: Upload greptime binary + uses: actions/upload-artifact@v4 + with: + name: greptime-bin + path: bin/ + retention-days: 1 + + run-multi-lang-tests: + name: Run Multi-language SDK Tests + needs: build-greptimedb + uses: ./.github/workflows/run-multi-lang-tests.yml + with: + artifact-name: greptime-bin diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml index 6640d1d3df..710a767334 100644 --- a/.github/workflows/nightly-build.yml +++ b/.github/workflows/nightly-build.yml @@ -174,6 +174,18 @@ jobs: image-registry: ${{ vars.ECR_IMAGE_REGISTRY }} image-namespace: ${{ vars.ECR_IMAGE_NAMESPACE }} + run-multi-lang-tests: + name: Run Multi-language SDK Tests + if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'schedule' }} + needs: [ + allocate-runners, + build-linux-amd64-artifacts, + ] + uses: ./.github/workflows/run-multi-lang-tests.yml + with: + artifact-name: greptime-linux-amd64-${{ needs.allocate-runners.outputs.version }} + artifact-is-tarball: true + release-images-to-dockerhub: name: Build and push images to DockerHub if: ${{ inputs.release_images || github.event_name == 'schedule' }} @@ -301,7 +313,8 @@ jobs: if: ${{ github.repository == 'GreptimeTeam/greptimedb' && always() }} # Not requiring successful dependent jobs, always run. name: Send notification to Greptime team needs: [ - release-images-to-dockerhub + release-images-to-dockerhub, + run-multi-lang-tests, ] runs-on: ubuntu-latest permissions: @@ -319,17 +332,17 @@ jobs: run: pnpm tsx bin/report-ci-failure.ts env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CI_REPORT_STATUS: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result == 'success' }} + CI_REPORT_STATUS: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result == 'success' && (needs.run-multi-lang-tests.result == 'success' || needs.run-multi-lang-tests.result == 'skipped') }} - name: Notify nightly build successful result uses: slackapi/slack-github-action@v1.23.0 - if: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result == 'success' }} + if: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result == 'success' && (needs.run-multi-lang-tests.result == 'success' || needs.run-multi-lang-tests.result == 'skipped') }} with: payload: | {"text": "GreptimeDB's ${{ env.NEXT_RELEASE_VERSION }} build has completed successfully."} - name: Notify nightly build failed result uses: slackapi/slack-github-action@v1.23.0 - if: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result != 'success' }} + if: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result != 'success' || needs.run-multi-lang-tests.result == 'failure' }} with: payload: | {"text": "GreptimeDB's ${{ env.NEXT_RELEASE_VERSION }} build has failed, please check ${{ steps.report-ci-status.outputs.html_url }}."} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bc9da93b9c..614500fab1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -215,6 +215,18 @@ jobs: image-registry: ${{ vars.ECR_IMAGE_REGISTRY }} image-namespace: ${{ vars.ECR_IMAGE_NAMESPACE }} + run-multi-lang-tests: + name: Run Multi-language SDK Tests + if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }} + needs: [ + allocate-runners, + build-linux-amd64-artifacts, + ] + uses: ./.github/workflows/run-multi-lang-tests.yml + with: + artifact-name: greptime-linux-amd64-${{ needs.allocate-runners.outputs.version }} + artifact-is-tarball: true + build-macos-artifacts: name: Build macOS artifacts strategy: @@ -303,6 +315,7 @@ jobs: allocate-runners, build-linux-amd64-artifacts, build-linux-arm64-artifacts, + run-multi-lang-tests, ] runs-on: ubuntu-latest outputs: @@ -381,6 +394,7 @@ jobs: build-macos-artifacts, build-windows-artifacts, release-images-to-dockerhub, + run-multi-lang-tests, ] runs-on: ubuntu-latest steps: diff --git a/.github/workflows/run-multi-lang-tests.yml b/.github/workflows/run-multi-lang-tests.yml new file mode 100644 index 0000000000..f744d7a644 --- /dev/null +++ b/.github/workflows/run-multi-lang-tests.yml @@ -0,0 +1,194 @@ +# Reusable workflow for running multi-language SDK tests against GreptimeDB +# Used by: multi-lang-tests.yml, release.yml, nightly-build.yml +# Supports both direct binary artifacts and tarball artifacts + +name: Run Multi-language SDK Tests + +on: + workflow_call: + inputs: + artifact-name: + required: true + type: string + description: 'Name of the artifact containing greptime binary' + http-port: + required: false + type: string + default: '4000' + description: 'HTTP server port' + mysql-port: + required: false + type: string + default: '4002' + description: 'MySQL server port' + postgres-port: + required: false + type: string + default: '4003' + description: 'PostgreSQL server port' + db-name: + required: false + type: string + default: 'test_db' + description: 'Test database name' + username: + required: false + type: string + default: 'greptime_user' + description: 'Authentication username' + password: + required: false + type: string + default: 'greptime_pwd' + description: 'Authentication password' + timeout-minutes: + required: false + type: number + default: 30 + description: 'Job timeout in minutes' + artifact-is-tarball: + required: false + type: boolean + default: false + description: 'Whether the artifact is a tarball (tar.gz) that needs to be extracted' + +jobs: + run-tests: + name: Run Multi-language SDK Tests + runs-on: ubuntu-latest + timeout-minutes: ${{ inputs.timeout-minutes }} + steps: + - name: Checkout greptimedb-tests repository + uses: actions/checkout@v4 + with: + repository: GreptimeTeam/greptimedb-tests + persist-credentials: false + + - name: Download pre-built greptime binary + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.artifact-name }} + path: artifact + + - name: Setup greptime binary + run: | + mkdir -p bin + if [ "${{ inputs.artifact-is-tarball }}" = "true" ]; then + # Extract tarball and find greptime binary + tar -xzf artifact/*.tar.gz -C artifact + find artifact -name "greptime" -type f -exec cp {} bin/greptime \; + else + # Direct binary format + if [ -f artifact/greptime ]; then + cp artifact/greptime bin/greptime + else + cp artifact/* bin/greptime + fi + fi + chmod +x ./bin/greptime + ls -lh ./bin/greptime + ./bin/greptime --version + + - name: Setup Java 17 + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + cache: 'maven' + + - name: Setup Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: '3.8' + + - name: Setup Go 1.24 + uses: actions/setup-go@v5 + with: + go-version: '1.24' + cache: true + cache-dependency-path: go-tests/go.sum + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '18' + + - name: Install Python dependencies + run: | + pip install mysql-connector-python psycopg2-binary + python3 -c "import mysql.connector; print(f'mysql-connector-python {mysql.connector.__version__}')" + python3 -c "import psycopg2; print(f'psycopg2 {psycopg2.__version__}')" + + - name: Install Go dependencies + working-directory: go-tests + run: | + go mod download + go mod verify + go version + + - name: Kill existing GreptimeDB processes + run: | + pkill -f greptime || true + sleep 2 + + - name: Start GreptimeDB standalone + run: | + ./bin/greptime standalone start \ + --http-addr 0.0.0.0:${{ inputs.http-port }} \ + --rpc-addr 0.0.0.0:4001 \ + --mysql-addr 0.0.0.0:${{ inputs.mysql-port }} \ + --postgres-addr 0.0.0.0:${{ inputs.postgres-port }} \ + --user-provider=static_user_provider:cmd:${{ inputs.username }}=${{ inputs.password }} > /tmp/greptimedb.log 2>&1 & + + - name: Wait for GreptimeDB to be ready + run: | + echo "Waiting for GreptimeDB..." + for i in {1..60}; do + if curl -sf http://localhost:${{ inputs.http-port }}/health > /dev/null; then + echo "✅ GreptimeDB is ready" + exit 0 + fi + sleep 2 + done + echo "❌ GreptimeDB failed to start" + cat /tmp/greptimedb.log + exit 1 + + - name: Run multi-language tests + env: + DB_NAME: ${{ inputs.db-name }} + MYSQL_HOST: 127.0.0.1 + MYSQL_PORT: ${{ inputs.mysql-port }} + POSTGRES_HOST: 127.0.0.1 + POSTGRES_PORT: ${{ inputs.postgres-port }} + HTTP_HOST: 127.0.0.1 + HTTP_PORT: ${{ inputs.http-port }} + GREPTIME_USERNAME: ${{ inputs.username }} + GREPTIME_PASSWORD: ${{ inputs.password }} + run: | + chmod +x ./run_tests.sh + ./run_tests.sh + + - name: Collect logs on failure + if: failure() + run: | + echo "=== GreptimeDB Logs ===" + cat /tmp/greptimedb.log || true + + - name: Upload test logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: test-logs + path: | + /tmp/greptimedb.log + java-tests/target/surefire-reports/ + python-tests/.pytest_cache/ + go-tests/*.log + **/test-output/ + retention-days: 7 + + - name: Cleanup + if: always() + run: | + pkill -f greptime || true diff --git a/Cargo.lock b/Cargo.lock index 68872d0cd3..d166138407 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -212,8 +212,9 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c" [[package]] name = "api" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ + "arrow-schema", "common-base", "common-decimal", "common-error", @@ -732,7 +733,7 @@ dependencies = [ [[package]] name = "auth" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -1264,7 +1265,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" dependencies = [ "memchr", - "regex-automata 0.4.9", + "regex-automata 0.4.13", "serde", ] @@ -1382,7 +1383,7 @@ dependencies = [ [[package]] name = "cache" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "catalog", "common-error", @@ -1417,7 +1418,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "catalog" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "arrow", @@ -1629,6 +1630,7 @@ dependencies = [ "chrono", "chrono-tz-build", "phf 0.11.3", + "uncased", ] [[package]] @@ -1639,6 +1641,8 @@ checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402" dependencies = [ "parse-zoneinfo", "phf_codegen 0.11.3", + "phf_shared 0.11.3", + "uncased", ] [[package]] @@ -1759,7 +1763,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "cli" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-stream", "async-trait", @@ -1812,7 +1816,7 @@ dependencies = [ [[package]] name = "client" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "arc-swap", @@ -1844,8 +1848,8 @@ dependencies = [ "serde_json", "snafu 0.8.6", "store-api", - "substrait 0.18.0", "substrait 0.37.3", + "substrait 1.0.0-beta.2", "tokio", "tokio-stream", "tonic 0.13.1", @@ -1885,7 +1889,7 @@ dependencies = [ [[package]] name = "cmd" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-trait", "auth", @@ -1896,6 +1900,7 @@ dependencies = [ "clap 4.5.40", "cli", "client", + "colored", "common-base", "common-catalog", "common-config", @@ -1917,6 +1922,7 @@ dependencies = [ "common-wal", "datanode", "datatypes", + "either", "etcd-client", "file-engine", "flow", @@ -1932,7 +1938,9 @@ dependencies = [ "moka", "nu-ansi-term", "object-store", + "parquet", "plugins", + "pprof", "prometheus", "prost 0.13.5", "query", @@ -1975,6 +1983,16 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + [[package]] name = "comfy-table" version = "7.1.2" @@ -1994,7 +2012,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335" [[package]] name = "common-base" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "anymap2", "async-trait", @@ -2004,9 +2022,11 @@ dependencies = [ "common-macro", "common-test-util", "futures", + "lazy_static", "paste", "pin-project", "rand 0.9.1", + "regex", "serde", "snafu 0.8.6", "tokio", @@ -2016,16 +2036,18 @@ dependencies = [ [[package]] name = "common-catalog" -version = "0.18.0" +version = "1.0.0-beta.2" +dependencies = [ + "const_format", +] [[package]] name = "common-config" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "common-base", "common-error", "common-macro", - "common-stat", "common-telemetry", "common-test-util", "common-wal", @@ -2045,7 +2067,7 @@ dependencies = [ [[package]] name = "common-datasource" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "arrow", "arrow-schema", @@ -2080,7 +2102,7 @@ dependencies = [ [[package]] name = "common-decimal" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "bigdecimal 0.4.8", "common-error", @@ -2093,7 +2115,7 @@ dependencies = [ [[package]] name = "common-error" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "common-macro", "http 1.3.1", @@ -2104,7 +2126,7 @@ dependencies = [ [[package]] name = "common-event-recorder" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -2126,7 +2148,7 @@ dependencies = [ [[package]] name = "common-frontend" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -2148,7 +2170,7 @@ dependencies = [ [[package]] name = "common-function" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "ahash 0.8.12", "api", @@ -2186,11 +2208,13 @@ dependencies = [ "hyperloglogplus", "jsonb", "memchr", + "mito-codec", "nalgebra", "num", "num-traits", "paste", "pretty_assertions", + "regex", "s2", "serde", "serde_json", @@ -2206,7 +2230,7 @@ dependencies = [ [[package]] name = "common-greptimedb-telemetry" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-trait", "common-runtime", @@ -2223,7 +2247,7 @@ dependencies = [ [[package]] name = "common-grpc" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "arrow-flight", @@ -2242,11 +2266,13 @@ dependencies = [ "hyper 1.6.0", "hyper-util", "lazy_static", + "notify", "prost 0.13.5", "rand 0.9.1", "serde", "serde_json", "snafu 0.8.6", + "tempfile", "tokio", "tokio-util", "tonic 0.13.1", @@ -2256,7 +2282,7 @@ dependencies = [ [[package]] name = "common-grpc-expr" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "common-base", @@ -2276,7 +2302,7 @@ dependencies = [ [[package]] name = "common-macro" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "greptime-proto", "once_cell", @@ -2287,7 +2313,7 @@ dependencies = [ [[package]] name = "common-mem-prof" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "anyhow", "common-error", @@ -2303,7 +2329,7 @@ dependencies = [ [[package]] name = "common-meta" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "anymap2", "api", @@ -2375,7 +2401,7 @@ dependencies = [ [[package]] name = "common-options" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "common-grpc", "humantime-serde", @@ -2384,11 +2410,11 @@ dependencies = [ [[package]] name = "common-plugins" -version = "0.18.0" +version = "1.0.0-beta.2" [[package]] name = "common-pprof" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "common-error", "common-macro", @@ -2400,7 +2426,7 @@ dependencies = [ [[package]] name = "common-procedure" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-stream", @@ -2429,7 +2455,7 @@ dependencies = [ [[package]] name = "common-procedure-test" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-trait", "common-procedure", @@ -2439,7 +2465,7 @@ dependencies = [ [[package]] name = "common-query" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -2454,6 +2480,7 @@ dependencies = [ "datafusion-expr", "datatypes", "futures-util", + "once_cell", "serde", "snafu 0.8.6", "sqlparser", @@ -2464,7 +2491,7 @@ dependencies = [ [[package]] name = "common-recordbatch" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "arc-swap", "common-base", @@ -2488,7 +2515,7 @@ dependencies = [ [[package]] name = "common-runtime" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-trait", "clap 4.5.40", @@ -2517,7 +2544,7 @@ dependencies = [ [[package]] name = "common-session" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "serde", "strum 0.27.1", @@ -2525,7 +2552,7 @@ dependencies = [ [[package]] name = "common-sql" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "common-base", "common-decimal", @@ -2543,19 +2570,22 @@ dependencies = [ [[package]] name = "common-stat" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "common-base", + "common-runtime", + "common-telemetry", "lazy_static", "nix 0.30.1", "num_cpus", "prometheus", "sysinfo", + "tokio", ] [[package]] name = "common-telemetry" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "backtrace", "common-base", @@ -2584,7 +2614,7 @@ dependencies = [ [[package]] name = "common-test-util" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "client", "common-grpc", @@ -2597,7 +2627,7 @@ dependencies = [ [[package]] name = "common-time" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "arrow", "chrono", @@ -2615,7 +2645,7 @@ dependencies = [ [[package]] name = "common-version" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "build-data", "cargo-manifest", @@ -2626,7 +2656,7 @@ dependencies = [ [[package]] name = "common-wal" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "common-base", "common-error", @@ -2649,7 +2679,7 @@ dependencies = [ [[package]] name = "common-workload" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "common-telemetry", "serde", @@ -3711,9 +3741,9 @@ dependencies = [ [[package]] name = "datafusion-pg-catalog" -version = "0.11.0" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f258caedd1593e7dca3bf53912249de6685fa224bcce897ede1fbb7b040ac6f6" +checksum = "755393864c0c2dd95575ceed4b25e348686028e1b83d06f8f39914209999f821" dependencies = [ "async-trait", "datafusion", @@ -3886,7 +3916,7 @@ dependencies = [ [[package]] name = "datanode" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "arrow-flight", @@ -3907,6 +3937,7 @@ dependencies = [ "common-query", "common-recordbatch", "common-runtime", + "common-stat", "common-telemetry", "common-test-util", "common-time", @@ -3949,7 +3980,7 @@ dependencies = [ [[package]] name = "datatypes" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4585,7 +4616,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" dependencies = [ "bit-set", - "regex-automata 0.4.9", + "regex-automata 0.4.13", "regex-syntax 0.8.7", ] @@ -4621,7 +4652,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-engine" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -4753,7 +4784,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" [[package]] name = "flow" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "arrow", @@ -4822,7 +4853,7 @@ dependencies = [ "sql", "store-api", "strum 0.27.1", - "substrait 0.18.0", + "substrait 1.0.0-beta.2", "table", "tokio", "tonic 0.13.1", @@ -4877,7 +4908,7 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619" [[package]] name = "frontend" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "arc-swap", @@ -4904,6 +4935,7 @@ dependencies = [ "common-query", "common-recordbatch", "common-runtime", + "common-stat", "common-telemetry", "common-test-util", "common-time", @@ -5319,7 +5351,7 @@ dependencies = [ [[package]] name = "greptime-proto" version = "0.1.0" -source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=72a0d22e0f5f716b2ee21bca091f87a88c36e5ca#72a0d22e0f5f716b2ee21bca091f87a88c36e5ca" +source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=0df99f09f1d6785055b2d9da96fc4ecc2bdf6803#0df99f09f1d6785055b2d9da96fc4ecc2bdf6803" dependencies = [ "prost 0.13.5", "prost-types 0.13.5", @@ -6087,7 +6119,7 @@ dependencies = [ [[package]] name = "index" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-trait", "asynchronous-codec", @@ -6114,7 +6146,7 @@ dependencies = [ "rand 0.9.1", "rand_chacha 0.9.0", "regex", - "regex-automata 0.4.9", + "regex-automata 0.4.13", "roaring", "serde", "serde_json", @@ -6299,17 +6331,6 @@ dependencies = [ "derive_utils", ] -[[package]] -name = "io-uring" -version = "0.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" -dependencies = [ - "bitflags 2.9.1", - "cfg-if", - "libc", -] - [[package]] name = "ipnet" version = "2.11.0" @@ -6731,7 +6752,7 @@ version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5baa5e9ff84f1aefd264e6869907646538a52147a755d494517a8007fb48733" dependencies = [ - "regex-automata 0.4.9", + "regex-automata 0.4.13", "rustversion", ] @@ -7027,7 +7048,7 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "log-query" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "chrono", "common-error", @@ -7039,7 +7060,7 @@ dependencies = [ [[package]] name = "log-store" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-stream", "async-trait", @@ -7346,7 +7367,7 @@ dependencies = [ [[package]] name = "meta-client" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -7374,7 +7395,7 @@ dependencies = [ [[package]] name = "meta-srv" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -7398,6 +7419,7 @@ dependencies = [ "common-procedure", "common-procedure-test", "common-runtime", + "common-stat", "common-telemetry", "common-time", "common-version", @@ -7421,7 +7443,9 @@ dependencies = [ "lazy_static", "local-ip-address", "once_cell", + "ordered-float 4.6.0", "parking_lot 0.12.4", + "partition", "prometheus", "prost 0.13.5", "rand 0.9.1", @@ -7471,7 +7495,7 @@ dependencies = [ [[package]] name = "metric-engine" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "aquamarine", @@ -7489,6 +7513,7 @@ dependencies = [ "common-telemetry", "common-test-util", "common-time", + "common-wal", "datafusion", "datatypes", "futures-util", @@ -7565,7 +7590,7 @@ dependencies = [ [[package]] name = "mito-codec" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "bytes", @@ -7573,6 +7598,7 @@ dependencies = [ "common-decimal", "common-error", "common-macro", + "common-query", "common-recordbatch", "common-telemetry", "common-time", @@ -7589,7 +7615,7 @@ dependencies = [ [[package]] name = "mito2" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "aquamarine", @@ -8327,7 +8353,7 @@ dependencies = [ [[package]] name = "object-store" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "anyhow", "bytes", @@ -8336,6 +8362,7 @@ dependencies = [ "common-macro", "common-telemetry", "common-test-util", + "derive_builder 0.20.2", "futures", "humantime-serde", "lazy_static", @@ -8506,7 +8533,7 @@ dependencies = [ [[package]] name = "opensrv-mysql" version = "0.8.0" -source = "git+https://github.com/datafuselabs/opensrv?rev=a1fb4da215c8693c7e4f62be249a01b7fec52997#a1fb4da215c8693c7e4f62be249a01b7fec52997" +source = "git+https://github.com/datafuselabs/opensrv?tag=v0.10.0#074bd8fb81da3c9e6d6a098a482f3380478b9c0b" dependencies = [ "async-trait", "byteorder", @@ -8612,7 +8639,7 @@ dependencies = [ [[package]] name = "operator" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "ahash 0.8.12", "api", @@ -8638,6 +8665,7 @@ dependencies = [ "common-recordbatch", "common-runtime", "common-sql", + "common-stat", "common-telemetry", "common-test-util", "common-time", @@ -8649,6 +8677,7 @@ dependencies = [ "futures", "futures-util", "humantime", + "itertools 0.14.0", "jsonb", "lazy_static", "meta-client", @@ -8670,7 +8699,7 @@ dependencies = [ "sql", "sqlparser", "store-api", - "substrait 0.18.0", + "substrait 1.0.0-beta.2", "table", "tokio", "tokio-util", @@ -8956,7 +8985,7 @@ dependencies = [ [[package]] name = "partition" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -9160,10 +9189,21 @@ dependencies = [ ] [[package]] -name = "pgwire" -version = "0.34.2" +name = "pg_interval" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f56a81b4fcc69016028f657a68f9b8e8a2a4b7d07684ca3298f2d3e7ff199ce" +checksum = "fe46640b465e284b048ef065cbed8ef17a622878d310c724578396b4cfd00df2" +dependencies = [ + "bytes", + "chrono", + "postgres-types", +] + +[[package]] +name = "pgwire" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d331bb0eef5bc83a221c0a85b1f205bccf094d4f72a26ae1d68a1b1c535123b7" dependencies = [ "async-trait", "base64 0.22.1", @@ -9179,6 +9219,7 @@ dependencies = [ "ring", "rust_decimal", "rustls-pki-types", + "ryu", "serde", "serde_json", "stringprep", @@ -9255,6 +9296,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ "siphasher", + "uncased", ] [[package]] @@ -9300,7 +9342,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pipeline" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "ahash 0.8.12", "api", @@ -9456,9 +9498,10 @@ dependencies = [ [[package]] name = "plugins" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "auth", + "catalog", "clap 4.5.40", "cli", "common-base", @@ -9467,6 +9510,7 @@ dependencies = [ "datanode", "flow", "frontend", + "meta-client", "meta-srv", "serde", "snafu 0.8.6", @@ -9756,7 +9800,7 @@ dependencies = [ [[package]] name = "promql" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "ahash 0.8.12", "async-trait", @@ -10039,7 +10083,7 @@ dependencies = [ [[package]] name = "puffin" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-compression 0.4.19", "async-trait", @@ -10081,7 +10125,7 @@ dependencies = [ [[package]] name = "query" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "ahash 0.8.12", "api", @@ -10105,6 +10149,7 @@ dependencies = [ "common-query", "common-recordbatch", "common-runtime", + "common-stat", "common-telemetry", "common-time", "datafusion", @@ -10147,7 +10192,7 @@ dependencies = [ "sql", "sqlparser", "store-api", - "substrait 0.18.0", + "substrait 1.0.0-beta.2", "table", "tokio", "tokio-stream", @@ -10464,13 +10509,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.1" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.9", + "regex-automata 0.4.13", "regex-syntax 0.8.7", ] @@ -10485,9 +10530,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -11483,7 +11528,7 @@ dependencies = [ [[package]] name = "servers" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "ahash 0.8.12", "api", @@ -11560,6 +11605,7 @@ dependencies = [ "otel-arrow-rust", "parking_lot 0.12.4", "permutation", + "pg_interval", "pgwire", "pin-project", "pipeline", @@ -11601,6 +11647,7 @@ dependencies = [ "tower 0.5.2", "tower-http 0.6.6", "tracing", + "tracing-opentelemetry", "urlencoding", "uuid", "vrl", @@ -11609,7 +11656,7 @@ dependencies = [ [[package]] name = "session" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "ahash 0.8.12", "api", @@ -11943,7 +11990,7 @@ dependencies = [ [[package]] name = "sql" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "arrow-buffer", @@ -11964,6 +12011,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "datatypes", + "either", "hex", "humantime", "iso8601", @@ -12002,7 +12050,7 @@ dependencies = [ [[package]] name = "sqlness-runner" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-trait", "clap 4.5.40", @@ -12279,7 +12327,7 @@ dependencies = [ [[package]] name = "standalone" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "async-trait", "catalog", @@ -12320,7 +12368,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "store-api" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "aquamarine", @@ -12485,28 +12533,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "substrait" -version = "0.18.0" -dependencies = [ - "async-trait", - "bytes", - "common-error", - "common-function", - "common-macro", - "common-telemetry", - "datafusion", - "datafusion-common", - "datafusion-expr", - "datafusion-substrait", - "datatypes", - "promql", - "prost 0.13.5", - "snafu 0.8.6", - "substrait 0.37.3", - "tokio", -] - [[package]] name = "substrait" version = "0.37.3" @@ -12553,6 +12579,28 @@ dependencies = [ "walkdir", ] +[[package]] +name = "substrait" +version = "1.0.0-beta.2" +dependencies = [ + "async-trait", + "bytes", + "common-error", + "common-function", + "common-macro", + "common-telemetry", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-substrait", + "datatypes", + "promql", + "prost 0.13.5", + "snafu 0.8.6", + "substrait 0.37.3", + "tokio", +] + [[package]] name = "subtle" version = "2.6.1" @@ -12656,7 +12704,7 @@ dependencies = [ [[package]] name = "table" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "async-trait", @@ -12895,7 +12943,7 @@ dependencies = [ "getrandom 0.3.3", "once_cell", "rustix 1.0.7", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -12925,7 +12973,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" [[package]] name = "tests-fuzz" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "arbitrary", "async-trait", @@ -12969,7 +13017,7 @@ dependencies = [ [[package]] name = "tests-integration" -version = "0.18.0" +version = "1.0.0-beta.2" dependencies = [ "api", "arrow-flight", @@ -12995,6 +13043,7 @@ dependencies = [ "common-query", "common-recordbatch", "common-runtime", + "common-stat", "common-telemetry", "common-test-util", "common-time", @@ -13018,6 +13067,7 @@ dependencies = [ "loki-proto", "meta-client", "meta-srv", + "mito2", "moka", "mysql_async", "object-store", @@ -13042,7 +13092,7 @@ dependencies = [ "sqlx", "standalone", "store-api", - "substrait 0.18.0", + "substrait 1.0.0-beta.2", "table", "tempfile", "time", @@ -13244,23 +13294,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.47.1" +version = "1.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" dependencies = [ - "backtrace", "bytes", - "io-uring", "libc", "mio", "parking_lot 0.12.4", "pin-project-lite", "signal-hook-registry", - "slab", "socket2 0.6.0", "tokio-macros", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -13275,9 +13322,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", @@ -13955,6 +14002,15 @@ dependencies = [ "serde", ] +[[package]] +name = "uncased" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697" +dependencies = [ + "version_check", +] + [[package]] name = "unescaper" version = "0.1.6" @@ -14699,6 +14755,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link 0.2.1", +] + [[package]] name = "windows-targets" version = "0.48.5" diff --git a/Cargo.toml b/Cargo.toml index b76c5ae1cf..d0a2f66f58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.18.0" +version = "1.0.0-beta.2" edition = "2024" license = "Apache-2.0" @@ -118,9 +118,10 @@ bitflags = "2.4.1" bytemuck = "1.12" bytes = { version = "1.7", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] } -chrono-tz = "0.10.1" +chrono-tz = { version = "0.10.1", features = ["case-insensitive"] } clap = { version = "4.4", features = ["derive"] } config = "0.13.0" +const_format = "0.2" crossbeam-utils = "0.8" dashmap = "6.1" datafusion = "50" @@ -130,7 +131,7 @@ datafusion-functions = "50" datafusion-functions-aggregate-common = "50" datafusion-optimizer = "50" datafusion-orc = "0.5" -datafusion-pg-catalog = "0.11" +datafusion-pg-catalog = "0.12.2" datafusion-physical-expr = "50" datafusion-physical-plan = "50" datafusion-sql = "50" @@ -147,7 +148,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d fst = "0.4.7" futures = "0.3" futures-util = "0.3" -greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "72a0d22e0f5f716b2ee21bca091f87a88c36e5ca" } +greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0df99f09f1d6785055b2d9da96fc4ecc2bdf6803" } hex = "0.4" http = "1" humantime = "2.1" @@ -191,7 +192,7 @@ prost-types = "0.13" raft-engine = { version = "0.4.1", default-features = false } rand = "0.9" ratelimit = "0.10" -regex = "1.8" +regex = "1.12" regex-automata = "0.4" reqwest = { version = "0.12", default-features = false, features = [ "json", @@ -218,12 +219,7 @@ similar-asserts = "1.6.0" smallvec = { version = "1", features = ["serde"] } snafu = "0.8" sqlparser = { version = "0.58.0", default-features = false, features = ["std", "visitor", "serde"] } -sqlx = { version = "0.8", features = [ - "runtime-tokio-rustls", - "mysql", - "postgres", - "chrono", -] } +sqlx = { version = "0.8", default-features = false, features = ["any", "macros", "json", "runtime-tokio-rustls"] } strum = { version = "0.27", features = ["derive"] } sysinfo = "0.33" tempfile = "3" @@ -238,6 +234,7 @@ tower = "0.5" tower-http = "0.6" tracing = "0.1" tracing-appender = "0.2" +tracing-opentelemetry = "0.31.0" tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] } typetag = "0.2" uuid = { version = "1.17", features = ["serde", "v4", "fast-rng"] } diff --git a/Makefile b/Makefile index a200244029..91fb600d14 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,8 @@ CARGO_REGISTRY_CACHE ?= ${HOME}/.cargo/registry ARCH := $(shell uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/') OUTPUT_DIR := $(shell if [ "$(RELEASE)" = "true" ]; then echo "release"; elif [ ! -z "$(CARGO_PROFILE)" ]; then echo "$(CARGO_PROFILE)" ; else echo "debug"; fi) SQLNESS_OPTS ?= +EXTRA_BUILD_ENVS ?= +ASSEMBLED_EXTRA_BUILD_ENV := $(foreach var,$(EXTRA_BUILD_ENVS),-e $(var)) # The arguments for running integration tests. ETCD_VERSION ?= v3.5.9 @@ -83,6 +85,7 @@ build: ## Build debug version greptime. .PHONY: build-by-dev-builder build-by-dev-builder: ## Build greptime by dev-builder. docker run --network=host \ + ${ASSEMBLED_EXTRA_BUILD_ENV} \ -v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry \ -w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \ make build \ diff --git a/README.md b/README.md index 94944c36ba..6c83582a24 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,7 @@

- GreptimeCloud | - User Guide | + User Guide | API Docs | Roadmap 2025

@@ -67,17 +66,24 @@ ## Introduction -**GreptimeDB** is an open-source, cloud-native database purpose-built for the unified collection and analysis of observability data (metrics, logs, and traces). Whether you’re operating on the edge, in the cloud, or across hybrid environments, GreptimeDB empowers real-time insights at massive scale — all in one system. +**GreptimeDB** is an open-source, cloud-native database that unifies metrics, logs, and traces, enabling real-time observability at any scale — across edge, cloud, and hybrid environments. ## Features | Feature | Description | | --------- | ----------- | -| [Unified Observability Data](https://docs.greptime.com/user-guide/concepts/why-greptimedb) | Store metrics, logs, and traces as timestamped, contextual wide events. Query via [SQL](https://docs.greptime.com/user-guide/query-data/sql), [PromQL](https://docs.greptime.com/user-guide/query-data/promql), and [streaming](https://docs.greptime.com/user-guide/flow-computation/overview). | -| [High Performance & Cost Effective](https://docs.greptime.com/user-guide/manage-data/data-index) | Written in Rust, with a distributed query engine, [rich indexing](https://docs.greptime.com/user-guide/manage-data/data-index), and optimized columnar storage, delivering sub-second responses at PB scale. | -| [Cloud-Native Architecture](https://docs.greptime.com/user-guide/concepts/architecture) | Designed for [Kubernetes](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management), with compute/storage separation, native object storage (AWS S3, Azure Blob, etc.) and seamless cross-cloud access. | -| [Developer-Friendly](https://docs.greptime.com/user-guide/protocols/overview) | Access via SQL/PromQL interfaces, REST API, MySQL/PostgreSQL protocols, and popular ingestion [protocols](https://docs.greptime.com/user-guide/protocols/overview). | -| [Flexible Deployment](https://docs.greptime.com/user-guide/deployments-administration/overview) | Deploy anywhere: edge (including ARM/[Android](https://docs.greptime.com/user-guide/deployments-administration/run-on-android)) or cloud, with unified APIs and efficient data sync. | +| [All-in-One Observability](https://docs.greptime.com/user-guide/concepts/why-greptimedb) | OpenTelemetry-native platform unifying metrics, logs, and traces. Query via [SQL](https://docs.greptime.com/user-guide/query-data/sql), [PromQL](https://docs.greptime.com/user-guide/query-data/promql), and [Flow](https://docs.greptime.com/user-guide/flow-computation/overview). | +| [High Performance](https://docs.greptime.com/user-guide/manage-data/data-index) | Written in Rust with [rich indexing](https://docs.greptime.com/user-guide/manage-data/data-index) (inverted, fulltext, skipping, vector), delivering sub-second responses at PB scale. | +| [Cost Efficiency](https://docs.greptime.com/user-guide/concepts/architecture) | 50x lower operational and storage costs with compute-storage separation and native object storage (S3, Azure Blob, etc.). | +| [Cloud-Native & Scalable](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management) | Purpose-built for [Kubernetes](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management) with unlimited cross-cloud scaling, handling hundreds of thousands of concurrent requests. | +| [Developer-Friendly](https://docs.greptime.com/user-guide/protocols/overview) | SQL/PromQL interfaces, built-in web dashboard, REST API, MySQL/PostgreSQL protocol compatibility, and native [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/) support. | +| [Flexible Deployment](https://docs.greptime.com/user-guide/deployments-administration/overview) | Deploy anywhere from ARM-based edge devices (including [Android](https://docs.greptime.com/user-guide/deployments-administration/run-on-android)) to cloud, with unified APIs and efficient data sync. | + + ✅ **Perfect for:** + - Unified observability stack replacing Prometheus + Loki + Tempo + - Large-scale metrics with high cardinality (millions to billions of time series) + - Large-scale observability platform requiring cost efficiency and scalability + - IoT and edge computing with resource and bandwidth constraints Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb) and [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database). @@ -86,10 +92,10 @@ Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why | Feature | GreptimeDB | Traditional TSDB | Log Stores | |----------------------------------|-----------------------|--------------------|-----------------| | Data Types | Metrics, Logs, Traces | Metrics only | Logs only | -| Query Language | SQL, PromQL, Streaming| Custom/PromQL | Custom/DSL | +| Query Language | SQL, PromQL | Custom/PromQL | Custom/DSL | | Deployment | Edge + Cloud | Cloud/On-prem | Mostly central | | Indexing & Performance | PB-Scale, Sub-second | Varies | Varies | -| Integration | REST, SQL, Common protocols | Varies | Varies | +| Integration | REST API, SQL, Common protocols | Varies | Varies | **Performance:** * [GreptimeDB tops JSONBench's billion-record cold run test!](https://greptime.com/blogs/2025-03-18-jsonbench-greptimedb-performance) @@ -99,22 +105,18 @@ Read [more benchmark reports](https://docs.greptime.com/user-guide/concepts/feat ## Architecture -* Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document. -* [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB: +GreptimeDB can run in two modes: +* **Standalone Mode** - Single binary for development and small deployments +* **Distributed Mode** - Separate components for production scale: + - Frontend: Query processing and protocol handling + - Datanode: Data storage and retrieval + - Metasrv: Metadata management and coordination + +Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document. [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB: GreptimeDB System Overview ## Try GreptimeDB -### 1. [Live Demo](https://greptime.com/playground) - -Experience GreptimeDB directly in your browser. - -### 2. [GreptimeCloud](https://console.greptime.cloud/) - -Start instantly with a free cluster. - -### 3. Docker (Local Quickstart) - ```shell docker pull greptime/greptimedb ``` @@ -130,7 +132,8 @@ docker run -p 127.0.0.1:4000-4003:4000-4003 \ --postgres-addr 0.0.0.0:4003 ``` Dashboard: [http://localhost:4000/dashboard](http://localhost:4000/dashboard) -[Full Install Guide](https://docs.greptime.com/getting-started/installation/overview) + +Read more in the [full Install Guide](https://docs.greptime.com/getting-started/installation/overview). **Troubleshooting:** * Cannot connect to the database? Ensure that ports `4000`, `4001`, `4002`, and `4003` are not blocked by a firewall or used by other services. @@ -159,21 +162,26 @@ cargo run -- standalone start ## Tools & Extensions -- **Kubernetes:** [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator) -- **Helm Charts:** [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts) -- **Dashboard:** [Web UI](https://github.com/GreptimeTeam/dashboard) -- **SDKs/Ingester:** [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust), [JS](https://github.com/GreptimeTeam/greptimedb-ingester-js) -- **Grafana**: [Official Dashboard](https://github.com/GreptimeTeam/greptimedb/blob/main/grafana/README.md) +- **Kubernetes**: [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator) +- **Helm Charts**: [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts) +- **Dashboard**: [Web UI](https://github.com/GreptimeTeam/dashboard) +- **gRPC Ingester**: [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust) +- **Grafana Data Source**: [GreptimeDB Grafana data source plugin](https://github.com/GreptimeTeam/greptimedb-grafana-datasource) +- **Grafana Dashboard**: [Official Dashboard for monitoring](https://github.com/GreptimeTeam/greptimedb/blob/main/grafana/README.md) ## Project Status -> **Status:** Beta. -> **GA (v1.0):** Targeted for mid 2025. +> **Status:** Beta — marching toward v1.0 GA! +> **GA (v1.0):** January 10, 2026 -- Being used in production by early adopters +- Deployed in production by open-source projects and commercial users - Stable, actively maintained, with regular releases ([version info](https://docs.greptime.com/nightly/reference/about-greptimedb-version)) - Suitable for evaluation and pilot deployments +GreptimeDB v1.0 represents a major milestone toward maturity — marking stable APIs, production readiness, and proven performance. + +**Roadmap:** Beta1 (Nov 10) → Beta2 (Nov 24) → RC1 (Dec 8) → GA (Jan 10, 2026), please read [v1.0 highlights and release plan](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) for details. + For production use, we recommend using the latest stable release. [![Star History Chart](https://api.star-history.com/svg?repos=GreptimeTeam/GreptimeDB&type=Date)](https://www.star-history.com/#GreptimeTeam/GreptimeDB&Date) @@ -214,5 +222,5 @@ Special thanks to all contributors! See [AUTHORS.md](https://github.com/Greptime - Uses [Apache Arrow™](https://arrow.apache.org/) (memory model) - [Apache Parquet™](https://parquet.apache.org/) (file storage) -- [Apache Arrow DataFusion™](https://arrow.apache.org/datafusion/) (query engine) +- [Apache DataFusion™](https://arrow.apache.org/datafusion/) (query engine) - [Apache OpenDAL™](https://opendal.apache.org/) (data access abstraction) diff --git a/config/config.md b/config/config.md index 72d48b5bcb..29185c6b58 100644 --- a/config/config.md +++ b/config/config.md @@ -13,9 +13,10 @@ | Key | Type | Default | Descriptions | | --- | -----| ------- | ----------- | | `default_timezone` | String | Unset | The default timezone of the server. | +| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.
By default, it provides services after all regions have been initialized. | | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. | -| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. | +| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.
NOTE: This setting affects scan_memory_limit's privileged tier allocation.
When set, 70% of queries get privileged memory access (full scan_memory_limit).
The remaining 30% get standard tier access (70% of scan_memory_limit). | | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. | | `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. | | `runtime` | -- | -- | The runtime options. | @@ -103,6 +104,7 @@ | `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.
Not setting(or set to 0) this value will use the number of CPU cores divided by 2. | | `query` | -- | -- | The query engine options. | | `query.parallelism` | Integer | `0` | Parallelism of the query engine.
Default to 0, which means the number of CPU cores. | +| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).
Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit (unbounded, default behavior).
When this limit is reached, queries will fail with ResourceExhausted error.
NOTE: This does NOT limit memory used by table scans. | | `storage` | -- | -- | The data storage options. | | `storage.data_home` | String | `./greptimedb_data` | The working home directory. | | `storage.type` | String | `File` | The storage type used to store the data.
- `File`: the data is stored in the local file system.
- `S3`: the data is stored in the S3 object storage.
- `Gcs`: the data is stored in the Google Cloud Storage.
- `Azblob`: the data is stored in the Azure Blob Storage.
- `Oss`: the data is stored in the Aliyun OSS. | @@ -150,10 +152,13 @@ | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. | | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. | | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. | +| `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).
When enabled, index files are loaded into the write cache during region initialization,
which can improve query performance at the cost of longer startup times. | +| `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).
The remaining capacity is used for data (parquet) files.
Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,
1GiB is reserved for index files and 4GiB for data files. | | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. | | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | +| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit.
NOTE: Works with max_concurrent_queries for tiered memory allocation.
- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.
To align with the old behavior, the default value is 0 (no restrictions). | | `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | @@ -187,7 +192,7 @@ | `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.
Only available for `partition_tree` memtable. | | `region_engine.file` | -- | -- | Enable the file engine. | | `region_engine.metric` | -- | -- | Metric engine options. | -| `region_engine.metric.experimental_sparse_primary_key_encoding` | Bool | `false` | Whether to enable the experimental sparse primary key encoding. | +| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. | | `logging` | -- | -- | The logging options. | | `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. | | `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. | @@ -205,14 +210,6 @@ | `slow_query.record_type` | String | Unset | The record type of slow queries. It can be `system_table` or `log`. | | `slow_query.threshold` | String | Unset | The threshold of slow query. | | `slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. | -| `export_metrics` | -- | -- | The standalone can export its metrics and send to Prometheus compatible service (e.g. `greptimedb`) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. | -| `export_metrics.enable` | Bool | `false` | whether enable export metrics. | -| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. | -| `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommended to collect metrics generated by itself
You must create the database before enabling it. | -| `export_metrics.self_import.db` | String | Unset | -- | -| `export_metrics.remote_write` | -- | -- | -- | -| `export_metrics.remote_write.url` | String | `""` | The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. | -| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. | | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. | | `tracing.tokio_console_addr` | String | Unset | The tokio console address. | | `memory` | -- | -- | The memory options. | @@ -226,6 +223,7 @@ | Key | Type | Default | Descriptions | | --- | -----| ------- | ----------- | | `default_timezone` | String | Unset | The default timezone of the server. | +| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | | `max_in_flight_write_bytes` | String | Unset | The maximum in-flight write bytes. | | `runtime` | -- | -- | The runtime options. | | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. | @@ -306,6 +304,7 @@ | `query` | -- | -- | The query engine options. | | `query.parallelism` | Integer | `0` | Parallelism of the query engine.
Default to 0, which means the number of CPU cores. | | `query.allow_query_fallback` | Bool | `false` | Whether to allow query fallback when push down optimize fails.
Default to false, meaning when push down optimize failed, return error msg | +| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).
Supports absolute size (e.g., "4GB", "8GB") or percentage of system memory (e.g., "30%").
Setting it to 0 disables the limit (unbounded, default behavior).
When this limit is reached, queries will fail with ResourceExhausted error.
NOTE: This does NOT limit memory used by table scans (only applies to datanodes). | | `datanode` | -- | -- | Datanode options. | | `datanode.client` | -- | -- | Datanode client options. | | `datanode.client.connect_timeout` | String | `10s` | -- | @@ -328,12 +327,6 @@ | `slow_query.threshold` | String | `30s` | The threshold of slow query. It can be human readable time string, for example: `10s`, `100ms`, `1s`. | | `slow_query.sample_ratio` | Float | `1.0` | The sampling ratio of slow query log. The value should be in the range of (0, 1]. For example, `0.1` means 10% of the slow queries will be logged and `1.0` means all slow queries will be logged. | | `slow_query.ttl` | String | `90d` | The TTL of the `slow_queries` system table. Default is `90d` when `record_type` is `system_table`. | -| `export_metrics` | -- | -- | The frontend can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. | -| `export_metrics.enable` | Bool | `false` | whether enable export metrics. | -| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. | -| `export_metrics.remote_write` | -- | -- | -- | -| `export_metrics.remote_write.url` | String | `""` | The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. | -| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. | | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. | | `tracing.tokio_console_addr` | String | Unset | The tokio console address. | | `memory` | -- | -- | The memory options. | @@ -347,7 +340,7 @@ | Key | Type | Default | Descriptions | | --- | -----| ------- | ----------- | | `data_home` | String | `./greptimedb_data` | The working home directory. | -| `store_addrs` | Array | -- | Store server address default to etcd store.
For postgres store, the format is:
"password=password dbname=postgres user=postgres host=localhost port=5432"
For etcd store, the format is:
"127.0.0.1:2379" | +| `store_addrs` | Array | -- | Store server address(es). The format depends on the selected backend.

For etcd: a list of "host:port" endpoints.
e.g. ["192.168.1.1:2379", "192.168.1.2:2379"]

For PostgreSQL: a connection string in libpq format or URI.
e.g.
- "host=localhost port=5432 user=postgres password= dbname=postgres"
- "postgresql://user:password@localhost:5432/mydb?connect_timeout=10"
The detail see: https://docs.rs/tokio-postgres/latest/tokio_postgres/config/struct.Config.html

For mysql store, the format is a MySQL connection URL.
e.g. "mysql://user:password@localhost:3306/greptime_meta?ssl-mode=VERIFY_CA&ssl-ca=/path/to/ca.pem" | | `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. | | `backend` | String | `etcd_store` | The datastore for meta server.
Available values:
- `etcd_store` (default value)
- `memory_store`
- `postgres_store`
- `mysql_store` | | `meta_table_name` | String | `greptime_metakv` | Table name in RDS to store metadata. Effect when using a RDS kvbackend.
**Only used when backend is `postgres_store`.** | @@ -363,12 +356,11 @@ | `runtime` | -- | -- | The runtime options. | | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. | | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. | -| `backend_tls` | -- | -- | TLS configuration for kv store backend (applicable for etcd, PostgreSQL, and MySQL backends)
When using etcd, PostgreSQL, or MySQL as metadata store, you can configure TLS here | +| `backend_tls` | -- | -- | TLS configuration for kv store backend (applicable for etcd, PostgreSQL, and MySQL backends)
When using etcd, PostgreSQL, or MySQL as metadata store, you can configure TLS here

Note: if TLS is configured in both this section and the `store_addrs` connection string, the
settings here will override the TLS settings in `store_addrs`. | | `backend_tls.mode` | String | `prefer` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html
- "disable" - No TLS
- "prefer" (default) - Try TLS, fallback to plain
- "require" - Require TLS
- "verify_ca" - Require TLS and verify CA
- "verify_full" - Require TLS and verify hostname | | `backend_tls.cert_path` | String | `""` | Path to client certificate file (for client authentication)
Like "/path/to/client.crt" | | `backend_tls.key_path` | String | `""` | Path to client private key file (for client authentication)
Like "/path/to/client.key" | | `backend_tls.ca_cert_path` | String | `""` | Path to CA certificate file (for server certificate verification)
Required when using custom CAs or self-signed certificates
Leave empty to use system root certificates only
Like "/path/to/ca.crt" | -| `backend_tls.watch` | Bool | `false` | Watch for certificate file changes and auto reload | | `grpc` | -- | -- | The gRPC server options. | | `grpc.bind_addr` | String | `127.0.0.1:3002` | The address to bind the gRPC server. | | `grpc.server_addr` | String | `127.0.0.1:3002` | The communication server address for the frontend and datanode to connect to metasrv.
If left empty or unset, the server will automatically use the IP address of the first network interface
on the host, with the same port number as the one specified in `bind_addr`. | @@ -423,12 +415,6 @@ | `logging.otlp_headers` | -- | -- | Additional OTLP headers, only valid when using OTLP http | | `logging.tracing_sample_ratio` | -- | Unset | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 | | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- | -| `export_metrics` | -- | -- | The metasrv can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. | -| `export_metrics.enable` | Bool | `false` | whether enable export metrics. | -| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. | -| `export_metrics.remote_write` | -- | -- | -- | -| `export_metrics.remote_write.url` | String | `""` | The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. | -| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. | | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. | | `tracing.tokio_console_addr` | String | Unset | The tokio console address. | | `memory` | -- | -- | The memory options. | @@ -440,10 +426,11 @@ | Key | Type | Default | Descriptions | | --- | -----| ------- | ----------- | | `node_id` | Integer | Unset | The datanode identifier and should be unique in the cluster. | +| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | | `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.
It will block the datanode start if it can't receive leases in the heartbeat from metasrv. | | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.
By default, it provides services after all regions have been initialized. | | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. | -| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited. | +| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.
NOTE: This setting affects scan_memory_limit's privileged tier allocation.
When set, 70% of queries get privileged memory access (full scan_memory_limit).
The remaining 30% get standard tier access (70% of scan_memory_limit). | | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. | | `http` | -- | -- | The HTTP server options. | | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. | @@ -497,6 +484,7 @@ | `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.
**It's only used when the provider is `kafka`**.

This option ensures that when Kafka messages are deleted, the system
can still successfully replay memtable data without throwing an
out-of-range error.
However, enabling this option might lead to unexpected data loss,
as the system will skip over missing entries instead of treating
them as critical errors. | | `query` | -- | -- | The query engine options. | | `query.parallelism` | Integer | `0` | Parallelism of the query engine.
Default to 0, which means the number of CPU cores. | +| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).
Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit (unbounded, default behavior).
When this limit is reached, queries will fail with ResourceExhausted error.
NOTE: This does NOT limit memory used by table scans. | | `storage` | -- | -- | The data storage options. | | `storage.data_home` | String | `./greptimedb_data` | The working home directory. | | `storage.type` | String | `File` | The storage type used to store the data.
- `File`: the data is stored in the local file system.
- `S3`: the data is stored in the S3 object storage.
- `Gcs`: the data is stored in the Google Cloud Storage.
- `Azblob`: the data is stored in the Azure Blob Storage.
- `Oss`: the data is stored in the Aliyun OSS. | @@ -546,10 +534,13 @@ | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. | | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. | | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. | +| `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).
When enabled, index files are loaded into the write cache during region initialization,
which can improve query performance at the cost of longer startup times. | +| `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).
The remaining capacity is used for data (parquet) files.
Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,
1GiB is reserved for index files and 4GiB for data files. | | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. | | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. | | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. | | `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. | +| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.
Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit.
NOTE: Works with max_concurrent_queries for tiered memory allocation.
- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. | | `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.
To align with the old behavior, the default value is 0 (no restrictions). | | `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. | | `region_engine.mito.index` | -- | -- | The options for index in Mito engine. | @@ -583,7 +574,7 @@ | `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.
Only available for `partition_tree` memtable. | | `region_engine.file` | -- | -- | Enable the file engine. | | `region_engine.metric` | -- | -- | Metric engine options. | -| `region_engine.metric.experimental_sparse_primary_key_encoding` | Bool | `false` | Whether to enable the experimental sparse primary key encoding. | +| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. | | `logging` | -- | -- | The logging options. | | `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. | | `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. | @@ -596,12 +587,6 @@ | `logging.otlp_headers` | -- | -- | Additional OTLP headers, only valid when using OTLP http | | `logging.tracing_sample_ratio` | -- | Unset | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 | | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- | -| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. | -| `export_metrics.enable` | Bool | `false` | whether enable export metrics. | -| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. | -| `export_metrics.remote_write` | -- | -- | -- | -| `export_metrics.remote_write.url` | String | `""` | The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. | -| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. | | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. | | `tracing.tokio_console_addr` | String | Unset | The tokio console address. | | `memory` | -- | -- | The memory options. | @@ -670,5 +655,6 @@ | `tracing.tokio_console_addr` | String | Unset | The tokio console address. | | `query` | -- | -- | -- | | `query.parallelism` | Integer | `1` | Parallelism of the query engine for query sent by flownode.
Default to 1, so it won't use too much cpu or memory | +| `query.memory_pool_size` | String | `50%` | Memory pool size for query execution operators (aggregation, sorting, join).
Supports absolute size (e.g., "1GB", "2GB") or percentage of system memory (e.g., "20%").
Setting it to 0 disables the limit (unbounded, default behavior).
When this limit is reached, queries will fail with ResourceExhausted error.
NOTE: This does NOT limit memory used by table scans. | | `memory` | -- | -- | The memory options. | | `memory.enable_heap_profiling` | Bool | `true` | Whether to enable heap profiling activation during startup.
When enabled, heap profiling will be activated if the `MALLOC_CONF` environment variable
is set to "prof:true,prof_active:false". The official image adds this env variable.
Default is true. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index 82ee07bd84..8db6bf3d1c 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -2,6 +2,10 @@ ## @toml2docs:none-default node_id = 42 +## The default column prefix for auto-created time index and value columns. +## @toml2docs:none-default +default_column_prefix = "greptime" + ## Start services after regions have obtained leases. ## It will block the datanode start if it can't receive leases in the heartbeat from metasrv. require_lease_before_startup = false @@ -14,6 +18,9 @@ init_regions_in_background = false init_regions_parallelism = 16 ## The maximum current queries allowed to be executed. Zero means unlimited. +## NOTE: This setting affects scan_memory_limit's privileged tier allocation. +## When set, 70% of queries get privileged memory access (full scan_memory_limit). +## The remaining 30% get standard tier access (70% of scan_memory_limit). max_concurrent_queries = 0 ## Enable telemetry to collect anonymous usage data. Enabled by default. @@ -257,6 +264,13 @@ overwrite_entry_start_id = false ## Default to 0, which means the number of CPU cores. parallelism = 0 +## Memory pool size for query execution operators (aggregation, sorting, join). +## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%"). +## Setting it to 0 disables the limit (unbounded, default behavior). +## When this limit is reached, queries will fail with ResourceExhausted error. +## NOTE: This does NOT limit memory used by table scans. +memory_pool_size = "50%" + ## The data storage options. [storage] ## The working home directory. @@ -485,6 +499,17 @@ write_cache_size = "5GiB" ## @toml2docs:none-default write_cache_ttl = "8h" +## Preload index (puffin) files into cache on region open (default: true). +## When enabled, index files are loaded into the write cache during region initialization, +## which can improve query performance at the cost of longer startup times. +preload_index_cache = true + +## Percentage of write cache capacity allocated for index (puffin) files (default: 20). +## The remaining capacity is used for data (parquet) files. +## Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation, +## 1GiB is reserved for index files and 4GiB for data files. +index_cache_percent = 20 + ## Buffer size for SST writing. sst_write_buffer_size = "8MB" @@ -497,6 +522,14 @@ max_concurrent_scan_files = 384 ## Whether to allow stale WAL entries read during replay. allow_stale_entries = false +## Memory limit for table scans across all queries. +## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%"). +## Setting it to 0 disables the limit. +## NOTE: Works with max_concurrent_queries for tiered memory allocation. +## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access. +## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. +scan_memory_limit = "50%" + ## Minimum time interval between two compactions. ## To align with the old behavior, the default value is 0 (no restrictions). min_compaction_interval = "0m" @@ -636,8 +669,8 @@ fork_dictionary_bytes = "1GiB" [[region_engine]] ## Metric engine options. [region_engine.metric] -## Whether to enable the experimental sparse primary key encoding. -experimental_sparse_primary_key_encoding = false +## Whether to use sparse primary key encoding. +sparse_primary_key_encoding = true ## The logging options. [logging] @@ -679,21 +712,6 @@ otlp_export_protocol = "http" [logging.tracing_sample_ratio] default_ratio = 1.0 -## The datanode can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API. -## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. -[export_metrics] -## whether enable export metrics. -enable = false -## The interval of export metrics. -write_interval = "30s" - -[export_metrics.remote_write] -## The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. -url = "" - -## HTTP headers of Prometheus remote-write carry. -headers = { } - ## The tracing options. Only effect when compiled with `tokio-console` feature. #+ [tracing] ## The tokio console address. diff --git a/config/flownode.example.toml b/config/flownode.example.toml index 81ff25f283..4e44c1ecbb 100644 --- a/config/flownode.example.toml +++ b/config/flownode.example.toml @@ -158,6 +158,13 @@ default_ratio = 1.0 ## Default to 1, so it won't use too much cpu or memory parallelism = 1 +## Memory pool size for query execution operators (aggregation, sorting, join). +## Supports absolute size (e.g., "1GB", "2GB") or percentage of system memory (e.g., "20%"). +## Setting it to 0 disables the limit (unbounded, default behavior). +## When this limit is reached, queries will fail with ResourceExhausted error. +## NOTE: This does NOT limit memory used by table scans. +memory_pool_size = "50%" + ## The memory options. [memory] ## Whether to enable heap profiling activation during startup. diff --git a/config/frontend.example.toml b/config/frontend.example.toml index 9ffcdad540..ecac6cff01 100644 --- a/config/frontend.example.toml +++ b/config/frontend.example.toml @@ -2,6 +2,10 @@ ## @toml2docs:none-default default_timezone = "UTC" +## The default column prefix for auto-created time index and value columns. +## @toml2docs:none-default +default_column_prefix = "greptime" + ## The maximum in-flight write bytes. ## @toml2docs:none-default #+ max_in_flight_write_bytes = "500MB" @@ -252,6 +256,13 @@ parallelism = 0 ## Default to false, meaning when push down optimize failed, return error msg allow_query_fallback = false +## Memory pool size for query execution operators (aggregation, sorting, join). +## Supports absolute size (e.g., "4GB", "8GB") or percentage of system memory (e.g., "30%"). +## Setting it to 0 disables the limit (unbounded, default behavior). +## When this limit is reached, queries will fail with ResourceExhausted error. +## NOTE: This does NOT limit memory used by table scans (only applies to datanodes). +memory_pool_size = "50%" + ## Datanode options. [datanode] ## Datanode client options. @@ -318,21 +329,6 @@ sample_ratio = 1.0 ## The TTL of the `slow_queries` system table. Default is `90d` when `record_type` is `system_table`. ttl = "90d" -## The frontend can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API. -## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. -[export_metrics] -## whether enable export metrics. -enable = false -## The interval of export metrics. -write_interval = "30s" - -[export_metrics.remote_write] -## The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. -url = "" - -## HTTP headers of Prometheus remote-write carry. -headers = { } - ## The tracing options. Only effect when compiled with `tokio-console` feature. #+ [tracing] ## The tokio console address. diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml index d7d5ace99c..7997383a52 100644 --- a/config/metasrv.example.toml +++ b/config/metasrv.example.toml @@ -1,11 +1,19 @@ ## The working home directory. data_home = "./greptimedb_data" -## Store server address default to etcd store. -## For postgres store, the format is: -## "password=password dbname=postgres user=postgres host=localhost port=5432" -## For etcd store, the format is: -## "127.0.0.1:2379" +## Store server address(es). The format depends on the selected backend. +## +## For etcd: a list of "host:port" endpoints. +## e.g. ["192.168.1.1:2379", "192.168.1.2:2379"] +## +## For PostgreSQL: a connection string in libpq format or URI. +## e.g. +## - "host=localhost port=5432 user=postgres password= dbname=postgres" +## - "postgresql://user:password@localhost:5432/mydb?connect_timeout=10" +## The detail see: https://docs.rs/tokio-postgres/latest/tokio_postgres/config/struct.Config.html +## +## For mysql store, the format is a MySQL connection URL. +## e.g. "mysql://user:password@localhost:3306/greptime_meta?ssl-mode=VERIFY_CA&ssl-ca=/path/to/ca.pem" store_addrs = ["127.0.0.1:2379"] ## If it's not empty, the metasrv will store all data with this key prefix. @@ -75,6 +83,9 @@ node_max_idle_time = "24hours" ## TLS configuration for kv store backend (applicable for etcd, PostgreSQL, and MySQL backends) ## When using etcd, PostgreSQL, or MySQL as metadata store, you can configure TLS here +## +## Note: if TLS is configured in both this section and the `store_addrs` connection string, the +## settings here will override the TLS settings in `store_addrs`. [backend_tls] ## TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html ## - "disable" - No TLS @@ -98,9 +109,6 @@ key_path = "" ## Like "/path/to/ca.crt" ca_cert_path = "" -## Watch for certificate file changes and auto reload -watch = false - ## The gRPC server options. [grpc] ## The address to bind the gRPC server. @@ -323,21 +331,6 @@ otlp_export_protocol = "http" [logging.tracing_sample_ratio] default_ratio = 1.0 -## The metasrv can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API. -## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. -[export_metrics] -## whether enable export metrics. -enable = false -## The interval of export metrics. -write_interval = "30s" - -[export_metrics.remote_write] -## The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. -url = "" - -## HTTP headers of Prometheus remote-write carry. -headers = { } - ## The tracing options. Only effect when compiled with `tokio-console` feature. #+ [tracing] ## The tokio console address. diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 744dbbe751..661067d2a1 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -2,6 +2,10 @@ ## @toml2docs:none-default default_timezone = "UTC" +## The default column prefix for auto-created time index and value columns. +## @toml2docs:none-default +default_column_prefix = "greptime" + ## Initialize all regions in the background during the startup. ## By default, it provides services after all regions have been initialized. init_regions_in_background = false @@ -10,6 +14,9 @@ init_regions_in_background = false init_regions_parallelism = 16 ## The maximum current queries allowed to be executed. Zero means unlimited. +## NOTE: This setting affects scan_memory_limit's privileged tier allocation. +## When set, 70% of queries get privileged memory access (full scan_memory_limit). +## The remaining 30% get standard tier access (70% of scan_memory_limit). max_concurrent_queries = 0 ## Enable telemetry to collect anonymous usage data. Enabled by default. @@ -361,6 +368,13 @@ max_running_procedures = 128 ## Default to 0, which means the number of CPU cores. parallelism = 0 +## Memory pool size for query execution operators (aggregation, sorting, join). +## Supports absolute size (e.g., "2GB", "4GB") or percentage of system memory (e.g., "20%"). +## Setting it to 0 disables the limit (unbounded, default behavior). +## When this limit is reached, queries will fail with ResourceExhausted error. +## NOTE: This does NOT limit memory used by table scans. +memory_pool_size = "50%" + ## The data storage options. [storage] ## The working home directory. @@ -576,6 +590,17 @@ write_cache_size = "5GiB" ## @toml2docs:none-default write_cache_ttl = "8h" +## Preload index (puffin) files into cache on region open (default: true). +## When enabled, index files are loaded into the write cache during region initialization, +## which can improve query performance at the cost of longer startup times. +preload_index_cache = true + +## Percentage of write cache capacity allocated for index (puffin) files (default: 20). +## The remaining capacity is used for data (parquet) files. +## Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation, +## 1GiB is reserved for index files and 4GiB for data files. +index_cache_percent = 20 + ## Buffer size for SST writing. sst_write_buffer_size = "8MB" @@ -588,6 +613,14 @@ max_concurrent_scan_files = 384 ## Whether to allow stale WAL entries read during replay. allow_stale_entries = false +## Memory limit for table scans across all queries. +## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%"). +## Setting it to 0 disables the limit. +## NOTE: Works with max_concurrent_queries for tiered memory allocation. +## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access. +## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. +scan_memory_limit = "50%" + ## Minimum time interval between two compactions. ## To align with the old behavior, the default value is 0 (no restrictions). min_compaction_interval = "0m" @@ -727,8 +760,8 @@ fork_dictionary_bytes = "1GiB" [[region_engine]] ## Metric engine options. [region_engine.metric] -## Whether to enable the experimental sparse primary key encoding. -experimental_sparse_primary_key_encoding = false +## Whether to use sparse primary key encoding. +sparse_primary_key_encoding = true ## The logging options. [logging] @@ -787,27 +820,6 @@ default_ratio = 1.0 ## @toml2docs:none-default #+ sample_ratio = 1.0 -## The standalone can export its metrics and send to Prometheus compatible service (e.g. `greptimedb`) from remote-write API. -## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. -[export_metrics] -## whether enable export metrics. -enable = false -## The interval of export metrics. -write_interval = "30s" - -## For `standalone` mode, `self_import` is recommended to collect metrics generated by itself -## You must create the database before enabling it. -[export_metrics.self_import] -## @toml2docs:none-default -db = "greptime_metrics" - -[export_metrics.remote_write] -## The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. -url = "" - -## HTTP headers of Prometheus remote-write carry. -headers = { } - ## The tracing options. Only effect when compiled with `tokio-console` feature. #+ [tracing] ## The tokio console address. diff --git a/docker/buildx/centos/Dockerfile b/docker/buildx/centos/Dockerfile index b7e822fac6..f5bbd15ad6 100644 --- a/docker/buildx/centos/Dockerfile +++ b/docker/buildx/centos/Dockerfile @@ -1,10 +1,10 @@ -FROM centos:7 as builder +FROM centos:7 AS builder ARG CARGO_PROFILE ARG FEATURES ARG OUTPUT_DIR -ENV LANG en_US.utf8 +ENV LANG=en_US.utf8 WORKDIR /greptimedb # Install dependencies @@ -22,7 +22,7 @@ RUN unzip protoc-3.15.8-linux-x86_64.zip -d /usr/local/ # Install Rust SHELL ["/bin/bash", "-c"] RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain none -y -ENV PATH /usr/local/bin:/root/.cargo/bin/:$PATH +ENV PATH=/usr/local/bin:/root/.cargo/bin/:$PATH # Build the project in release mode. RUN --mount=target=.,rw \ @@ -33,7 +33,7 @@ RUN --mount=target=.,rw \ TARGET_DIR=/out/target # Export the binary to the clean image. -FROM centos:7 as base +FROM centos:7 AS base ARG OUTPUT_DIR @@ -45,7 +45,7 @@ RUN yum install -y epel-release \ WORKDIR /greptime COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/ -ENV PATH /greptime/bin/:$PATH +ENV PATH=/greptime/bin/:$PATH ENV MALLOC_CONF="prof:true,prof_active:false" diff --git a/docker/buildx/distroless/Dockerfile b/docker/buildx/distroless/Dockerfile new file mode 100644 index 0000000000..b0f3af33e8 --- /dev/null +++ b/docker/buildx/distroless/Dockerfile @@ -0,0 +1,65 @@ +FROM ubuntu:22.04 AS builder + +ARG CARGO_PROFILE +ARG FEATURES +ARG OUTPUT_DIR + +ENV LANG=en_US.utf8 +WORKDIR /greptimedb + +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common + +# Install dependencies. +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update && apt-get install -y \ + libssl-dev \ + protobuf-compiler \ + curl \ + git \ + build-essential \ + pkg-config + +# Install Rust. +SHELL ["/bin/bash", "-c"] +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain none -y +ENV PATH=/root/.cargo/bin/:$PATH + +# Build the project in release mode. +RUN --mount=target=. \ + --mount=type=cache,target=/root/.cargo/registry \ + make build \ + CARGO_PROFILE=${CARGO_PROFILE} \ + FEATURES=${FEATURES} \ + TARGET_DIR=/out/target + +FROM ubuntu:22.04 AS libs + +ARG TARGETARCH + +# Copy required library dependencies based on architecture +RUN if [ "$TARGETARCH" = "amd64" ]; then \ + cp /lib/x86_64-linux-gnu/libz.so.1.2.11 /lib/x86_64-linux-gnu/libz.so.1; \ + elif [ "$TARGETARCH" = "arm64" ]; then \ + cp /lib/aarch64-linux-gnu/libz.so.1.2.11 /lib/aarch64-linux-gnu/libz.so.1; \ + else \ + echo "Unsupported architecture: $TARGETARCH" && exit 1; \ + fi + +# Export the binary to the clean distroless image. +FROM gcr.io/distroless/cc-debian12:latest AS base + +ARG OUTPUT_DIR +ARG TARGETARCH + +# Copy required library dependencies +COPY --from=libs /lib /lib +COPY --from=busybox:stable /bin/busybox /bin/busybox + +WORKDIR /greptime +COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/greptime +ENV PATH=/greptime/bin/:$PATH + +ENV MALLOC_CONF="prof:true,prof_active:false" + +ENTRYPOINT ["greptime"] diff --git a/docker/buildx/ubuntu/Dockerfile b/docker/buildx/ubuntu/Dockerfile index 6306e04688..b6dc386da4 100644 --- a/docker/buildx/ubuntu/Dockerfile +++ b/docker/buildx/ubuntu/Dockerfile @@ -1,10 +1,10 @@ -FROM ubuntu:22.04 as builder +FROM ubuntu:22.04 AS builder ARG CARGO_PROFILE ARG FEATURES ARG OUTPUT_DIR -ENV LANG en_US.utf8 +ENV LANG=en_US.utf8 WORKDIR /greptimedb RUN apt-get update && \ @@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/var/cache/apt \ # Install Rust. SHELL ["/bin/bash", "-c"] RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain none -y -ENV PATH /root/.cargo/bin/:$PATH +ENV PATH=/root/.cargo/bin/:$PATH # Build the project in release mode. RUN --mount=target=. \ @@ -35,7 +35,7 @@ RUN --mount=target=. \ # Export the binary to the clean image. # TODO(zyy17): Maybe should use the more secure container image. -FROM ubuntu:22.04 as base +FROM ubuntu:22.04 AS base ARG OUTPUT_DIR @@ -45,7 +45,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get \ WORKDIR /greptime COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/ -ENV PATH /greptime/bin/:$PATH +ENV PATH=/greptime/bin/:$PATH ENV MALLOC_CONF="prof:true,prof_active:false" diff --git a/docker/ci/centos/Dockerfile b/docker/ci/centos/Dockerfile index 480f2196b2..67efadd7dc 100644 --- a/docker/ci/centos/Dockerfile +++ b/docker/ci/centos/Dockerfile @@ -13,7 +13,7 @@ ARG TARGETARCH ADD $TARGETARCH/greptime /greptime/bin/ -ENV PATH /greptime/bin/:$PATH +ENV PATH=/greptime/bin/:$PATH ENV MALLOC_CONF="prof:true,prof_active:false" diff --git a/docker/ci/distroless/Dockerfile b/docker/ci/distroless/Dockerfile new file mode 100644 index 0000000000..f5e7ebd88e --- /dev/null +++ b/docker/ci/distroless/Dockerfile @@ -0,0 +1,40 @@ +FROM ubuntu:22.04 AS libs + +ARG TARGETARCH + +# Copy required library dependencies based on architecture +# TARGETARCH values: amd64, arm64 +# Ubuntu library paths: x86_64-linux-gnu, aarch64-linux-gnu +RUN if [ "$TARGETARCH" = "amd64" ]; then \ + mkdir -p /output/x86_64-linux-gnu && \ + cp /lib/x86_64-linux-gnu/libz.so.1.2.11 /output/x86_64-linux-gnu/libz.so.1; \ + elif [ "$TARGETARCH" = "arm64" ]; then \ + mkdir -p /output/aarch64-linux-gnu && \ + cp /lib/aarch64-linux-gnu/libz.so.1.2.11 /output/aarch64-linux-gnu/libz.so.1; \ + else \ + echo "Unsupported architecture: $TARGETARCH" && exit 1; \ + fi + +FROM gcr.io/distroless/cc-debian12:latest + +# The root path under which contains all the dependencies to build this Dockerfile. +ARG DOCKER_BUILD_ROOT=. +# The binary name of GreptimeDB executable. +# Defaults to "greptime", but sometimes in other projects it might be different. +ARG TARGET_BIN=greptime + +ARG TARGETARCH + +# Copy required library dependencies +COPY --from=libs /output /lib +COPY --from=busybox:stable /bin/busybox /bin/busybox + +ADD $TARGETARCH/$TARGET_BIN /greptime/bin/ + +ENV PATH=/greptime/bin/:$PATH + +ENV TARGET_BIN=$TARGET_BIN + +ENV MALLOC_CONF="prof:true,prof_active:false" + +ENTRYPOINT ["greptime"] diff --git a/docker/ci/ubuntu/Dockerfile b/docker/ci/ubuntu/Dockerfile index 046fd62972..c1a88e02c8 100644 --- a/docker/ci/ubuntu/Dockerfile +++ b/docker/ci/ubuntu/Dockerfile @@ -14,7 +14,7 @@ ARG TARGETARCH ADD $TARGETARCH/$TARGET_BIN /greptime/bin/ -ENV PATH /greptime/bin/:$PATH +ENV PATH=/greptime/bin/:$PATH ENV TARGET_BIN=$TARGET_BIN diff --git a/docs/how-to/how-to-change-log-level-on-the-fly.md b/docs/how-to/how-to-change-log-level-on-the-fly.md index 16a72bf6ae..c3bf2602a2 100644 --- a/docs/how-to/how-to-change-log-level-on-the-fly.md +++ b/docs/how-to/how-to-change-log-level-on-the-fly.md @@ -13,4 +13,19 @@ Log Level changed from Some("info") to "trace,flow=debug"% The data is a string in the format of `global_level,module1=level1,module2=level2,...` that follows the same rule of `RUST_LOG`. -The module is the module name of the log, and the level is the log level. The log level can be one of the following: `trace`, `debug`, `info`, `warn`, `error`, `off`(case insensitive). \ No newline at end of file +The module is the module name of the log, and the level is the log level. The log level can be one of the following: `trace`, `debug`, `info`, `warn`, `error`, `off`(case insensitive). + +# Enable/Disable Trace on the Fly + +## HTTP API + +example: +```bash +curl --data "true" 127.0.0.1:4000/debug/enable_trace +``` +And database will reply with something like: +``` +trace enabled% +``` + +Possible values are "true" or "false". diff --git a/docs/how-to/how-to-profile-memory.md b/docs/how-to/how-to-profile-memory.md index a860c95246..b4bc00093a 100644 --- a/docs/how-to/how-to-profile-memory.md +++ b/docs/how-to/how-to-profile-memory.md @@ -71,6 +71,15 @@ curl -X POST localhost:4000/debug/prof/mem/activate # Deactivate heap profiling curl -X POST localhost:4000/debug/prof/mem/deactivate + +# Activate gdump feature that dumps memory profiling data every time virtual memory usage exceeds previous maximum value. +curl -X POST localhost:4000/debug/prof/mem/gdump -d 'activate=true' + +# Deactivate gdump. +curl -X POST localhost:4000/debug/prof/mem/gdump -d 'activate=false' + +# Retrieve current gdump status. +curl -X GET localhost:4000/debug/prof/mem/gdump ``` ### Dump memory profiling data diff --git a/docs/rfcs/2025-07-23-global-gc-worker.md b/docs/rfcs/2025-07-23-global-gc-worker.md index 69d1e3ac34..331ed01f38 100644 --- a/docs/rfcs/2025-07-23-global-gc-worker.md +++ b/docs/rfcs/2025-07-23-global-gc-worker.md @@ -106,6 +106,37 @@ This mechanism may be too complex to implement at once. We can consider a two-ph Also the read replica shouldn't be later in manifest version for more than the lingering time of obsolete files, otherwise it might ref to files that are already deleted by the GC worker. - need to upload tmp manifest to object storage, which may introduce additional complexity and potential performance overhead. But since long-running queries are typically not frequent, the performance impact is expected to be minimal. +one potential race condition with region-migration is illustrated below: + +```mermaid +sequenceDiagram + participant gc_worker as GC Worker(same dn as region 1) + participant region1 as Region 1 (Leader → Follower) + participant region2 as Region 2 (Follower → Leader) + participant region_dir as Region Directory + + gc_worker->>region1: Start GC, get region manifest + activate region1 + region1-->>gc_worker: Region 1 manifest + deactivate region1 + gc_worker->>region_dir: Scan region directory + + Note over region1,region2: Region Migration Occurs + region1-->>region2: Downgrade to Follower + region2-->>region1: Becomes Leader + + region2->>region_dir: Add new file + + gc_worker->>region_dir: Continue scanning + gc_worker-->>region_dir: Discovers new file + Note over gc_worker: New file not in Region 1's manifest + gc_worker->>gc_worker: Mark file as orphan(incorrectly) +``` +which could cause gc worker to incorrectly mark the new file as orphan and delete it, if config the lingering time for orphan files(files not mentioned anywhere(in used or unused)) is not long enough. + +A good enough solution could be to use lock to prevent gc worker to happen on the region if region migration is happening on the region, and vise versa. + +The race condition between gc worker and repartition also needs to be considered carefully. For now, acquiring lock for both region-migration and repartition during gc worker process could be a simple solution. ## Conclusion and Rationale diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index f5826d01a7..3515b788b5 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -8,6 +8,7 @@ license.workspace = true workspace = true [dependencies] +arrow-schema.workspace = true common-base.workspace = true common-decimal.workspace = true common-error.workspace = true diff --git a/src/api/src/helper.rs b/src/api/src/helper.rs index da5fdcfeda..f53f3f162b 100644 --- a/src/api/src/helper.rs +++ b/src/api/src/helper.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::sync::Arc; use common_decimal::Decimal128; @@ -20,13 +20,12 @@ use common_decimal::decimal128::{DECIMAL128_DEFAULT_SCALE, DECIMAL128_MAX_PRECIS use common_time::time::Time; use common_time::timestamp::TimeUnit; use common_time::{Date, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp}; +use datatypes::json::value::{JsonNumber, JsonValue, JsonValueRef, JsonVariant}; use datatypes::prelude::{ConcreteDataType, ValueRef}; use datatypes::types::{ - IntervalType, JsonFormat, StructField, StructType, TimeType, TimestampType, -}; -use datatypes::value::{ - ListValue, ListValueRef, OrderedF32, OrderedF64, StructValue, StructValueRef, Value, + IntervalType, JsonFormat, JsonType, StructField, StructType, TimeType, TimestampType, }; +use datatypes::value::{ListValueRef, OrderedF32, OrderedF64, StructValueRef, Value}; use datatypes::vectors::VectorRef; use greptime_proto::v1::column_data_type_extension::TypeExt; use greptime_proto::v1::ddl_request::Expr; @@ -34,9 +33,9 @@ use greptime_proto::v1::greptime_request::Request; use greptime_proto::v1::query_request::Query; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ - self, ColumnDataTypeExtension, DdlRequest, DecimalTypeExtension, JsonNativeTypeExtension, - JsonTypeExtension, ListTypeExtension, QueryRequest, Row, SemanticType, StructTypeExtension, - VectorTypeExtension, + self, ColumnDataTypeExtension, DdlRequest, DecimalTypeExtension, DictionaryTypeExtension, + JsonList, JsonNativeTypeExtension, JsonObject, JsonTypeExtension, ListTypeExtension, + QueryRequest, Row, SemanticType, StructTypeExtension, VectorTypeExtension, json_value, }; use paste::paste; use snafu::prelude::*; @@ -81,6 +80,10 @@ impl ColumnDataTypeWrapper { pub fn to_parts(&self) -> (ColumnDataType, Option) { (self.datatype, self.datatype_ext.clone()) } + + pub fn into_parts(self) -> (ColumnDataType, Option) { + (self.datatype, self.datatype_ext) + } } impl From for ConcreteDataType { @@ -126,6 +129,7 @@ impl From for ConcreteDataType { }; ConcreteDataType::json_native_datatype(inner_type.into()) } + None => ConcreteDataType::Json(JsonType::null()), _ => { // invalid state, type extension is missing or invalid ConcreteDataType::null_datatype() @@ -215,6 +219,26 @@ impl From for ConcreteDataType { ConcreteDataType::null_datatype() } } + ColumnDataType::Dictionary => { + if let Some(TypeExt::DictionaryType(d)) = datatype_wrapper + .datatype_ext + .as_ref() + .and_then(|datatype_ext| datatype_ext.type_ext.as_ref()) + { + let key_type = ColumnDataTypeWrapper { + datatype: d.key_datatype(), + datatype_ext: d.key_datatype_extension.clone().map(|ext| *ext), + }; + let value_type = ColumnDataTypeWrapper { + datatype: d.value_datatype(), + datatype_ext: d.value_datatype_extension.clone().map(|ext| *ext), + }; + ConcreteDataType::dictionary_datatype(key_type.into(), value_type.into()) + } else { + // invalid state: type extension not found + ConcreteDataType::null_datatype() + } + } } } } @@ -338,13 +362,30 @@ impl ColumnDataTypeWrapper { }), } } + + pub fn dictionary_datatype( + key_type: ColumnDataTypeWrapper, + value_type: ColumnDataTypeWrapper, + ) -> Self { + ColumnDataTypeWrapper { + datatype: ColumnDataType::Dictionary, + datatype_ext: Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::DictionaryType(Box::new(DictionaryTypeExtension { + key_datatype: key_type.datatype().into(), + key_datatype_extension: key_type.datatype_ext.map(Box::new), + value_datatype: value_type.datatype().into(), + value_datatype_extension: value_type.datatype_ext.map(Box::new), + }))), + }), + } + } } impl TryFrom for ColumnDataTypeWrapper { type Error = error::Error; fn try_from(datatype: ConcreteDataType) -> Result { - let column_datatype = match datatype { + let column_datatype = match &datatype { ConcreteDataType::Boolean(_) => ColumnDataType::Boolean, ConcreteDataType::Int8(_) => ColumnDataType::Int8, ConcreteDataType::Int16(_) => ColumnDataType::Int16, @@ -381,9 +422,8 @@ impl TryFrom for ColumnDataTypeWrapper { ConcreteDataType::Vector(_) => ColumnDataType::Vector, ConcreteDataType::List(_) => ColumnDataType::List, ConcreteDataType::Struct(_) => ColumnDataType::Struct, - ConcreteDataType::Null(_) - | ConcreteDataType::Dictionary(_) - | ConcreteDataType::Duration(_) => { + ConcreteDataType::Dictionary(_) => ColumnDataType::Dictionary, + ConcreteDataType::Null(_) | ConcreteDataType::Duration(_) => { return error::IntoColumnDataTypeSnafu { from: datatype }.fail(); } }; @@ -404,16 +444,22 @@ impl TryFrom for ColumnDataTypeWrapper { JsonFormat::Jsonb => Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())), }), - JsonFormat::Native(inner) => { - let inner_type = ColumnDataTypeWrapper::try_from(*inner.clone())?; - Some(ColumnDataTypeExtension { - type_ext: Some(TypeExt::JsonNativeType(Box::new( - JsonNativeTypeExtension { - datatype: inner_type.datatype.into(), - datatype_extension: inner_type.datatype_ext.map(Box::new), - }, - ))), - }) + JsonFormat::Native(native_type) => { + if native_type.is_null() { + None + } else { + let native_type = ConcreteDataType::from(native_type.as_ref()); + let (datatype, datatype_extension) = + ColumnDataTypeWrapper::try_from(native_type)?.into_parts(); + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::JsonNativeType(Box::new( + JsonNativeTypeExtension { + datatype: datatype as i32, + datatype_extension: datatype_extension.map(Box::new), + }, + ))), + }) + } } } } else { @@ -463,6 +509,25 @@ impl TryFrom for ColumnDataTypeWrapper { None } } + ColumnDataType::Dictionary => { + if let ConcreteDataType::Dictionary(dict_type) = &datatype { + let key_type = ColumnDataTypeWrapper::try_from(dict_type.key_type().clone())?; + let value_type = + ColumnDataTypeWrapper::try_from(dict_type.value_type().clone())?; + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::DictionaryType(Box::new( + DictionaryTypeExtension { + key_datatype: key_type.datatype.into(), + key_datatype_extension: key_type.datatype_ext.map(Box::new), + value_datatype: value_type.datatype.into(), + value_datatype_extension: value_type.datatype_ext.map(Box::new), + }, + ))), + }) + } else { + None + } + } _ => None, }; Ok(Self { @@ -601,6 +666,9 @@ pub fn values_with_capacity(datatype: ColumnDataType, capacity: usize) -> Values struct_values: Vec::with_capacity(capacity), ..Default::default() }, + ColumnDataType::Dictionary => Values { + ..Default::default() + }, } } @@ -801,21 +869,8 @@ pub fn pb_value_to_value_ref<'a>( } ValueData::JsonValue(inner_value) => { - let json_datatype_ext = datatype_ext - .as_ref() - .and_then(|ext| { - if let Some(TypeExt::JsonNativeType(l)) = &ext.type_ext { - Some(l) - } else { - None - } - }) - .expect("json value must contain datatype ext"); - - ValueRef::Json(Box::new(pb_value_to_value_ref( - inner_value, - json_datatype_ext.datatype_extension.as_deref(), - ))) + let value = decode_json_value(inner_value); + ValueRef::Json(Box::new(value)) } } } @@ -839,125 +894,64 @@ pub fn is_column_type_value_eq( .unwrap_or(false) } -/// Convert value into proto's value. -pub fn to_proto_value(value: Value) -> v1::Value { - match value { - Value::Null => v1::Value { value_data: None }, - Value::Boolean(v) => v1::Value { - value_data: Some(ValueData::BoolValue(v)), - }, - Value::UInt8(v) => v1::Value { - value_data: Some(ValueData::U8Value(v.into())), - }, - Value::UInt16(v) => v1::Value { - value_data: Some(ValueData::U16Value(v.into())), - }, - Value::UInt32(v) => v1::Value { - value_data: Some(ValueData::U32Value(v)), - }, - Value::UInt64(v) => v1::Value { - value_data: Some(ValueData::U64Value(v)), - }, - Value::Int8(v) => v1::Value { - value_data: Some(ValueData::I8Value(v.into())), - }, - Value::Int16(v) => v1::Value { - value_data: Some(ValueData::I16Value(v.into())), - }, - Value::Int32(v) => v1::Value { - value_data: Some(ValueData::I32Value(v)), - }, - Value::Int64(v) => v1::Value { - value_data: Some(ValueData::I64Value(v)), - }, - Value::Float32(v) => v1::Value { - value_data: Some(ValueData::F32Value(*v)), - }, - Value::Float64(v) => v1::Value { - value_data: Some(ValueData::F64Value(*v)), - }, - Value::String(v) => v1::Value { - value_data: Some(ValueData::StringValue(v.as_utf8().to_string())), - }, - Value::Binary(v) => v1::Value { - value_data: Some(ValueData::BinaryValue(v.to_vec())), - }, - Value::Date(v) => v1::Value { - value_data: Some(ValueData::DateValue(v.val())), - }, - Value::Timestamp(v) => match v.unit() { - TimeUnit::Second => v1::Value { - value_data: Some(ValueData::TimestampSecondValue(v.value())), - }, - TimeUnit::Millisecond => v1::Value { - value_data: Some(ValueData::TimestampMillisecondValue(v.value())), - }, - TimeUnit::Microsecond => v1::Value { - value_data: Some(ValueData::TimestampMicrosecondValue(v.value())), - }, - TimeUnit::Nanosecond => v1::Value { - value_data: Some(ValueData::TimestampNanosecondValue(v.value())), - }, - }, - Value::Time(v) => match v.unit() { - TimeUnit::Second => v1::Value { - value_data: Some(ValueData::TimeSecondValue(v.value())), - }, - TimeUnit::Millisecond => v1::Value { - value_data: Some(ValueData::TimeMillisecondValue(v.value())), - }, - TimeUnit::Microsecond => v1::Value { - value_data: Some(ValueData::TimeMicrosecondValue(v.value())), - }, - TimeUnit::Nanosecond => v1::Value { - value_data: Some(ValueData::TimeNanosecondValue(v.value())), - }, - }, - Value::IntervalYearMonth(v) => v1::Value { - value_data: Some(ValueData::IntervalYearMonthValue(v.to_i32())), - }, - Value::IntervalDayTime(v) => v1::Value { - value_data: Some(ValueData::IntervalDayTimeValue(v.to_i64())), - }, - Value::IntervalMonthDayNano(v) => v1::Value { - value_data: Some(ValueData::IntervalMonthDayNanoValue( - convert_month_day_nano_to_pb(v), - )), - }, - Value::Decimal128(v) => v1::Value { - value_data: Some(ValueData::Decimal128Value(convert_to_pb_decimal128(v))), - }, - Value::List(list_value) => v1::Value { - value_data: Some(ValueData::ListValue(v1::ListValue { - items: convert_list_to_pb_values(list_value), +fn encode_json_value(value: JsonValue) -> v1::JsonValue { + fn helper(json: JsonVariant) -> v1::JsonValue { + let value = match json { + JsonVariant::Null => None, + JsonVariant::Bool(x) => Some(json_value::Value::Boolean(x)), + JsonVariant::Number(x) => Some(match x { + JsonNumber::PosInt(i) => json_value::Value::Uint(i), + JsonNumber::NegInt(i) => json_value::Value::Int(i), + JsonNumber::Float(f) => json_value::Value::Float(f.0), + }), + JsonVariant::String(x) => Some(json_value::Value::Str(x)), + JsonVariant::Array(x) => Some(json_value::Value::Array(JsonList { + items: x.into_iter().map(helper).collect::>(), })), - }, - Value::Struct(struct_value) => v1::Value { - value_data: Some(ValueData::StructValue(v1::StructValue { - items: convert_struct_to_pb_values(struct_value), - })), - }, - Value::Json(v) => v1::Value { - value_data: Some(ValueData::JsonValue(Box::new(to_proto_value(*v)))), - }, - Value::Duration(_) => v1::Value { value_data: None }, + JsonVariant::Object(x) => { + let entries = x + .into_iter() + .map(|(key, v)| v1::json_object::Entry { + key, + value: Some(helper(v)), + }) + .collect::>(); + Some(json_value::Value::Object(JsonObject { entries })) + } + }; + v1::JsonValue { value } } + helper(value.into_variant()) } -fn convert_list_to_pb_values(list_value: ListValue) -> Vec { - list_value - .take_items() - .into_iter() - .map(to_proto_value) - .collect() -} - -fn convert_struct_to_pb_values(struct_value: StructValue) -> Vec { - struct_value - .take_items() - .into_iter() - .map(to_proto_value) - .collect() +fn decode_json_value(value: &v1::JsonValue) -> JsonValueRef<'_> { + let Some(value) = &value.value else { + return JsonValueRef::null(); + }; + match value { + json_value::Value::Boolean(x) => (*x).into(), + json_value::Value::Int(x) => (*x).into(), + json_value::Value::Uint(x) => (*x).into(), + json_value::Value::Float(x) => (*x).into(), + json_value::Value::Str(x) => (x.as_str()).into(), + json_value::Value::Array(array) => array + .items + .iter() + .map(|x| decode_json_value(x).into_variant()) + .collect::>() + .into(), + json_value::Value::Object(x) => x + .entries + .iter() + .filter_map(|entry| { + entry + .value + .as_ref() + .map(|v| (entry.key.as_str(), decode_json_value(v).into_variant())) + }) + .collect::>() + .into(), + } } /// Returns the [ColumnDataTypeWrapper] of the value. @@ -1006,14 +1000,14 @@ pub fn vectors_to_rows<'a>( let mut rows = vec![Row { values: vec![] }; row_count]; for column in columns { for (row_index, row) in rows.iter_mut().enumerate() { - row.values.push(value_to_grpc_value(column.get(row_index))) + row.values.push(to_grpc_value(column.get(row_index))) } } rows } -pub fn value_to_grpc_value(value: Value) -> GrpcValue { +pub fn to_grpc_value(value: Value) -> GrpcValue { GrpcValue { value_data: match value { Value::Null => None, @@ -1053,7 +1047,7 @@ pub fn value_to_grpc_value(value: Value) -> GrpcValue { let items = list_value .take_items() .into_iter() - .map(value_to_grpc_value) + .map(to_grpc_value) .collect(); Some(ValueData::ListValue(v1::ListValue { items })) } @@ -1061,13 +1055,11 @@ pub fn value_to_grpc_value(value: Value) -> GrpcValue { let items = struct_value .take_items() .into_iter() - .map(value_to_grpc_value) + .map(to_grpc_value) .collect(); Some(ValueData::StructValue(v1::StructValue { items })) } - Value::Json(inner_value) => Some(ValueData::JsonValue(Box::new(value_to_grpc_value( - *inner_value, - )))), + Value::Json(v) => Some(ValueData::JsonValue(encode_json_value(*v))), Value::Duration(_) => unreachable!(), }, } @@ -1163,6 +1155,7 @@ mod tests { use common_time::interval::IntervalUnit; use datatypes::scalars::ScalarVector; use datatypes::types::{Int8Type, Int32Type, UInt8Type, UInt32Type}; + use datatypes::value::{ListValue, StructValue}; use datatypes::vectors::{ BooleanVector, DateVector, Float32Vector, PrimitiveVector, StringVector, }; @@ -1259,6 +1252,9 @@ mod tests { let values = values_with_capacity(ColumnDataType::Json, 2); assert_eq!(2, values.json_values.capacity()); assert_eq!(2, values.string_values.capacity()); + + let values = values_with_capacity(ColumnDataType::Dictionary, 2); + assert!(values.bool_values.is_empty()); } #[test] @@ -1355,6 +1351,17 @@ mod tests { ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::string_datatype())), ColumnDataTypeWrapper::list_datatype(ColumnDataTypeWrapper::string_datatype()).into() ); + assert_eq!( + ConcreteDataType::dictionary_datatype( + ConcreteDataType::int32_datatype(), + ConcreteDataType::string_datatype() + ), + ColumnDataTypeWrapper::dictionary_datatype( + ColumnDataTypeWrapper::int32_datatype(), + ColumnDataTypeWrapper::string_datatype() + ) + .into() + ); let struct_type = StructType::new(Arc::new(vec![ StructField::new("id".to_string(), ConcreteDataType::int64_datatype(), true), StructField::new( @@ -1525,6 +1532,18 @@ mod tests { ColumnDataTypeWrapper::vector_datatype(3), ConcreteDataType::vector_datatype(3).try_into().unwrap() ); + assert_eq!( + ColumnDataTypeWrapper::dictionary_datatype( + ColumnDataTypeWrapper::int32_datatype(), + ColumnDataTypeWrapper::string_datatype() + ), + ConcreteDataType::dictionary_datatype( + ConcreteDataType::int32_datatype(), + ConcreteDataType::string_datatype() + ) + .try_into() + .unwrap() + ); let result: Result = ConcreteDataType::null_datatype().try_into(); assert!(result.is_err()); @@ -1580,6 +1599,20 @@ mod tests { datatype_extension: Some(Box::new(ColumnDataTypeExtension { type_ext: Some(TypeExt::StructType(StructTypeExtension { fields: vec![ + v1::StructField { + name: "address".to_string(), + datatype: ColumnDataTypeWrapper::string_datatype() + .datatype() + .into(), + datatype_extension: None + }, + v1::StructField { + name: "age".to_string(), + datatype: ColumnDataTypeWrapper::int64_datatype() + .datatype() + .into(), + datatype_extension: None + }, v1::StructField { name: "id".to_string(), datatype: ColumnDataTypeWrapper::int64_datatype() @@ -1594,20 +1627,6 @@ mod tests { .into(), datatype_extension: None }, - v1::StructField { - name: "age".to_string(), - datatype: ColumnDataTypeWrapper::int32_datatype() - .datatype() - .into(), - datatype_extension: None - }, - v1::StructField { - name: "address".to_string(), - datatype: ColumnDataTypeWrapper::string_datatype() - .datatype() - .into(), - datatype_extension: None - } ] })) })) @@ -1740,7 +1759,7 @@ mod tests { Arc::new(ConcreteDataType::boolean_datatype()), )); - let pb_value = to_proto_value(value); + let pb_value = to_grpc_value(value); match pb_value.value_data.unwrap() { ValueData::ListValue(pb_list_value) => { @@ -1769,7 +1788,7 @@ mod tests { .unwrap(), ); - let pb_value = to_proto_value(value); + let pb_value = to_grpc_value(value); match pb_value.value_data.unwrap() { ValueData::StructValue(pb_struct_value) => { @@ -1778,4 +1797,199 @@ mod tests { _ => panic!("Unexpected value type"), } } + + #[test] + fn test_encode_decode_json_value() { + let json = JsonValue::null(); + let proto = encode_json_value(json.clone()); + assert!(proto.value.is_none()); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = true.into(); + let proto = encode_json_value(json.clone()); + assert_eq!(proto.value, Some(json_value::Value::Boolean(true))); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = (-1i64).into(); + let proto = encode_json_value(json.clone()); + assert_eq!(proto.value, Some(json_value::Value::Int(-1))); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = 1u64.into(); + let proto = encode_json_value(json.clone()); + assert_eq!(proto.value, Some(json_value::Value::Uint(1))); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = 1.0f64.into(); + let proto = encode_json_value(json.clone()); + assert_eq!(proto.value, Some(json_value::Value::Float(1.0))); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = "s".into(); + let proto = encode_json_value(json.clone()); + assert_eq!(proto.value, Some(json_value::Value::Str("s".to_string()))); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = [1i64, 2, 3].into(); + let proto = encode_json_value(json.clone()); + assert_eq!( + proto.value, + Some(json_value::Value::Array(JsonList { + items: vec![ + v1::JsonValue { + value: Some(json_value::Value::Int(1)) + }, + v1::JsonValue { + value: Some(json_value::Value::Int(2)) + }, + v1::JsonValue { + value: Some(json_value::Value::Int(3)) + } + ] + })) + ); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = [(); 0].into(); + let proto = encode_json_value(json.clone()); + assert_eq!( + proto.value, + Some(json_value::Value::Array(JsonList { items: vec![] })) + ); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = [("k3", 3i64), ("k2", 2i64), ("k1", 1i64)].into(); + let proto = encode_json_value(json.clone()); + assert_eq!( + proto.value, + Some(json_value::Value::Object(JsonObject { + entries: vec![ + v1::json_object::Entry { + key: "k1".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Int(1)) + }), + }, + v1::json_object::Entry { + key: "k2".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Int(2)) + }), + }, + v1::json_object::Entry { + key: "k3".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Int(3)) + }), + }, + ] + })) + ); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = [("null", ()); 0].into(); + let proto = encode_json_value(json.clone()); + assert_eq!( + proto.value, + Some(json_value::Value::Object(JsonObject { entries: vec![] })) + ); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + + let json: JsonValue = [ + ("null", JsonVariant::from(())), + ("bool", false.into()), + ("list", ["hello", "world"].into()), + ( + "object", + [ + ("positive_i", JsonVariant::from(42u64)), + ("negative_i", (-42i64).into()), + ("nested", [("what", "blah")].into()), + ] + .into(), + ), + ] + .into(); + let proto = encode_json_value(json.clone()); + assert_eq!( + proto.value, + Some(json_value::Value::Object(JsonObject { + entries: vec![ + v1::json_object::Entry { + key: "bool".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Boolean(false)) + }), + }, + v1::json_object::Entry { + key: "list".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Array(JsonList { + items: vec![ + v1::JsonValue { + value: Some(json_value::Value::Str("hello".to_string())) + }, + v1::JsonValue { + value: Some(json_value::Value::Str("world".to_string())) + }, + ] + })) + }), + }, + v1::json_object::Entry { + key: "null".to_string(), + value: Some(v1::JsonValue { value: None }), + }, + v1::json_object::Entry { + key: "object".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Object(JsonObject { + entries: vec![ + v1::json_object::Entry { + key: "negative_i".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Int(-42)) + }), + }, + v1::json_object::Entry { + key: "nested".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Object(JsonObject { + entries: vec![v1::json_object::Entry { + key: "what".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Str( + "blah".to_string() + )) + }), + },] + })) + }), + }, + v1::json_object::Entry { + key: "positive_i".to_string(), + value: Some(v1::JsonValue { + value: Some(json_value::Value::Uint(42)) + }), + }, + ] + })) + }), + }, + ] + })) + ); + let value = decode_json_value(&proto); + assert_eq!(json.as_ref(), value); + } } diff --git a/src/api/src/v1/column_def.rs b/src/api/src/v1/column_def.rs index 5be3d5c196..88ee0c5749 100644 --- a/src/api/src/v1/column_def.rs +++ b/src/api/src/v1/column_def.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; +use arrow_schema::extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}; use datatypes::schema::{ COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions, @@ -68,6 +69,15 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result { if let Some(skipping_index) = options.options.get(SKIPPING_INDEX_GRPC_KEY) { metadata.insert(SKIPPING_INDEX_KEY.to_string(), skipping_index.to_owned()); } + if let Some(extension_name) = options.options.get(EXTENSION_TYPE_NAME_KEY) { + metadata.insert(EXTENSION_TYPE_NAME_KEY.to_string(), extension_name.clone()); + } + if let Some(extension_metadata) = options.options.get(EXTENSION_TYPE_METADATA_KEY) { + metadata.insert( + EXTENSION_TYPE_METADATA_KEY.to_string(), + extension_metadata.clone(), + ); + } } ColumnSchema::new(&column_def.name, data_type.into(), column_def.is_nullable) @@ -139,6 +149,17 @@ pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option: Send + Sync { + async fn configure( + &self, + builder: KvBackendCatalogManagerBuilder, + ctx: C, + ) -> std::result::Result; +} + +pub type CatalogManagerConfiguratorRef = Arc>; + pub struct KvBackendCatalogManagerBuilder { information_extension: InformationExtensionRef, backend: KvBackendRef, cache_registry: LayeredCacheRegistryRef, procedure_manager: Option, process_manager: Option, - #[cfg(feature = "enterprise")] - extra_information_table_factories: - std::collections::HashMap, + extra_information_table_factories: HashMap, } impl KvBackendCatalogManagerBuilder { @@ -54,8 +67,7 @@ impl KvBackendCatalogManagerBuilder { cache_registry, procedure_manager: None, process_manager: None, - #[cfg(feature = "enterprise")] - extra_information_table_factories: std::collections::HashMap::new(), + extra_information_table_factories: HashMap::new(), } } @@ -70,10 +82,9 @@ impl KvBackendCatalogManagerBuilder { } /// Sets the extra information tables. - #[cfg(feature = "enterprise")] pub fn with_extra_information_table_factories( mut self, - factories: std::collections::HashMap, + factories: HashMap, ) -> Self { self.extra_information_table_factories = factories; self @@ -86,7 +97,6 @@ impl KvBackendCatalogManagerBuilder { cache_registry, procedure_manager, process_manager, - #[cfg(feature = "enterprise")] extra_information_table_factories, } = self; Arc::new_cyclic(|me| KvBackendCatalogManager { @@ -110,7 +120,6 @@ impl KvBackendCatalogManagerBuilder { process_manager.clone(), backend.clone(), ); - #[cfg(feature = "enterprise")] let provider = provider .with_extra_table_factories(extra_information_table_factories.clone()); Arc::new(provider) @@ -119,9 +128,9 @@ impl KvBackendCatalogManagerBuilder { DEFAULT_CATALOG_NAME.to_string(), me.clone(), )), + numbers_table_provider: NumbersTableProvider, backend, process_manager, - #[cfg(feature = "enterprise")] extra_information_table_factories, }, cache_registry, diff --git a/src/catalog/src/kvbackend/manager.rs b/src/catalog/src/kvbackend/manager.rs index 902f15c09e..7852142c6a 100644 --- a/src/catalog/src/kvbackend/manager.rs +++ b/src/catalog/src/kvbackend/manager.rs @@ -18,8 +18,7 @@ use std::sync::{Arc, Weak}; use async_stream::try_stream; use common_catalog::consts::{ - DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, NUMBERS_TABLE_ID, - PG_CATALOG_NAME, + DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, PG_CATALOG_NAME, }; use common_error::ext::BoxedError; use common_meta::cache::{ @@ -45,7 +44,6 @@ use table::TableRef; use table::dist_table::DistTable; use table::metadata::{TableId, TableInfoRef}; use table::table::PartitionRules; -use table::table::numbers::{NUMBERS_TABLE_NAME, NumbersTable}; use table::table_name::TableName; use tokio::sync::Semaphore; use tokio_stream::wrappers::ReceiverStream; @@ -55,12 +53,13 @@ use crate::error::{ CacheNotFoundSnafu, GetTableCacheSnafu, InvalidTableInfoInCatalogSnafu, ListCatalogsSnafu, ListSchemasSnafu, ListTablesSnafu, Result, TableMetadataManagerSnafu, }; -#[cfg(feature = "enterprise")] -use crate::information_schema::InformationSchemaTableFactoryRef; -use crate::information_schema::{InformationExtensionRef, InformationSchemaProvider}; +use crate::information_schema::{ + InformationExtensionRef, InformationSchemaProvider, InformationSchemaTableFactoryRef, +}; use crate::kvbackend::TableCacheRef; use crate::process_manager::ProcessManagerRef; use crate::system_schema::SystemSchemaProvider; +use crate::system_schema::numbers_table_provider::NumbersTableProvider; use crate::system_schema::pg_catalog::PGCatalogProvider; /// Access all existing catalog, schema and tables. @@ -555,9 +554,9 @@ pub(super) struct SystemCatalog { // system_schema_provider for default catalog pub(super) information_schema_provider: Arc, pub(super) pg_catalog_provider: Arc, + pub(super) numbers_table_provider: NumbersTableProvider, pub(super) backend: KvBackendRef, pub(super) process_manager: Option, - #[cfg(feature = "enterprise")] pub(super) extra_information_table_factories: std::collections::HashMap, } @@ -584,9 +583,7 @@ impl SystemCatalog { PG_CATALOG_NAME if channel == Channel::Postgres => { self.pg_catalog_provider.table_names() } - DEFAULT_SCHEMA_NAME => { - vec![NUMBERS_TABLE_NAME.to_string()] - } + DEFAULT_SCHEMA_NAME => self.numbers_table_provider.table_names(), _ => vec![], } } @@ -604,7 +601,7 @@ impl SystemCatalog { if schema == INFORMATION_SCHEMA_NAME { self.information_schema_provider.table(table).is_some() } else if schema == DEFAULT_SCHEMA_NAME { - table == NUMBERS_TABLE_NAME + self.numbers_table_provider.table_exists(table) } else if schema == PG_CATALOG_NAME && channel == Channel::Postgres { self.pg_catalog_provider.table(table).is_some() } else { @@ -630,7 +627,6 @@ impl SystemCatalog { self.process_manager.clone(), self.backend.clone(), ); - #[cfg(feature = "enterprise")] let provider = provider .with_extra_table_factories(self.extra_information_table_factories.clone()); Arc::new(provider) @@ -649,8 +645,8 @@ impl SystemCatalog { }); pg_catalog_provider.table(table_name) } - } else if schema == DEFAULT_SCHEMA_NAME && table_name == NUMBERS_TABLE_NAME { - Some(NumbersTable::table(NUMBERS_TABLE_ID)) + } else if schema == DEFAULT_SCHEMA_NAME { + self.numbers_table_provider.table(table_name) } else { None } diff --git a/src/catalog/src/system_schema.rs b/src/catalog/src/system_schema.rs index c813ab6ab7..2e1c890427 100644 --- a/src/catalog/src/system_schema.rs +++ b/src/catalog/src/system_schema.rs @@ -14,6 +14,7 @@ pub mod information_schema; mod memory_table; +pub mod numbers_table_provider; pub mod pg_catalog; pub mod predicate; mod utils; diff --git a/src/catalog/src/system_schema/information_schema.rs b/src/catalog/src/system_schema/information_schema.rs index 3ffcf73631..18384b8163 100644 --- a/src/catalog/src/system_schema/information_schema.rs +++ b/src/catalog/src/system_schema/information_schema.rs @@ -22,7 +22,6 @@ mod procedure_info; pub mod process_list; pub mod region_peers; mod region_statistics; -mod runtime_metrics; pub mod schemata; mod ssts; mod table_constraints; @@ -65,7 +64,6 @@ use crate::system_schema::information_schema::information_memory_table::get_sche use crate::system_schema::information_schema::key_column_usage::InformationSchemaKeyColumnUsage; use crate::system_schema::information_schema::partitions::InformationSchemaPartitions; use crate::system_schema::information_schema::region_peers::InformationSchemaRegionPeers; -use crate::system_schema::information_schema::runtime_metrics::InformationSchemaMetrics; use crate::system_schema::information_schema::schemata::InformationSchemaSchemata; use crate::system_schema::information_schema::ssts::{ InformationSchemaSstsIndexMeta, InformationSchemaSstsManifest, InformationSchemaSstsStorage, @@ -97,7 +95,6 @@ lazy_static! { ROUTINES, SCHEMA_PRIVILEGES, TABLE_PRIVILEGES, - TRIGGERS, GLOBAL_STATUS, SESSION_STATUS, PARTITIONS, @@ -120,7 +117,6 @@ macro_rules! setup_memory_table { }; } -#[cfg(feature = "enterprise")] pub struct MakeInformationTableRequest { pub catalog_name: String, pub catalog_manager: Weak, @@ -131,12 +127,10 @@ pub struct MakeInformationTableRequest { /// /// This trait allows for extensibility of the information schema by providing /// a way to dynamically create custom information schema tables. -#[cfg(feature = "enterprise")] pub trait InformationSchemaTableFactory { fn make_information_table(&self, req: MakeInformationTableRequest) -> SystemTableRef; } -#[cfg(feature = "enterprise")] pub type InformationSchemaTableFactoryRef = Arc; /// The `information_schema` tables info provider. @@ -146,9 +140,7 @@ pub struct InformationSchemaProvider { process_manager: Option, flow_metadata_manager: Arc, tables: HashMap, - #[allow(dead_code)] kv_backend: KvBackendRef, - #[cfg(feature = "enterprise")] extra_table_factories: HashMap, } @@ -169,7 +161,6 @@ impl SystemSchemaProviderInner for InformationSchemaProvider { } fn system_table(&self, name: &str) -> Option { - #[cfg(feature = "enterprise")] if let Some(factory) = self.extra_table_factories.get(name) { let req = MakeInformationTableRequest { catalog_name: self.catalog_name.clone(), @@ -207,7 +198,6 @@ impl SystemSchemaProviderInner for InformationSchemaProvider { ROUTINES => setup_memory_table!(ROUTINES), SCHEMA_PRIVILEGES => setup_memory_table!(SCHEMA_PRIVILEGES), TABLE_PRIVILEGES => setup_memory_table!(TABLE_PRIVILEGES), - TRIGGERS => setup_memory_table!(TRIGGERS), GLOBAL_STATUS => setup_memory_table!(GLOBAL_STATUS), SESSION_STATUS => setup_memory_table!(SESSION_STATUS), KEY_COLUMN_USAGE => Some(Arc::new(InformationSchemaKeyColumnUsage::new( @@ -218,7 +208,6 @@ impl SystemSchemaProviderInner for InformationSchemaProvider { self.catalog_name.clone(), self.catalog_manager.clone(), )) as _), - RUNTIME_METRICS => Some(Arc::new(InformationSchemaMetrics::new())), PARTITIONS => Some(Arc::new(InformationSchemaPartitions::new( self.catalog_name.clone(), self.catalog_manager.clone(), @@ -286,7 +275,6 @@ impl InformationSchemaProvider { process_manager, tables: HashMap::new(), kv_backend, - #[cfg(feature = "enterprise")] extra_table_factories: HashMap::new(), }; @@ -295,7 +283,6 @@ impl InformationSchemaProvider { provider } - #[cfg(feature = "enterprise")] pub(crate) fn with_extra_table_factories( mut self, factories: HashMap, @@ -313,10 +300,6 @@ impl InformationSchemaProvider { // authentication details, and other critical information. // Only put these tables under `greptime` catalog to prevent info leak. if self.catalog_name == DEFAULT_CATALOG_NAME { - tables.insert( - RUNTIME_METRICS.to_string(), - self.build_table(RUNTIME_METRICS).unwrap(), - ); tables.insert( BUILD_INFO.to_string(), self.build_table(BUILD_INFO).unwrap(), @@ -367,7 +350,6 @@ impl InformationSchemaProvider { if let Some(process_list) = self.build_table(PROCESS_LIST) { tables.insert(PROCESS_LIST.to_string(), process_list); } - #[cfg(feature = "enterprise")] for name in self.extra_table_factories.keys() { tables.insert(name.clone(), self.build_table(name).expect(name)); } diff --git a/src/catalog/src/system_schema/information_schema/cluster_info.rs b/src/catalog/src/system_schema/information_schema/cluster_info.rs index f45dc5be06..1ba1a55fb6 100644 --- a/src/catalog/src/system_schema/information_schema/cluster_info.rs +++ b/src/catalog/src/system_schema/information_schema/cluster_info.rs @@ -33,7 +33,6 @@ use datatypes::timestamp::TimestampMillisecond; use datatypes::value::Value; use datatypes::vectors::{ Int64VectorBuilder, StringVectorBuilder, TimestampMillisecondVectorBuilder, - UInt32VectorBuilder, UInt64VectorBuilder, }; use serde::Serialize; use snafu::ResultExt; @@ -53,6 +52,8 @@ const PEER_ADDR: &str = "peer_addr"; const PEER_HOSTNAME: &str = "peer_hostname"; const TOTAL_CPU_MILLICORES: &str = "total_cpu_millicores"; const TOTAL_MEMORY_BYTES: &str = "total_memory_bytes"; +const CPU_USAGE_MILLICORES: &str = "cpu_usage_millicores"; +const MEMORY_USAGE_BYTES: &str = "memory_usage_bytes"; const VERSION: &str = "version"; const GIT_COMMIT: &str = "git_commit"; const START_TIME: &str = "start_time"; @@ -67,15 +68,17 @@ const INIT_CAPACITY: usize = 42; /// - `peer_id`: the peer server id. /// - `peer_type`: the peer type, such as `datanode`, `frontend`, `metasrv` etc. /// - `peer_addr`: the peer gRPC address. +/// - `peer_hostname`: the hostname of the peer. /// - `total_cpu_millicores`: the total CPU millicores of the peer. /// - `total_memory_bytes`: the total memory bytes of the peer. +/// - `cpu_usage_millicores`: the CPU usage millicores of the peer. +/// - `memory_usage_bytes`: the memory usage bytes of the peer. /// - `version`: the build package version of the peer. /// - `git_commit`: the build git commit hash of the peer. /// - `start_time`: the starting time of the peer. /// - `uptime`: the uptime of the peer. /// - `active_time`: the time since the last activity of the peer. /// - `node_status`: the status info of the peer. -/// - `peer_hostname`: the hostname of the peer. /// #[derive(Debug)] pub(super) struct InformationSchemaClusterInfo { @@ -99,12 +102,22 @@ impl InformationSchemaClusterInfo { ColumnSchema::new(PEER_HOSTNAME, ConcreteDataType::string_datatype(), true), ColumnSchema::new( TOTAL_CPU_MILLICORES, - ConcreteDataType::uint32_datatype(), + ConcreteDataType::int64_datatype(), false, ), ColumnSchema::new( TOTAL_MEMORY_BYTES, - ConcreteDataType::uint64_datatype(), + ConcreteDataType::int64_datatype(), + false, + ), + ColumnSchema::new( + CPU_USAGE_MILLICORES, + ConcreteDataType::int64_datatype(), + false, + ), + ColumnSchema::new( + MEMORY_USAGE_BYTES, + ConcreteDataType::int64_datatype(), false, ), ColumnSchema::new(VERSION, ConcreteDataType::string_datatype(), false), @@ -167,8 +180,10 @@ struct InformationSchemaClusterInfoBuilder { peer_types: StringVectorBuilder, peer_addrs: StringVectorBuilder, peer_hostnames: StringVectorBuilder, - cpus: UInt32VectorBuilder, - memory_bytes: UInt64VectorBuilder, + total_cpu_millicores: Int64VectorBuilder, + total_memory_bytes: Int64VectorBuilder, + cpu_usage_millicores: Int64VectorBuilder, + memory_usage_bytes: Int64VectorBuilder, versions: StringVectorBuilder, git_commits: StringVectorBuilder, start_times: TimestampMillisecondVectorBuilder, @@ -186,8 +201,10 @@ impl InformationSchemaClusterInfoBuilder { peer_types: StringVectorBuilder::with_capacity(INIT_CAPACITY), peer_addrs: StringVectorBuilder::with_capacity(INIT_CAPACITY), peer_hostnames: StringVectorBuilder::with_capacity(INIT_CAPACITY), - cpus: UInt32VectorBuilder::with_capacity(INIT_CAPACITY), - memory_bytes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY), + total_cpu_millicores: Int64VectorBuilder::with_capacity(INIT_CAPACITY), + total_memory_bytes: Int64VectorBuilder::with_capacity(INIT_CAPACITY), + cpu_usage_millicores: Int64VectorBuilder::with_capacity(INIT_CAPACITY), + memory_usage_bytes: Int64VectorBuilder::with_capacity(INIT_CAPACITY), versions: StringVectorBuilder::with_capacity(INIT_CAPACITY), git_commits: StringVectorBuilder::with_capacity(INIT_CAPACITY), start_times: TimestampMillisecondVectorBuilder::with_capacity(INIT_CAPACITY), @@ -243,8 +260,14 @@ impl InformationSchemaClusterInfoBuilder { self.start_times.push(None); self.uptimes.push(None); } - self.cpus.push(Some(node_info.cpus)); - self.memory_bytes.push(Some(node_info.memory_bytes)); + self.total_cpu_millicores + .push(Some(node_info.total_cpu_millicores)); + self.total_memory_bytes + .push(Some(node_info.total_memory_bytes)); + self.cpu_usage_millicores + .push(Some(node_info.cpu_usage_millicores)); + self.memory_usage_bytes + .push(Some(node_info.memory_usage_bytes)); if node_info.last_activity_ts > 0 { self.active_times.push(Some( @@ -269,8 +292,10 @@ impl InformationSchemaClusterInfoBuilder { Arc::new(self.peer_types.finish()), Arc::new(self.peer_addrs.finish()), Arc::new(self.peer_hostnames.finish()), - Arc::new(self.cpus.finish()), - Arc::new(self.memory_bytes.finish()), + Arc::new(self.total_cpu_millicores.finish()), + Arc::new(self.total_memory_bytes.finish()), + Arc::new(self.cpu_usage_millicores.finish()), + Arc::new(self.memory_usage_bytes.finish()), Arc::new(self.versions.finish()), Arc::new(self.git_commits.finish()), Arc::new(self.start_times.finish()), diff --git a/src/catalog/src/system_schema/information_schema/information_memory_table.rs b/src/catalog/src/system_schema/information_schema/information_memory_table.rs index 03fbd16e13..56a84a0da1 100644 --- a/src/catalog/src/system_schema/information_schema/information_memory_table.rs +++ b/src/catalog/src/system_schema/information_schema/information_memory_table.rs @@ -15,8 +15,7 @@ use std::sync::Arc; use common_catalog::consts::{METRIC_ENGINE, MITO_ENGINE}; -use datatypes::data_type::ConcreteDataType; -use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; +use datatypes::schema::{Schema, SchemaRef}; use datatypes::vectors::{Int64Vector, StringVector, VectorRef}; use crate::system_schema::information_schema::table_names::*; @@ -366,16 +365,6 @@ pub(super) fn get_schema_columns(table_name: &str) -> (SchemaRef, Vec vec![], ), - TRIGGERS => ( - vec![ - string_column("TRIGGER_NAME"), - ColumnSchema::new("trigger_id", ConcreteDataType::uint64_datatype(), false), - string_column("TRIGGER_DEFINITION"), - ColumnSchema::new("flownode_id", ConcreteDataType::uint64_datatype(), true), - ], - vec![], - ), - // TODO: Considering store internal metrics in `global_status` and // `session_status` tables. GLOBAL_STATUS => ( diff --git a/src/catalog/src/system_schema/information_schema/partitions.rs b/src/catalog/src/system_schema/information_schema/partitions.rs index 68f4f83051..b9396fe554 100644 --- a/src/catalog/src/system_schema/information_schema/partitions.rs +++ b/src/catalog/src/system_schema/information_schema/partitions.rs @@ -211,6 +211,7 @@ struct InformationSchemaPartitionsBuilder { partition_names: StringVectorBuilder, partition_ordinal_positions: Int64VectorBuilder, partition_expressions: StringVectorBuilder, + partition_descriptions: StringVectorBuilder, create_times: TimestampSecondVectorBuilder, partition_ids: UInt64VectorBuilder, } @@ -231,6 +232,7 @@ impl InformationSchemaPartitionsBuilder { partition_names: StringVectorBuilder::with_capacity(INIT_CAPACITY), partition_ordinal_positions: Int64VectorBuilder::with_capacity(INIT_CAPACITY), partition_expressions: StringVectorBuilder::with_capacity(INIT_CAPACITY), + partition_descriptions: StringVectorBuilder::with_capacity(INIT_CAPACITY), create_times: TimestampSecondVectorBuilder::with_capacity(INIT_CAPACITY), partition_ids: UInt64VectorBuilder::with_capacity(INIT_CAPACITY), } @@ -319,6 +321,21 @@ impl InformationSchemaPartitionsBuilder { return; } + // Get partition column names (shared by all partitions) + // In MySQL, PARTITION_EXPRESSION is the partitioning function expression (e.g., column name) + let partition_columns: String = table_info + .meta + .partition_column_names() + .cloned() + .collect::>() + .join(", "); + + let partition_expr_str = if partition_columns.is_empty() { + None + } else { + Some(partition_columns) + }; + for (index, partition) in partitions.iter().enumerate() { let partition_name = format!("p{index}"); @@ -328,8 +345,12 @@ impl InformationSchemaPartitionsBuilder { self.partition_names.push(Some(&partition_name)); self.partition_ordinal_positions .push(Some((index + 1) as i64)); - let expression = partition.partition_expr.as_ref().map(|e| e.to_string()); - self.partition_expressions.push(expression.as_deref()); + // PARTITION_EXPRESSION: partition column names (same for all partitions) + self.partition_expressions + .push(partition_expr_str.as_deref()); + // PARTITION_DESCRIPTION: partition boundary expression (different for each partition) + let description = partition.partition_expr.as_ref().map(|e| e.to_string()); + self.partition_descriptions.push(description.as_deref()); self.create_times.push(Some(TimestampSecond::from( table_info.meta.created_on.timestamp(), ))); @@ -369,7 +390,7 @@ impl InformationSchemaPartitionsBuilder { null_string_vector.clone(), Arc::new(self.partition_expressions.finish()), null_string_vector.clone(), - null_string_vector.clone(), + Arc::new(self.partition_descriptions.finish()), // TODO(dennis): rows and index statistics info null_i64_vector.clone(), null_i64_vector.clone(), diff --git a/src/catalog/src/system_schema/information_schema/runtime_metrics.rs b/src/catalog/src/system_schema/information_schema/runtime_metrics.rs deleted file mode 100644 index 5ccb871321..0000000000 --- a/src/catalog/src/system_schema/information_schema/runtime_metrics.rs +++ /dev/null @@ -1,265 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow_schema::SchemaRef as ArrowSchemaRef; -use common_catalog::consts::INFORMATION_SCHEMA_RUNTIME_METRICS_TABLE_ID; -use common_error::ext::BoxedError; -use common_recordbatch::adapter::RecordBatchStreamAdapter; -use common_recordbatch::{RecordBatch, SendableRecordBatchStream}; -use common_time::util::current_time_millis; -use datafusion::execution::TaskContext; -use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream; -use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatchStreamAdapter; -use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream; -use datatypes::prelude::{ConcreteDataType, MutableVector}; -use datatypes::scalars::ScalarVectorBuilder; -use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; -use datatypes::vectors::{ - ConstantVector, Float64VectorBuilder, StringVectorBuilder, TimestampMillisecondVector, - VectorRef, -}; -use itertools::Itertools; -use snafu::ResultExt; -use store_api::storage::{ScanRequest, TableId}; - -use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result}; -use crate::system_schema::information_schema::{InformationTable, RUNTIME_METRICS}; - -#[derive(Debug)] -pub(super) struct InformationSchemaMetrics { - schema: SchemaRef, -} - -const METRIC_NAME: &str = "metric_name"; -const METRIC_VALUE: &str = "value"; -const METRIC_LABELS: &str = "labels"; -const PEER_ADDR: &str = "peer_addr"; -const PEER_TYPE: &str = "peer_type"; -const TIMESTAMP: &str = "timestamp"; - -/// The `information_schema.runtime_metrics` virtual table. -/// It provides the GreptimeDB runtime metrics for the users by SQL. -impl InformationSchemaMetrics { - pub(super) fn new() -> Self { - Self { - schema: Self::schema(), - } - } - - fn schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - ColumnSchema::new(METRIC_NAME, ConcreteDataType::string_datatype(), false), - ColumnSchema::new(METRIC_VALUE, ConcreteDataType::float64_datatype(), false), - ColumnSchema::new(METRIC_LABELS, ConcreteDataType::string_datatype(), true), - ColumnSchema::new(PEER_ADDR, ConcreteDataType::string_datatype(), true), - ColumnSchema::new(PEER_TYPE, ConcreteDataType::string_datatype(), false), - ColumnSchema::new( - TIMESTAMP, - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - ])) - } - - fn builder(&self) -> InformationSchemaMetricsBuilder { - InformationSchemaMetricsBuilder::new(self.schema.clone()) - } -} - -impl InformationTable for InformationSchemaMetrics { - fn table_id(&self) -> TableId { - INFORMATION_SCHEMA_RUNTIME_METRICS_TABLE_ID - } - - fn table_name(&self) -> &'static str { - RUNTIME_METRICS - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - - fn to_stream(&self, request: ScanRequest) -> Result { - let schema = self.schema.arrow_schema().clone(); - let mut builder = self.builder(); - let stream = Box::pin(DfRecordBatchStreamAdapter::new( - schema, - futures::stream::once(async move { - builder - .make_metrics(Some(request)) - .await - .map(|x| x.into_df_record_batch()) - .map_err(Into::into) - }), - )); - - Ok(Box::pin( - RecordBatchStreamAdapter::try_new(stream) - .map_err(BoxedError::new) - .context(InternalSnafu)?, - )) - } -} - -struct InformationSchemaMetricsBuilder { - schema: SchemaRef, - - metric_names: StringVectorBuilder, - metric_values: Float64VectorBuilder, - metric_labels: StringVectorBuilder, - peer_addrs: StringVectorBuilder, - peer_types: StringVectorBuilder, -} - -impl InformationSchemaMetricsBuilder { - fn new(schema: SchemaRef) -> Self { - Self { - schema, - metric_names: StringVectorBuilder::with_capacity(42), - metric_values: Float64VectorBuilder::with_capacity(42), - metric_labels: StringVectorBuilder::with_capacity(42), - peer_addrs: StringVectorBuilder::with_capacity(42), - peer_types: StringVectorBuilder::with_capacity(42), - } - } - - fn add_metric( - &mut self, - metric_name: &str, - labels: String, - metric_value: f64, - peer: Option<&str>, - peer_type: &str, - ) { - self.metric_names.push(Some(metric_name)); - self.metric_values.push(Some(metric_value)); - self.metric_labels.push(Some(&labels)); - self.peer_addrs.push(peer); - self.peer_types.push(Some(peer_type)); - } - - async fn make_metrics(&mut self, _request: Option) -> Result { - let metric_families = prometheus::gather(); - - let write_request = - common_telemetry::metric::convert_metric_to_write_request(metric_families, None, 0); - - for ts in write_request.timeseries { - //Safety: always has `__name__` label - let metric_name = ts - .labels - .iter() - .find_map(|label| { - if label.name == "__name__" { - Some(label.value.clone()) - } else { - None - } - }) - .unwrap(); - - self.add_metric( - &metric_name, - ts.labels - .into_iter() - .filter_map(|label| { - if label.name == "__name__" { - None - } else { - Some(format!("{}={}", label.name, label.value)) - } - }) - .join(", "), - // Safety: always has a sample - ts.samples[0].value, - // The peer column is always `None` for standalone - None, - "STANDALONE", - ); - } - - // FIXME(dennis): fetching other peers metrics - self.finish() - } - - fn finish(&mut self) -> Result { - let rows_num = self.metric_names.len(); - - let timestamps = Arc::new(ConstantVector::new( - Arc::new(TimestampMillisecondVector::from_slice([ - current_time_millis(), - ])), - rows_num, - )); - - let columns: Vec = vec![ - Arc::new(self.metric_names.finish()), - Arc::new(self.metric_values.finish()), - Arc::new(self.metric_labels.finish()), - Arc::new(self.peer_addrs.finish()), - Arc::new(self.peer_types.finish()), - timestamps, - ]; - - RecordBatch::new(self.schema.clone(), columns).context(CreateRecordBatchSnafu) - } -} - -impl DfPartitionStream for InformationSchemaMetrics { - fn schema(&self) -> &ArrowSchemaRef { - self.schema.arrow_schema() - } - - fn execute(&self, _: Arc) -> DfSendableRecordBatchStream { - let schema = self.schema.arrow_schema().clone(); - let mut builder = self.builder(); - Box::pin(DfRecordBatchStreamAdapter::new( - schema, - futures::stream::once(async move { - builder - .make_metrics(None) - .await - .map(|x| x.into_df_record_batch()) - .map_err(Into::into) - }), - )) - } -} - -#[cfg(test)] -mod tests { - use common_recordbatch::RecordBatches; - - use super::*; - - #[tokio::test] - async fn test_make_metrics() { - let metrics = InformationSchemaMetrics::new(); - - let stream = metrics.to_stream(ScanRequest::default()).unwrap(); - - let batches = RecordBatches::try_collect(stream).await.unwrap(); - - let result_literal = batches.pretty_print().unwrap(); - - assert!(result_literal.contains(METRIC_NAME)); - assert!(result_literal.contains(METRIC_VALUE)); - assert!(result_literal.contains(METRIC_LABELS)); - assert!(result_literal.contains(PEER_ADDR)); - assert!(result_literal.contains(PEER_TYPE)); - assert!(result_literal.contains(TIMESTAMP)); - } -} diff --git a/src/catalog/src/system_schema/information_schema/table_names.rs b/src/catalog/src/system_schema/information_schema/table_names.rs index 23791425dc..2a3329fece 100644 --- a/src/catalog/src/system_schema/information_schema/table_names.rs +++ b/src/catalog/src/system_schema/information_schema/table_names.rs @@ -38,7 +38,6 @@ pub const TABLE_PRIVILEGES: &str = "table_privileges"; pub const TRIGGERS: &str = "triggers"; pub const GLOBAL_STATUS: &str = "global_status"; pub const SESSION_STATUS: &str = "session_status"; -pub const RUNTIME_METRICS: &str = "runtime_metrics"; pub const PARTITIONS: &str = "partitions"; pub const REGION_PEERS: &str = "region_peers"; pub const TABLE_CONSTRAINTS: &str = "table_constraints"; diff --git a/src/catalog/src/system_schema/information_schema/tables.rs b/src/catalog/src/system_schema/information_schema/tables.rs index 507dedc547..38a0cb1d61 100644 --- a/src/catalog/src/system_schema/information_schema/tables.rs +++ b/src/catalog/src/system_schema/information_schema/tables.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; use std::sync::{Arc, Weak}; use arrow_schema::SchemaRef as ArrowSchemaRef; @@ -255,14 +254,17 @@ impl InformationSchemaTablesBuilder { // TODO(dennis): `region_stats` API is not stable in distributed cluster because of network issue etc. // But we don't want the statements such as `show tables` fail, // so using `unwrap_or_else` here instead of `?` operator. - let region_stats = information_extension - .region_stats() - .await - .map_err(|e| { - error!(e; "Failed to call region_stats"); - e - }) - .unwrap_or_else(|_| vec![]); + let region_stats = { + let mut x = information_extension + .region_stats() + .await + .unwrap_or_else(|e| { + error!(e; "Failed to find region stats in information_schema, fallback to all empty"); + vec![] + }); + x.sort_unstable_by_key(|x| x.id); + x + }; for schema_name in catalog_manager.schema_names(&catalog_name, None).await? { let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None); @@ -273,16 +275,16 @@ impl InformationSchemaTablesBuilder { // TODO(dennis): make it working for metric engine let table_region_stats = if table_info.meta.engine == MITO_ENGINE || table_info.is_physical_table() { - let region_ids = table_info + table_info .meta .region_numbers .iter() .map(|n| RegionId::new(table_info.ident.table_id, *n)) - .collect::>(); - - region_stats - .iter() - .filter(|stat| region_ids.contains(&stat.id)) + .flat_map(|region_id| { + region_stats + .binary_search_by_key(®ion_id, |x| x.id) + .map(|i| ®ion_stats[i]) + }) .collect::>() } else { vec![] diff --git a/src/catalog/src/system_schema/numbers_table_provider.rs b/src/catalog/src/system_schema/numbers_table_provider.rs new file mode 100644 index 0000000000..6ea6d554b7 --- /dev/null +++ b/src/catalog/src/system_schema/numbers_table_provider.rs @@ -0,0 +1,59 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[cfg(any(test, feature = "testing", debug_assertions))] +use common_catalog::consts::NUMBERS_TABLE_ID; +use table::TableRef; +#[cfg(any(test, feature = "testing", debug_assertions))] +use table::table::numbers::NUMBERS_TABLE_NAME; +#[cfg(any(test, feature = "testing", debug_assertions))] +use table::table::numbers::NumbersTable; + +// NumbersTableProvider is a dedicated provider for feature-gating the numbers table. +#[derive(Clone)] +pub struct NumbersTableProvider; + +#[cfg(any(test, feature = "testing", debug_assertions))] +impl NumbersTableProvider { + pub(crate) fn table_exists(&self, name: &str) -> bool { + name == NUMBERS_TABLE_NAME + } + + pub(crate) fn table_names(&self) -> Vec { + vec![NUMBERS_TABLE_NAME.to_string()] + } + + pub(crate) fn table(&self, name: &str) -> Option { + if name == NUMBERS_TABLE_NAME { + Some(NumbersTable::table(NUMBERS_TABLE_ID)) + } else { + None + } + } +} + +#[cfg(not(any(test, feature = "testing", debug_assertions)))] +impl NumbersTableProvider { + pub(crate) fn table_exists(&self, _name: &str) -> bool { + false + } + + pub(crate) fn table_names(&self) -> Vec { + vec![] + } + + pub(crate) fn table(&self, _name: &str) -> Option { + None + } +} diff --git a/src/cli/src/data.rs b/src/cli/src/data.rs index bac7f3e308..be623f63a2 100644 --- a/src/cli/src/data.rs +++ b/src/cli/src/data.rs @@ -16,12 +16,15 @@ mod export; mod import; use clap::Subcommand; +use client::DEFAULT_CATALOG_NAME; use common_error::ext::BoxedError; use crate::Tool; use crate::data::export::ExportCommand; use crate::data::import::ImportCommand; +pub(crate) const COPY_PATH_PLACEHOLDER: &str = ""; + /// Command for data operations including exporting data from and importing data into GreptimeDB. #[derive(Subcommand)] pub enum DataCommand { @@ -37,3 +40,7 @@ impl DataCommand { } } } + +pub(crate) fn default_database() -> String { + format!("{DEFAULT_CATALOG_NAME}-*") +} diff --git a/src/cli/src/data/export.rs b/src/cli/src/data/export.rs index a9f68bf9c9..007f8aa67c 100644 --- a/src/cli/src/data/export.rs +++ b/src/cli/src/data/export.rs @@ -30,6 +30,7 @@ use snafu::{OptionExt, ResultExt}; use tokio::sync::Semaphore; use tokio::time::Instant; +use crate::data::{COPY_PATH_PLACEHOLDER, default_database}; use crate::database::{DatabaseClient, parse_proxy_opts}; use crate::error::{ EmptyResultSnafu, Error, OpenDalSnafu, OutputDirNotSetSnafu, Result, S3ConfigNotSetSnafu, @@ -63,12 +64,20 @@ pub struct ExportCommand { output_dir: Option, /// The name of the catalog to export. - #[clap(long, default_value = "greptime-*")] + #[clap(long, default_value_t = default_database())] database: String, - /// Parallelism of the export. - #[clap(long, short = 'j', default_value = "1")] - export_jobs: usize, + /// The number of databases exported in parallel. + /// For example, if there are 20 databases and `db_parallelism` is 4, + /// 4 databases will be exported concurrently. + #[clap(long, short = 'j', default_value = "1", alias = "export-jobs")] + db_parallelism: usize, + + /// The number of tables exported in parallel within a single database. + /// For example, if a database has 30 tables and `parallelism` is 8, + /// 8 tables will be exported concurrently. + #[clap(long, default_value = "4")] + table_parallelism: usize, /// Max retry times for each job. #[clap(long, default_value = "3")] @@ -209,10 +218,11 @@ impl ExportCommand { schema, database_client, output_dir: self.output_dir.clone(), - parallelism: self.export_jobs, + export_jobs: self.db_parallelism, target: self.target.clone(), start_time: self.start_time.clone(), end_time: self.end_time.clone(), + parallelism: self.table_parallelism, s3: self.s3, ddl_local_dir: self.ddl_local_dir.clone(), s3_bucket: self.s3_bucket.clone(), @@ -250,10 +260,11 @@ pub struct Export { schema: Option, database_client: DatabaseClient, output_dir: Option, - parallelism: usize, + export_jobs: usize, target: ExportTarget, start_time: Option, end_time: Option, + parallelism: usize, s3: bool, ddl_local_dir: Option, s3_bucket: Option, @@ -463,7 +474,7 @@ impl Export { async fn export_create_table(&self) -> Result<()> { let timer = Instant::now(); - let semaphore = Arc::new(Semaphore::new(self.parallelism)); + let semaphore = Arc::new(Semaphore::new(self.export_jobs)); let db_names = self.get_db_names().await?; let db_count = db_names.len(); let operator = Arc::new(self.build_prefer_fs_operator().await?); @@ -624,13 +635,13 @@ impl Export { async fn export_database_data(&self) -> Result<()> { let timer = Instant::now(); - let semaphore = Arc::new(Semaphore::new(self.parallelism)); + let semaphore = Arc::new(Semaphore::new(self.export_jobs)); let db_names = self.get_db_names().await?; let db_count = db_names.len(); let mut tasks = Vec::with_capacity(db_count); let operator = Arc::new(self.build_operator().await?); let fs_first_operator = Arc::new(self.build_prefer_fs_operator().await?); - let with_options = build_with_options(&self.start_time, &self.end_time); + let with_options = build_with_options(&self.start_time, &self.end_time, self.parallelism); for schema in db_names { let semaphore_moved = semaphore.clone(); @@ -667,10 +678,26 @@ impl Export { ); // Create copy_from.sql file - let copy_database_from_sql = format!( - r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({}){};"#, - export_self.catalog, schema, path, with_options_clone, connection_part - ); + let copy_database_from_sql = { + let command_without_connection = format!( + r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({});"#, + export_self.catalog, schema, COPY_PATH_PLACEHOLDER, with_options_clone + ); + + if connection_part.is_empty() { + command_without_connection + } else { + let command_with_connection = format!( + r#"COPY DATABASE "{}"."{}" FROM '{}' WITH ({}){};"#, + export_self.catalog, schema, path, with_options_clone, connection_part + ); + + format!( + "-- {}\n{}", + command_with_connection, command_without_connection + ) + } + }; let copy_from_path = export_self.get_file_path(&schema, "copy_from.sql"); export_self @@ -871,7 +898,11 @@ impl Tool for Export { } /// Builds the WITH options string for SQL commands, assuming consistent syntax across S3 and local exports. -fn build_with_options(start_time: &Option, end_time: &Option) -> String { +fn build_with_options( + start_time: &Option, + end_time: &Option, + parallelism: usize, +) -> String { let mut options = vec!["format = 'parquet'".to_string()]; if let Some(start) = start_time { options.push(format!("start_time = '{}'", start)); @@ -879,5 +910,6 @@ fn build_with_options(start_time: &Option, end_time: &Option) -> if let Some(end) = end_time { options.push(format!("end_time = '{}'", end)); } + options.push(format!("parallelism = {}", parallelism)); options.join(", ") } diff --git a/src/cli/src/data/import.rs b/src/cli/src/data/import.rs index 102de8ac91..ffe8b62c7e 100644 --- a/src/cli/src/data/import.rs +++ b/src/cli/src/data/import.rs @@ -21,12 +21,13 @@ use clap::{Parser, ValueEnum}; use common_catalog::consts::DEFAULT_SCHEMA_NAME; use common_error::ext::BoxedError; use common_telemetry::{error, info, warn}; -use snafu::{OptionExt, ResultExt}; +use snafu::{OptionExt, ResultExt, ensure}; use tokio::sync::Semaphore; use tokio::time::Instant; +use crate::data::{COPY_PATH_PLACEHOLDER, default_database}; use crate::database::{DatabaseClient, parse_proxy_opts}; -use crate::error::{Error, FileIoSnafu, Result, SchemaNotFoundSnafu}; +use crate::error::{Error, FileIoSnafu, InvalidArgumentsSnafu, Result, SchemaNotFoundSnafu}; use crate::{Tool, database}; #[derive(Debug, Default, Clone, ValueEnum)] @@ -52,12 +53,14 @@ pub struct ImportCommand { input_dir: String, /// The name of the catalog to import. - #[clap(long, default_value = "greptime-*")] + #[clap(long, default_value_t = default_database())] database: String, - /// Parallelism of the import. - #[clap(long, short = 'j', default_value = "1")] - import_jobs: usize, + /// The number of databases imported in parallel. + /// For example, if there are 20 databases and `db_parallelism` is 4, + /// 4 databases will be imported concurrently. + #[clap(long, short = 'j', default_value = "1", alias = "import-jobs")] + db_parallelism: usize, /// Max retry times for each job. #[clap(long, default_value = "3")] @@ -108,7 +111,7 @@ impl ImportCommand { schema, database_client, input_dir: self.input_dir.clone(), - parallelism: self.import_jobs, + parallelism: self.db_parallelism, target: self.target.clone(), })) } @@ -147,12 +150,15 @@ impl Import { let _permit = semaphore_moved.acquire().await.unwrap(); let database_input_dir = self.catalog_path().join(&schema); let sql_file = database_input_dir.join(filename); - let sql = tokio::fs::read_to_string(sql_file) + let mut sql = tokio::fs::read_to_string(sql_file) .await .context(FileIoSnafu)?; - if sql.is_empty() { + if sql.trim().is_empty() { info!("Empty `{filename}` {database_input_dir:?}"); } else { + if filename == "copy_from.sql" { + sql = self.rewrite_copy_database_sql(&schema, &sql)?; + } let db = exec_db.unwrap_or(&schema); self.database_client.sql(&sql, db).await?; info!("Imported `{filename}` for database {schema}"); @@ -225,6 +231,57 @@ impl Import { } Ok(db_names) } + + fn rewrite_copy_database_sql(&self, schema: &str, sql: &str) -> Result { + let target_location = self.build_copy_database_location(schema); + let escaped_location = target_location.replace('\'', "''"); + + let mut first_stmt_checked = false; + for line in sql.lines() { + let trimmed = line.trim_start(); + if trimmed.is_empty() || trimmed.starts_with("--") { + continue; + } + + ensure!( + trimmed.starts_with("COPY DATABASE"), + InvalidArgumentsSnafu { + msg: "Expected COPY DATABASE statement at start of copy_from.sql" + } + ); + first_stmt_checked = true; + break; + } + + ensure!( + first_stmt_checked, + InvalidArgumentsSnafu { + msg: "COPY DATABASE statement not found in copy_from.sql" + } + ); + + ensure!( + sql.contains(COPY_PATH_PLACEHOLDER), + InvalidArgumentsSnafu { + msg: format!( + "Placeholder `{}` not found in COPY DATABASE statement", + COPY_PATH_PLACEHOLDER + ) + } + ); + + Ok(sql.replacen(COPY_PATH_PLACEHOLDER, &escaped_location, 1)) + } + + fn build_copy_database_location(&self, schema: &str) -> String { + let mut path = self.catalog_path(); + path.push(schema); + let mut path_str = path.to_string_lossy().into_owned(); + if !path_str.ends_with('/') { + path_str.push('/'); + } + path_str + } } #[async_trait] @@ -240,3 +297,52 @@ impl Tool for Import { } } } + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::*; + + fn build_import(input_dir: &str) -> Import { + Import { + catalog: "catalog".to_string(), + schema: None, + database_client: DatabaseClient::new( + "127.0.0.1:4000".to_string(), + "catalog".to_string(), + None, + Duration::from_secs(0), + None, + ), + input_dir: input_dir.to_string(), + parallelism: 1, + target: ImportTarget::Data, + } + } + + #[test] + fn rewrite_copy_database_sql_replaces_placeholder() { + let import = build_import("/tmp/export-path"); + let comment = "-- COPY DATABASE \"catalog\".\"schema\" FROM 's3://bucket/demo/' WITH (format = 'parquet') CONNECTION (region = 'us-west-2')"; + let sql = format!( + "{comment}\nCOPY DATABASE \"catalog\".\"schema\" FROM '{}' WITH (format = 'parquet');", + COPY_PATH_PLACEHOLDER + ); + + let rewritten = import.rewrite_copy_database_sql("schema", &sql).unwrap(); + let expected_location = import.build_copy_database_location("schema"); + let escaped = expected_location.replace('\'', "''"); + + assert!(rewritten.starts_with(comment)); + assert!(rewritten.contains(&format!("FROM '{escaped}'"))); + assert!(!rewritten.contains(COPY_PATH_PLACEHOLDER)); + } + + #[test] + fn rewrite_copy_database_sql_requires_placeholder() { + let import = build_import("/tmp/export-path"); + let sql = "COPY DATABASE \"catalog\".\"schema\" FROM '/tmp/export-path/catalog/schema/' WITH (format = 'parquet');"; + assert!(import.rewrite_copy_database_sql("schema", sql).is_err()); + } +} diff --git a/src/client/src/client.rs b/src/client/src/client.rs index 1506ac5208..39cb5c30aa 100644 --- a/src/client/src/client.rs +++ b/src/client/src/client.rs @@ -20,7 +20,9 @@ use api::v1::health_check_client::HealthCheckClient; use api::v1::prometheus_gateway_client::PrometheusGatewayClient; use api::v1::region::region_client::RegionClient as PbRegionClient; use arrow_flight::flight_service_client::FlightServiceClient; -use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption}; +use common_grpc::channel_manager::{ + ChannelConfig, ChannelManager, ClientTlsOption, load_client_tls_config, +}; use parking_lot::RwLock; use snafu::{OptionExt, ResultExt}; use tonic::codec::CompressionEncoding; @@ -93,9 +95,10 @@ impl Client { U: AsRef, A: AsRef<[U]>, { - let channel_config = ChannelConfig::default().client_tls_config(client_tls); - let channel_manager = ChannelManager::with_tls_config(channel_config) - .context(error::CreateTlsChannelSnafu)?; + let channel_config = ChannelConfig::default().client_tls_config(client_tls.clone()); + let tls_config = + load_client_tls_config(Some(client_tls)).context(error::CreateTlsChannelSnafu)?; + let channel_manager = ChannelManager::with_config(channel_config, tls_config); Ok(Self::with_manager_and_urls(channel_manager, urls)) } diff --git a/src/client/src/client_manager.rs b/src/client/src/client_manager.rs index 80afd2fb32..edac45a9fe 100644 --- a/src/client/src/client_manager.rs +++ b/src/client/src/client_manager.rs @@ -74,7 +74,7 @@ impl FlownodeManager for NodeClients { impl NodeClients { pub fn new(config: ChannelConfig) -> Self { Self { - channel_manager: ChannelManager::with_config(config), + channel_manager: ChannelManager::with_config(config, None), clients: CacheBuilder::new(1024) .time_to_live(Duration::from_secs(30 * 60)) .time_to_idle(Duration::from_secs(5 * 60)) diff --git a/src/client/src/database.rs b/src/client/src/database.rs index 0646c3e2a3..239f3fe3f9 100644 --- a/src/client/src/database.rs +++ b/src/client/src/database.rs @@ -435,10 +435,10 @@ impl Database { .context(ExternalSnafu)?; match flight_message { FlightMessage::RecordBatch(arrow_batch) => { - yield RecordBatch::try_from_df_record_batch( + yield Ok(RecordBatch::from_df_record_batch( schema_cloned.clone(), arrow_batch, - ) + )) } FlightMessage::Metrics(_) => {} FlightMessage::AffectedRows(_) | FlightMessage::Schema(_) => { diff --git a/src/client/src/region.rs b/src/client/src/region.rs index 6e5a286083..3e80b83cec 100644 --- a/src/client/src/region.rs +++ b/src/client/src/region.rs @@ -182,10 +182,8 @@ impl RegionRequester { match flight_message { FlightMessage::RecordBatch(record_batch) => { - let result_to_yield = RecordBatch::try_from_df_record_batch( - schema_cloned.clone(), - record_batch, - ); + let result_to_yield = + RecordBatch::from_df_record_batch(schema_cloned.clone(), record_batch); // get the next message from the stream. normally it should be a metrics message. if let Some(next_flight_message_result) = flight_message_stream.next().await @@ -219,7 +217,7 @@ impl RegionRequester { stream_ended = true; } - yield result_to_yield; + yield Ok(result_to_yield); } FlightMessage::Metrics(s) => { // just a branch in case of some metrics message comes after other things. diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml index 744d13faeb..d279ddb7f0 100644 --- a/src/cmd/Cargo.toml +++ b/src/cmd/Cargo.toml @@ -16,7 +16,7 @@ default = [ "meta-srv/pg_kvbackend", "meta-srv/mysql_kvbackend", ] -enterprise = ["common-meta/enterprise", "frontend/enterprise", "meta-srv/enterprise", "catalog/enterprise"] +enterprise = ["common-meta/enterprise", "frontend/enterprise", "meta-srv/enterprise"] tokio-console = ["common-telemetry/tokio-console"] [lints] @@ -29,9 +29,11 @@ base64.workspace = true cache.workspace = true catalog.workspace = true chrono.workspace = true +either = "1.15" clap.workspace = true cli.workspace = true client.workspace = true +colored = "2.1.0" common-base.workspace = true common-catalog.workspace = true common-config.workspace = true @@ -63,9 +65,11 @@ lazy_static.workspace = true meta-client.workspace = true meta-srv.workspace = true metric-engine.workspace = true +mito2.workspace = true moka.workspace = true nu-ansi-term = "0.46" object-store.workspace = true +parquet = { workspace = true, features = ["object_store"] } plugins.workspace = true prometheus.workspace = true prost.workspace = true @@ -88,6 +92,11 @@ toml.workspace = true tonic.workspace = true tracing-appender.workspace = true +[target.'cfg(unix)'.dependencies] +pprof = { version = "0.14", features = [ + "flamegraph", +] } + [target.'cfg(not(windows))'.dependencies] tikv-jemallocator = "0.6" diff --git a/src/cmd/src/bin/greptime.rs b/src/cmd/src/bin/greptime.rs index cf72b3d32f..f6bbebf7fb 100644 --- a/src/cmd/src/bin/greptime.rs +++ b/src/cmd/src/bin/greptime.rs @@ -103,12 +103,15 @@ async fn main_body() -> Result<()> { async fn start(cli: Command) -> Result<()> { match cli.subcmd { - SubCommand::Datanode(cmd) => { - let opts = cmd.load_options(&cli.global_options)?; - let plugins = Plugins::new(); - let builder = InstanceBuilder::try_new_with_init(opts, plugins).await?; - cmd.build_with(builder).await?.run().await - } + SubCommand::Datanode(cmd) => match cmd.subcmd { + datanode::SubCommand::Start(ref start) => { + let opts = start.load_options(&cli.global_options)?; + let plugins = Plugins::new(); + let builder = InstanceBuilder::try_new_with_init(opts, plugins).await?; + cmd.build_with(builder).await?.run().await + } + datanode::SubCommand::Objbench(ref bench) => bench.run().await, + }, SubCommand::Flownode(cmd) => { cmd.build(cmd.load_options(&cli.global_options)?) .await? diff --git a/src/cmd/src/datanode.rs b/src/cmd/src/datanode.rs index 641d3fc5fd..23ca644ffc 100644 --- a/src/cmd/src/datanode.rs +++ b/src/cmd/src/datanode.rs @@ -13,6 +13,8 @@ // limitations under the License. pub mod builder; +#[allow(clippy::print_stdout)] +mod objbench; use std::path::Path; use std::time::Duration; @@ -23,13 +25,16 @@ use common_config::Configurable; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_telemetry::{info, warn}; use common_wal::config::DatanodeWalConfig; +use datanode::config::RegionEngineConfig; use datanode::datanode::Datanode; use meta_client::MetaClientOptions; +use serde::{Deserialize, Serialize}; use snafu::{ResultExt, ensure}; use tracing_appender::non_blocking::WorkerGuard; use crate::App; use crate::datanode::builder::InstanceBuilder; +use crate::datanode::objbench::ObjbenchCommand; use crate::error::{ LoadLayeredConfigSnafu, MissingConfigSnafu, Result, ShutdownDatanodeSnafu, StartDatanodeSnafu, }; @@ -89,7 +94,7 @@ impl App for Instance { #[derive(Parser)] pub struct Command { #[clap(subcommand)] - subcmd: SubCommand, + pub subcmd: SubCommand, } impl Command { @@ -100,13 +105,26 @@ impl Command { pub fn load_options(&self, global_options: &GlobalOptions) -> Result { match &self.subcmd { SubCommand::Start(cmd) => cmd.load_options(global_options), + SubCommand::Objbench(_) => { + // For objbench command, we don't need to load DatanodeOptions + // It's a standalone utility command + let mut opts = datanode::config::DatanodeOptions::default(); + opts.sanitize(); + Ok(DatanodeOptions { + runtime: Default::default(), + plugins: Default::default(), + component: opts, + }) + } } } } #[derive(Parser)] -enum SubCommand { +pub enum SubCommand { Start(StartCommand), + /// Object storage benchmark tool + Objbench(ObjbenchCommand), } impl SubCommand { @@ -116,12 +134,33 @@ impl SubCommand { info!("Building datanode with {:#?}", cmd); builder.build().await } + SubCommand::Objbench(cmd) => { + cmd.run().await?; + std::process::exit(0); + } } } } +/// Storage engine config +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +#[serde(default)] +pub struct StorageConfig { + /// The working directory of database + pub data_home: String, + #[serde(flatten)] + pub store: object_store::config::ObjectStoreConfig, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +#[serde(default)] +struct StorageConfigWrapper { + storage: StorageConfig, + region_engine: Vec, +} + #[derive(Debug, Parser, Default)] -struct StartCommand { +pub struct StartCommand { #[clap(long)] node_id: Option, /// The address to bind the gRPC server. @@ -149,7 +188,7 @@ struct StartCommand { } impl StartCommand { - fn load_options(&self, global_options: &GlobalOptions) -> Result { + pub fn load_options(&self, global_options: &GlobalOptions) -> Result { let mut opts = DatanodeOptions::load_layered_options( self.config_file.as_deref(), self.env_prefix.as_ref(), diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs new file mode 100644 index 0000000000..a8ff8b4daf --- /dev/null +++ b/src/cmd/src/datanode/objbench.rs @@ -0,0 +1,678 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +use clap::Parser; +use colored::Colorize; +use datanode::config::RegionEngineConfig; +use datanode::store; +use either::Either; +use mito2::access_layer::{ + AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType, +}; +use mito2::cache::{CacheManager, CacheManagerRef}; +use mito2::config::{FulltextIndexConfig, MitoConfig, Mode}; +use mito2::read::Source; +use mito2::sst::file::{FileHandle, FileMeta}; +use mito2::sst::file_purger::{FilePurger, FilePurgerRef}; +use mito2::sst::index::intermediate::IntermediateManager; +use mito2::sst::index::puffin_manager::PuffinManagerFactory; +use mito2::sst::parquet::reader::ParquetReaderBuilder; +use mito2::sst::parquet::{PARQUET_METADATA_KEY, WriteOptions}; +use mito2::worker::write_cache_from_config; +use object_store::ObjectStore; +use regex::Regex; +use snafu::OptionExt; +use store_api::metadata::{RegionMetadata, RegionMetadataRef}; +use store_api::path_utils::region_name; +use store_api::region_request::PathType; +use store_api::storage::FileId; + +use crate::datanode::{StorageConfig, StorageConfigWrapper}; +use crate::error; + +/// Object storage benchmark command +#[derive(Debug, Parser)] +pub struct ObjbenchCommand { + /// Path to the object-store config file (TOML). Must deserialize into object_store::config::ObjectStoreConfig. + #[clap(long, value_name = "FILE")] + pub config: PathBuf, + + /// Source SST file path in object-store (e.g. "region_dir/.parquet"). + #[clap(long, value_name = "PATH")] + pub source: String, + + /// Verbose output + #[clap(short, long, default_value_t = false)] + pub verbose: bool, + + /// Output file path for pprof flamegraph (enables profiling) + #[clap(long, value_name = "FILE")] + pub pprof_file: Option, +} + +fn parse_config(config_path: &PathBuf) -> error::Result<(StorageConfig, MitoConfig)> { + let cfg_str = std::fs::read_to_string(config_path).map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("failed to read config {}: {e}", config_path.display()), + } + .build() + })?; + + let store_cfg: StorageConfigWrapper = toml::from_str(&cfg_str).map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("failed to parse config {}: {e}", config_path.display()), + } + .build() + })?; + + let storage_config = store_cfg.storage; + let mito_engine_config = store_cfg + .region_engine + .into_iter() + .filter_map(|c| { + if let RegionEngineConfig::Mito(mito) = c { + Some(mito) + } else { + None + } + }) + .next() + .with_context(|| error::IllegalConfigSnafu { + msg: format!("Engine config not found in {:?}", config_path), + })?; + Ok((storage_config, mito_engine_config)) +} + +impl ObjbenchCommand { + pub async fn run(&self) -> error::Result<()> { + if self.verbose { + common_telemetry::init_default_ut_logging(); + } + + println!("{}", "Starting objbench with config:".cyan().bold()); + + // Build object store from config + let (store_cfg, mut mito_engine_config) = parse_config(&self.config)?; + + let object_store = build_object_store(&store_cfg).await?; + println!("{} Object store initialized", "✓".green()); + + // Prepare source identifiers + let components = parse_file_dir_components(&self.source)?; + println!( + "{} Source path parsed: {}, components: {:?}", + "✓".green(), + self.source, + components + ); + + // Load parquet metadata to extract RegionMetadata and file stats + println!("{}", "Loading parquet metadata...".yellow()); + let file_size = object_store + .stat(&self.source) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("stat failed: {e}"), + } + .build() + })? + .content_length(); + let parquet_meta = load_parquet_metadata(object_store.clone(), &self.source, file_size) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("read parquet metadata failed: {e}"), + } + .build() + })?; + + let region_meta = extract_region_metadata(&self.source, &parquet_meta)?; + let num_rows = parquet_meta.file_metadata().num_rows() as u64; + let num_row_groups = parquet_meta.num_row_groups() as u64; + + println!( + "{} Metadata loaded - rows: {}, size: {} bytes", + "✓".green(), + num_rows, + file_size + ); + + // Build a FileHandle for the source file + let file_meta = FileMeta { + region_id: region_meta.region_id, + file_id: components.file_id, + time_range: Default::default(), + level: 0, + file_size, + available_indexes: Default::default(), + indexes: Default::default(), + index_file_size: 0, + index_file_id: None, + num_rows, + num_row_groups, + sequence: None, + partition_expr: None, + num_series: 0, + }; + let src_handle = FileHandle::new(file_meta, new_noop_file_purger()); + + // Build the reader for a single file via ParquetReaderBuilder + let table_dir = components.table_dir(); + let (src_access_layer, cache_manager) = build_access_layer_simple( + &components, + object_store.clone(), + &mut mito_engine_config, + &store_cfg.data_home, + ) + .await?; + let reader_build_start = Instant::now(); + + let reader = ParquetReaderBuilder::new( + table_dir, + components.path_type, + src_handle.clone(), + object_store.clone(), + ) + .expected_metadata(Some(region_meta.clone())) + .build() + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("build reader failed: {e:?}"), + } + .build() + })?; + + let reader_build_elapsed = reader_build_start.elapsed(); + let total_rows = reader.parquet_metadata().file_metadata().num_rows(); + println!("{} Reader built in {:?}", "✓".green(), reader_build_elapsed); + + // Build write request + let fulltext_index_config = FulltextIndexConfig { + create_on_compaction: Mode::Disable, + ..Default::default() + }; + + let write_req = SstWriteRequest { + op_type: OperationType::Flush, + metadata: region_meta, + source: Either::Left(Source::Reader(Box::new(reader))), + cache_manager, + storage: None, + max_sequence: None, + index_options: Default::default(), + index_config: mito_engine_config.index.clone(), + inverted_index_config: MitoConfig::default().inverted_index, + fulltext_index_config, + bloom_filter_index_config: MitoConfig::default().bloom_filter_index, + }; + + // Write SST + println!("{}", "Writing SST...".yellow()); + + // Start profiling if pprof_file is specified + #[cfg(unix)] + let profiler_guard = if self.pprof_file.is_some() { + println!("{} Starting profiling...", "⚡".yellow()); + Some( + pprof::ProfilerGuardBuilder::default() + .frequency(99) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build() + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to start profiler: {e}"), + } + .build() + })?, + ) + } else { + None + }; + + #[cfg(not(unix))] + if self.pprof_file.is_some() { + eprintln!( + "{}: Profiling is not supported on this platform", + "Warning".yellow() + ); + } + + let write_start = Instant::now(); + let mut metrics = Metrics::new(WriteType::Flush); + let infos = src_access_layer + .write_sst(write_req, &WriteOptions::default(), &mut metrics) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("write_sst failed: {e:?}"), + } + .build() + })?; + + let write_elapsed = write_start.elapsed(); + + // Stop profiling and generate flamegraph if enabled + #[cfg(unix)] + if let (Some(guard), Some(pprof_file)) = (profiler_guard, &self.pprof_file) { + println!("{} Generating flamegraph...", "🔥".yellow()); + match guard.report().build() { + Ok(report) => { + let mut flamegraph_data = Vec::new(); + if let Err(e) = report.flamegraph(&mut flamegraph_data) { + println!("{}: Failed to generate flamegraph: {}", "Error".red(), e); + } else if let Err(e) = std::fs::write(pprof_file, flamegraph_data) { + println!( + "{}: Failed to write flamegraph to {}: {}", + "Error".red(), + pprof_file.display(), + e + ); + } else { + println!( + "{} Flamegraph saved to {}", + "✓".green(), + pprof_file.display().to_string().cyan() + ); + } + } + Err(e) => { + println!("{}: Failed to generate pprof report: {}", "Error".red(), e); + } + } + } + assert_eq!(infos.len(), 1); + let dst_file_id = infos[0].file_id; + let dst_file_path = format!("{}/{}.parquet", components.region_dir(), dst_file_id); + let mut dst_index_path = None; + if infos[0].index_metadata.file_size > 0 { + dst_index_path = Some(format!( + "{}/index/{}.puffin", + components.region_dir(), + dst_file_id + )); + } + + // Report results with ANSI colors + println!("\n{} {}", "Write complete!".green().bold(), "✓".green()); + println!(" {}: {}", "Destination file".bold(), dst_file_path.cyan()); + println!(" {}: {}", "Rows".bold(), total_rows.to_string().cyan()); + println!( + " {}: {}", + "File size".bold(), + format!("{} bytes", file_size).cyan() + ); + println!( + " {}: {:?}", + "Reader build time".bold(), + reader_build_elapsed + ); + println!(" {}: {:?}", "Total time".bold(), write_elapsed); + + // Print metrics in a formatted way + println!(" {}: {:?}", "Metrics".bold(), metrics,); + + // Print infos + println!(" {}: {:?}", "Index".bold(), infos[0].index_metadata); + + // Cleanup + println!("\n{}", "Cleaning up...".yellow()); + object_store.delete(&dst_file_path).await.map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to delete dest file {}: {}", dst_file_path, e), + } + .build() + })?; + println!("{} Temporary file {} deleted", "✓".green(), dst_file_path); + + if let Some(index_path) = dst_index_path { + object_store.delete(&index_path).await.map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to delete dest index file {}: {}", index_path, e), + } + .build() + })?; + println!( + "{} Temporary index file {} deleted", + "✓".green(), + index_path + ); + } + + println!("\n{}", "Benchmark completed successfully!".green().bold()); + Ok(()) + } +} + +#[derive(Debug)] +struct FileDirComponents { + catalog: String, + schema: String, + table_id: u32, + region_sequence: u32, + path_type: PathType, + file_id: FileId, +} + +impl FileDirComponents { + fn table_dir(&self) -> String { + format!("data/{}/{}/{}", self.catalog, self.schema, self.table_id) + } + + fn region_dir(&self) -> String { + let region_name = region_name(self.table_id, self.region_sequence); + match self.path_type { + PathType::Bare => { + format!( + "data/{}/{}/{}/{}", + self.catalog, self.schema, self.table_id, region_name + ) + } + PathType::Data => { + format!( + "data/{}/{}/{}/{}/data", + self.catalog, self.schema, self.table_id, region_name + ) + } + PathType::Metadata => { + format!( + "data/{}/{}/{}/{}/metadata", + self.catalog, self.schema, self.table_id, region_name + ) + } + } + } +} + +fn parse_file_dir_components(path: &str) -> error::Result { + // Define the regex pattern to match all three path styles + let pattern = + r"^data/([^/]+)/([^/]+)/([^/]+)/([^/]+)_([^/]+)(?:/data|/metadata)?/(.+).parquet$"; + + // Compile the regex + let re = Regex::new(pattern).expect("Invalid regex pattern"); + + // Determine the path type + let path_type = if path.contains("/data/") { + PathType::Data + } else if path.contains("/metadata/") { + PathType::Metadata + } else { + PathType::Bare + }; + + // Try to match the path + let components = (|| { + let captures = re.captures(path)?; + if captures.len() != 7 { + return None; + } + let mut components = FileDirComponents { + catalog: "".to_string(), + schema: "".to_string(), + table_id: 0, + region_sequence: 0, + path_type, + file_id: FileId::default(), + }; + // Extract the components + components.catalog = captures.get(1)?.as_str().to_string(); + components.schema = captures.get(2)?.as_str().to_string(); + components.table_id = captures[3].parse().ok()?; + components.region_sequence = captures[5].parse().ok()?; + let file_id_str = &captures[6]; + components.file_id = FileId::parse_str(file_id_str).ok()?; + Some(components) + })(); + components.context(error::IllegalConfigSnafu { + msg: format!("Expect valid source file path, got: {}", path), + }) +} + +fn extract_region_metadata( + file_path: &str, + meta: &parquet::file::metadata::ParquetMetaData, +) -> error::Result { + use parquet::format::KeyValue; + let kvs: Option<&Vec> = meta.file_metadata().key_value_metadata(); + let Some(kvs) = kvs else { + return Err(error::IllegalConfigSnafu { + msg: format!("{file_path}: missing parquet key_value metadata"), + } + .build()); + }; + let json = kvs + .iter() + .find(|kv| kv.key == PARQUET_METADATA_KEY) + .and_then(|kv| kv.value.as_ref()) + .ok_or_else(|| { + error::IllegalConfigSnafu { + msg: format!("{file_path}: key {PARQUET_METADATA_KEY} not found or empty"), + } + .build() + })?; + let region: RegionMetadata = RegionMetadata::from_json(json).map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("invalid region metadata json: {e}"), + } + .build() + })?; + Ok(Arc::new(region)) +} + +async fn build_object_store(sc: &StorageConfig) -> error::Result { + store::new_object_store(sc.store.clone(), &sc.data_home) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to build object store: {e:?}"), + } + .build() + }) +} + +async fn build_access_layer_simple( + components: &FileDirComponents, + object_store: ObjectStore, + config: &mut MitoConfig, + data_home: &str, +) -> error::Result<(AccessLayerRef, CacheManagerRef)> { + let _ = config.index.sanitize(data_home, &config.inverted_index); + let puffin_manager = PuffinManagerFactory::new( + &config.index.aux_path, + config.index.staging_size.as_bytes(), + Some(config.index.write_buffer_size.as_bytes() as _), + config.index.staging_ttl, + ) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to build access layer: {e:?}"), + } + .build() + })?; + + let intermediate_manager = IntermediateManager::init_fs(&config.index.aux_path) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to build IntermediateManager: {e:?}"), + } + .build() + })? + .with_buffer_size(Some(config.index.write_buffer_size.as_bytes() as _)); + + let cache_manager = + build_cache_manager(config, puffin_manager.clone(), intermediate_manager.clone()).await?; + let layer = AccessLayer::new( + components.table_dir(), + components.path_type, + object_store, + puffin_manager, + intermediate_manager, + ); + Ok((Arc::new(layer), cache_manager)) +} + +async fn build_cache_manager( + config: &MitoConfig, + puffin_manager: PuffinManagerFactory, + intermediate_manager: IntermediateManager, +) -> error::Result { + let write_cache = write_cache_from_config(config, puffin_manager, intermediate_manager) + .await + .map_err(|e| { + error::IllegalConfigSnafu { + msg: format!("Failed to build write cache: {e:?}"), + } + .build() + })?; + let cache_manager = Arc::new( + CacheManager::builder() + .sst_meta_cache_size(config.sst_meta_cache_size.as_bytes()) + .vector_cache_size(config.vector_cache_size.as_bytes()) + .page_cache_size(config.page_cache_size.as_bytes()) + .selector_result_cache_size(config.selector_result_cache_size.as_bytes()) + .index_metadata_size(config.index.metadata_cache_size.as_bytes()) + .index_content_size(config.index.content_cache_size.as_bytes()) + .index_content_page_size(config.index.content_cache_page_size.as_bytes()) + .index_result_cache_size(config.index.result_cache_size.as_bytes()) + .puffin_metadata_size(config.index.metadata_cache_size.as_bytes()) + .write_cache(write_cache) + .build(), + ); + Ok(cache_manager) +} + +fn new_noop_file_purger() -> FilePurgerRef { + #[derive(Debug)] + struct Noop; + impl FilePurger for Noop { + fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {} + } + Arc::new(Noop) +} + +async fn load_parquet_metadata( + object_store: ObjectStore, + path: &str, + file_size: u64, +) -> Result> { + use parquet::file::FOOTER_SIZE; + use parquet::file::metadata::ParquetMetaDataReader; + let actual_size = if file_size == 0 { + object_store.stat(path).await?.content_length() + } else { + file_size + }; + if actual_size < FOOTER_SIZE as u64 { + return Err("file too small".into()); + } + let prefetch: u64 = 64 * 1024; + let start = actual_size.saturating_sub(prefetch); + let buffer = object_store + .read_with(path) + .range(start..actual_size) + .await? + .to_vec(); + let buffer_len = buffer.len(); + let mut footer = [0; 8]; + footer.copy_from_slice(&buffer[buffer_len - FOOTER_SIZE..]); + let footer = ParquetMetaDataReader::decode_footer_tail(&footer)?; + let metadata_len = footer.metadata_length() as u64; + if actual_size - (FOOTER_SIZE as u64) < metadata_len { + return Err("invalid footer/metadata length".into()); + } + if (metadata_len as usize) <= buffer_len - FOOTER_SIZE { + let metadata_start = buffer_len - metadata_len as usize - FOOTER_SIZE; + let meta = ParquetMetaDataReader::decode_metadata( + &buffer[metadata_start..buffer_len - FOOTER_SIZE], + )?; + Ok(meta) + } else { + let metadata_start = actual_size - metadata_len - FOOTER_SIZE as u64; + let data = object_store + .read_with(path) + .range(metadata_start..(actual_size - FOOTER_SIZE as u64)) + .await? + .to_vec(); + let meta = ParquetMetaDataReader::decode_metadata(&data)?; + Ok(meta) + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + use std::str::FromStr; + + use common_base::readable_size::ReadableSize; + use store_api::region_request::PathType; + + use crate::datanode::objbench::{parse_config, parse_file_dir_components}; + + #[test] + fn test_parse_dir() { + let meta_path = "data/greptime/public/1024/1024_0000000000/metadata/00020380-009c-426d-953e-b4e34c15af34.parquet"; + let c = parse_file_dir_components(meta_path).unwrap(); + assert_eq!( + c.file_id.to_string(), + "00020380-009c-426d-953e-b4e34c15af34" + ); + assert_eq!(c.catalog, "greptime"); + assert_eq!(c.schema, "public"); + assert_eq!(c.table_id, 1024); + assert_eq!(c.region_sequence, 0); + assert_eq!(c.path_type, PathType::Metadata); + + let c = parse_file_dir_components( + "data/greptime/public/1024/1024_0000000000/data/00020380-009c-426d-953e-b4e34c15af34.parquet", + ).unwrap(); + assert_eq!( + c.file_id.to_string(), + "00020380-009c-426d-953e-b4e34c15af34" + ); + assert_eq!(c.catalog, "greptime"); + assert_eq!(c.schema, "public"); + assert_eq!(c.table_id, 1024); + assert_eq!(c.region_sequence, 0); + assert_eq!(c.path_type, PathType::Data); + + let c = parse_file_dir_components( + "data/greptime/public/1024/1024_0000000000/00020380-009c-426d-953e-b4e34c15af34.parquet", + ).unwrap(); + assert_eq!( + c.file_id.to_string(), + "00020380-009c-426d-953e-b4e34c15af34" + ); + assert_eq!(c.catalog, "greptime"); + assert_eq!(c.schema, "public"); + assert_eq!(c.table_id, 1024); + assert_eq!(c.region_sequence, 0); + assert_eq!(c.path_type, PathType::Bare); + } + + #[test] + fn test_parse_config() { + let path = "../../config/datanode.example.toml"; + let (storage, engine) = parse_config(&PathBuf::from_str(path).unwrap()).unwrap(); + assert_eq!(storage.data_home, "./greptimedb_data"); + assert_eq!(engine.index.staging_size, ReadableSize::gb(2)); + } +} diff --git a/src/cmd/src/error.rs b/src/cmd/src/error.rs index 0b77dec341..fbff2d42e0 100644 --- a/src/cmd/src/error.rs +++ b/src/cmd/src/error.rs @@ -99,13 +99,6 @@ pub enum Error { source: flow::Error, }, - #[snafu(display("Servers error"))] - Servers { - #[snafu(implicit)] - location: Location, - source: servers::error::Error, - }, - #[snafu(display("Failed to start frontend"))] StartFrontend { #[snafu(implicit)] @@ -336,7 +329,6 @@ impl ErrorExt for Error { Error::ShutdownFrontend { source, .. } => source.status_code(), Error::StartMetaServer { source, .. } => source.status_code(), Error::ShutdownMetaServer { source, .. } => source.status_code(), - Error::Servers { source, .. } => source.status_code(), Error::BuildMetaServer { source, .. } => source.status_code(), Error::UnsupportedSelectorType { source, .. } => source.status_code(), Error::BuildCli { source, .. } => source.status_code(), diff --git a/src/cmd/src/flownode.rs b/src/cmd/src/flownode.rs index 500e9bfa89..6cefdb0f79 100644 --- a/src/cmd/src/flownode.rs +++ b/src/cmd/src/flownode.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fmt::Debug; use std::path::Path; use std::sync::Arc; use std::time::Duration; @@ -30,6 +31,7 @@ use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHand use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler; use common_meta::key::TableMetadataManager; use common_meta::key::flow::FlowMetadataManager; +use common_stat::ResourceStatImpl; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_version::{short_version, verbose_version}; @@ -38,12 +40,14 @@ use flow::{ get_flow_auth_options, }; use meta_client::{MetaClientOptions, MetaClientType}; +use plugins::flownode::context::GrpcConfigureContext; +use servers::configurator::GrpcBuilderConfiguratorRef; use snafu::{OptionExt, ResultExt, ensure}; use tracing_appender::non_blocking::WorkerGuard; use crate::error::{ BuildCacheRegistrySnafu, InitMetadataSnafu, LoadLayeredConfigSnafu, MetaClientInitSnafu, - MissingConfigSnafu, Result, ShutdownFlownodeSnafu, StartFlownodeSnafu, + MissingConfigSnafu, OtherSnafu, Result, ShutdownFlownodeSnafu, StartFlownodeSnafu, }; use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{App, create_resource_limit_metrics, log_versions, maybe_activate_heap_profile}; @@ -54,33 +58,14 @@ type FlownodeOptions = GreptimeOptions; pub struct Instance { flownode: FlownodeInstance, - - // The components of flownode, which make it easier to expand based - // on the components. - #[cfg(feature = "enterprise")] - components: Components, - // Keep the logging guard to prevent the worker from being dropped. _guard: Vec, } -#[cfg(feature = "enterprise")] -pub struct Components { - pub catalog_manager: catalog::CatalogManagerRef, - pub fe_client: Arc, - pub kv_backend: common_meta::kv_backend::KvBackendRef, -} - impl Instance { - pub fn new( - flownode: FlownodeInstance, - #[cfg(feature = "enterprise")] components: Components, - guard: Vec, - ) -> Self { + pub fn new(flownode: FlownodeInstance, guard: Vec) -> Self { Self { flownode, - #[cfg(feature = "enterprise")] - components, _guard: guard, } } @@ -93,11 +78,6 @@ impl Instance { pub fn flownode_mut(&mut self) -> &mut FlownodeInstance { &mut self.flownode } - - #[cfg(feature = "enterprise")] - pub fn components(&self) -> &Components { - &self.components - } } #[async_trait::async_trait] @@ -372,11 +352,15 @@ impl StartCommand { Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())), ]); + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + let heartbeat_task = flow::heartbeat::HeartbeatTask::new( &opts, meta_client.clone(), opts.heartbeat.clone(), Arc::new(executor), + Arc::new(resource_stat), ); let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone())); @@ -391,7 +375,7 @@ impl StartCommand { let frontend_client = Arc::new(frontend_client); let flownode_builder = FlownodeBuilder::new( opts.clone(), - plugins, + plugins.clone(), table_metadata_manager, catalog_manager.clone(), flow_metadata_manager, @@ -400,8 +384,29 @@ impl StartCommand { .with_heartbeat_task(heartbeat_task); let mut flownode = flownode_builder.build().await.context(StartFlownodeSnafu)?; + + let builder = + FlownodeServiceBuilder::grpc_server_builder(&opts, flownode.flownode_server()); + let builder = if let Some(configurator) = + plugins.get::>() + { + let context = GrpcConfigureContext { + kv_backend: cached_meta_backend.clone(), + fe_client: frontend_client.clone(), + flownode_id: member_id, + catalog_manager: catalog_manager.clone(), + }; + configurator + .configure(builder, context) + .await + .context(OtherSnafu)? + } else { + builder + }; + let grpc_server = builder.build(); + let services = FlownodeServiceBuilder::new(&opts) - .with_default_grpc_server(flownode.flownode_server()) + .with_grpc_server(grpc_server) .enable_http_service() .build() .context(StartFlownodeSnafu)?; @@ -425,16 +430,6 @@ impl StartCommand { .set_frontend_invoker(invoker) .await; - #[cfg(feature = "enterprise")] - let components = Components { - catalog_manager: catalog_manager.clone(), - fe_client: frontend_client, - kv_backend: cached_meta_backend, - }; - - #[cfg(not(feature = "enterprise"))] - return Ok(Instance::new(flownode, guard)); - #[cfg(feature = "enterprise")] - Ok(Instance::new(flownode, components, guard)) + Ok(Instance::new(flownode, guard)) } } diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index 4c72021a47..d74b3cee5c 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fmt::Debug; use std::path::Path; use std::sync::Arc; use std::time::Duration; @@ -19,17 +20,23 @@ use std::time::Duration; use async_trait::async_trait; use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry}; use catalog::information_extension::DistributedInformationExtension; -use catalog::kvbackend::{CachedKvBackendBuilder, KvBackendCatalogManagerBuilder, MetaKvBackend}; +use catalog::kvbackend::{ + CachedKvBackendBuilder, CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder, + MetaKvBackend, +}; use catalog::process_manager::ProcessManager; use clap::Parser; use client::client_manager::NodeClients; use common_base::Plugins; use common_config::{Configurable, DEFAULT_DATA_HOME}; +use common_error::ext::BoxedError; use common_grpc::channel_manager::ChannelConfig; use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder}; use common_meta::heartbeat::handler::HandlerGroupExecutor; use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler; use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler; +use common_query::prelude::set_default_prefix; +use common_stat::ResourceStatImpl; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_time::timezone::set_default_timezone; @@ -39,14 +46,16 @@ use frontend::heartbeat::HeartbeatTask; use frontend::instance::builder::FrontendBuilder; use frontend::server::Services; use meta_client::{MetaClientOptions, MetaClientType}; +use plugins::frontend::context::{ + CatalogManagerConfigureContext, DistributedCatalogManagerConfigureContext, +}; use servers::addrs; -use servers::export_metrics::ExportMetricsTask; use servers::grpc::GrpcOptions; use servers::tls::{TlsMode, TlsOption}; use snafu::{OptionExt, ResultExt}; use tracing_appender::non_blocking::WorkerGuard; -use crate::error::{self, Result}; +use crate::error::{self, OtherSnafu, Result}; use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{App, create_resource_limit_metrics, log_versions, maybe_activate_heap_profile}; @@ -174,6 +183,8 @@ pub struct StartCommand { #[clap(long)] tls_key_path: Option, #[clap(long)] + tls_watch: bool, + #[clap(long)] user_provider: Option, #[clap(long)] disable_dashboard: Option, @@ -227,6 +238,7 @@ impl StartCommand { self.tls_mode.clone(), self.tls_cert_path.clone(), self.tls_key_path.clone(), + self.tls_watch, ); if let Some(addr) = &self.http_addr { @@ -332,6 +344,9 @@ impl StartCommand { .context(error::StartFrontendSnafu)?; set_default_timezone(opts.default_timezone.as_deref()).context(error::InitTimezoneSnafu)?; + set_default_prefix(opts.default_column_prefix.as_deref()) + .map_err(BoxedError::new) + .context(error::BuildCliSnafu)?; let meta_client_options = opts .meta_client @@ -408,9 +423,18 @@ impl StartCommand { layered_cache_registry.clone(), ) .with_process_manager(process_manager.clone()); - #[cfg(feature = "enterprise")] - let builder = if let Some(factories) = plugins.get() { - builder.with_extra_information_table_factories(factories) + let builder = if let Some(configurator) = + plugins.get::>() + { + let ctx = DistributedCatalogManagerConfigureContext { + meta_client: meta_client.clone(), + }; + let ctx = CatalogManagerConfigureContext::Distributed(ctx); + + configurator + .configure(builder, ctx) + .await + .context(OtherSnafu)? } else { builder }; @@ -421,11 +445,15 @@ impl StartCommand { Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())), ]); + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + let heartbeat_task = HeartbeatTask::new( &opts, meta_client.clone(), opts.heartbeat.clone(), Arc::new(executor), + Arc::new(resource_stat), ); let heartbeat_task = Some(heartbeat_task); @@ -445,9 +473,6 @@ impl StartCommand { .context(error::StartFrontendSnafu)?; let instance = Arc::new(instance); - let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins)) - .context(error::ServersSnafu)?; - let servers = Services::new(opts, instance.clone(), plugins) .build() .context(error::StartFrontendSnafu)?; @@ -456,7 +481,6 @@ impl StartCommand { instance, servers, heartbeat_task, - export_metrics_task, }; Ok(Instance::new(frontend, guard)) diff --git a/src/cmd/src/metasrv.rs b/src/cmd/src/metasrv.rs index 4f71775e74..ee67267de3 100644 --- a/src/cmd/src/metasrv.rs +++ b/src/cmd/src/metasrv.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::fmt; +use std::fmt::{self, Debug}; use std::path::Path; use std::time::Duration; @@ -23,7 +23,7 @@ use common_config::Configurable; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_version::{short_version, verbose_version}; -use meta_srv::bootstrap::MetasrvInstance; +use meta_srv::bootstrap::{MetasrvInstance, metasrv_builder}; use meta_srv::metasrv::BackendImpl; use snafu::ResultExt; use tracing_appender::non_blocking::WorkerGuard; @@ -177,7 +177,7 @@ pub struct StartCommand { backend: Option, } -impl fmt::Debug for StartCommand { +impl Debug for StartCommand { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("StartCommand") .field("rpc_bind_addr", &self.rpc_bind_addr) @@ -341,7 +341,7 @@ impl StartCommand { .await .context(StartMetaServerSnafu)?; - let builder = meta_srv::bootstrap::metasrv_builder(&opts, plugins, None) + let builder = metasrv_builder(&opts, plugins, None) .await .context(error::BuildMetaServerSnafu)?; let metasrv = builder.build().await.context(error::BuildMetaServerSnafu)?; diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 58602d0a39..1ef16a830f 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fmt::Debug; use std::net::SocketAddr; use std::path::Path; use std::sync::Arc; @@ -20,7 +21,7 @@ use std::{fs, path}; use async_trait::async_trait; use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry}; use catalog::information_schema::InformationExtensionRef; -use catalog::kvbackend::KvBackendCatalogManagerBuilder; +use catalog::kvbackend::{CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder}; use catalog::process_manager::ProcessManager; use clap::Parser; use common_base::Plugins; @@ -31,7 +32,7 @@ use common_meta::cache::LayeredCacheRegistryBuilder; use common_meta::ddl::flow_meta::FlowMetadataAllocator; use common_meta::ddl::table_meta::TableMetadataAllocator; use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl}; -use common_meta::ddl_manager::DdlManager; +use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef}; use common_meta::key::flow::FlowMetadataManager; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; @@ -41,6 +42,7 @@ use common_meta::region_registry::LeaderRegionRegistry; use common_meta::sequence::SequenceBuilder; use common_meta::wal_options_allocator::{WalOptionsAllocatorRef, build_wal_options_allocator}; use common_procedure::ProcedureManagerRef; +use common_query::prelude::set_default_prefix; use common_telemetry::info; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions}; use common_time::timezone::set_default_timezone; @@ -56,14 +58,17 @@ use frontend::instance::StandaloneDatanodeManager; use frontend::instance::builder::FrontendBuilder; use frontend::server::Services; use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ}; -use servers::export_metrics::ExportMetricsTask; +use plugins::frontend::context::{ + CatalogManagerConfigureContext, StandaloneCatalogManagerConfigureContext, +}; +use plugins::standalone::context::DdlManagerConfigureContext; use servers::tls::{TlsMode, TlsOption}; use snafu::ResultExt; use standalone::StandaloneInformationExtension; use standalone::options::StandaloneOptions; use tracing_appender::non_blocking::WorkerGuard; -use crate::error::{Result, StartFlownodeSnafu}; +use crate::error::{OtherSnafu, Result, StartFlownodeSnafu}; use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{App, create_resource_limit_metrics, error, log_versions, maybe_activate_heap_profile}; @@ -116,34 +121,15 @@ pub struct Instance { flownode: FlownodeInstance, procedure_manager: ProcedureManagerRef, wal_options_allocator: WalOptionsAllocatorRef, - - // The components of standalone, which make it easier to expand based - // on the components. - #[cfg(feature = "enterprise")] - components: Components, - // Keep the logging guard to prevent the worker from being dropped. _guard: Vec, } -#[cfg(feature = "enterprise")] -pub struct Components { - pub plugins: Plugins, - pub kv_backend: KvBackendRef, - pub frontend_client: Arc, - pub catalog_manager: catalog::CatalogManagerRef, -} - impl Instance { /// Find the socket addr of a server by its `name`. pub fn server_addr(&self, name: &str) -> Option { self.frontend.server_handlers().addr(name) } - - #[cfg(feature = "enterprise")] - pub fn components(&self) -> &Components { - &self.components - } } #[async_trait] @@ -227,6 +213,8 @@ pub struct StartCommand { #[clap(long)] tls_key_path: Option, #[clap(long)] + tls_watch: bool, + #[clap(long)] user_provider: Option, #[clap(long, default_value = "GREPTIMEDB_STANDALONE")] pub env_prefix: String, @@ -276,6 +264,7 @@ impl StartCommand { self.tls_mode.clone(), self.tls_cert_path.clone(), self.tls_key_path.clone(), + self.tls_watch, ); if let Some(addr) = &self.http_addr { @@ -355,6 +344,10 @@ impl StartCommand { let mut plugins = Plugins::new(); let plugin_opts = opts.plugins; let mut opts = opts.component; + set_default_prefix(opts.default_column_prefix.as_deref()) + .map_err(BoxedError::new) + .context(error::BuildCliSnafu)?; + opts.grpc.detect_server_addr(); let fe_opts = opts.frontend_options(); let dn_opts = opts.datanode_options(); @@ -408,6 +401,13 @@ impl StartCommand { plugins.insert::(information_extension.clone()); let process_manager = Arc::new(ProcessManager::new(opts.grpc.server_addr.clone(), None)); + + // for standalone not use grpc, but get a handler to frontend grpc client without + // actually make a connection + let (frontend_client, frontend_instance_handler) = + FrontendClient::from_empty_grpc_handler(opts.query.clone()); + let frontend_client = Arc::new(frontend_client); + let builder = KvBackendCatalogManagerBuilder::new( information_extension.clone(), kv_backend.clone(), @@ -415,9 +415,17 @@ impl StartCommand { ) .with_procedure_manager(procedure_manager.clone()) .with_process_manager(process_manager.clone()); - #[cfg(feature = "enterprise")] - let builder = if let Some(factories) = plugins.get() { - builder.with_extra_information_table_factories(factories) + let builder = if let Some(configurator) = + plugins.get::>() + { + let ctx = StandaloneCatalogManagerConfigureContext { + fe_client: frontend_client.clone(), + }; + let ctx = CatalogManagerConfigureContext::Standalone(ctx); + configurator + .configure(builder, ctx) + .await + .context(OtherSnafu)? } else { builder }; @@ -432,11 +440,6 @@ impl StartCommand { ..Default::default() }; - // for standalone not use grpc, but get a handler to frontend grpc client without - // actually make a connection - let (frontend_client, frontend_instance_handler) = - FrontendClient::from_empty_grpc_handler(opts.query.clone()); - let frontend_client = Arc::new(frontend_client); let flow_builder = FlownodeBuilder::new( flownode_options, plugins.clone(), @@ -507,11 +510,21 @@ impl StartCommand { let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager.clone(), true) .context(error::InitDdlManagerSnafu)?; - #[cfg(feature = "enterprise")] - let ddl_manager = { - let trigger_ddl_manager: Option = - plugins.get(); - ddl_manager.with_trigger_ddl_manager(trigger_ddl_manager) + + let ddl_manager = if let Some(configurator) = + plugins.get::>() + { + let ctx = DdlManagerConfigureContext { + kv_backend: kv_backend.clone(), + fe_client: frontend_client.clone(), + catalog_manager: catalog_manager.clone(), + }; + configurator + .configure(ddl_manager, ctx) + .await + .context(OtherSnafu)? + } else { + ddl_manager }; let procedure_executor = Arc::new(LocalProcedureExecutor::new( @@ -557,9 +570,6 @@ impl StartCommand { .context(StartFlownodeSnafu)?; flow_streaming_engine.set_frontend_invoker(invoker).await; - let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins)) - .context(error::ServersSnafu)?; - let servers = Services::new(opts, fe_instance.clone(), plugins.clone()) .build() .context(error::StartFrontendSnafu)?; @@ -568,15 +578,6 @@ impl StartCommand { instance: fe_instance, servers, heartbeat_task: None, - export_metrics_task, - }; - - #[cfg(feature = "enterprise")] - let components = Components { - plugins, - kv_backend, - frontend_client, - catalog_manager, }; Ok(Instance { @@ -585,8 +586,6 @@ impl StartCommand { flownode, procedure_manager, wal_options_allocator, - #[cfg(feature = "enterprise")] - components, _guard: guard, }) } @@ -764,6 +763,9 @@ mod tests { fn test_load_log_options_from_cli() { let cmd = StartCommand { user_provider: Some("static_user_provider:cmd:test=test".to_string()), + mysql_addr: Some("127.0.0.1:4002".to_string()), + postgres_addr: Some("127.0.0.1:4003".to_string()), + tls_watch: true, ..Default::default() }; @@ -780,6 +782,8 @@ mod tests { assert_eq!("./greptimedb_data/test/logs", opts.logging.dir); assert_eq!("debug", opts.logging.level.unwrap()); + assert!(opts.mysql.tls.watch); + assert!(opts.postgres.tls.watch); } #[test] diff --git a/src/cmd/tests/load_config_test.rs b/src/cmd/tests/load_config_test.rs index b92cf9631d..56a6caa71b 100644 --- a/src/cmd/tests/load_config_test.rs +++ b/src/cmd/tests/load_config_test.rs @@ -15,6 +15,7 @@ use std::time::Duration; use cmd::options::GreptimeOptions; +use common_base::memory_limit::MemoryLimit; use common_config::{Configurable, DEFAULT_DATA_HOME}; use common_options::datanode::{ClientOptions, DatanodeClientOptions}; use common_telemetry::logging::{DEFAULT_LOGGING_DIR, DEFAULT_OTLP_HTTP_ENDPOINT, LoggingOptions}; @@ -30,7 +31,6 @@ use meta_srv::selector::SelectorType; use metric_engine::config::EngineConfig as MetricEngineConfig; use mito2::config::MitoConfig; use query::options::QueryOptions; -use servers::export_metrics::ExportMetricsOption; use servers::grpc::GrpcOptions; use servers::http::HttpOptions; use servers::tls::{TlsMode, TlsOption}; @@ -48,6 +48,7 @@ fn test_load_datanode_example_config() { let expected = GreptimeOptions:: { component: DatanodeOptions { node_id: Some(42), + default_column_prefix: Some("greptime".to_string()), meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_secs(3), @@ -73,14 +74,19 @@ fn test_load_datanode_example_config() { RegionEngineConfig::Mito(MitoConfig { auto_flush_interval: Duration::from_secs(3600), write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)), + scan_memory_limit: MemoryLimit::Percentage(50), ..Default::default() }), RegionEngineConfig::File(FileEngineConfig {}), RegionEngineConfig::Metric(MetricEngineConfig { - experimental_sparse_primary_key_encoding: false, + sparse_primary_key_encoding: true, flush_metadata_region_interval: Duration::from_secs(30), }), ], + query: QueryOptions { + memory_pool_size: MemoryLimit::Percentage(50), + ..Default::default() + }, logging: LoggingOptions { level: Some("info".to_string()), dir: format!("{}/{}", DEFAULT_DATA_HOME, DEFAULT_LOGGING_DIR), @@ -88,11 +94,6 @@ fn test_load_datanode_example_config() { tracing_sample_ratio: Some(Default::default()), ..Default::default() }, - export_metrics: ExportMetricsOption { - self_import: None, - remote_write: Some(Default::default()), - ..Default::default() - }, grpc: GrpcOptions::default() .with_bind_addr("127.0.0.1:3001") .with_server_addr("127.0.0.1:3001"), @@ -113,6 +114,7 @@ fn test_load_frontend_example_config() { let expected = GreptimeOptions:: { component: FrontendOptions { default_timezone: Some("UTC".to_string()), + default_column_prefix: Some("greptime".to_string()), meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_secs(3), @@ -138,11 +140,6 @@ fn test_load_frontend_example_config() { ..Default::default() }, }, - export_metrics: ExportMetricsOption { - self_import: None, - remote_write: Some(Default::default()), - ..Default::default() - }, grpc: GrpcOptions { bind_addr: "127.0.0.1:4001".to_string(), server_addr: "127.0.0.1:4001".to_string(), @@ -153,6 +150,10 @@ fn test_load_frontend_example_config() { cors_allowed_origins: vec!["https://example.com".to_string()], ..Default::default() }, + query: QueryOptions { + memory_pool_size: MemoryLimit::Percentage(50), + ..Default::default() + }, ..Default::default() }, ..Default::default() @@ -189,11 +190,6 @@ fn test_load_metasrv_example_config() { tcp_nodelay: true, }, }, - export_metrics: ExportMetricsOption { - self_import: None, - remote_write: Some(Default::default()), - ..Default::default() - }, backend_tls: Some(TlsOption { mode: TlsMode::Prefer, cert_path: String::new(), @@ -240,6 +236,7 @@ fn test_load_flownode_example_config() { query: QueryOptions { parallelism: 1, allow_query_fallback: false, + memory_pool_size: MemoryLimit::Percentage(50), }, meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], @@ -273,6 +270,7 @@ fn test_load_standalone_example_config() { let expected = GreptimeOptions:: { component: StandaloneOptions { default_timezone: Some("UTC".to_string()), + default_column_prefix: Some("greptime".to_string()), wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig { dir: Some(format!("{}/{}", DEFAULT_DATA_HOME, WAL_DIR)), sync_period: Some(Duration::from_secs(10)), @@ -283,11 +281,12 @@ fn test_load_standalone_example_config() { RegionEngineConfig::Mito(MitoConfig { auto_flush_interval: Duration::from_secs(3600), write_cache_ttl: Some(Duration::from_secs(60 * 60 * 8)), + scan_memory_limit: MemoryLimit::Percentage(50), ..Default::default() }), RegionEngineConfig::File(FileEngineConfig {}), RegionEngineConfig::Metric(MetricEngineConfig { - experimental_sparse_primary_key_encoding: false, + sparse_primary_key_encoding: true, flush_metadata_region_interval: Duration::from_secs(30), }), ], @@ -302,16 +301,14 @@ fn test_load_standalone_example_config() { tracing_sample_ratio: Some(Default::default()), ..Default::default() }, - export_metrics: ExportMetricsOption { - self_import: Some(Default::default()), - remote_write: Some(Default::default()), - ..Default::default() - }, http: HttpOptions { cors_allowed_origins: vec!["https://example.com".to_string()], ..Default::default() }, - + query: QueryOptions { + memory_pool_size: MemoryLimit::Percentage(50), + ..Default::default() + }, ..Default::default() }, ..Default::default() diff --git a/src/common/base/Cargo.toml b/src/common/base/Cargo.toml index ae2945b1f5..4a881990b4 100644 --- a/src/common/base/Cargo.toml +++ b/src/common/base/Cargo.toml @@ -18,9 +18,11 @@ bytes.workspace = true common-error.workspace = true common-macro.workspace = true futures.workspace = true +lazy_static.workspace = true paste.workspace = true pin-project.workspace = true rand.workspace = true +regex.workspace = true serde = { version = "1.0", features = ["derive"] } snafu.workspace = true tokio.workspace = true diff --git a/src/common/base/src/lib.rs b/src/common/base/src/lib.rs index cc5acdbf47..91ee9d3343 100644 --- a/src/common/base/src/lib.rs +++ b/src/common/base/src/lib.rs @@ -15,10 +15,12 @@ pub mod bit_vec; pub mod bytes; pub mod cancellation; +pub mod memory_limit; pub mod plugins; pub mod range_read; #[allow(clippy::all)] pub mod readable_size; +pub mod regex_pattern; pub mod secrets; pub mod serde; diff --git a/src/common/base/src/memory_limit.rs b/src/common/base/src/memory_limit.rs new file mode 100644 index 0000000000..7129a4a027 --- /dev/null +++ b/src/common/base/src/memory_limit.rs @@ -0,0 +1,265 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{self, Display}; +use std::str::FromStr; + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use crate::readable_size::ReadableSize; + +/// Memory limit configuration that supports both absolute size and percentage. +/// +/// Examples: +/// - Absolute size: "2GB", "4GiB", "512MB" +/// - Percentage: "50%", "75%" +/// - Unlimited: "unlimited", "0" +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum MemoryLimit { + /// Absolute memory size. + Size(ReadableSize), + /// Percentage of total system memory (0-100). + Percentage(u8), + /// No memory limit. + #[default] + Unlimited, +} + +impl MemoryLimit { + /// Resolve the memory limit to bytes based on total system memory. + /// Returns 0 if the limit is unlimited. + pub fn resolve(&self, total_memory_bytes: u64) -> u64 { + match self { + MemoryLimit::Size(size) => size.as_bytes(), + MemoryLimit::Percentage(pct) => total_memory_bytes * (*pct as u64) / 100, + MemoryLimit::Unlimited => 0, + } + } + + /// Returns true if this limit is unlimited. + pub fn is_unlimited(&self) -> bool { + match self { + MemoryLimit::Size(size) => size.as_bytes() == 0, + MemoryLimit::Percentage(pct) => *pct == 0, + MemoryLimit::Unlimited => true, + } + } +} + +impl FromStr for MemoryLimit { + type Err = String; + + fn from_str(s: &str) -> Result { + let s = s.trim(); + + if s.eq_ignore_ascii_case("unlimited") { + return Ok(MemoryLimit::Unlimited); + } + + if let Some(pct_str) = s.strip_suffix('%') { + let pct = pct_str + .trim() + .parse::() + .map_err(|e| format!("invalid percentage value '{}': {}", pct_str, e))?; + + if pct > 100 { + return Err(format!("percentage must be between 0 and 100, got {}", pct)); + } + + if pct == 0 { + Ok(MemoryLimit::Unlimited) + } else { + Ok(MemoryLimit::Percentage(pct)) + } + } else { + let size = ReadableSize::from_str(s)?; + if size.as_bytes() == 0 { + Ok(MemoryLimit::Unlimited) + } else { + Ok(MemoryLimit::Size(size)) + } + } + } +} + +impl Display for MemoryLimit { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + MemoryLimit::Size(size) => write!(f, "{}", size), + MemoryLimit::Percentage(pct) => write!(f, "{}%", pct), + MemoryLimit::Unlimited => write!(f, "unlimited"), + } + } +} + +impl Serialize for MemoryLimit { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} + +impl<'de> Deserialize<'de> for MemoryLimit { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + MemoryLimit::from_str(&s).map_err(serde::de::Error::custom) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_absolute_size() { + assert_eq!( + "2GB".parse::().unwrap(), + MemoryLimit::Size(ReadableSize(2 * 1024 * 1024 * 1024)) + ); + assert_eq!( + "512MB".parse::().unwrap(), + MemoryLimit::Size(ReadableSize(512 * 1024 * 1024)) + ); + assert_eq!("0".parse::().unwrap(), MemoryLimit::Unlimited); + } + + #[test] + fn test_parse_percentage() { + assert_eq!( + "50%".parse::().unwrap(), + MemoryLimit::Percentage(50) + ); + assert_eq!( + "75%".parse::().unwrap(), + MemoryLimit::Percentage(75) + ); + assert_eq!("0%".parse::().unwrap(), MemoryLimit::Unlimited); + } + + #[test] + fn test_parse_invalid() { + assert!("150%".parse::().is_err()); + assert!("-10%".parse::().is_err()); + assert!("invalid".parse::().is_err()); + } + + #[test] + fn test_resolve() { + let total = 8 * 1024 * 1024 * 1024; // 8GB + + assert_eq!( + MemoryLimit::Size(ReadableSize(2 * 1024 * 1024 * 1024)).resolve(total), + 2 * 1024 * 1024 * 1024 + ); + assert_eq!( + MemoryLimit::Percentage(50).resolve(total), + 4 * 1024 * 1024 * 1024 + ); + assert_eq!(MemoryLimit::Unlimited.resolve(total), 0); + } + + #[test] + fn test_is_unlimited() { + assert!(MemoryLimit::Unlimited.is_unlimited()); + assert!(!MemoryLimit::Size(ReadableSize(1024)).is_unlimited()); + assert!(!MemoryLimit::Percentage(50).is_unlimited()); + assert!(!MemoryLimit::Percentage(1).is_unlimited()); + + // Defensive: these states shouldn't exist via public API, but check anyway + assert!(MemoryLimit::Size(ReadableSize(0)).is_unlimited()); + assert!(MemoryLimit::Percentage(0).is_unlimited()); + } + + #[test] + fn test_parse_100_percent() { + assert_eq!( + "100%".parse::().unwrap(), + MemoryLimit::Percentage(100) + ); + } + + #[test] + fn test_display_percentage() { + assert_eq!(MemoryLimit::Percentage(20).to_string(), "20%"); + assert_eq!(MemoryLimit::Percentage(50).to_string(), "50%"); + assert_eq!(MemoryLimit::Percentage(100).to_string(), "100%"); + } + + #[test] + fn test_parse_unlimited() { + assert_eq!( + "unlimited".parse::().unwrap(), + MemoryLimit::Unlimited + ); + assert_eq!( + "UNLIMITED".parse::().unwrap(), + MemoryLimit::Unlimited + ); + assert_eq!( + "Unlimited".parse::().unwrap(), + MemoryLimit::Unlimited + ); + } + + #[test] + fn test_display_unlimited() { + assert_eq!(MemoryLimit::Unlimited.to_string(), "unlimited"); + } + + #[test] + fn test_parse_display_roundtrip() { + let cases = vec![ + "50%", + "100%", + "1%", + "2GB", + "512MB", + "unlimited", + "UNLIMITED", + "0", // normalized to unlimited + "0%", // normalized to unlimited + ]; + + for input in cases { + let parsed = input.parse::().unwrap(); + let displayed = parsed.to_string(); + let reparsed = displayed.parse::().unwrap(); + assert_eq!( + parsed, reparsed, + "round-trip failed: '{}' -> '{}' -> '{:?}'", + input, displayed, reparsed + ); + } + } + + #[test] + fn test_zero_normalization() { + // All forms of zero should normalize to Unlimited + assert_eq!("0".parse::().unwrap(), MemoryLimit::Unlimited); + assert_eq!("0%".parse::().unwrap(), MemoryLimit::Unlimited); + assert_eq!("0B".parse::().unwrap(), MemoryLimit::Unlimited); + assert_eq!( + "0KB".parse::().unwrap(), + MemoryLimit::Unlimited + ); + + // Unlimited always displays as "unlimited" + assert_eq!(MemoryLimit::Unlimited.to_string(), "unlimited"); + } +} diff --git a/src/common/base/src/plugins.rs b/src/common/base/src/plugins.rs index bbab003c69..aa1a9d1287 100644 --- a/src/common/base/src/plugins.rs +++ b/src/common/base/src/plugins.rs @@ -32,7 +32,12 @@ impl Plugins { pub fn insert(&self, value: T) { let last = self.write().insert(value); - assert!(last.is_none(), "each type of plugins must be one and only"); + if last.is_some() { + panic!( + "Plugin of type {} already exists", + std::any::type_name::() + ); + } } pub fn get(&self) -> Option { @@ -140,7 +145,7 @@ mod tests { } #[test] - #[should_panic(expected = "each type of plugins must be one and only")] + #[should_panic(expected = "Plugin of type i32 already exists")] fn test_plugin_uniqueness() { let plugins = Plugins::new(); plugins.insert(1i32); diff --git a/src/common/base/src/regex_pattern.rs b/src/common/base/src/regex_pattern.rs new file mode 100644 index 0000000000..7ff46693ba --- /dev/null +++ b/src/common/base/src/regex_pattern.rs @@ -0,0 +1,22 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use lazy_static::lazy_static; +use regex::Regex; + +pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.@#]*"; + +lazy_static! { + pub static ref NAME_PATTERN_REG: Regex = Regex::new(&format!("^{NAME_PATTERN}$")).unwrap(); +} diff --git a/src/common/catalog/Cargo.toml b/src/common/catalog/Cargo.toml index 051675fe93..357f180a33 100644 --- a/src/common/catalog/Cargo.toml +++ b/src/common/catalog/Cargo.toml @@ -8,5 +8,6 @@ license.workspace = true workspace = true [dependencies] +const_format.workspace = true [dev-dependencies] diff --git a/src/common/catalog/build.rs b/src/common/catalog/build.rs new file mode 100644 index 0000000000..311d6eef3f --- /dev/null +++ b/src/common/catalog/build.rs @@ -0,0 +1,27 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +fn main() { + // Set DEFAULT_CATALOG_NAME from environment variable or use default value + let default_catalog_name = + std::env::var("DEFAULT_CATALOG_NAME").unwrap_or_else(|_| "greptime".to_string()); + + println!( + "cargo:rustc-env=DEFAULT_CATALOG_NAME={}", + default_catalog_name + ); + + // Rerun build script if the environment variable changes + println!("cargo:rerun-if-env-changed=DEFAULT_CATALOG_NAME"); +} diff --git a/src/common/catalog/src/consts.rs b/src/common/catalog/src/consts.rs index 2bc5db9824..1cd5db8a0c 100644 --- a/src/common/catalog/src/consts.rs +++ b/src/common/catalog/src/consts.rs @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +use const_format::concatcp; + pub const SYSTEM_CATALOG_NAME: &str = "system"; pub const INFORMATION_SCHEMA_NAME: &str = "information_schema"; pub const PG_CATALOG_NAME: &str = "pg_catalog"; pub const SYSTEM_CATALOG_TABLE_NAME: &str = "system_catalog"; -pub const DEFAULT_CATALOG_NAME: &str = "greptime"; +pub const DEFAULT_CATALOG_NAME: &str = env!("DEFAULT_CATALOG_NAME"); pub const DEFAULT_SCHEMA_NAME: &str = "public"; -pub const DEFAULT_PRIVATE_SCHEMA_NAME: &str = "greptime_private"; +pub const DEFAULT_PRIVATE_SCHEMA_NAME: &str = concatcp!(DEFAULT_CATALOG_NAME, "_private"); /// Reserves [0,MIN_USER_FLOW_ID) for internal usage. /// User defined table id starts from this value. @@ -84,8 +86,6 @@ pub const INFORMATION_SCHEMA_TRIGGERS_TABLE_ID: u32 = 24; pub const INFORMATION_SCHEMA_GLOBAL_STATUS_TABLE_ID: u32 = 25; /// id for information_schema.SESSION_STATUS pub const INFORMATION_SCHEMA_SESSION_STATUS_TABLE_ID: u32 = 26; -/// id for information_schema.RUNTIME_METRICS -pub const INFORMATION_SCHEMA_RUNTIME_METRICS_TABLE_ID: u32 = 27; /// id for information_schema.PARTITIONS pub const INFORMATION_SCHEMA_PARTITIONS_TABLE_ID: u32 = 28; /// id for information_schema.REGION_PEERS @@ -110,6 +110,8 @@ pub const INFORMATION_SCHEMA_SSTS_MANIFEST_TABLE_ID: u32 = 37; pub const INFORMATION_SCHEMA_SSTS_STORAGE_TABLE_ID: u32 = 38; /// id for information_schema.ssts_index_meta pub const INFORMATION_SCHEMA_SSTS_INDEX_META_TABLE_ID: u32 = 39; +/// id for information_schema.alerts +pub const INFORMATION_SCHEMA_ALERTS_TABLE_ID: u32 = 40; // ----- End of information_schema tables ----- @@ -150,4 +152,9 @@ pub const TRACE_TABLE_NAME_SESSION_KEY: &str = "trace_table_name"; pub fn trace_services_table_name(trace_table_name: &str) -> String { format!("{}_services", trace_table_name) } + +/// Generate the trace operations table name from the trace table name by adding `_operations` suffix. +pub fn trace_operations_table_name(trace_table_name: &str) -> String { + format!("{}_operations", trace_table_name) +} // ---- End of special table and fields ---- diff --git a/src/common/config/Cargo.toml b/src/common/config/Cargo.toml index 1d2b21602f..b45c03a6c3 100644 --- a/src/common/config/Cargo.toml +++ b/src/common/config/Cargo.toml @@ -11,7 +11,6 @@ workspace = true common-base.workspace = true common-error.workspace = true common-macro.workspace = true -common-stat.workspace = true config.workspace = true humantime-serde.workspace = true object-store.workspace = true diff --git a/src/common/config/src/lib.rs b/src/common/config/src/lib.rs index b806924217..cc25ebce16 100644 --- a/src/common/config/src/lib.rs +++ b/src/common/config/src/lib.rs @@ -14,7 +14,6 @@ pub mod config; pub mod error; -pub mod utils; use std::time::Duration; diff --git a/src/common/config/src/utils.rs b/src/common/config/src/utils.rs deleted file mode 100644 index 1bc986b77e..0000000000 --- a/src/common/config/src/utils.rs +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_base::readable_size::ReadableSize; -use common_stat::{get_total_cpu_millicores, get_total_memory_readable}; - -/// `ResourceSpec` holds the static resource specifications of a node, -/// such as CPU cores and memory capacity. These values are fixed -/// at startup and do not change dynamically during runtime. -#[derive(Debug, Clone, Copy)] -pub struct ResourceSpec { - pub cpus: i64, - pub memory: Option, -} - -impl Default for ResourceSpec { - fn default() -> Self { - Self { - cpus: get_total_cpu_millicores(), - memory: get_total_memory_readable(), - } - } -} diff --git a/src/common/datasource/Cargo.toml b/src/common/datasource/Cargo.toml index 303d05ceb1..964f41736c 100644 --- a/src/common/datasource/Cargo.toml +++ b/src/common/datasource/Cargo.toml @@ -36,7 +36,7 @@ object_store_opendal.workspace = true orc-rust = { version = "0.6.3", default-features = false, features = ["async"] } parquet.workspace = true paste.workspace = true -regex = "1.7" +regex.workspace = true serde.workspace = true snafu.workspace = true strum.workspace = true diff --git a/src/common/datasource/src/buffered_writer.rs b/src/common/datasource/src/buffered_writer.rs index e1571b0187..953715b223 100644 --- a/src/common/datasource/src/buffered_writer.rs +++ b/src/common/datasource/src/buffered_writer.rs @@ -12,28 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::future::Future; - use arrow::record_batch::RecordBatch; use async_trait::async_trait; use datafusion::parquet::format::FileMetaData; -use snafu::{OptionExt, ResultExt}; -use tokio::io::{AsyncWrite, AsyncWriteExt}; -use crate::error::{self, Result}; -use crate::share_buffer::SharedBuffer; - -pub struct LazyBufferedWriter { - path: String, - writer_factory: F, - writer: Option, - /// None stands for [`LazyBufferedWriter`] closed. - encoder: Option, - buffer: SharedBuffer, - rows_written: usize, - bytes_written: u64, - threshold: usize, -} +use crate::error::Result; pub trait DfRecordBatchEncoder { fn write(&mut self, batch: &RecordBatch) -> Result<()>; @@ -43,126 +26,3 @@ pub trait DfRecordBatchEncoder { pub trait ArrowWriterCloser { async fn close(mut self) -> Result; } - -impl< - T: AsyncWrite + Send + Unpin, - U: DfRecordBatchEncoder + ArrowWriterCloser, - F: Fn(String) -> Fut, - Fut: Future>, -> LazyBufferedWriter -{ - /// Closes `LazyBufferedWriter` and optionally flushes all data to underlying storage - /// if any row's been written. - pub async fn close_with_arrow_writer(mut self) -> Result<(FileMetaData, u64)> { - let encoder = self - .encoder - .take() - .context(error::BufferedWriterClosedSnafu)?; - let metadata = encoder.close().await?; - - // It's important to shut down! flushes all pending writes - self.close_inner_writer().await?; - Ok((metadata, self.bytes_written)) - } -} - -impl< - T: AsyncWrite + Send + Unpin, - U: DfRecordBatchEncoder, - F: Fn(String) -> Fut, - Fut: Future>, -> LazyBufferedWriter -{ - /// Closes the writer and flushes the buffer data. - pub async fn close_inner_writer(&mut self) -> Result<()> { - // Use `rows_written` to keep a track of if any rows have been written. - // If no row's been written, then we can simply close the underlying - // writer without flush so that no file will be actually created. - if self.rows_written != 0 { - self.bytes_written += self.try_flush(true).await?; - } - - if let Some(writer) = &mut self.writer { - writer.shutdown().await.context(error::AsyncWriteSnafu)?; - } - Ok(()) - } - - pub fn new( - threshold: usize, - buffer: SharedBuffer, - encoder: U, - path: impl AsRef, - writer_factory: F, - ) -> Self { - Self { - path: path.as_ref().to_string(), - threshold, - encoder: Some(encoder), - buffer, - rows_written: 0, - bytes_written: 0, - writer_factory, - writer: None, - } - } - - pub async fn write(&mut self, batch: &RecordBatch) -> Result<()> { - let encoder = self - .encoder - .as_mut() - .context(error::BufferedWriterClosedSnafu)?; - encoder.write(batch)?; - self.rows_written += batch.num_rows(); - self.bytes_written += self.try_flush(false).await?; - Ok(()) - } - - async fn try_flush(&mut self, all: bool) -> Result { - let mut bytes_written: u64 = 0; - - // Once buffered data size reaches threshold, split the data in chunks (typically 4MB) - // and write to underlying storage. - while self.buffer.buffer.lock().unwrap().len() >= self.threshold { - let chunk = { - let mut buffer = self.buffer.buffer.lock().unwrap(); - buffer.split_to(self.threshold) - }; - let size = chunk.len(); - - self.maybe_init_writer() - .await? - .write_all(&chunk) - .await - .context(error::AsyncWriteSnafu)?; - - bytes_written += size as u64; - } - - if all { - bytes_written += self.try_flush_all().await?; - } - Ok(bytes_written) - } - - /// Only initiates underlying file writer when rows have been written. - async fn maybe_init_writer(&mut self) -> Result<&mut T> { - if let Some(ref mut writer) = self.writer { - Ok(writer) - } else { - let writer = (self.writer_factory)(self.path.clone()).await?; - Ok(self.writer.insert(writer)) - } - } - - async fn try_flush_all(&mut self) -> Result { - let remain = self.buffer.buffer.lock().unwrap().split(); - let size = remain.len(); - self.maybe_init_writer() - .await? - .write_all(&remain) - .await - .context(error::AsyncWriteSnafu)?; - Ok(size as u64) - } -} diff --git a/src/common/datasource/src/compressed_writer.rs b/src/common/datasource/src/compressed_writer.rs new file mode 100644 index 0000000000..afd2544f4c --- /dev/null +++ b/src/common/datasource/src/compressed_writer.rs @@ -0,0 +1,202 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use async_compression::tokio::write::{BzEncoder, GzipEncoder, XzEncoder, ZstdEncoder}; +use snafu::ResultExt; +use tokio::io::{AsyncWrite, AsyncWriteExt}; + +use crate::compression::CompressionType; +use crate::error::{self, Result}; + +/// A compressed writer that wraps an underlying async writer with compression. +/// +/// This writer supports multiple compression formats including GZIP, BZIP2, XZ, and ZSTD. +/// It provides transparent compression for any async writer implementation. +pub struct CompressedWriter { + inner: Box, + compression_type: CompressionType, +} + +impl CompressedWriter { + /// Creates a new compressed writer with the specified compression type. + /// + /// # Arguments + /// + /// * `writer` - The underlying writer to wrap with compression + /// * `compression_type` - The type of compression to apply + pub fn new( + writer: impl AsyncWrite + Unpin + Send + 'static, + compression_type: CompressionType, + ) -> Self { + let inner: Box = match compression_type { + CompressionType::Gzip => Box::new(GzipEncoder::new(writer)), + CompressionType::Bzip2 => Box::new(BzEncoder::new(writer)), + CompressionType::Xz => Box::new(XzEncoder::new(writer)), + CompressionType::Zstd => Box::new(ZstdEncoder::new(writer)), + CompressionType::Uncompressed => Box::new(writer), + }; + + Self { + inner, + compression_type, + } + } + + /// Returns the compression type used by this writer. + pub fn compression_type(&self) -> CompressionType { + self.compression_type + } + + /// Flush the writer and shutdown compression + pub async fn shutdown(mut self) -> Result<()> { + self.inner + .shutdown() + .await + .context(error::AsyncWriteSnafu)?; + Ok(()) + } +} + +impl AsyncWrite for CompressedWriter { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + Pin::new(&mut self.inner).poll_write(cx, buf) + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_flush(cx) + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_shutdown(cx) + } +} + +/// A trait for converting async writers into compressed writers. +/// +/// This trait is automatically implemented for all types that implement [`AsyncWrite`]. +pub trait IntoCompressedWriter { + /// Converts this writer into a [`CompressedWriter`] with the specified compression type. + /// + /// # Arguments + /// + /// * `self` - The underlying writer to wrap with compression + /// * `compression_type` - The type of compression to apply + fn into_compressed_writer(self, compression_type: CompressionType) -> CompressedWriter + where + Self: AsyncWrite + Unpin + Send + 'static + Sized, + { + CompressedWriter::new(self, compression_type) + } +} + +impl IntoCompressedWriter for W {} + +#[cfg(test)] +mod tests { + use tokio::io::{AsyncReadExt, AsyncWriteExt, duplex}; + + use super::*; + + #[tokio::test] + async fn test_compressed_writer_gzip() { + let (duplex_writer, mut duplex_reader) = duplex(1024); + let mut writer = duplex_writer.into_compressed_writer(CompressionType::Gzip); + let original = b"test data for gzip compression"; + + writer.write_all(original).await.unwrap(); + writer.shutdown().await.unwrap(); + + let mut buffer = Vec::new(); + duplex_reader.read_to_end(&mut buffer).await.unwrap(); + + // The compressed data should be different from the original + assert_ne!(buffer, original); + assert!(!buffer.is_empty()); + } + + #[tokio::test] + async fn test_compressed_writer_bzip2() { + let (duplex_writer, mut duplex_reader) = duplex(1024); + let mut writer = duplex_writer.into_compressed_writer(CompressionType::Bzip2); + let original = b"test data for bzip2 compression"; + + writer.write_all(original).await.unwrap(); + writer.shutdown().await.unwrap(); + + let mut buffer = Vec::new(); + duplex_reader.read_to_end(&mut buffer).await.unwrap(); + + // The compressed data should be different from the original + assert_ne!(buffer, original); + assert!(!buffer.is_empty()); + } + + #[tokio::test] + async fn test_compressed_writer_xz() { + let (duplex_writer, mut duplex_reader) = duplex(1024); + let mut writer = duplex_writer.into_compressed_writer(CompressionType::Xz); + let original = b"test data for xz compression"; + + writer.write_all(original).await.unwrap(); + writer.shutdown().await.unwrap(); + + let mut buffer = Vec::new(); + duplex_reader.read_to_end(&mut buffer).await.unwrap(); + + // The compressed data should be different from the original + assert_ne!(buffer, original); + assert!(!buffer.is_empty()); + } + + #[tokio::test] + async fn test_compressed_writer_zstd() { + let (duplex_writer, mut duplex_reader) = duplex(1024); + let mut writer = duplex_writer.into_compressed_writer(CompressionType::Zstd); + let original = b"test data for zstd compression"; + + writer.write_all(original).await.unwrap(); + writer.shutdown().await.unwrap(); + + let mut buffer = Vec::new(); + duplex_reader.read_to_end(&mut buffer).await.unwrap(); + + // The compressed data should be different from the original + assert_ne!(buffer, original); + assert!(!buffer.is_empty()); + } + + #[tokio::test] + async fn test_compressed_writer_uncompressed() { + let (duplex_writer, mut duplex_reader) = duplex(1024); + let mut writer = duplex_writer.into_compressed_writer(CompressionType::Uncompressed); + let original = b"test data for uncompressed"; + + writer.write_all(original).await.unwrap(); + writer.shutdown().await.unwrap(); + + let mut buffer = Vec::new(); + duplex_reader.read_to_end(&mut buffer).await.unwrap(); + + // Uncompressed data should be the same as the original + assert_eq!(buffer, original); + } +} diff --git a/src/common/datasource/src/error.rs b/src/common/datasource/src/error.rs index cfaa5a19c0..a8aa08e55c 100644 --- a/src/common/datasource/src/error.rs +++ b/src/common/datasource/src/error.rs @@ -194,12 +194,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Buffered writer closed"))] - BufferedWriterClosed { - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Failed to write parquet file, path: {}", path))] WriteParquet { path: String, @@ -208,6 +202,14 @@ pub enum Error { #[snafu(source)] error: parquet::errors::ParquetError, }, + + #[snafu(display("Failed to build file stream"))] + BuildFileStream { + #[snafu(implicit)] + location: Location, + #[snafu(source)] + error: datafusion::error::DataFusionError, + }, } pub type Result = std::result::Result; @@ -239,7 +241,7 @@ impl ErrorExt for Error { | ReadRecordBatch { .. } | WriteRecordBatch { .. } | EncodeRecordBatch { .. } - | BufferedWriterClosed { .. } + | BuildFileStream { .. } | OrcReader { .. } => StatusCode::Unexpected, } } diff --git a/src/common/datasource/src/file_format.rs b/src/common/datasource/src/file_format.rs index 7c4e8d6c88..614be170e8 100644 --- a/src/common/datasource/src/file_format.rs +++ b/src/common/datasource/src/file_format.rs @@ -30,12 +30,22 @@ use arrow::record_batch::RecordBatch; use arrow_schema::{ArrowError, Schema as ArrowSchema}; use async_trait::async_trait; use bytes::{Buf, Bytes}; -use datafusion::datasource::physical_plan::FileOpenFuture; +use common_recordbatch::DfSendableRecordBatchStream; +use datafusion::datasource::file_format::file_compression_type::FileCompressionType as DfCompressionType; +use datafusion::datasource::listing::PartitionedFile; +use datafusion::datasource::object_store::ObjectStoreUrl; +use datafusion::datasource::physical_plan::{ + FileGroup, FileOpenFuture, FileScanConfigBuilder, FileSource, FileStream, +}; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::physical_plan::SendableRecordBatchStream; +use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; +use datatypes::arrow::datatypes::SchemaRef; use futures::{StreamExt, TryStreamExt}; use object_store::ObjectStore; +use object_store_opendal::OpendalStore; use snafu::ResultExt; +use tokio::io::AsyncWriteExt; use tokio_util::compat::FuturesAsyncWriteCompatExt; use self::csv::CsvFormat; @@ -43,7 +53,8 @@ use self::json::JsonFormat; use self::orc::OrcFormat; use self::parquet::ParquetFormat; use crate::DEFAULT_WRITE_BUFFER_SIZE; -use crate::buffered_writer::{DfRecordBatchEncoder, LazyBufferedWriter}; +use crate::buffered_writer::DfRecordBatchEncoder; +use crate::compressed_writer::{CompressedWriter, IntoCompressedWriter}; use crate::compression::CompressionType; use crate::error::{self, Result}; use crate::share_buffer::SharedBuffer; @@ -195,33 +206,128 @@ pub async fn infer_schemas( ArrowSchema::try_merge(schemas).context(error::MergeSchemaSnafu) } -pub async fn stream_to_file T>( +/// Writes data to a compressed writer if the data is not empty. +/// +/// Does nothing if `data` is empty; otherwise writes all data and returns any error. +async fn write_to_compressed_writer( + compressed_writer: &mut CompressedWriter, + data: &[u8], +) -> Result<()> { + if !data.is_empty() { + compressed_writer + .write_all(data) + .await + .context(error::AsyncWriteSnafu)?; + } + Ok(()) +} + +/// Streams [SendableRecordBatchStream] to a file with optional compression support. +/// Data is buffered and flushed according to the given `threshold`. +/// Ensures that writer resources are cleanly released and that an empty file is not +/// created if no rows are written. +/// +/// Returns the total number of rows successfully written. +pub async fn stream_to_file( mut stream: SendableRecordBatchStream, store: ObjectStore, path: &str, threshold: usize, concurrency: usize, - encoder_factory: U, -) -> Result { + compression_type: CompressionType, + encoder_factory: impl Fn(SharedBuffer) -> E, +) -> Result +where + E: DfRecordBatchEncoder, +{ + // Create the file writer with OpenDAL's built-in buffering + let writer = store + .writer_with(path) + .concurrent(concurrency) + .chunk(DEFAULT_WRITE_BUFFER_SIZE.as_bytes() as usize) + .await + .with_context(|_| error::WriteObjectSnafu { path })? + .into_futures_async_write() + .compat_write(); + + // Apply compression if needed + let mut compressed_writer = writer.into_compressed_writer(compression_type); + + // Create a buffer for the encoder let buffer = SharedBuffer::with_capacity(threshold); - let encoder = encoder_factory(buffer.clone()); - let mut writer = LazyBufferedWriter::new(threshold, buffer, encoder, path, |path| async { - store - .writer_with(&path) - .concurrent(concurrency) - .chunk(DEFAULT_WRITE_BUFFER_SIZE.as_bytes() as usize) - .await - .map(|v| v.into_futures_async_write().compat_write()) - .context(error::WriteObjectSnafu { path }) - }); + let mut encoder = encoder_factory(buffer.clone()); let mut rows = 0; + // Process each record batch while let Some(batch) = stream.next().await { let batch = batch.context(error::ReadRecordBatchSnafu)?; - writer.write(&batch).await?; + + // Write batch using the encoder + encoder.write(&batch)?; rows += batch.num_rows(); + + loop { + let chunk = { + let mut buffer_guard = buffer.buffer.lock().unwrap(); + if buffer_guard.len() < threshold { + break; + } + buffer_guard.split_to(threshold) + }; + write_to_compressed_writer(&mut compressed_writer, &chunk).await?; + } } - writer.close_inner_writer().await?; + + // If no row's been written, just simply close the underlying writer + // without flush so that no file will be actually created. + if rows != 0 { + // Final flush of any remaining data + let final_data = { + let mut buffer_guard = buffer.buffer.lock().unwrap(); + buffer_guard.split() + }; + write_to_compressed_writer(&mut compressed_writer, &final_data).await?; + } + + // Shutdown compression and close writer + compressed_writer.shutdown().await?; + Ok(rows) } + +/// Creates a [FileStream] for reading data from a file with optional column projection +/// and compression support. +/// +/// Returns [SendableRecordBatchStream]. +pub async fn file_to_stream( + store: &ObjectStore, + filename: &str, + file_schema: SchemaRef, + file_source: Arc, + projection: Option>, + compression_type: CompressionType, +) -> Result { + let df_compression: DfCompressionType = compression_type.into(); + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + file_schema, + file_source.clone(), + ) + .with_file_group(FileGroup::new(vec![PartitionedFile::new( + filename.to_string(), + 0, + )])) + .with_projection(projection) + .with_file_compression_type(df_compression) + .build(); + + let store = Arc::new(OpendalStore::new(store.clone())); + let file_opener = file_source + .with_projection(&config) + .create_file_opener(store, &config, 0); + let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new()) + .context(error::BuildFileStreamSnafu)?; + + Ok(Box::pin(stream)) +} diff --git a/src/common/datasource/src/file_format/csv.rs b/src/common/datasource/src/file_format/csv.rs index efffce8d12..392f3ff49d 100644 --- a/src/common/datasource/src/file_format/csv.rs +++ b/src/common/datasource/src/file_format/csv.rs @@ -157,19 +157,27 @@ pub async fn stream_to_csv( concurrency: usize, format: &CsvFormat, ) -> Result { - stream_to_file(stream, store, path, threshold, concurrency, |buffer| { - let mut builder = WriterBuilder::new(); - if let Some(timestamp_format) = &format.timestamp_format { - builder = builder.with_timestamp_format(timestamp_format.to_owned()) - } - if let Some(date_format) = &format.date_format { - builder = builder.with_date_format(date_format.to_owned()) - } - if let Some(time_format) = &format.time_format { - builder = builder.with_time_format(time_format.to_owned()) - } - builder.build(buffer) - }) + stream_to_file( + stream, + store, + path, + threshold, + concurrency, + format.compression_type, + |buffer| { + let mut builder = WriterBuilder::new(); + if let Some(timestamp_format) = &format.timestamp_format { + builder = builder.with_timestamp_format(timestamp_format.to_owned()) + } + if let Some(date_format) = &format.date_format { + builder = builder.with_date_format(date_format.to_owned()) + } + if let Some(time_format) = &format.time_format { + builder = builder.with_time_format(time_format.to_owned()) + } + builder.build(buffer) + }, + ) .await } @@ -181,13 +189,21 @@ impl DfRecordBatchEncoder for csv::Writer { #[cfg(test)] mod tests { + use std::sync::Arc; + use common_recordbatch::adapter::DfRecordBatchStreamAdapter; + use common_recordbatch::{RecordBatch, RecordBatches}; use common_test_util::find_workspace_path; + use datafusion::datasource::physical_plan::{CsvSource, FileSource}; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::{ColumnSchema, Schema}; + use datatypes::vectors::{Float64Vector, StringVector, UInt32Vector, VectorRef}; + use futures::TryStreamExt; use super::*; use crate::file_format::{ FORMAT_COMPRESSION_TYPE, FORMAT_DELIMITER, FORMAT_HAS_HEADER, - FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat, + FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat, file_to_stream, }; use crate::test_util::{format_schema, test_store}; @@ -297,4 +313,166 @@ mod tests { } ); } + + #[tokio::test] + async fn test_compressed_csv() { + // Create test data + let column_schemas = vec![ + ColumnSchema::new("id", ConcreteDataType::uint32_datatype(), false), + ColumnSchema::new("name", ConcreteDataType::string_datatype(), false), + ColumnSchema::new("value", ConcreteDataType::float64_datatype(), false), + ]; + let schema = Arc::new(Schema::new(column_schemas)); + + // Create multiple record batches with different data + let batch1_columns: Vec = vec![ + Arc::new(UInt32Vector::from_slice(vec![1, 2, 3])), + Arc::new(StringVector::from(vec!["Alice", "Bob", "Charlie"])), + Arc::new(Float64Vector::from_slice(vec![10.5, 20.3, 30.7])), + ]; + let batch1 = RecordBatch::new(schema.clone(), batch1_columns).unwrap(); + + let batch2_columns: Vec = vec![ + Arc::new(UInt32Vector::from_slice(vec![4, 5, 6])), + Arc::new(StringVector::from(vec!["David", "Eva", "Frank"])), + Arc::new(Float64Vector::from_slice(vec![40.1, 50.2, 60.3])), + ]; + let batch2 = RecordBatch::new(schema.clone(), batch2_columns).unwrap(); + + let batch3_columns: Vec = vec![ + Arc::new(UInt32Vector::from_slice(vec![7, 8, 9])), + Arc::new(StringVector::from(vec!["Grace", "Henry", "Ivy"])), + Arc::new(Float64Vector::from_slice(vec![70.4, 80.5, 90.6])), + ]; + let batch3 = RecordBatch::new(schema.clone(), batch3_columns).unwrap(); + + // Combine all batches into a RecordBatches collection + let recordbatches = RecordBatches::try_new(schema, vec![batch1, batch2, batch3]).unwrap(); + + // Test with different compression types + let compression_types = vec![ + CompressionType::Gzip, + CompressionType::Bzip2, + CompressionType::Xz, + CompressionType::Zstd, + ]; + + // Create a temporary file path + let temp_dir = common_test_util::temp_dir::create_temp_dir("test_compressed_csv"); + for compression_type in compression_types { + let format = CsvFormat { + compression_type, + ..CsvFormat::default() + }; + + // Use correct format without Debug formatter + let compressed_file_name = + format!("test_compressed_csv.{}", compression_type.file_extension()); + let compressed_file_path = temp_dir.path().join(&compressed_file_name); + let compressed_file_path_str = compressed_file_path.to_str().unwrap(); + + // Create a simple file store for testing + let store = test_store("/"); + + // Export CSV with compression + let rows = stream_to_csv( + Box::pin(DfRecordBatchStreamAdapter::new(recordbatches.as_stream())), + store, + compressed_file_path_str, + 1024, + 1, + &format, + ) + .await + .unwrap(); + + assert_eq!(rows, 9); + + // Verify compressed file was created and has content + assert!(compressed_file_path.exists()); + let file_size = std::fs::metadata(&compressed_file_path).unwrap().len(); + assert!(file_size > 0); + + // Verify the file is actually compressed + let file_content = std::fs::read(&compressed_file_path).unwrap(); + // Compressed files should not start with CSV header + // They should have compression magic bytes + match compression_type { + CompressionType::Gzip => { + // Gzip magic bytes: 0x1f 0x8b + assert_eq!(file_content[0], 0x1f, "Gzip file should start with 0x1f"); + assert_eq!( + file_content[1], 0x8b, + "Gzip file should have 0x8b as second byte" + ); + } + CompressionType::Bzip2 => { + // Bzip2 magic bytes: 'BZ' + assert_eq!(file_content[0], b'B', "Bzip2 file should start with 'B'"); + assert_eq!( + file_content[1], b'Z', + "Bzip2 file should have 'Z' as second byte" + ); + } + CompressionType::Xz => { + // XZ magic bytes: 0xFD '7zXZ' + assert_eq!(file_content[0], 0xFD, "XZ file should start with 0xFD"); + } + CompressionType::Zstd => { + // Zstd magic bytes: 0x28 0xB5 0x2F 0xFD + assert_eq!(file_content[0], 0x28, "Zstd file should start with 0x28"); + assert_eq!( + file_content[1], 0xB5, + "Zstd file should have 0xB5 as second byte" + ); + } + _ => {} + } + + // Verify the compressed file can be decompressed and content matches original data + let store = test_store("/"); + let schema = Arc::new( + CsvFormat { + compression_type, + ..Default::default() + } + .infer_schema(&store, compressed_file_path_str) + .await + .unwrap(), + ); + let csv_source = CsvSource::new(true, b',', b'"') + .with_schema(schema.clone()) + .with_batch_size(8192); + + let stream = file_to_stream( + &store, + compressed_file_path_str, + schema.clone(), + csv_source.clone(), + None, + compression_type, + ) + .await + .unwrap(); + + let batches = stream.try_collect::>().await.unwrap(); + let pretty_print = arrow::util::pretty::pretty_format_batches(&batches) + .unwrap() + .to_string(); + let expected = r#"+----+---------+-------+ +| id | name | value | ++----+---------+-------+ +| 1 | Alice | 10.5 | +| 2 | Bob | 20.3 | +| 3 | Charlie | 30.7 | +| 4 | David | 40.1 | +| 5 | Eva | 50.2 | +| 6 | Frank | 60.3 | +| 7 | Grace | 70.4 | +| 8 | Henry | 80.5 | +| 9 | Ivy | 90.6 | ++----+---------+-------+"#; + assert_eq!(expected, pretty_print); + } + } } diff --git a/src/common/datasource/src/file_format/json.rs b/src/common/datasource/src/file_format/json.rs index c234eec846..cafcd71372 100644 --- a/src/common/datasource/src/file_format/json.rs +++ b/src/common/datasource/src/file_format/json.rs @@ -115,10 +115,17 @@ pub async fn stream_to_json( path: &str, threshold: usize, concurrency: usize, + format: &JsonFormat, ) -> Result { - stream_to_file(stream, store, path, threshold, concurrency, |buffer| { - json::LineDelimitedWriter::new(buffer) - }) + stream_to_file( + stream, + store, + path, + threshold, + concurrency, + format.compression_type, + json::LineDelimitedWriter::new, + ) .await } @@ -130,10 +137,21 @@ impl DfRecordBatchEncoder for json::Writer { #[cfg(test)] mod tests { + use std::sync::Arc; + + use common_recordbatch::adapter::DfRecordBatchStreamAdapter; + use common_recordbatch::{RecordBatch, RecordBatches}; use common_test_util::find_workspace_path; + use datafusion::datasource::physical_plan::{FileSource, JsonSource}; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::{ColumnSchema, Schema}; + use datatypes::vectors::{Float64Vector, StringVector, UInt32Vector, VectorRef}; + use futures::TryStreamExt; use super::*; - use crate::file_format::{FORMAT_COMPRESSION_TYPE, FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat}; + use crate::file_format::{ + FORMAT_COMPRESSION_TYPE, FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat, file_to_stream, + }; use crate::test_util::{format_schema, test_store}; fn test_data_root() -> String { @@ -203,4 +221,165 @@ mod tests { } ); } + + #[tokio::test] + async fn test_compressed_json() { + // Create test data + let column_schemas = vec![ + ColumnSchema::new("id", ConcreteDataType::uint32_datatype(), false), + ColumnSchema::new("name", ConcreteDataType::string_datatype(), false), + ColumnSchema::new("value", ConcreteDataType::float64_datatype(), false), + ]; + let schema = Arc::new(Schema::new(column_schemas)); + + // Create multiple record batches with different data + let batch1_columns: Vec = vec![ + Arc::new(UInt32Vector::from_slice(vec![1, 2, 3])), + Arc::new(StringVector::from(vec!["Alice", "Bob", "Charlie"])), + Arc::new(Float64Vector::from_slice(vec![10.5, 20.3, 30.7])), + ]; + let batch1 = RecordBatch::new(schema.clone(), batch1_columns).unwrap(); + + let batch2_columns: Vec = vec![ + Arc::new(UInt32Vector::from_slice(vec![4, 5, 6])), + Arc::new(StringVector::from(vec!["David", "Eva", "Frank"])), + Arc::new(Float64Vector::from_slice(vec![40.1, 50.2, 60.3])), + ]; + let batch2 = RecordBatch::new(schema.clone(), batch2_columns).unwrap(); + + let batch3_columns: Vec = vec![ + Arc::new(UInt32Vector::from_slice(vec![7, 8, 9])), + Arc::new(StringVector::from(vec!["Grace", "Henry", "Ivy"])), + Arc::new(Float64Vector::from_slice(vec![70.4, 80.5, 90.6])), + ]; + let batch3 = RecordBatch::new(schema.clone(), batch3_columns).unwrap(); + + // Combine all batches into a RecordBatches collection + let recordbatches = RecordBatches::try_new(schema, vec![batch1, batch2, batch3]).unwrap(); + + // Test with different compression types + let compression_types = vec![ + CompressionType::Gzip, + CompressionType::Bzip2, + CompressionType::Xz, + CompressionType::Zstd, + ]; + + // Create a temporary file path + let temp_dir = common_test_util::temp_dir::create_temp_dir("test_compressed_json"); + for compression_type in compression_types { + let format = JsonFormat { + compression_type, + ..JsonFormat::default() + }; + + let compressed_file_name = + format!("test_compressed_json.{}", compression_type.file_extension()); + let compressed_file_path = temp_dir.path().join(&compressed_file_name); + let compressed_file_path_str = compressed_file_path.to_str().unwrap(); + + // Create a simple file store for testing + let store = test_store("/"); + + // Export JSON with compression + let rows = stream_to_json( + Box::pin(DfRecordBatchStreamAdapter::new(recordbatches.as_stream())), + store, + compressed_file_path_str, + 1024, + 1, + &format, + ) + .await + .unwrap(); + + assert_eq!(rows, 9); + + // Verify compressed file was created and has content + assert!(compressed_file_path.exists()); + let file_size = std::fs::metadata(&compressed_file_path).unwrap().len(); + assert!(file_size > 0); + + // Verify the file is actually compressed + let file_content = std::fs::read(&compressed_file_path).unwrap(); + // Compressed files should not start with '{' (JSON character) + // They should have compression magic bytes + match compression_type { + CompressionType::Gzip => { + // Gzip magic bytes: 0x1f 0x8b + assert_eq!(file_content[0], 0x1f, "Gzip file should start with 0x1f"); + assert_eq!( + file_content[1], 0x8b, + "Gzip file should have 0x8b as second byte" + ); + } + CompressionType::Bzip2 => { + // Bzip2 magic bytes: 'BZ' + assert_eq!(file_content[0], b'B', "Bzip2 file should start with 'B'"); + assert_eq!( + file_content[1], b'Z', + "Bzip2 file should have 'Z' as second byte" + ); + } + CompressionType::Xz => { + // XZ magic bytes: 0xFD '7zXZ' + assert_eq!(file_content[0], 0xFD, "XZ file should start with 0xFD"); + } + CompressionType::Zstd => { + // Zstd magic bytes: 0x28 0xB5 0x2F 0xFD + assert_eq!(file_content[0], 0x28, "Zstd file should start with 0x28"); + assert_eq!( + file_content[1], 0xB5, + "Zstd file should have 0xB5 as second byte" + ); + } + _ => {} + } + + // Verify the compressed file can be decompressed and content matches original data + let store = test_store("/"); + let schema = Arc::new( + JsonFormat { + compression_type, + ..Default::default() + } + .infer_schema(&store, compressed_file_path_str) + .await + .unwrap(), + ); + let json_source = JsonSource::new() + .with_schema(schema.clone()) + .with_batch_size(8192); + + let stream = file_to_stream( + &store, + compressed_file_path_str, + schema.clone(), + json_source.clone(), + None, + compression_type, + ) + .await + .unwrap(); + + let batches = stream.try_collect::>().await.unwrap(); + let pretty_print = arrow::util::pretty::pretty_format_batches(&batches) + .unwrap() + .to_string(); + let expected = r#"+----+---------+-------+ +| id | name | value | ++----+---------+-------+ +| 1 | Alice | 10.5 | +| 2 | Bob | 20.3 | +| 3 | Charlie | 30.7 | +| 4 | David | 40.1 | +| 5 | Eva | 50.2 | +| 6 | Frank | 60.3 | +| 7 | Grace | 70.4 | +| 8 | Henry | 80.5 | +| 9 | Ivy | 90.6 | ++----+---------+-------+"#; + assert_eq!(expected, pretty_print); + } + } } diff --git a/src/common/datasource/src/lib.rs b/src/common/datasource/src/lib.rs index 72e94c7f36..91663ce22c 100644 --- a/src/common/datasource/src/lib.rs +++ b/src/common/datasource/src/lib.rs @@ -16,6 +16,7 @@ #![feature(type_alias_impl_trait)] pub mod buffered_writer; +pub mod compressed_writer; pub mod compression; pub mod error; pub mod file_format; diff --git a/src/common/datasource/src/test_util.rs b/src/common/datasource/src/test_util.rs index f3f813be34..244df3b7a5 100644 --- a/src/common/datasource/src/test_util.rs +++ b/src/common/datasource/src/test_util.rs @@ -28,7 +28,7 @@ use object_store::ObjectStore; use object_store::services::Fs; use crate::file_format::csv::{CsvFormat, stream_to_csv}; -use crate::file_format::json::stream_to_json; +use crate::file_format::json::{JsonFormat, stream_to_json}; use crate::test_util; pub const TEST_BATCH_SIZE: usize = 100; @@ -122,13 +122,16 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi let output_path = format!("{}/{}", dir.path().display(), "output"); + let json_format = JsonFormat::default(); + assert!( stream_to_json( Box::pin(stream), tmp_store.clone(), &output_path, threshold(size), - 8 + 8, + &json_format, ) .await .is_ok() diff --git a/src/common/error/src/lib.rs b/src/common/error/src/lib.rs index 0052d70cf3..18e6a0c9ae 100644 --- a/src/common/error/src/lib.rs +++ b/src/common/error/src/lib.rs @@ -45,3 +45,19 @@ pub fn from_err_code_msg_to_header(code: u32, msg: &str) -> HeaderMap { header.insert(GREPTIME_DB_HEADER_ERROR_MSG, msg); header } + +/// Returns the external root cause of the source error (exclude the current error). +pub fn root_source(err: &dyn std::error::Error) -> Option<&dyn std::error::Error> { + // There are some divergence about the behavior of the `sources()` API + // in https://github.com/rust-lang/rust/issues/58520 + // So this function iterates the sources manually. + let mut root = err.source(); + while let Some(r) = root { + if let Some(s) = r.source() { + root = Some(s); + } else { + break; + } + } + root +} diff --git a/src/common/event-recorder/src/recorder.rs b/src/common/event-recorder/src/recorder.rs index ddf0bcdae0..ace7702991 100644 --- a/src/common/event-recorder/src/recorder.rs +++ b/src/common/event-recorder/src/recorder.rs @@ -97,9 +97,9 @@ pub trait Event: Send + Sync + Debug { vec![] } - /// Add the extra row to the event with the default row. - fn extra_row(&self) -> Result { - Ok(Row { values: vec![] }) + /// Add the extra rows to the event with the default row. + fn extra_rows(&self) -> Result> { + Ok(vec![Row { values: vec![] }]) } /// Returns the event as any type. @@ -159,15 +159,17 @@ pub fn build_row_inserts_request(events: &[&Box]) -> Result = Vec::with_capacity(events.len()); for event in events { - let extra_row = event.extra_row()?; - let mut values = Vec::with_capacity(3 + extra_row.values.len()); - values.extend([ - ValueData::StringValue(event.event_type().to_string()).into(), - ValueData::BinaryValue(event.json_payload()?.into_bytes()).into(), - ValueData::TimestampNanosecondValue(event.timestamp().value()).into(), - ]); - values.extend(extra_row.values); - rows.push(Row { values }); + let extra_rows = event.extra_rows()?; + for extra_row in extra_rows { + let mut values = Vec::with_capacity(3 + extra_row.values.len()); + values.extend([ + ValueData::StringValue(event.event_type().to_string()).into(), + ValueData::BinaryValue(event.json_payload()?.into_bytes()).into(), + ValueData::TimestampNanosecondValue(event.timestamp().value()).into(), + ]); + values.extend(extra_row.values); + rows.push(Row { values }); + } } Ok(RowInsertRequests { diff --git a/src/common/frontend/src/selector.rs b/src/common/frontend/src/selector.rs index 4e6cc9566c..f2dc337cc2 100644 --- a/src/common/frontend/src/selector.rs +++ b/src/common/frontend/src/selector.rs @@ -104,7 +104,7 @@ impl MetaClientSelector { let cfg = ChannelConfig::new() .connect_timeout(Duration::from_secs(30)) .timeout(Duration::from_secs(30)); - let channel_manager = ChannelManager::with_config(cfg); + let channel_manager = ChannelManager::with_config(cfg, None); Self { meta_client, channel_manager, diff --git a/src/common/frontend/src/slow_query_event.rs b/src/common/frontend/src/slow_query_event.rs index 0e65443acb..32ca457da4 100644 --- a/src/common/frontend/src/slow_query_event.rs +++ b/src/common/frontend/src/slow_query_event.rs @@ -107,8 +107,8 @@ impl Event for SlowQueryEvent { ] } - fn extra_row(&self) -> Result { - Ok(Row { + fn extra_rows(&self) -> Result> { + Ok(vec![Row { values: vec![ ValueData::U64Value(self.cost).into(), ValueData::U64Value(self.threshold).into(), @@ -119,7 +119,7 @@ impl Event for SlowQueryEvent { ValueData::TimestampMillisecondValue(self.promql_start.unwrap_or(0)).into(), ValueData::TimestampMillisecondValue(self.promql_end.unwrap_or(0)).into(), ], - }) + }]) } fn json_payload(&self) -> Result { diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index d5b928e2a1..34de004e79 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -47,10 +47,12 @@ h3o = { version = "0.6", optional = true } hyperloglogplus = "0.4" jsonb.workspace = true memchr = "2.7" +mito-codec.workspace = true nalgebra.workspace = true num = "0.4" num-traits = "0.2" paste.workspace = true +regex.workspace = true s2 = { version = "0.0.12", optional = true } serde.workspace = true serde_json.workspace = true diff --git a/src/common/function/src/admin.rs b/src/common/function/src/admin.rs index 11270c3282..e7fd186b86 100644 --- a/src/common/function/src/admin.rs +++ b/src/common/function/src/admin.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod build_index_table; mod flush_compact_region; mod flush_compact_table; mod migrate_region; @@ -26,6 +27,7 @@ use reconcile_catalog::ReconcileCatalogFunction; use reconcile_database::ReconcileDatabaseFunction; use reconcile_table::ReconcileTableFunction; +use crate::admin::build_index_table::BuildIndexFunction; use crate::flush_flow::FlushFlowFunction; use crate::function_registry::FunctionRegistry; @@ -40,6 +42,7 @@ impl AdminFunction { registry.register(CompactRegionFunction::factory()); registry.register(FlushTableFunction::factory()); registry.register(CompactTableFunction::factory()); + registry.register(BuildIndexFunction::factory()); registry.register(FlushFlowFunction::factory()); registry.register(ReconcileCatalogFunction::factory()); registry.register(ReconcileDatabaseFunction::factory()); diff --git a/src/common/function/src/admin/build_index_table.rs b/src/common/function/src/admin/build_index_table.rs new file mode 100644 index 0000000000..155f198c79 --- /dev/null +++ b/src/common/function/src/admin/build_index_table.rs @@ -0,0 +1,80 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow::datatypes::DataType as ArrowDataType; +use common_error::ext::BoxedError; +use common_macro::admin_fn; +use common_query::error::{ + InvalidFuncArgsSnafu, MissingTableMutationHandlerSnafu, Result, TableMutationSnafu, + UnsupportedInputDataTypeSnafu, +}; +use datafusion_expr::{Signature, Volatility}; +use datatypes::prelude::*; +use session::context::QueryContextRef; +use session::table_name::table_name_to_full_name; +use snafu::{ResultExt, ensure}; +use table::requests::BuildIndexTableRequest; + +use crate::handlers::TableMutationHandlerRef; + +#[admin_fn( + name = BuildIndexFunction, + display_name = build_index, + sig_fn = build_index_signature, + ret = uint64 +)] +pub(crate) async fn build_index( + table_mutation_handler: &TableMutationHandlerRef, + query_ctx: &QueryContextRef, + params: &[ValueRef<'_>], +) -> Result { + ensure!( + params.len() == 1, + InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not correct, expect 1, have: {}", + params.len() + ), + } + ); + + let ValueRef::String(table_name) = params[0] else { + return UnsupportedInputDataTypeSnafu { + function: "build_index", + datatypes: params.iter().map(|v| v.data_type()).collect::>(), + } + .fail(); + }; + + let (catalog_name, schema_name, table_name) = table_name_to_full_name(table_name, query_ctx) + .map_err(BoxedError::new) + .context(TableMutationSnafu)?; + + let affected_rows = table_mutation_handler + .build_index( + BuildIndexTableRequest { + catalog_name, + schema_name, + table_name, + }, + query_ctx.clone(), + ) + .await?; + + Ok(Value::from(affected_rows as u64)) +} + +fn build_index_signature() -> Signature { + Signature::uniform(1, vec![ArrowDataType::Utf8], Volatility::Immutable) +} diff --git a/src/common/function/src/aggrs/aggr_wrapper.rs b/src/common/function/src/aggrs/aggr_wrapper.rs index ed691296ee..54dc1ac78e 100644 --- a/src/common/function/src/aggrs/aggr_wrapper.rs +++ b/src/common/function/src/aggrs/aggr_wrapper.rs @@ -29,6 +29,8 @@ use arrow::array::StructArray; use arrow_schema::{FieldRef, Fields}; use common_telemetry::debug; use datafusion::functions_aggregate::all_default_aggregate_functions; +use datafusion::functions_aggregate::count::Count; +use datafusion::functions_aggregate::min_max::{Max, Min}; use datafusion::optimizer::AnalyzerRule; use datafusion::optimizer::analyzer::type_coercion::TypeCoercion; use datafusion::physical_planner::create_aggregate_expr_and_maybe_filter; @@ -413,6 +415,51 @@ impl AggregateUDFImpl for StateWrapper { fn coerce_types(&self, arg_types: &[DataType]) -> datafusion_common::Result> { self.inner.coerce_types(arg_types) } + + fn value_from_stats( + &self, + statistics_args: &datafusion_expr::StatisticsArgs, + ) -> Option { + let inner = self.inner().inner().as_any(); + // only count/min/max need special handling here, for getting result from statistics + // the result of count/min/max is also the result of count_state so can return directly + let can_use_stat = inner.is::() || inner.is::() || inner.is::(); + if !can_use_stat { + return None; + } + + // fix return type by extract the first field's data type from the struct type + let state_type = if let DataType::Struct(fields) = &statistics_args.return_type { + if fields.is_empty() { + return None; + } + fields[0].data_type().clone() + } else { + return None; + }; + + let fixed_args = datafusion_expr::StatisticsArgs { + statistics: statistics_args.statistics, + return_type: &state_type, + is_distinct: statistics_args.is_distinct, + exprs: statistics_args.exprs, + }; + + let ret = self.inner().value_from_stats(&fixed_args)?; + + // wrap the result into struct scalar value + let fields = if let DataType::Struct(fields) = &statistics_args.return_type { + fields + } else { + return None; + }; + + let array = ret.to_array().ok()?; + + let struct_array = StructArray::new(fields.clone(), vec![array], None); + let ret = ScalarValue::Struct(Arc::new(struct_array)); + Some(ret) + } } /// The wrapper's input is the same as the original aggregate function's input, diff --git a/src/common/function/src/aggrs/vector.rs b/src/common/function/src/aggrs/vector.rs index 5af064d002..03489a51d4 100644 --- a/src/common/function/src/aggrs/vector.rs +++ b/src/common/function/src/aggrs/vector.rs @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +use crate::aggrs::vector::avg::VectorAvg; use crate::aggrs::vector::product::VectorProduct; use crate::aggrs::vector::sum::VectorSum; use crate::function_registry::FunctionRegistry; +mod avg; mod product; mod sum; @@ -25,5 +27,6 @@ impl VectorFunction { pub fn register(registry: &FunctionRegistry) { registry.register_aggr(VectorSum::uadf_impl()); registry.register_aggr(VectorProduct::uadf_impl()); + registry.register_aggr(VectorAvg::uadf_impl()); } } diff --git a/src/common/function/src/aggrs/vector/avg.rs b/src/common/function/src/aggrs/vector/avg.rs new file mode 100644 index 0000000000..ddf1823d28 --- /dev/null +++ b/src/common/function/src/aggrs/vector/avg.rs @@ -0,0 +1,270 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::borrow::Cow; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, AsArray, BinaryArray, LargeStringArray, StringArray}; +use arrow::compute::sum; +use arrow::datatypes::UInt64Type; +use arrow_schema::{DataType, Field}; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::{ + Accumulator, AggregateUDF, Signature, SimpleAggregateUDF, TypeSignature, Volatility, +}; +use datafusion_functions_aggregate_common::accumulator::AccumulatorArgs; +use nalgebra::{Const, DVector, DVectorView, Dyn, OVector}; + +use crate::scalars::vector::impl_conv::{ + binlit_as_veclit, parse_veclit_from_strlit, veclit_to_binlit, +}; + +/// The accumulator for the `vec_avg` aggregate function. +#[derive(Debug, Default)] +pub struct VectorAvg { + sum: Option>, + count: u64, +} + +impl VectorAvg { + /// Create a new `AggregateUDF` for the `vec_avg` aggregate function. + pub fn uadf_impl() -> AggregateUDF { + let signature = Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Utf8]), + TypeSignature::Exact(vec![DataType::LargeUtf8]), + TypeSignature::Exact(vec![DataType::Binary]), + ], + Volatility::Immutable, + ); + let udaf = SimpleAggregateUDF::new_with_signature( + "vec_avg", + signature, + DataType::Binary, + Arc::new(Self::accumulator), + vec![ + Arc::new(Field::new("sum", DataType::Binary, true)), + Arc::new(Field::new("count", DataType::UInt64, true)), + ], + ); + AggregateUDF::from(udaf) + } + + fn accumulator(args: AccumulatorArgs) -> Result> { + if args.schema.fields().len() != 1 { + return Err(datafusion_common::DataFusionError::Internal(format!( + "expect creating `VEC_AVG` with only one input field, actual {}", + args.schema.fields().len() + ))); + } + + let t = args.schema.field(0).data_type(); + if !matches!(t, DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary) { + return Err(datafusion_common::DataFusionError::Internal(format!( + "unexpected input datatype {t} when creating `VEC_AVG`" + ))); + } + + Ok(Box::new(VectorAvg::default())) + } + + fn inner(&mut self, len: usize) -> &mut OVector { + self.sum + .get_or_insert_with(|| OVector::zeros_generic(Dyn(len), Const::<1>)) + } + + fn update(&mut self, values: &[ArrayRef], is_update: bool) -> Result<()> { + if values.is_empty() { + return Ok(()); + }; + + let vectors = match values[0].data_type() { + DataType::Utf8 => { + let arr: &StringArray = values[0].as_string(); + arr.iter() + .filter_map(|x| x.map(|s| parse_veclit_from_strlit(s).map_err(Into::into))) + .map(|x| x.map(Cow::Owned)) + .collect::>>()? + } + DataType::LargeUtf8 => { + let arr: &LargeStringArray = values[0].as_string(); + arr.iter() + .filter_map(|x| x.map(|s| parse_veclit_from_strlit(s).map_err(Into::into))) + .map(|x: Result>| x.map(Cow::Owned)) + .collect::>>()? + } + DataType::Binary => { + let arr: &BinaryArray = values[0].as_binary(); + arr.iter() + .filter_map(|x| x.map(|b| binlit_as_veclit(b).map_err(Into::into))) + .collect::>>()? + } + _ => { + return Err(datafusion_common::DataFusionError::NotImplemented(format!( + "unsupported data type {} for `VEC_AVG`", + values[0].data_type() + ))); + } + }; + + if vectors.is_empty() { + return Ok(()); + } + + let len = if is_update { + vectors.len() as u64 + } else { + sum(values[1].as_primitive::()).unwrap_or_default() + }; + + let dims = vectors[0].len(); + let mut sum = DVector::zeros(dims); + for v in vectors { + if v.len() != dims { + return Err(datafusion_common::DataFusionError::Execution( + "vectors length not match: VEC_AVG".to_string(), + )); + } + let v_view = DVectorView::from_slice(&v, dims); + sum += &v_view; + } + + *self.inner(dims) += sum; + self.count += len; + + Ok(()) + } +} + +impl Accumulator for VectorAvg { + fn state(&mut self) -> Result> { + let vector = match &self.sum { + None => ScalarValue::Binary(None), + Some(sum) => ScalarValue::Binary(Some(veclit_to_binlit(sum.as_slice()))), + }; + Ok(vec![vector, ScalarValue::from(self.count)]) + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + self.update(values, true) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + self.update(states, false) + } + + fn evaluate(&mut self) -> Result { + match &self.sum { + None => Ok(ScalarValue::Binary(None)), + Some(sum) => Ok(ScalarValue::Binary(Some(veclit_to_binlit( + (sum / self.count as f32).as_slice(), + )))), + } + } + + fn size(&self) -> usize { + size_of_val(self) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::StringArray; + use datatypes::scalars::ScalarVector; + use datatypes::vectors::{ConstantVector, StringVector, Vector}; + + use super::*; + + #[test] + fn test_update_batch() { + // test update empty batch, expect not updating anything + let mut vec_avg = VectorAvg::default(); + vec_avg.update_batch(&[]).unwrap(); + assert!(vec_avg.sum.is_none()); + assert_eq!(ScalarValue::Binary(None), vec_avg.evaluate().unwrap()); + + // test update one not-null value + let mut vec_avg = VectorAvg::default(); + let v: Vec = vec![Arc::new(StringArray::from(vec![ + Some("[1.0,2.0,3.0]".to_string()), + Some("[4.0,5.0,6.0]".to_string()), + ]))]; + vec_avg.update_batch(&v).unwrap(); + assert_eq!( + ScalarValue::Binary(Some(veclit_to_binlit(&[2.5, 3.5, 4.5]))), + vec_avg.evaluate().unwrap() + ); + + // test update one null value + let mut vec_avg = VectorAvg::default(); + let v: Vec = vec![Arc::new(StringArray::from(vec![Option::::None]))]; + vec_avg.update_batch(&v).unwrap(); + assert_eq!(ScalarValue::Binary(None), vec_avg.evaluate().unwrap()); + + // test update no null-value batch + let mut vec_avg = VectorAvg::default(); + let v: Vec = vec![Arc::new(StringArray::from(vec![ + Some("[1.0,2.0,3.0]".to_string()), + Some("[4.0,5.0,6.0]".to_string()), + Some("[7.0,8.0,9.0]".to_string()), + ]))]; + vec_avg.update_batch(&v).unwrap(); + assert_eq!( + ScalarValue::Binary(Some(veclit_to_binlit(&[4.0, 5.0, 6.0]))), + vec_avg.evaluate().unwrap() + ); + + // test update null-value batch + let mut vec_avg = VectorAvg::default(); + let v: Vec = vec![Arc::new(StringArray::from(vec![ + Some("[1.0,2.0,3.0]".to_string()), + None, + Some("[7.0,8.0,9.0]".to_string()), + ]))]; + vec_avg.update_batch(&v).unwrap(); + assert_eq!( + ScalarValue::Binary(Some(veclit_to_binlit(&[4.0, 5.0, 6.0]))), + vec_avg.evaluate().unwrap() + ); + + let mut vec_avg = VectorAvg::default(); + let v: Vec = vec![Arc::new(StringArray::from(vec![ + None, + Some("[4.0,5.0,6.0]".to_string()), + Some("[7.0,8.0,9.0]".to_string()), + ]))]; + vec_avg.update_batch(&v).unwrap(); + assert_eq!( + ScalarValue::Binary(Some(veclit_to_binlit(&[5.5, 6.5, 7.5]))), + vec_avg.evaluate().unwrap() + ); + + // test update with constant vector + let mut vec_avg = VectorAvg::default(); + let v: Vec = vec![ + Arc::new(ConstantVector::new( + Arc::new(StringVector::from_vec(vec!["[1.0,2.0,3.0]".to_string()])), + 4, + )) + .to_arrow_array(), + ]; + vec_avg.update_batch(&v).unwrap(); + assert_eq!( + ScalarValue::Binary(Some(veclit_to_binlit(&[1.0, 2.0, 3.0]))), + vec_avg.evaluate().unwrap() + ); + } +} diff --git a/src/common/function/src/function_registry.rs b/src/common/function/src/function_registry.rs index 75bb71c63a..6208c9569c 100644 --- a/src/common/function/src/function_registry.rs +++ b/src/common/function/src/function_registry.rs @@ -34,6 +34,8 @@ use crate::scalars::json::JsonFunction; use crate::scalars::matches::MatchesFunction; use crate::scalars::matches_term::MatchesTermFunction; use crate::scalars::math::MathFunction; +use crate::scalars::primary_key::DecodePrimaryKeyFunction; +use crate::scalars::string::register_string_functions; use crate::scalars::timestamp::TimestampFunction; use crate::scalars::uddsketch_calc::UddSketchCalcFunction; use crate::scalars::vector::VectorFunction as VectorScalarFunction; @@ -142,6 +144,7 @@ pub static FUNCTION_REGISTRY: LazyLock> = LazyLock::new(|| ExpressionFunction::register(&function_registry); UddSketchCalcFunction::register(&function_registry); HllCalcFunction::register(&function_registry); + DecodePrimaryKeyFunction::register(&function_registry); // Full text search function MatchesFunction::register(&function_registry); @@ -154,6 +157,9 @@ pub static FUNCTION_REGISTRY: LazyLock> = LazyLock::new(|| // Json related functions JsonFunction::register(&function_registry); + // String related functions + register_string_functions(&function_registry); + // Vector related functions VectorScalarFunction::register(&function_registry); VectorAggrFunction::register(&function_registry); diff --git a/src/common/function/src/handlers.rs b/src/common/function/src/handlers.rs index e7ab67e312..0e6060e90c 100644 --- a/src/common/function/src/handlers.rs +++ b/src/common/function/src/handlers.rs @@ -25,7 +25,9 @@ use common_query::Output; use common_query::error::Result; use session::context::QueryContextRef; use store_api::storage::RegionId; -use table::requests::{CompactTableRequest, DeleteRequest, FlushTableRequest, InsertRequest}; +use table::requests::{ + BuildIndexTableRequest, CompactTableRequest, DeleteRequest, FlushTableRequest, InsertRequest, +}; /// A trait for handling table mutations in `QueryEngine`. #[async_trait] @@ -47,6 +49,13 @@ pub trait TableMutationHandler: Send + Sync { ctx: QueryContextRef, ) -> Result; + /// Trigger an index build task for the table. + async fn build_index( + &self, + request: BuildIndexTableRequest, + ctx: QueryContextRef, + ) -> Result; + /// Trigger a flush task for a table region. async fn flush_region(&self, region_id: RegionId, ctx: QueryContextRef) -> Result; diff --git a/src/common/function/src/scalars.rs b/src/common/function/src/scalars.rs index 6f93f2741d..6cf138b69a 100644 --- a/src/common/function/src/scalars.rs +++ b/src/common/function/src/scalars.rs @@ -20,6 +20,8 @@ pub mod json; pub mod matches; pub mod matches_term; pub mod math; +pub mod primary_key; +pub(crate) mod string; pub mod vector; pub(crate) mod hll_count; diff --git a/src/common/function/src/scalars/date/date_format.rs b/src/common/function/src/scalars/date/date_format.rs index 0e321c957e..dfa5a444ca 100644 --- a/src/common/function/src/scalars/date/date_format.rs +++ b/src/common/function/src/scalars/date/date_format.rs @@ -20,7 +20,9 @@ use common_query::error; use common_time::{Date, Timestamp}; use datafusion_common::DataFusionError; use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder}; -use datafusion_common::arrow::datatypes::{ArrowTimestampType, DataType, Date32Type, TimeUnit}; +use datafusion_common::arrow::datatypes::{ + ArrowTimestampType, DataType, Date32Type, Date64Type, TimeUnit, +}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature}; use snafu::ResultExt; @@ -40,6 +42,7 @@ impl Default for DateFormatFunction { signature: helper::one_of_sigs2( vec![ DataType::Date32, + DataType::Date64, DataType::Timestamp(TimeUnit::Second, None), DataType::Timestamp(TimeUnit::Millisecond, None), DataType::Timestamp(TimeUnit::Microsecond, None), @@ -115,6 +118,29 @@ impl Function for DateFormatFunction { builder.append_option(result.as_deref()); } } + DataType::Date64 => { + let left = left.as_primitive::(); + for i in 0..size { + let date = left.is_valid(i).then(|| { + let ms = left.value(i); + Timestamp::new_millisecond(ms) + }); + let format = formats.is_valid(i).then(|| formats.value(i)); + + let result = match (date, format) { + (Some(ts), Some(fmt)) => { + Some(ts.as_formatted_string(fmt, Some(timezone)).map_err(|e| { + DataFusionError::Execution(format!( + "cannot format {ts:?} as '{fmt}': {e}" + )) + })?) + } + _ => None, + }; + + builder.append_option(result.as_deref()); + } + } x => { return Err(DataFusionError::Execution(format!( "unsupported input data type {x}" @@ -137,7 +163,9 @@ mod tests { use std::sync::Arc; use arrow_schema::Field; - use datafusion_common::arrow::array::{Date32Array, StringArray, TimestampSecondArray}; + use datafusion_common::arrow::array::{ + Date32Array, Date64Array, StringArray, TimestampSecondArray, + }; use datafusion_common::config::ConfigOptions; use datafusion_expr::{TypeSignature, Volatility}; @@ -166,7 +194,7 @@ mod tests { Signature { type_signature: TypeSignature::OneOf(sigs), volatility: Volatility::Immutable - } if sigs.len() == 5)); + } if sigs.len() == 6)); } #[test] @@ -213,6 +241,50 @@ mod tests { } } + #[test] + fn test_date64_date_format() { + let f = DateFormatFunction::default(); + + let dates = vec![Some(123000), None, Some(42000), None]; + let formats = vec![ + "%Y-%m-%d %T.%3f", + "%Y-%m-%d %T.%3f", + "%Y-%m-%d %T.%3f", + "%Y-%m-%d %T.%3f", + ]; + let results = [ + Some("1970-01-01 00:02:03.000"), + None, + Some("1970-01-01 00:00:42.000"), + None, + ]; + + let mut config_options = ConfigOptions::default(); + config_options.extensions.insert(FunctionContext::default()); + let config_options = Arc::new(config_options); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(Date64Array::from(dates))), + ColumnarValue::Array(Arc::new(StringArray::from_iter_values(formats))), + ], + arg_fields: vec![], + number_rows: 4, + return_field: Arc::new(Field::new("x", DataType::Utf8View, false)), + config_options, + }; + let result = f + .invoke_with_args(args) + .and_then(|x| x.to_array(4)) + .unwrap(); + let vector = result.as_string_view(); + + assert_eq!(4, vector.len()); + for (actual, expect) in vector.iter().zip(results) { + assert_eq!(actual, expect); + } + } + #[test] fn test_date_date_format() { let f = DateFormatFunction::default(); diff --git a/src/common/function/src/scalars/expression.rs b/src/common/function/src/scalars/expression.rs index 75920801db..63ed40fc8f 100644 --- a/src/common/function/src/scalars/expression.rs +++ b/src/common/function/src/scalars/expression.rs @@ -14,6 +14,7 @@ mod binary; mod ctx; +mod if_func; mod is_null; mod unary; @@ -22,6 +23,7 @@ pub use ctx::EvalContext; pub use unary::scalar_unary_op; use crate::function_registry::FunctionRegistry; +use crate::scalars::expression::if_func::IfFunction; use crate::scalars::expression::is_null::IsNullFunction; pub(crate) struct ExpressionFunction; @@ -29,5 +31,6 @@ pub(crate) struct ExpressionFunction; impl ExpressionFunction { pub fn register(registry: &FunctionRegistry) { registry.register_scalar(IsNullFunction::default()); + registry.register_scalar(IfFunction::default()); } } diff --git a/src/common/function/src/scalars/expression/if_func.rs b/src/common/function/src/scalars/expression/if_func.rs new file mode 100644 index 0000000000..92108cd307 --- /dev/null +++ b/src/common/function/src/scalars/expression/if_func.rs @@ -0,0 +1,404 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; +use std::fmt::Display; + +use arrow::array::ArrowNativeTypeOp; +use arrow::datatypes::ArrowPrimitiveType; +use datafusion::arrow::array::{Array, ArrayRef, AsArray, BooleanArray, PrimitiveArray}; +use datafusion::arrow::compute::kernels::zip::zip; +use datafusion::arrow::datatypes::DataType; +use datafusion_common::DataFusionError; +use datafusion_expr::type_coercion::binary::comparison_coercion; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; + +use crate::function::Function; + +const NAME: &str = "if"; + +/// MySQL-compatible IF function: IF(condition, true_value, false_value) +/// +/// Returns true_value if condition is TRUE (not NULL and not 0), +/// otherwise returns false_value. +/// +/// MySQL truthy rules: +/// - NULL -> false +/// - 0 (numeric zero) -> false +/// - Any non-zero numeric -> true +/// - Boolean true/false -> use directly +#[derive(Clone, Debug)] +pub struct IfFunction { + signature: Signature, +} + +impl Default for IfFunction { + fn default() -> Self { + Self { + signature: Signature::any(3, Volatility::Immutable), + } + } +} + +impl Display for IfFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", NAME.to_ascii_uppercase()) + } +} + +impl Function for IfFunction { + fn name(&self) -> &str { + NAME + } + + fn return_type(&self, input_types: &[DataType]) -> datafusion_common::Result { + // Return the common type of true_value and false_value (args[1] and args[2]) + if input_types.len() < 3 { + return Err(DataFusionError::Plan(format!( + "{} requires 3 arguments, got {}", + NAME, + input_types.len() + ))); + } + let true_type = &input_types[1]; + let false_type = &input_types[2]; + + // Use comparison_coercion to find common type + comparison_coercion(true_type, false_type).ok_or_else(|| { + DataFusionError::Plan(format!( + "Cannot find common type for IF function between {:?} and {:?}", + true_type, false_type + )) + }) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + if args.args.len() != 3 { + return Err(DataFusionError::Plan(format!( + "{} requires exactly 3 arguments, got {}", + NAME, + args.args.len() + ))); + } + + let condition = &args.args[0]; + let true_value = &args.args[1]; + let false_value = &args.args[2]; + + // Convert condition to boolean array using MySQL truthy rules + let bool_array = to_boolean_array(condition, args.number_rows)?; + + // Convert true and false values to arrays + let true_array = true_value.to_array(args.number_rows)?; + let false_array = false_value.to_array(args.number_rows)?; + + // Use zip to select values based on condition + // zip expects &dyn Datum, and ArrayRef (Arc) implements Datum + let result = zip(&bool_array, &true_array, &false_array)?; + Ok(ColumnarValue::Array(result)) + } +} + +/// Convert a ColumnarValue to a BooleanArray using MySQL truthy rules: +/// - NULL -> false +/// - 0 (any numeric zero) -> false +/// - Non-zero numeric -> true +/// - Boolean -> use directly +fn to_boolean_array( + value: &ColumnarValue, + num_rows: usize, +) -> datafusion_common::Result { + let array = value.to_array(num_rows)?; + array_to_bool(array) +} + +/// Convert an integer PrimitiveArray to BooleanArray using MySQL truthy rules: +/// NULL -> false, 0 -> false, non-zero -> true +fn int_array_to_bool(array: &PrimitiveArray) -> BooleanArray +where + T: ArrowPrimitiveType, + T::Native: ArrowNativeTypeOp, +{ + BooleanArray::from_iter( + array + .iter() + .map(|opt| Some(opt.is_some_and(|v| !v.is_zero()))), + ) +} + +/// Convert a float PrimitiveArray to BooleanArray using MySQL truthy rules: +/// NULL -> false, 0 (including -0.0) -> false, NaN -> true, other non-zero -> true +fn float_array_to_bool(array: &PrimitiveArray) -> BooleanArray +where + T: ArrowPrimitiveType, + T::Native: ArrowNativeTypeOp + num_traits::Float, +{ + use num_traits::Float; + BooleanArray::from_iter( + array + .iter() + .map(|opt| Some(opt.is_some_and(|v| v.is_nan() || !v.is_zero()))), + ) +} + +/// Convert an Array to BooleanArray using MySQL truthy rules +fn array_to_bool(array: ArrayRef) -> datafusion_common::Result { + use arrow::datatypes::*; + + match array.data_type() { + DataType::Boolean => { + let bool_array = array.as_boolean(); + Ok(BooleanArray::from_iter( + bool_array.iter().map(|opt| Some(opt.unwrap_or(false))), + )) + } + DataType::Int8 => Ok(int_array_to_bool(array.as_primitive::())), + DataType::Int16 => Ok(int_array_to_bool(array.as_primitive::())), + DataType::Int32 => Ok(int_array_to_bool(array.as_primitive::())), + DataType::Int64 => Ok(int_array_to_bool(array.as_primitive::())), + DataType::UInt8 => Ok(int_array_to_bool(array.as_primitive::())), + DataType::UInt16 => Ok(int_array_to_bool(array.as_primitive::())), + DataType::UInt32 => Ok(int_array_to_bool(array.as_primitive::())), + DataType::UInt64 => Ok(int_array_to_bool(array.as_primitive::())), + // Float16 needs special handling since half::f16 doesn't implement num_traits::Float + DataType::Float16 => { + let typed_array = array.as_primitive::(); + Ok(BooleanArray::from_iter(typed_array.iter().map(|opt| { + Some(opt.is_some_and(|v| { + let f = v.to_f32(); + f.is_nan() || !f.is_zero() + })) + }))) + } + DataType::Float32 => Ok(float_array_to_bool(array.as_primitive::())), + DataType::Float64 => Ok(float_array_to_bool(array.as_primitive::())), + // Null type is always false. + // Note: NullArray::is_null() returns false (physical null), so we must handle it explicitly. + // See: https://github.com/apache/arrow-rs/issues/4840 + DataType::Null => Ok(BooleanArray::from(vec![false; array.len()])), + // For other types, treat non-null as true + _ => { + let len = array.len(); + Ok(BooleanArray::from_iter( + (0..len).map(|i| Some(!array.is_null(i))), + )) + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_schema::Field; + use datafusion_common::ScalarValue; + use datafusion_common::arrow::array::{AsArray, Int32Array, StringArray}; + + use super::*; + + #[test] + fn test_if_function_basic() { + let if_func = IfFunction::default(); + assert_eq!("if", if_func.name()); + + // Test IF(true, 'yes', 'no') -> 'yes' + let result = if_func + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Boolean(Some(true))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))), + ], + arg_fields: vec![], + number_rows: 1, + return_field: Arc::new(Field::new("", DataType::Utf8, true)), + config_options: Arc::new(Default::default()), + }) + .unwrap(); + + if let ColumnarValue::Array(arr) = result { + let str_arr = arr.as_string::(); + assert_eq!(str_arr.value(0), "yes"); + } else { + panic!("Expected Array result"); + } + } + + #[test] + fn test_if_function_false() { + let if_func = IfFunction::default(); + + // Test IF(false, 'yes', 'no') -> 'no' + let result = if_func + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))), + ], + arg_fields: vec![], + number_rows: 1, + return_field: Arc::new(Field::new("", DataType::Utf8, true)), + config_options: Arc::new(Default::default()), + }) + .unwrap(); + + if let ColumnarValue::Array(arr) = result { + let str_arr = arr.as_string::(); + assert_eq!(str_arr.value(0), "no"); + } else { + panic!("Expected Array result"); + } + } + + #[test] + fn test_if_function_null_is_false() { + let if_func = IfFunction::default(); + + // Test IF(NULL, 'yes', 'no') -> 'no' (NULL is treated as false) + // Using Boolean(None) - typed null + let result = if_func + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Boolean(None)), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))), + ], + arg_fields: vec![], + number_rows: 1, + return_field: Arc::new(Field::new("", DataType::Utf8, true)), + config_options: Arc::new(Default::default()), + }) + .unwrap(); + + if let ColumnarValue::Array(arr) = result { + let str_arr = arr.as_string::(); + assert_eq!(str_arr.value(0), "no"); + } else { + panic!("Expected Array result"); + } + + // Test IF(NULL, 'yes', 'no') -> 'no' using ScalarValue::Null (untyped null from SQL NULL literal) + let result = if_func + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Null), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))), + ], + arg_fields: vec![], + number_rows: 1, + return_field: Arc::new(Field::new("", DataType::Utf8, true)), + config_options: Arc::new(Default::default()), + }) + .unwrap(); + + if let ColumnarValue::Array(arr) = result { + let str_arr = arr.as_string::(); + assert_eq!(str_arr.value(0), "no"); + } else { + panic!("Expected Array result"); + } + } + + #[test] + fn test_if_function_numeric_truthy() { + let if_func = IfFunction::default(); + + // Test IF(1, 'yes', 'no') -> 'yes' (non-zero is true) + let result = if_func + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))), + ], + arg_fields: vec![], + number_rows: 1, + return_field: Arc::new(Field::new("", DataType::Utf8, true)), + config_options: Arc::new(Default::default()), + }) + .unwrap(); + + if let ColumnarValue::Array(arr) = result { + let str_arr = arr.as_string::(); + assert_eq!(str_arr.value(0), "yes"); + } else { + panic!("Expected Array result"); + } + + // Test IF(0, 'yes', 'no') -> 'no' (zero is false) + let result = if_func + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Int32(Some(0))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))), + ], + arg_fields: vec![], + number_rows: 1, + return_field: Arc::new(Field::new("", DataType::Utf8, true)), + config_options: Arc::new(Default::default()), + }) + .unwrap(); + + if let ColumnarValue::Array(arr) = result { + let str_arr = arr.as_string::(); + assert_eq!(str_arr.value(0), "no"); + } else { + panic!("Expected Array result"); + } + } + + #[test] + fn test_if_function_with_arrays() { + let if_func = IfFunction::default(); + + // Test with array condition + let condition = Int32Array::from(vec![Some(1), Some(0), None, Some(5)]); + let true_val = StringArray::from(vec!["yes", "yes", "yes", "yes"]); + let false_val = StringArray::from(vec!["no", "no", "no", "no"]); + + let result = if_func + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(condition)), + ColumnarValue::Array(Arc::new(true_val)), + ColumnarValue::Array(Arc::new(false_val)), + ], + arg_fields: vec![], + number_rows: 4, + return_field: Arc::new(Field::new("", DataType::Utf8, true)), + config_options: Arc::new(Default::default()), + }) + .unwrap(); + + if let ColumnarValue::Array(arr) = result { + let str_arr = arr.as_string::(); + assert_eq!(str_arr.value(0), "yes"); // 1 is true + assert_eq!(str_arr.value(1), "no"); // 0 is false + assert_eq!(str_arr.value(2), "no"); // NULL is false + assert_eq!(str_arr.value(3), "yes"); // 5 is true + } else { + panic!("Expected Array result"); + } + } +} diff --git a/src/common/function/src/scalars/json.rs b/src/common/function/src/scalars/json.rs index 9b022d71da..f84937fa0f 100644 --- a/src/common/function/src/scalars/json.rs +++ b/src/common/function/src/scalars/json.rs @@ -19,7 +19,7 @@ mod json_path_match; mod json_to_string; mod parse_json; -use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString}; +use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetObject, JsonGetString}; use json_is::{ JsonIsArray, JsonIsBool, JsonIsFloat, JsonIsInt, JsonIsNull, JsonIsObject, JsonIsString, }; @@ -39,6 +39,7 @@ impl JsonFunction { registry.register_scalar(JsonGetFloat::default()); registry.register_scalar(JsonGetString::default()); registry.register_scalar(JsonGetBool::default()); + registry.register_scalar(JsonGetObject::default()); registry.register_scalar(JsonIsNull::default()); registry.register_scalar(JsonIsInt::default()); diff --git a/src/common/function/src/scalars/json/json_get.rs b/src/common/function/src/scalars/json/json_get.rs index 51dd2fc9b7..92ea9cf990 100644 --- a/src/common/function/src/scalars/json/json_get.rs +++ b/src/common/function/src/scalars/json/json_get.rs @@ -16,10 +16,13 @@ use std::fmt::{self, Display}; use std::sync::Arc; use arrow::compute; +use datafusion_common::DataFusionError; use datafusion_common::arrow::array::{ - Array, AsArray, BooleanBuilder, Float64Builder, Int64Builder, StringViewBuilder, + Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder, + StringViewBuilder, }; use datafusion_common::arrow::datatypes::DataType; +use datafusion_expr::type_coercion::aggregates::STRINGS; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature}; use crate::function::{Function, extract_args}; @@ -212,13 +215,92 @@ impl Display for JsonGetString { } } +/// Get the object from JSON value by path. +pub(super) struct JsonGetObject { + signature: Signature, +} + +impl JsonGetObject { + const NAME: &'static str = "json_get_object"; +} + +impl Default for JsonGetObject { + fn default() -> Self { + Self { + signature: helper::one_of_sigs2( + vec![ + DataType::Binary, + DataType::LargeBinary, + DataType::BinaryView, + ], + STRINGS.to_vec(), + ), + } + } +} + +impl Function for JsonGetObject { + fn name(&self) -> &str { + Self::NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::BinaryView) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let [arg0, arg1] = extract_args(self.name(), &args)?; + let arg0 = compute::cast(&arg0, &DataType::BinaryView)?; + let jsons = arg0.as_binary_view(); + let arg1 = compute::cast(&arg1, &DataType::Utf8View)?; + let paths = arg1.as_string_view(); + + let len = jsons.len(); + let mut builder = BinaryViewBuilder::with_capacity(len); + + for i in 0..len { + let json = jsons.is_valid(i).then(|| jsons.value(i)); + let path = paths.is_valid(i).then(|| paths.value(i)); + let result = if let (Some(json), Some(path)) = (json, path) { + let result = jsonb::jsonpath::parse_json_path(path.as_bytes()).and_then(|path| { + let mut data = Vec::new(); + let mut offset = Vec::new(); + jsonb::get_by_path(json, path, &mut data, &mut offset) + .map(|()| jsonb::is_object(&data).then_some(data)) + }); + result.map_err(|e| DataFusionError::Execution(e.to_string()))? + } else { + None + }; + builder.append_option(result); + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +impl Display for JsonGetObject { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", Self::NAME.to_ascii_uppercase()) + } +} + #[cfg(test)] mod tests { use std::sync::Arc; use arrow_schema::Field; - use datafusion_common::arrow::array::{BinaryArray, StringArray}; + use datafusion_common::ScalarValue; + use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray}; use datafusion_common::arrow::datatypes::{Float64Type, Int64Type}; + use datatypes::types::parse_string_to_jsonb; use super::*; @@ -425,4 +507,49 @@ mod tests { assert_eq!(*gt, result); } } + + #[test] + fn test_json_get_object() -> datafusion_common::Result<()> { + let udf = JsonGetObject::default(); + assert_eq!("json_get_object", udf.name()); + assert_eq!( + DataType::BinaryView, + udf.return_type(&[DataType::BinaryView, DataType::Utf8View])? + ); + + let json_value = parse_string_to_jsonb(r#"{"a": {"b": {"c": {"d": 1}}}}"#).unwrap(); + let paths = vec!["$", "$.a", "$.a.b", "$.a.b.c", "$.a.b.c.d", "$.e", "$.a.e"]; + let number_rows = paths.len(); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(ScalarValue::Binary(Some(json_value))), + ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))), + ], + arg_fields: vec![], + number_rows, + return_field: Arc::new(Field::new("x", DataType::Binary, false)), + config_options: Arc::new(Default::default()), + }; + let result = udf + .invoke_with_args(args) + .and_then(|x| x.to_array(number_rows))?; + let result = result.as_binary_view(); + + let expected = &BinaryViewArray::from_iter( + vec![ + Some(r#"{"a": {"b": {"c": {"d": 1}}}}"#), + Some(r#"{"b": {"c": {"d": 1}}}"#), + Some(r#"{"c": {"d": 1}}"#), + Some(r#"{"d": 1}"#), + None, + None, + None, + ] + .into_iter() + .map(|x| x.and_then(|s| parse_string_to_jsonb(s).ok())), + ); + assert_eq!(result, expected); + Ok(()) + } } diff --git a/src/common/function/src/scalars/json/json_to_string.rs b/src/common/function/src/scalars/json/json_to_string.rs index ae134b75dd..6c0cc260b2 100644 --- a/src/common/function/src/scalars/json/json_to_string.rs +++ b/src/common/function/src/scalars/json/json_to_string.rs @@ -32,7 +32,15 @@ impl Default for JsonToStringFunction { fn default() -> Self { Self { // TODO(LFC): Use a more clear type here instead of "Binary" for Json input, once we have a "Json" type. - signature: Signature::exact(vec![DataType::Binary], Volatility::Immutable), + signature: Signature::uniform( + 1, + vec![ + DataType::Binary, + DataType::LargeBinary, + DataType::BinaryView, + ], + Volatility::Immutable, + ), } } } @@ -57,7 +65,8 @@ impl Function for JsonToStringFunction { args: ScalarFunctionArgs, ) -> datafusion_common::Result { let [arg0] = extract_args(self.name(), &args)?; - let jsons = arg0.as_binary::(); + let arg0 = arrow::compute::cast(&arg0, &DataType::BinaryView)?; + let jsons = arg0.as_binary_view(); let size = jsons.len(); let mut builder = StringViewBuilder::with_capacity(size); diff --git a/src/common/function/src/scalars/primary_key.rs b/src/common/function/src/scalars/primary_key.rs new file mode 100644 index 0000000000..680c663bc5 --- /dev/null +++ b/src/common/function/src/scalars/primary_key.rs @@ -0,0 +1,521 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::{self, Display}; +use std::sync::Arc; + +use datafusion_common::arrow::array::{ + Array, ArrayRef, BinaryArray, BinaryViewArray, DictionaryArray, ListBuilder, StringBuilder, +}; +use datafusion_common::arrow::datatypes::{DataType, Field}; +use datafusion_common::{DataFusionError, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +use datatypes::arrow::datatypes::UInt32Type; +use datatypes::value::Value; +use mito_codec::row_converter::{ + CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec_with_fields, +}; +use store_api::codec::PrimaryKeyEncoding; +use store_api::metadata::RegionMetadata; +use store_api::storage::ColumnId; +use store_api::storage::consts::{PRIMARY_KEY_COLUMN_NAME, ReservedColumnId}; + +use crate::function::{Function, extract_args}; +use crate::function_registry::FunctionRegistry; + +type NameValuePair = (String, Option); + +#[derive(Clone, Debug)] +pub(crate) struct DecodePrimaryKeyFunction { + signature: Signature, +} + +const NAME: &str = "decode_primary_key"; +const NULL_VALUE_LITERAL: &str = "null"; + +impl Default for DecodePrimaryKeyFunction { + fn default() -> Self { + Self { + signature: Signature::any(3, Volatility::Immutable), + } + } +} + +impl DecodePrimaryKeyFunction { + pub fn register(registry: &FunctionRegistry) { + registry.register_scalar(Self::default()); + } + + fn return_data_type() -> DataType { + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) + } +} + +impl Function for DecodePrimaryKeyFunction { + fn name(&self) -> &str { + NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(Self::return_data_type()) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let [encoded, _, _] = extract_args(self.name(), &args)?; + let number_rows = args.number_rows; + + let encoding = parse_encoding(&args.args[1])?; + let metadata = parse_region_metadata(&args.args[2])?; + let codec = build_codec(&metadata, encoding); + let name_lookup: HashMap<_, _> = metadata + .column_metadatas + .iter() + .map(|c| (c.column_id, c.column_schema.name.clone())) + .collect(); + + let decoded_rows = decode_primary_keys(encoded, number_rows, codec.as_ref(), &name_lookup)?; + let array = build_list_array(&decoded_rows)?; + + Ok(ColumnarValue::Array(array)) + } +} + +impl Display for DecodePrimaryKeyFunction { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "DECODE_PRIMARY_KEY") + } +} + +fn parse_encoding(arg: &ColumnarValue) -> datafusion_common::Result { + let encoding = match arg { + ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) + | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(v))) => v.as_str(), + ColumnarValue::Scalar(value) => { + return Err(DataFusionError::Execution(format!( + "encoding must be a string literal, got {value:?}" + ))); + } + ColumnarValue::Array(_) => { + return Err(DataFusionError::Execution( + "encoding must be a scalar string".to_string(), + )); + } + }; + + match encoding.to_ascii_lowercase().as_str() { + "dense" => Ok(PrimaryKeyEncoding::Dense), + "sparse" => Ok(PrimaryKeyEncoding::Sparse), + _ => Err(DataFusionError::Execution(format!( + "unsupported primary key encoding: {encoding}" + ))), + } +} + +fn build_codec( + metadata: &RegionMetadata, + encoding: PrimaryKeyEncoding, +) -> Arc { + let fields = metadata.primary_key_columns().map(|c| { + ( + c.column_id, + SortField::new(c.column_schema.data_type.clone()), + ) + }); + build_primary_key_codec_with_fields(encoding, fields) +} + +fn parse_region_metadata(arg: &ColumnarValue) -> datafusion_common::Result { + let json = match arg { + ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) + | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(v))) => v.as_str(), + ColumnarValue::Scalar(value) => { + return Err(DataFusionError::Execution(format!( + "region metadata must be a string literal, got {value:?}" + ))); + } + ColumnarValue::Array(_) => { + return Err(DataFusionError::Execution( + "region metadata must be a scalar string".to_string(), + )); + } + }; + + RegionMetadata::from_json(json) + .map_err(|e| DataFusionError::Execution(format!("failed to parse region metadata: {e:?}"))) +} + +fn decode_primary_keys( + encoded: ArrayRef, + number_rows: usize, + codec: &dyn PrimaryKeyCodec, + name_lookup: &HashMap, +) -> datafusion_common::Result>> { + if let Some(dict) = encoded + .as_any() + .downcast_ref::>() + { + decode_dictionary(dict, number_rows, codec, name_lookup) + } else if let Some(array) = encoded.as_any().downcast_ref::() { + decode_binary_array(array, codec, name_lookup) + } else if let Some(array) = encoded.as_any().downcast_ref::() { + decode_binary_view_array(array, codec, name_lookup) + } else { + Err(DataFusionError::Execution(format!( + "column {PRIMARY_KEY_COLUMN_NAME} must be binary or dictionary(binary) array" + ))) + } +} + +fn decode_dictionary( + dict: &DictionaryArray, + number_rows: usize, + codec: &dyn PrimaryKeyCodec, + name_lookup: &HashMap, +) -> datafusion_common::Result>> { + let values = dict + .values() + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Execution("primary key dictionary values are not binary".to_string()) + })?; + + let mut decoded_values = Vec::with_capacity(values.len()); + for i in 0..values.len() { + let pk = values.value(i); + let pairs = decode_one(pk, codec, name_lookup)?; + decoded_values.push(pairs); + } + + let mut rows = Vec::with_capacity(number_rows); + let keys = dict.keys(); + for i in 0..number_rows { + let dict_index = keys.value(i) as usize; + rows.push(decoded_values[dict_index].clone()); + } + + Ok(rows) +} + +fn decode_binary_array( + array: &BinaryArray, + codec: &dyn PrimaryKeyCodec, + name_lookup: &HashMap, +) -> datafusion_common::Result>> { + (0..array.len()) + .map(|i| decode_one(array.value(i), codec, name_lookup)) + .collect() +} + +fn decode_binary_view_array( + array: &BinaryViewArray, + codec: &dyn PrimaryKeyCodec, + name_lookup: &HashMap, +) -> datafusion_common::Result>> { + (0..array.len()) + .map(|i| decode_one(array.value(i), codec, name_lookup)) + .collect() +} + +fn decode_one( + pk: &[u8], + codec: &dyn PrimaryKeyCodec, + name_lookup: &HashMap, +) -> datafusion_common::Result> { + let decoded = codec + .decode(pk) + .map_err(|e| DataFusionError::Execution(format!("failed to decode primary key: {e}")))?; + + Ok(match decoded { + CompositeValues::Dense(values) => values + .into_iter() + .map(|(column_id, value)| (column_name(column_id, name_lookup), value_to_string(value))) + .collect(), + CompositeValues::Sparse(values) => { + let mut values: Vec<_> = values + .iter() + .map(|(column_id, value)| { + ( + *column_id, + column_name(*column_id, name_lookup), + value_to_string(value.clone()), + ) + }) + .collect(); + values.sort_by_key(|(column_id, _, _)| { + (ReservedColumnId::is_reserved(*column_id), *column_id) + }); + values + .into_iter() + .map(|(_, name, value)| (name, value)) + .collect() + } + }) +} + +fn column_name(column_id: ColumnId, name_lookup: &HashMap) -> String { + if let Some(name) = name_lookup.get(&column_id) { + return name.clone(); + } + + if column_id == ReservedColumnId::table_id() { + return "__table_id".to_string(); + } + if column_id == ReservedColumnId::tsid() { + return "__tsid".to_string(); + } + + column_id.to_string() +} + +fn value_to_string(value: Value) -> Option { + match value { + Value::Null => None, + _ => Some(value.to_string()), + } +} + +fn build_list_array(rows: &[Vec]) -> datafusion_common::Result { + let mut builder = ListBuilder::new(StringBuilder::new()); + + for row in rows { + for (key, value) in row { + let value = value.as_deref().unwrap_or(NULL_VALUE_LITERAL); + builder.values().append_value(format!("{key} : {value}")); + } + builder.append(true); + } + + Ok(Arc::new(builder.finish())) +} + +#[cfg(test)] +mod tests { + use api::v1::SemanticType; + use datafusion_common::ScalarValue; + use datatypes::arrow::array::builder::BinaryDictionaryBuilder; + use datatypes::arrow::array::{BinaryArray, ListArray, StringArray}; + use datatypes::arrow::datatypes::UInt32Type; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::ColumnSchema; + use datatypes::value::Value; + use mito_codec::row_converter::{ + DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortField, SparsePrimaryKeyCodec, + }; + use store_api::codec::PrimaryKeyEncoding; + use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; + use store_api::storage::consts::ReservedColumnId; + use store_api::storage::{ColumnId, RegionId}; + + use super::*; + + fn pk_field() -> Arc { + Arc::new(Field::new_dictionary( + PRIMARY_KEY_COLUMN_NAME, + DataType::UInt32, + DataType::Binary, + false, + )) + } + + fn region_metadata_json( + columns: &[(ColumnId, &str, ConcreteDataType)], + encoding: PrimaryKeyEncoding, + ) -> String { + let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1)); + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 100, + }); + builder.primary_key_encoding(encoding); + for (id, name, ty) in columns { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new((*name).to_string(), ty.clone(), true), + semantic_type: SemanticType::Tag, + column_id: *id, + }); + } + builder.primary_key(columns.iter().map(|(id, _, _)| *id).collect()); + + builder.build().unwrap().to_json().unwrap() + } + + fn list_row(list: &ListArray, row_idx: usize) -> Vec { + let values = list.value(row_idx); + let values = values.as_any().downcast_ref::().unwrap(); + (0..values.len()) + .map(|i| values.value(i).to_string()) + .collect() + } + + #[test] + fn test_decode_dense_primary_key() { + let columns = vec![ + (0, "host", ConcreteDataType::string_datatype()), + (1, "core", ConcreteDataType::int64_datatype()), + ]; + let metadata_json = region_metadata_json(&columns, PrimaryKeyEncoding::Dense); + let codec = DensePrimaryKeyCodec::with_fields( + columns + .iter() + .map(|(id, _, ty)| (*id, SortField::new(ty.clone()))) + .collect(), + ); + + let rows = vec![ + vec![Value::from("a"), Value::from(1_i64)], + vec![Value::from("b"), Value::from(2_i64)], + vec![Value::from("a"), Value::from(1_i64)], + ]; + + let mut builder = BinaryDictionaryBuilder::::new(); + for row in &rows { + let encoded = codec.encode(row.iter().map(|v| v.as_value_ref())).unwrap(); + builder.append(encoded.as_slice()).unwrap(); + } + let dict_array: ArrayRef = Arc::new(builder.finish()); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(dict_array), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("dense".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(metadata_json))), + ], + arg_fields: vec![ + pk_field(), + Arc::new(Field::new("encoding", DataType::Utf8, false)), + Arc::new(Field::new("region_metadata", DataType::Utf8, false)), + ], + number_rows: 3, + return_field: Arc::new(Field::new( + "decoded", + DecodePrimaryKeyFunction::return_data_type(), + false, + )), + config_options: Default::default(), + }; + + let func = DecodePrimaryKeyFunction::default(); + let result = func + .invoke_with_args(args) + .and_then(|v| v.to_array(3)) + .unwrap(); + let list = result.as_any().downcast_ref::().unwrap(); + + let expected = [ + vec!["host : a".to_string(), "core : 1".to_string()], + vec!["host : b".to_string(), "core : 2".to_string()], + vec!["host : a".to_string(), "core : 1".to_string()], + ]; + + for (row_idx, expected_row) in expected.iter().enumerate() { + assert_eq!(*expected_row, list_row(list, row_idx)); + } + } + + #[test] + fn test_decode_sparse_primary_key() { + let columns = vec![ + (10, "k0", ConcreteDataType::string_datatype()), + (11, "k1", ConcreteDataType::string_datatype()), + ]; + let metadata_json = region_metadata_json(&columns, PrimaryKeyEncoding::Sparse); + let codec = SparsePrimaryKeyCodec::schemaless(); + + let rows = vec![ + vec![ + (ReservedColumnId::table_id(), Value::UInt32(1)), + (ReservedColumnId::tsid(), Value::UInt64(100)), + (10, Value::from("a")), + (11, Value::from("b")), + ], + vec![ + (ReservedColumnId::table_id(), Value::UInt32(1)), + (ReservedColumnId::tsid(), Value::UInt64(200)), + (10, Value::from("c")), + (11, Value::from("d")), + ], + ]; + + let mut encoded_values = Vec::with_capacity(rows.len()); + for row in &rows { + let mut buf = Vec::new(); + codec.encode_values(row, &mut buf).unwrap(); + encoded_values.push(buf); + } + + let pk_array: ArrayRef = Arc::new(BinaryArray::from_iter_values( + encoded_values.iter().cloned(), + )); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(pk_array), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("sparse".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(metadata_json))), + ], + arg_fields: vec![ + pk_field(), + Arc::new(Field::new("encoding", DataType::Utf8, false)), + Arc::new(Field::new("region_metadata", DataType::Utf8, false)), + ], + number_rows: rows.len(), + return_field: Arc::new(Field::new( + "decoded", + DecodePrimaryKeyFunction::return_data_type(), + false, + )), + config_options: Default::default(), + }; + + let func = DecodePrimaryKeyFunction::default(); + let result = func + .invoke_with_args(args) + .and_then(|v| v.to_array(rows.len())) + .unwrap(); + let list = result.as_any().downcast_ref::().unwrap(); + + let expected = [ + vec![ + "k0 : a".to_string(), + "k1 : b".to_string(), + "__tsid : 100".to_string(), + "__table_id : 1".to_string(), + ], + vec![ + "k0 : c".to_string(), + "k1 : d".to_string(), + "__tsid : 200".to_string(), + "__table_id : 1".to_string(), + ], + ]; + + for (row_idx, expected_row) in expected.iter().enumerate() { + assert_eq!(*expected_row, list_row(list, row_idx)); + } + } +} diff --git a/src/common/function/src/scalars/string.rs b/src/common/function/src/scalars/string.rs new file mode 100644 index 0000000000..95c6201ee2 --- /dev/null +++ b/src/common/function/src/scalars/string.rs @@ -0,0 +1,26 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! String scalar functions + +mod regexp_extract; + +pub(crate) use regexp_extract::RegexpExtractFunction; + +use crate::function_registry::FunctionRegistry; + +/// Register all string functions +pub fn register_string_functions(registry: &FunctionRegistry) { + RegexpExtractFunction::register(registry); +} diff --git a/src/common/function/src/scalars/string/regexp_extract.rs b/src/common/function/src/scalars/string/regexp_extract.rs new file mode 100644 index 0000000000..bc78c4df74 --- /dev/null +++ b/src/common/function/src/scalars/string/regexp_extract.rs @@ -0,0 +1,339 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Implementation of REGEXP_EXTRACT function +use std::fmt; +use std::sync::Arc; + +use datafusion_common::DataFusionError; +use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder}; +use datafusion_common::arrow::compute::cast; +use datafusion_common::arrow::datatypes::DataType; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility}; +use regex::{Regex, RegexBuilder}; + +use crate::function::Function; +use crate::function_registry::FunctionRegistry; + +const NAME: &str = "regexp_extract"; + +// Safety limits +const MAX_REGEX_SIZE: usize = 1024 * 1024; // compiled regex heap cap +const MAX_DFA_SIZE: usize = 2 * 1024 * 1024; // lazy DFA cap +const MAX_TOTAL_RESULT_SIZE: usize = 64 * 1024 * 1024; // total batch cap +const MAX_SINGLE_MATCH: usize = 1024 * 1024; // per-row cap +const MAX_PATTERN_LEN: usize = 10_000; // pattern text length cap + +/// REGEXP_EXTRACT function implementation +/// Extracts the first substring matching the given regular expression pattern. +/// If no match is found, returns NULL. +/// +#[derive(Debug)] +pub struct RegexpExtractFunction { + signature: Signature, +} + +impl RegexpExtractFunction { + pub fn register(registry: &FunctionRegistry) { + registry.register_scalar(RegexpExtractFunction::default()); + } +} + +impl Default for RegexpExtractFunction { + fn default() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Utf8View, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::Utf8View, DataType::Utf8View]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8View]), + TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8View]), + TypeSignature::Exact(vec![DataType::Utf8View, DataType::LargeUtf8]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::Utf8, DataType::LargeUtf8]), + TypeSignature::Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), + ], + Volatility::Immutable, + ), + } + } +} + +impl fmt::Display for RegexpExtractFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", NAME.to_ascii_uppercase()) + } +} + +impl Function for RegexpExtractFunction { + fn name(&self) -> &str { + NAME + } + + // Always return LargeUtf8 for simplicity and safety + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::LargeUtf8) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + if args.args.len() != 2 { + return Err(DataFusionError::Execution( + "REGEXP_EXTRACT requires exactly two arguments (text, pattern)".to_string(), + )); + } + + // Keep original ColumnarValue variants for scalar-pattern fast path + let pattern_is_scalar = matches!(args.args[1], ColumnarValue::Scalar(_)); + + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let text_array = &arrays[0]; + let pattern_array = &arrays[1]; + + // Cast both to LargeUtf8 for uniform access (supports Utf8/Utf8View/Dictionary) + let text_large = cast(text_array.as_ref(), &DataType::LargeUtf8).map_err(|e| { + DataFusionError::Execution(format!("REGEXP_EXTRACT: text cast failed: {e}")) + })?; + let pattern_large = cast(pattern_array.as_ref(), &DataType::LargeUtf8).map_err(|e| { + DataFusionError::Execution(format!("REGEXP_EXTRACT: pattern cast failed: {e}")) + })?; + + let text = text_large.as_string::(); + let pattern = pattern_large.as_string::(); + let len = text.len(); + + // Pre-size result builder with conservative estimate + let mut estimated_total = 0usize; + for i in 0..len { + if !text.is_null(i) { + estimated_total = estimated_total.saturating_add(text.value_length(i) as usize); + if estimated_total > MAX_TOTAL_RESULT_SIZE { + return Err(DataFusionError::ResourcesExhausted(format!( + "REGEXP_EXTRACT total output exceeds {} bytes", + MAX_TOTAL_RESULT_SIZE + ))); + } + } + } + let mut builder = LargeStringBuilder::with_capacity(len, estimated_total); + + // Fast path: if pattern is scalar, compile once + let compiled_scalar: Option = if pattern_is_scalar && len > 0 && !pattern.is_null(0) + { + Some(compile_regex_checked(pattern.value(0))?) + } else { + None + }; + + for i in 0..len { + if text.is_null(i) || pattern.is_null(i) { + builder.append_null(); + continue; + } + + let s = text.value(i); + let pat = pattern.value(i); + + // Compile or reuse regex + let re = if let Some(ref compiled) = compiled_scalar { + compiled + } else { + // TODO: For performance-critical applications with repeating patterns, + // consider adding a small LRU cache here + &compile_regex_checked(pat)? + }; + + // First match only + if let Some(m) = re.find(s) { + let m_str = m.as_str(); + if m_str.len() > MAX_SINGLE_MATCH { + return Err(DataFusionError::Execution( + "REGEXP_EXTRACT match exceeds per-row limit (1MB)".to_string(), + )); + } + builder.append_value(m_str); + } else { + builder.append_null(); + } + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +// Compile a regex with safety checks +fn compile_regex_checked(pattern: &str) -> datafusion_common::Result { + if pattern.len() > MAX_PATTERN_LEN { + return Err(DataFusionError::Execution(format!( + "REGEXP_EXTRACT pattern too long (> {} chars)", + MAX_PATTERN_LEN + ))); + } + RegexBuilder::new(pattern) + .size_limit(MAX_REGEX_SIZE) + .dfa_size_limit(MAX_DFA_SIZE) + .build() + .map_err(|e| { + DataFusionError::Execution(format!("REGEXP_EXTRACT invalid pattern '{}': {e}", pattern)) + }) +} + +#[cfg(test)] +mod tests { + use datafusion_common::arrow::array::StringArray; + use datafusion_common::arrow::datatypes::Field; + use datafusion_expr::ScalarFunctionArgs; + + use super::*; + + #[test] + fn test_regexp_extract_function_basic() { + let text_array = Arc::new(StringArray::from(vec!["version 1.2.3", "no match here"])); + let pattern_array = Arc::new(StringArray::from(vec!["\\d+\\.\\d+\\.\\d+", "\\d+"])); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(text_array), + ColumnarValue::Array(pattern_array), + ], + arg_fields: vec![ + Arc::new(Field::new("arg_0", DataType::Utf8, false)), + Arc::new(Field::new("arg_1", DataType::Utf8, false)), + ], + return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)), + number_rows: 2, + config_options: Arc::new(datafusion_common::config::ConfigOptions::default()), + }; + + let function = RegexpExtractFunction::default(); + let result = function.invoke_with_args(args).unwrap(); + + if let ColumnarValue::Array(array) = result { + let string_array = array.as_string::(); + assert_eq!(string_array.value(0), "1.2.3"); + assert!(string_array.is_null(1)); // no match should return NULL + } else { + panic!("Expected array result"); + } + } + + #[test] + fn test_regexp_extract_phone_number() { + let text_array = Arc::new(StringArray::from(vec!["Phone: 123-456-7890", "No phone"])); + let pattern_array = Arc::new(StringArray::from(vec![ + "\\d{3}-\\d{3}-\\d{4}", + "\\d{3}-\\d{3}-\\d{4}", + ])); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(text_array), + ColumnarValue::Array(pattern_array), + ], + arg_fields: vec![ + Arc::new(Field::new("arg_0", DataType::Utf8, false)), + Arc::new(Field::new("arg_1", DataType::Utf8, false)), + ], + return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)), + number_rows: 2, + config_options: Arc::new(datafusion_common::config::ConfigOptions::default()), + }; + + let function = RegexpExtractFunction::default(); + let result = function.invoke_with_args(args).unwrap(); + + if let ColumnarValue::Array(array) = result { + let string_array = array.as_string::(); + assert_eq!(string_array.value(0), "123-456-7890"); + assert!(string_array.is_null(1)); // no match should return NULL + } else { + panic!("Expected array result"); + } + } + + #[test] + fn test_regexp_extract_email() { + let text_array = Arc::new(StringArray::from(vec![ + "Email: user@domain.com", + "Invalid email", + ])); + let pattern_array = Arc::new(StringArray::from(vec![ + "[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z]+", + "[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z]+", + ])); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(text_array), + ColumnarValue::Array(pattern_array), + ], + arg_fields: vec![ + Arc::new(Field::new("arg_0", DataType::Utf8, false)), + Arc::new(Field::new("arg_1", DataType::Utf8, false)), + ], + return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)), + number_rows: 2, + config_options: Arc::new(datafusion_common::config::ConfigOptions::default()), + }; + + let function = RegexpExtractFunction::default(); + let result = function.invoke_with_args(args).unwrap(); + + if let ColumnarValue::Array(array) = result { + let string_array = array.as_string::(); + assert_eq!(string_array.value(0), "user@domain.com"); + assert!(string_array.is_null(1)); // no match should return NULL + } else { + panic!("Expected array result"); + } + } + + #[test] + fn test_regexp_extract_with_nulls() { + let text_array = Arc::new(StringArray::from(vec![Some("test 123"), None])); + let pattern_array = Arc::new(StringArray::from(vec![Some("\\d+"), Some("\\d+")])); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(text_array), + ColumnarValue::Array(pattern_array), + ], + arg_fields: vec![ + Arc::new(Field::new("arg_0", DataType::Utf8, true)), + Arc::new(Field::new("arg_1", DataType::Utf8, false)), + ], + return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)), + number_rows: 2, + config_options: Arc::new(datafusion_common::config::ConfigOptions::default()), + }; + + let function = RegexpExtractFunction::default(); + let result = function.invoke_with_args(args).unwrap(); + + if let ColumnarValue::Array(array) = result { + let string_array = array.as_string::(); + assert_eq!(string_array.value(0), "123"); + assert!(string_array.is_null(1)); // NULL input should return NULL + } else { + panic!("Expected array result"); + } + } +} diff --git a/src/common/function/src/scalars/vector.rs b/src/common/function/src/scalars/vector.rs index 75d66f03c5..f265cfe53a 100644 --- a/src/common/function/src/scalars/vector.rs +++ b/src/common/function/src/scalars/vector.rs @@ -14,6 +14,7 @@ mod convert; mod distance; +mod elem_avg; mod elem_product; mod elem_sum; pub mod impl_conv; @@ -64,6 +65,7 @@ impl VectorFunction { registry.register_scalar(vector_subvector::VectorSubvectorFunction::default()); registry.register_scalar(elem_sum::ElemSumFunction::default()); registry.register_scalar(elem_product::ElemProductFunction::default()); + registry.register_scalar(elem_avg::ElemAvgFunction::default()); } } diff --git a/src/common/function/src/scalars/vector/elem_avg.rs b/src/common/function/src/scalars/vector/elem_avg.rs new file mode 100644 index 0000000000..7ebee3ad41 --- /dev/null +++ b/src/common/function/src/scalars/vector/elem_avg.rs @@ -0,0 +1,128 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Display; + +use datafusion::arrow::datatypes::DataType; +use datafusion::logical_expr::ColumnarValue; +use datafusion_common::ScalarValue; +use datafusion_expr::type_coercion::aggregates::{BINARYS, STRINGS}; +use datafusion_expr::{ScalarFunctionArgs, Signature, TypeSignature, Volatility}; +use nalgebra::DVectorView; + +use crate::function::Function; +use crate::scalars::vector::{VectorCalculator, impl_conv}; + +const NAME: &str = "vec_elem_avg"; + +#[derive(Debug, Clone)] +pub(crate) struct ElemAvgFunction { + signature: Signature, +} + +impl Default for ElemAvgFunction { + fn default() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Uniform(1, STRINGS.to_vec()), + TypeSignature::Uniform(1, BINARYS.to_vec()), + TypeSignature::Uniform(1, vec![DataType::BinaryView]), + ], + Volatility::Immutable, + ), + } + } +} + +impl Function for ElemAvgFunction { + fn name(&self) -> &str { + NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::Float32) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let body = |v0: &ScalarValue| -> datafusion_common::Result { + let v0 = + impl_conv::as_veclit(v0)?.map(|v0| DVectorView::from_slice(&v0, v0.len()).mean()); + Ok(ScalarValue::Float32(v0)) + }; + + let calculator = VectorCalculator { + name: self.name(), + func: body, + }; + calculator.invoke_with_single_argument(args) + } +} + +impl Display for ElemAvgFunction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", NAME.to_ascii_uppercase()) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::StringViewArray; + use arrow_schema::Field; + use datafusion::arrow::array::{Array, AsArray}; + use datafusion::arrow::datatypes::Float32Type; + use datafusion_common::config::ConfigOptions; + + use super::*; + + #[test] + fn test_elem_avg() { + let func = ElemAvgFunction::default(); + + let input = Arc::new(StringViewArray::from(vec![ + Some("[1.0,2.0,3.0]".to_string()), + Some("[4.0,5.0,6.0]".to_string()), + Some("[7.0,8.0,9.0]".to_string()), + None, + ])); + + let result = func + .invoke_with_args(ScalarFunctionArgs { + args: vec![ColumnarValue::Array(input.clone())], + arg_fields: vec![], + number_rows: input.len(), + return_field: Arc::new(Field::new("x", DataType::Float32, true)), + config_options: Arc::new(ConfigOptions::new()), + }) + .and_then(|v| ColumnarValue::values_to_arrays(&[v])) + .map(|mut a| a.remove(0)) + .unwrap(); + let result = result.as_primitive::(); + + assert_eq!(result.len(), 4); + assert_eq!(result.value(0), 2.0); + assert_eq!(result.value(1), 5.0); + assert_eq!(result.value(2), 8.0); + assert!(result.is_null(3)); + } +} diff --git a/src/common/function/src/state.rs b/src/common/function/src/state.rs index f90479b923..d1a3d341b4 100644 --- a/src/common/function/src/state.rs +++ b/src/common/function/src/state.rs @@ -44,7 +44,8 @@ impl FunctionState { use session::context::QueryContextRef; use store_api::storage::RegionId; use table::requests::{ - CompactTableRequest, DeleteRequest, FlushTableRequest, InsertRequest, + BuildIndexTableRequest, CompactTableRequest, DeleteRequest, FlushTableRequest, + InsertRequest, }; use crate::handlers::{FlowServiceHandler, ProcedureServiceHandler, TableMutationHandler}; @@ -120,6 +121,14 @@ impl FunctionState { Ok(ROWS) } + async fn build_index( + &self, + _request: BuildIndexTableRequest, + _ctx: QueryContextRef, + ) -> Result { + Ok(ROWS) + } + async fn flush_region( &self, _region_id: RegionId, diff --git a/src/common/function/src/system/pg_catalog.rs b/src/common/function/src/system/pg_catalog.rs index c768aae248..b6aee0d7c8 100644 --- a/src/common/function/src/system/pg_catalog.rs +++ b/src/common/function/src/system/pg_catalog.rs @@ -12,18 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod version; - use std::sync::Arc; -use datafusion::arrow::array::{ArrayRef, StringArray, as_boolean_array}; +use common_catalog::consts::{ + DEFAULT_PRIVATE_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, PG_CATALOG_NAME, +}; +use datafusion::arrow::array::{ArrayRef, StringArray, StringBuilder, as_boolean_array}; use datafusion::catalog::TableFunction; use datafusion::common::ScalarValue; use datafusion::common::utils::SingleRowListArrayBuilder; -use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility}; use datafusion_pg_catalog::pg_catalog::{self, PgCatalogStaticTables}; use datatypes::arrow::datatypes::{DataType, Field}; -use version::PGVersionFunction; +use derive_more::derive::Display; use crate::function::{Function, find_function_context}; use crate::function_registry::FunctionRegistry; @@ -32,10 +33,40 @@ use crate::system::define_nullary_udf; const CURRENT_SCHEMA_FUNCTION_NAME: &str = "current_schema"; const CURRENT_SCHEMAS_FUNCTION_NAME: &str = "current_schemas"; const SESSION_USER_FUNCTION_NAME: &str = "session_user"; +const CURRENT_DATABASE_FUNCTION_NAME: &str = "current_database"; +const OBJ_DESCRIPTION_FUNCTION_NAME: &str = "obj_description"; +const COL_DESCRIPTION_FUNCTION_NAME: &str = "col_description"; +const SHOBJ_DESCRIPTION_FUNCTION_NAME: &str = "shobj_description"; +const PG_MY_TEMP_SCHEMA_FUNCTION_NAME: &str = "pg_my_temp_schema"; define_nullary_udf!(CurrentSchemaFunction); -define_nullary_udf!(CurrentSchemasFunction); define_nullary_udf!(SessionUserFunction); +define_nullary_udf!(CurrentDatabaseFunction); +define_nullary_udf!(PgMyTempSchemaFunction); + +impl Function for CurrentDatabaseFunction { + fn name(&self) -> &str { + CURRENT_DATABASE_FUNCTION_NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::Utf8View) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let func_ctx = find_function_context(&args)?; + let db = func_ctx.query_ctx.current_catalog().to_string(); + + Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(db)))) + } +} // Though "current_schema" can be aliased to "database", to not cause any breaking changes, // we are not doing it: not until https://github.com/apache/datafusion/issues/17469 is resolved. @@ -89,6 +120,23 @@ impl Function for SessionUserFunction { } } +#[derive(Display, Debug)] +#[display("{}", self.name())] +pub(super) struct CurrentSchemasFunction { + signature: Signature, +} + +impl CurrentSchemasFunction { + pub fn new() -> Self { + Self { + signature: Signature::new( + TypeSignature::Exact(vec![DataType::Boolean]), + Volatility::Stable, + ), + } + } +} + impl Function for CurrentSchemasFunction { fn name(&self) -> &str { CURRENT_SCHEMAS_FUNCTION_NAME @@ -96,9 +144,9 @@ impl Function for CurrentSchemasFunction { fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { Ok(DataType::List(Arc::new(Field::new( - "x", - DataType::Utf8View, - false, + "item", + DataType::Utf8, + true, )))) } @@ -117,9 +165,9 @@ impl Function for CurrentSchemasFunction { let mut values = vec!["public"]; // include implicit schemas if input.value(0) { - values.push("information_schema"); - values.push("pg_catalog"); - values.push("greptime_private"); + values.push(INFORMATION_SCHEMA_NAME); + values.push(PG_CATALOG_NAME); + values.push(DEFAULT_PRIVATE_SCHEMA_NAME); } let list_array = SingleRowListArrayBuilder::new(Arc::new(StringArray::from(values))); @@ -130,6 +178,175 @@ impl Function for CurrentSchemasFunction { } } +/// PostgreSQL obj_description - returns NULL for compatibility +#[derive(Display, Debug, Clone)] +#[display("{}", self.name())] +pub(super) struct ObjDescriptionFunction { + signature: Signature, +} + +impl ObjDescriptionFunction { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Int64, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::UInt32, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::Int64]), + TypeSignature::Exact(vec![DataType::UInt32]), + ], + Volatility::Stable, + ), + } + } +} + +impl Function for ObjDescriptionFunction { + fn name(&self) -> &str { + OBJ_DESCRIPTION_FUNCTION_NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::Utf8) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let num_rows = args.number_rows; + let mut builder = StringBuilder::with_capacity(num_rows, 0); + for _ in 0..num_rows { + builder.append_null(); + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +/// PostgreSQL col_description - returns NULL for compatibility +#[derive(Display, Debug, Clone)] +#[display("{}", self.name())] +pub(super) struct ColDescriptionFunction { + signature: Signature, +} + +impl ColDescriptionFunction { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Int64, DataType::Int32]), + TypeSignature::Exact(vec![DataType::UInt32, DataType::Int32]), + TypeSignature::Exact(vec![DataType::Int64, DataType::Int64]), + TypeSignature::Exact(vec![DataType::UInt32, DataType::Int64]), + ], + Volatility::Stable, + ), + } + } +} + +impl Function for ColDescriptionFunction { + fn name(&self) -> &str { + COL_DESCRIPTION_FUNCTION_NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::Utf8) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let num_rows = args.number_rows; + let mut builder = StringBuilder::with_capacity(num_rows, 0); + for _ in 0..num_rows { + builder.append_null(); + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +/// PostgreSQL shobj_description - returns NULL for compatibility +#[derive(Display, Debug, Clone)] +#[display("{}", self.name())] +pub(super) struct ShobjDescriptionFunction { + signature: Signature, +} + +impl ShobjDescriptionFunction { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Int64, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::UInt64, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]), + TypeSignature::Exact(vec![DataType::UInt32, DataType::Utf8]), + ], + Volatility::Stable, + ), + } + } +} + +impl Function for ShobjDescriptionFunction { + fn name(&self) -> &str { + SHOBJ_DESCRIPTION_FUNCTION_NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::Utf8) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let num_rows = args.number_rows; + let mut builder = StringBuilder::with_capacity(num_rows, 0); + for _ in 0..num_rows { + builder.append_null(); + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +/// PostgreSQL pg_my_temp_schema - returns 0 (no temp schema) for compatibility +impl Function for PgMyTempSchemaFunction { + fn name(&self) -> &str { + PG_MY_TEMP_SCHEMA_FUNCTION_NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::UInt32) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + _args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + Ok(ColumnarValue::Scalar(ScalarValue::UInt32(Some(0)))) + } +} + pub(super) struct PGCatalogFunction; impl PGCatalogFunction { @@ -137,10 +354,10 @@ impl PGCatalogFunction { let static_tables = Arc::new(PgCatalogStaticTables::try_new().expect("load postgres static tables")); - registry.register_scalar(PGVersionFunction::default()); registry.register_scalar(CurrentSchemaFunction::default()); - registry.register_scalar(CurrentSchemasFunction::default()); + registry.register_scalar(CurrentSchemasFunction::new()); registry.register_scalar(SessionUserFunction::default()); + registry.register_scalar(CurrentDatabaseFunction::default()); registry.register(pg_catalog::format_type::create_format_type_udf()); registry.register(pg_catalog::create_pg_get_partkeydef_udf()); registry.register(pg_catalog::has_privilege_udf::create_has_privilege_udf( @@ -164,7 +381,103 @@ impl PGCatalogFunction { registry.register(pg_catalog::create_pg_get_userbyid_udf()); registry.register(pg_catalog::create_pg_table_is_visible()); registry.register(pg_catalog::pg_get_expr_udf::create_pg_get_expr_udf()); - // TODO(sunng87): upgrade datafusion to add - //registry.register(pg_catalog::create_pg_encoding_to_char_udf()); + registry.register(pg_catalog::create_pg_encoding_to_char_udf()); + registry.register(pg_catalog::create_pg_relation_size_udf()); + registry.register(pg_catalog::create_pg_total_relation_size_udf()); + registry.register(pg_catalog::create_pg_stat_get_numscans()); + registry.register(pg_catalog::create_pg_get_constraintdef()); + registry.register(pg_catalog::create_pg_get_partition_ancestors_udf()); + registry.register_scalar(ObjDescriptionFunction::new()); + registry.register_scalar(ColDescriptionFunction::new()); + registry.register_scalar(ShobjDescriptionFunction::new()); + registry.register_scalar(PgMyTempSchemaFunction::default()); + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_schema::Field; + use datafusion::arrow::array::Array; + use datafusion_common::ScalarValue; + use datafusion_expr::ColumnarValue; + + use super::*; + + fn create_test_args(args: Vec, number_rows: usize) -> ScalarFunctionArgs { + ScalarFunctionArgs { + args, + arg_fields: vec![], + number_rows, + return_field: Arc::new(Field::new("result", DataType::Utf8, true)), + config_options: Arc::new(Default::default()), + } + } + + #[test] + fn test_obj_description_function() { + let func = ObjDescriptionFunction::new(); + assert_eq!("obj_description", func.name()); + assert_eq!(DataType::Utf8, func.return_type(&[]).unwrap()); + + let args = create_test_args( + vec![ + ColumnarValue::Scalar(ScalarValue::Int64(Some(1234))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("pg_class".to_string()))), + ], + 1, + ); + let result = func.invoke_with_args(args).unwrap(); + if let ColumnarValue::Array(arr) = result { + assert_eq!(1, arr.len()); + assert!(arr.is_null(0)); + } else { + panic!("Expected Array result"); + } + } + + #[test] + fn test_col_description_function() { + let func = ColDescriptionFunction::new(); + assert_eq!("col_description", func.name()); + assert_eq!(DataType::Utf8, func.return_type(&[]).unwrap()); + + let args = create_test_args( + vec![ + ColumnarValue::Scalar(ScalarValue::Int64(Some(1234))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), + ], + 1, + ); + let result = func.invoke_with_args(args).unwrap(); + if let ColumnarValue::Array(arr) = result { + assert_eq!(1, arr.len()); + assert!(arr.is_null(0)); + } else { + panic!("Expected Array result"); + } + } + + #[test] + fn test_shobj_description_function() { + let func = ShobjDescriptionFunction::new(); + assert_eq!("shobj_description", func.name()); + assert_eq!(DataType::Utf8, func.return_type(&[]).unwrap()); + + let args = create_test_args( + vec![ + ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("pg_database".to_string()))), + ], + 1, + ); + let result = func.invoke_with_args(args).unwrap(); + if let ColumnarValue::Array(arr) = result { + assert_eq!(1, arr.len()); + assert!(arr.is_null(0)); + } else { + panic!("Expected Array result"); + } } } diff --git a/src/common/function/src/system/pg_catalog/version.rs b/src/common/function/src/system/pg_catalog/version.rs deleted file mode 100644 index 9acdd39472..0000000000 --- a/src/common/function/src/system/pg_catalog/version.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::fmt; - -use datafusion::arrow::datatypes::DataType; -use datafusion_common::ScalarValue; -use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; - -use crate::function::Function; - -#[derive(Clone, Debug)] -pub(crate) struct PGVersionFunction { - signature: Signature, -} - -impl Default for PGVersionFunction { - fn default() -> Self { - Self { - signature: Signature::exact(vec![], Volatility::Immutable), - } - } -} - -impl fmt::Display for PGVersionFunction { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "pg_catalog.VERSION") - } -} - -impl Function for PGVersionFunction { - fn name(&self) -> &str { - "pg_catalog.version" - } - - fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { - Ok(DataType::Utf8View) - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn invoke_with_args(&self, _: ScalarFunctionArgs) -> datafusion_common::Result { - Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(format!( - "PostgreSQL 16.3 GreptimeDB {}", - common_version::version() - ))))) - } -} diff --git a/src/common/function/src/system/version.rs b/src/common/function/src/system/version.rs index 369ad14080..1c148bd7ab 100644 --- a/src/common/function/src/system/version.rs +++ b/src/common/function/src/system/version.rs @@ -50,7 +50,7 @@ impl Function for VersionFunction { ) } Channel::Postgres => { - format!("16.3-greptimedb-{}", common_version::version()) + format!("PostgreSQL 16.3 GreptimeDB {}", common_version::version()) } _ => common_version::version().to_string(), }; diff --git a/src/common/grpc/Cargo.toml b/src/common/grpc/Cargo.toml index 1684d0b297..9978791a7a 100644 --- a/src/common/grpc/Cargo.toml +++ b/src/common/grpc/Cargo.toml @@ -23,6 +23,7 @@ datatypes.workspace = true flatbuffers = "25.2" hyper.workspace = true lazy_static.workspace = true +notify.workspace = true prost.workspace = true serde.workspace = true serde_json.workspace = true @@ -37,6 +38,7 @@ vec1 = "1.12" criterion = "0.4" hyper-util = { workspace = true, features = ["tokio"] } rand.workspace = true +tempfile.workspace = true [[bench]] name = "bench_main" diff --git a/src/common/grpc/src/channel_manager.rs b/src/common/grpc/src/channel_manager.rs index cdea89cb86..a60604da94 100644 --- a/src/common/grpc/src/channel_manager.rs +++ b/src/common/grpc/src/channel_manager.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::path::Path; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}; use std::time::Duration; @@ -22,14 +23,15 @@ use dashmap::DashMap; use dashmap::mapref::entry::Entry; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt}; +use snafu::ResultExt; use tokio_util::sync::CancellationToken; use tonic::transport::{ Certificate, Channel as InnerChannel, ClientTlsConfig, Endpoint, Identity, Uri, }; use tower::Service; -use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, InvalidTlsConfigSnafu, Result}; +use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, Result}; +use crate::reloadable_tls::{ReloadableTlsConfig, TlsConfigLoader, maybe_watch_tls_config}; const RECYCLE_CHANNEL_INTERVAL_SECS: u64 = 60; pub const DEFAULT_GRPC_REQUEST_TIMEOUT_SECS: u64 = 10; @@ -50,7 +52,7 @@ pub struct ChannelManager { struct Inner { id: u64, config: ChannelConfig, - client_tls_config: Option, + reloadable_client_tls_config: Option>, pool: Arc, channel_recycle_started: AtomicBool, cancel: CancellationToken, @@ -78,7 +80,7 @@ impl Inner { Self { id, config, - client_tls_config: None, + reloadable_client_tls_config: None, pool, channel_recycle_started: AtomicBool::new(false), cancel, @@ -91,57 +93,22 @@ impl ChannelManager { Default::default() } - pub fn with_config(config: ChannelConfig) -> Self { - let inner = Inner::with_config(config); + /// Create a ChannelManager with configuration and optional TLS config + /// + /// Use [`load_client_tls_config`] to create TLS configuration from `ClientTlsOption`. + /// The TLS config supports both static (watch disabled) and dynamic reloading (watch enabled). + /// If you want to use dynamic reloading, please **manually** invoke [`maybe_watch_client_tls_config`] after this method. + pub fn with_config( + config: ChannelConfig, + reloadable_tls_config: Option>, + ) -> Self { + let mut inner = Inner::with_config(config.clone()); + inner.reloadable_client_tls_config = reloadable_tls_config; Self { inner: Arc::new(inner), } } - /// Read tls cert and key files and create a ChannelManager with TLS config. - pub fn with_tls_config(config: ChannelConfig) -> Result { - let mut inner = Inner::with_config(config.clone()); - - // setup tls - let path_config = config.client_tls.context(InvalidTlsConfigSnafu { - msg: "no config input", - })?; - - if !path_config.enabled { - // if TLS not enabled, just ignore other tls config - // and not set `client_tls_config` hence not use TLS - return Ok(Self { - inner: Arc::new(inner), - }); - } - - let mut tls_config = ClientTlsConfig::new(); - - if let Some(server_ca) = path_config.server_ca_cert_path { - let server_root_ca_cert = - std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?; - let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert); - tls_config = tls_config.ca_certificate(server_root_ca_cert); - } - - if let (Some(client_cert_path), Some(client_key_path)) = - (&path_config.client_cert_path, &path_config.client_key_path) - { - let client_cert = - std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?; - let client_key = - std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?; - let client_identity = Identity::from_pem(client_cert, client_key); - tls_config = tls_config.identity(client_identity); - } - - inner.client_tls_config = Some(tls_config); - - Ok(Self { - inner: Arc::new(inner), - }) - } - pub fn config(&self) -> &ChannelConfig { &self.inner.config } @@ -211,8 +178,21 @@ impl ChannelManager { self.pool().retain_channel(f); } + /// Clear all channels to force reconnection. + /// This should be called when TLS configuration changes to ensure new connections use updated certificates. + pub fn clear_all_channels(&self) { + self.pool().retain_channel(|_, _| false); + } + fn build_endpoint(&self, addr: &str) -> Result { - let http_prefix = if self.inner.client_tls_config.is_some() { + // Get the latest TLS config from reloadable config (which handles both static and dynamic cases) + let tls_config = self + .inner + .reloadable_client_tls_config + .as_ref() + .and_then(|c| c.get_config()); + + let http_prefix = if tls_config.is_some() { "https" } else { "http" @@ -251,9 +231,9 @@ impl ChannelManager { if let Some(enabled) = self.config().http2_adaptive_window { endpoint = endpoint.http2_adaptive_window(enabled); } - if let Some(tls_config) = &self.inner.client_tls_config { + if let Some(tls_config) = tls_config { endpoint = endpoint - .tls_config(tls_config.clone()) + .tls_config(tls_config) .context(CreateChannelSnafu { addr })?; } @@ -287,13 +267,97 @@ impl ChannelManager { } } -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +fn load_tls_config(tls_option: Option<&ClientTlsOption>) -> Result> { + let path_config = match tls_option { + Some(path_config) if path_config.enabled => path_config, + _ => return Ok(None), + }; + + let mut tls_config = ClientTlsConfig::new(); + + if let Some(server_ca) = &path_config.server_ca_cert_path { + let server_root_ca_cert = + std::fs::read_to_string(server_ca).context(InvalidConfigFilePathSnafu)?; + let server_root_ca_cert = Certificate::from_pem(server_root_ca_cert); + tls_config = tls_config.ca_certificate(server_root_ca_cert); + } + + if let (Some(client_cert_path), Some(client_key_path)) = + (&path_config.client_cert_path, &path_config.client_key_path) + { + let client_cert = + std::fs::read_to_string(client_cert_path).context(InvalidConfigFilePathSnafu)?; + let client_key = + std::fs::read_to_string(client_key_path).context(InvalidConfigFilePathSnafu)?; + let client_identity = Identity::from_pem(client_cert, client_key); + tls_config = tls_config.identity(client_identity); + } + Ok(Some(tls_config)) +} + +impl TlsConfigLoader for ClientTlsOption { + type Error = crate::error::Error; + + fn load(&self) -> Result> { + load_tls_config(Some(self)) + } + + fn watch_paths(&self) -> Vec<&Path> { + let mut paths = Vec::new(); + if let Some(cert_path) = &self.client_cert_path { + paths.push(Path::new(cert_path.as_str())); + } + if let Some(key_path) = &self.client_key_path { + paths.push(Path::new(key_path.as_str())); + } + if let Some(ca_path) = &self.server_ca_cert_path { + paths.push(Path::new(ca_path.as_str())); + } + paths + } + + fn watch_enabled(&self) -> bool { + self.enabled && self.watch + } +} + +/// Type alias for client-side reloadable TLS config +pub type ReloadableClientTlsConfig = ReloadableTlsConfig; + +/// Load client TLS configuration from `ClientTlsOption` and return a `ReloadableClientTlsConfig`. +/// This is the primary way to create TLS configuration for the ChannelManager. +pub fn load_client_tls_config( + tls_option: Option, +) -> Result>> { + match tls_option { + Some(option) if option.enabled => { + let reloadable = ReloadableClientTlsConfig::try_new(option)?; + Ok(Some(Arc::new(reloadable))) + } + _ => Ok(None), + } +} + +pub fn maybe_watch_client_tls_config( + client_tls_config: Arc, + channel_manager: ChannelManager, +) -> Result<()> { + maybe_watch_tls_config(client_tls_config, move || { + // Clear all existing channels to force reconnection with new certificates + channel_manager.clear_all_channels(); + info!("Cleared all existing channels to use new TLS certificates."); + }) +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct ClientTlsOption { /// Whether to enable TLS for client. pub enabled: bool, pub server_ca_cert_path: Option, pub client_cert_path: Option, pub client_key_path: Option, + #[serde(default)] + pub watch: bool, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -613,6 +677,7 @@ mod tests { server_ca_cert_path: Some("some_server_path".to_string()), client_cert_path: Some("some_cert_path".to_string()), client_key_path: Some("some_key_path".to_string()), + watch: false, }); assert_eq!( @@ -634,6 +699,7 @@ mod tests { server_ca_cert_path: Some("some_server_path".to_string()), client_cert_path: Some("some_cert_path".to_string()), client_key_path: Some("some_key_path".to_string()), + watch: false, }), max_recv_message_size: DEFAULT_MAX_GRPC_RECV_MESSAGE_SIZE, max_send_message_size: DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE, @@ -659,7 +725,7 @@ mod tests { .http2_adaptive_window(true) .tcp_keepalive(Duration::from_secs(2)) .tcp_nodelay(true); - let mgr = ChannelManager::with_config(config); + let mgr = ChannelManager::with_config(config, None); let res = mgr.build_endpoint("test_addr"); diff --git a/src/common/grpc/src/error.rs b/src/common/grpc/src/error.rs index 147ff70c07..4f9b8e92dd 100644 --- a/src/common/grpc/src/error.rs +++ b/src/common/grpc/src/error.rs @@ -38,6 +38,15 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to watch config file path: {}", path))] + FileWatch { + path: String, + #[snafu(source)] + error: notify::Error, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display( "Write type mismatch, column name: {}, expected: {}, actual: {}", column_name, @@ -108,6 +117,7 @@ impl ErrorExt for Error { match self { Error::InvalidTlsConfig { .. } | Error::InvalidConfigFilePath { .. } + | Error::FileWatch { .. } | Error::TypeMismatch { .. } | Error::InvalidFlightData { .. } | Error::NotSupported { .. } => StatusCode::InvalidArguments, diff --git a/src/common/grpc/src/lib.rs b/src/common/grpc/src/lib.rs index 287644b529..8527dd079b 100644 --- a/src/common/grpc/src/lib.rs +++ b/src/common/grpc/src/lib.rs @@ -16,6 +16,7 @@ pub mod channel_manager; pub mod error; pub mod flight; pub mod precision; +pub mod reloadable_tls; pub mod select; pub use arrow_flight::FlightData; diff --git a/src/common/grpc/src/reloadable_tls.rs b/src/common/grpc/src/reloadable_tls.rs new file mode 100644 index 0000000000..c1bd3aca52 --- /dev/null +++ b/src/common/grpc/src/reloadable_tls.rs @@ -0,0 +1,163 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::Path; +use std::result::Result as StdResult; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::mpsc::channel; +use std::sync::{Arc, RwLock}; + +use common_telemetry::{error, info}; +use notify::{EventKind, RecursiveMode, Watcher}; +use snafu::ResultExt; + +use crate::error::{FileWatchSnafu, Result}; + +/// A trait for loading TLS configuration from an option type +pub trait TlsConfigLoader { + type Error; + + /// Load the TLS configuration + fn load(&self) -> StdResult, Self::Error>; + + /// Get paths to certificate files for watching + fn watch_paths(&self) -> Vec<&Path>; + + /// Check if watching is enabled + fn watch_enabled(&self) -> bool; +} + +/// A mutable container for TLS config +/// +/// This struct allows dynamic reloading of certificates and keys. +/// It's generic over the config type (e.g., ServerConfig, ClientTlsConfig) +/// and the option type (e.g., TlsOption, ClientTlsOption). +#[derive(Debug)] +pub struct ReloadableTlsConfig +where + O: TlsConfigLoader, +{ + tls_option: O, + config: RwLock>, + version: AtomicUsize, +} + +impl ReloadableTlsConfig +where + O: TlsConfigLoader, +{ + /// Create config by loading configuration from the option type + pub fn try_new(tls_option: O) -> StdResult { + let config = tls_option.load()?; + Ok(Self { + tls_option, + config: RwLock::new(config), + version: AtomicUsize::new(0), + }) + } + + /// Reread certificates and keys from file system. + pub fn reload(&self) -> StdResult<(), O::Error> { + let config = self.tls_option.load()?; + *self.config.write().unwrap() = config; + self.version.fetch_add(1, Ordering::Relaxed); + Ok(()) + } + + /// Get the config held by this container + pub fn get_config(&self) -> Option + where + T: Clone, + { + self.config.read().unwrap().clone() + } + + /// Get associated option + pub fn get_tls_option(&self) -> &O { + &self.tls_option + } + + /// Get version of current config + /// + /// this version will auto increase when config get reloaded. + pub fn get_version(&self) -> usize { + self.version.load(Ordering::Relaxed) + } +} + +/// Watch TLS configuration files for changes and reload automatically +/// +/// This is a generic function that works with any ReloadableTlsConfig. +/// When changes are detected, it calls the provided callback after reloading. +/// +/// T: the original TLS config +/// O: the compiled TLS option +/// F: the hook function to be called after reloading +/// E: the error type for the loading operation +pub fn maybe_watch_tls_config( + tls_config: Arc>, + on_reload: F, +) -> Result<()> +where + T: Send + Sync + 'static, + O: TlsConfigLoader + Send + Sync + 'static, + E: std::error::Error + Send + Sync + 'static, + F: Fn() + Send + 'static, +{ + if !tls_config.get_tls_option().watch_enabled() { + return Ok(()); + } + + let tls_config_for_watcher = tls_config.clone(); + + let (tx, rx) = channel::>(); + let mut watcher = notify::recommended_watcher(tx).context(FileWatchSnafu { path: "" })?; + + // Watch all paths returned by the TlsConfigLoader + for path in tls_config.get_tls_option().watch_paths() { + watcher + .watch(path, RecursiveMode::NonRecursive) + .with_context(|_| FileWatchSnafu { + path: path.display().to_string(), + })?; + } + + info!("Spawning background task for watching TLS cert/key file changes"); + std::thread::spawn(move || { + let _watcher = watcher; + loop { + match rx.recv() { + Ok(Ok(event)) => { + if let EventKind::Modify(_) | EventKind::Create(_) = event.kind { + info!("Detected TLS cert/key file change: {:?}", event); + if let Err(err) = tls_config_for_watcher.reload() { + error!("Failed to reload TLS config: {}", err); + } else { + info!("Reloaded TLS cert/key file successfully."); + on_reload(); + } + } + } + Ok(Err(err)) => { + error!("Failed to watch TLS cert/key file: {}", err); + } + Err(err) => { + error!("TLS cert/key file watcher channel closed: {}", err); + } + } + } + }); + + Ok(()) +} diff --git a/src/common/grpc/tests/mod.rs b/src/common/grpc/tests/mod.rs index d119f22836..93188e35fc 100644 --- a/src/common/grpc/tests/mod.rs +++ b/src/common/grpc/tests/mod.rs @@ -12,14 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_grpc::channel_manager::{ChannelConfig, ChannelManager, ClientTlsOption}; +use common_grpc::channel_manager::{ + ChannelConfig, ChannelManager, ClientTlsOption, load_client_tls_config, + maybe_watch_client_tls_config, +}; #[tokio::test] async fn test_mtls_config() { // test no config let config = ChannelConfig::new(); - let re = ChannelManager::with_tls_config(config); - assert!(re.is_err()); + let re = load_client_tls_config(config.client_tls.clone()); + assert!(re.is_ok()); + assert!(re.unwrap().is_none()); // test wrong file let config = ChannelConfig::new().client_tls_config(ClientTlsOption { @@ -27,9 +31,10 @@ async fn test_mtls_config() { server_ca_cert_path: Some("tests/tls/wrong_ca.pem".to_string()), client_cert_path: Some("tests/tls/wrong_client.pem".to_string()), client_key_path: Some("tests/tls/wrong_client.key".to_string()), + watch: false, }); - let re = ChannelManager::with_tls_config(config); + let re = load_client_tls_config(config.client_tls.clone()); assert!(re.is_err()); // test corrupted file content @@ -38,9 +43,12 @@ async fn test_mtls_config() { server_ca_cert_path: Some("tests/tls/ca.pem".to_string()), client_cert_path: Some("tests/tls/client.pem".to_string()), client_key_path: Some("tests/tls/corrupted".to_string()), + watch: false, }); - let re = ChannelManager::with_tls_config(config).unwrap(); + let tls_config = load_client_tls_config(config.client_tls.clone()).unwrap(); + let re = ChannelManager::with_config(config, tls_config); + let re = re.get("127.0.0.1:0"); assert!(re.is_err()); @@ -50,9 +58,112 @@ async fn test_mtls_config() { server_ca_cert_path: Some("tests/tls/ca.pem".to_string()), client_cert_path: Some("tests/tls/client.pem".to_string()), client_key_path: Some("tests/tls/client.key".to_string()), + watch: false, }); - let re = ChannelManager::with_tls_config(config).unwrap(); + let tls_config = load_client_tls_config(config.client_tls.clone()).unwrap(); + let re = ChannelManager::with_config(config, tls_config); let re = re.get("127.0.0.1:0"); let _ = re.unwrap(); } + +#[tokio::test] +async fn test_reloadable_client_tls_config() { + common_telemetry::init_default_ut_logging(); + + let dir = tempfile::tempdir().unwrap(); + let cert_path = dir.path().join("client.pem"); + let key_path = dir.path().join("client.key"); + + std::fs::copy("tests/tls/client.pem", &cert_path).expect("failed to copy cert to tmpdir"); + std::fs::copy("tests/tls/client.key", &key_path).expect("failed to copy key to tmpdir"); + + assert!(std::fs::exists(&cert_path).unwrap()); + assert!(std::fs::exists(&key_path).unwrap()); + + let client_tls_option = ClientTlsOption { + enabled: true, + server_ca_cert_path: Some("tests/tls/ca.pem".to_string()), + client_cert_path: Some( + cert_path + .clone() + .into_os_string() + .into_string() + .expect("failed to convert path to string"), + ), + client_key_path: Some( + key_path + .clone() + .into_os_string() + .into_string() + .expect("failed to convert path to string"), + ), + watch: true, + }; + + let reloadable_config = load_client_tls_config(Some(client_tls_option)) + .expect("failed to load tls config") + .expect("tls config should be present"); + + let config = ChannelConfig::new(); + let manager = ChannelManager::with_config(config, Some(reloadable_config.clone())); + + maybe_watch_client_tls_config(reloadable_config.clone(), manager.clone()) + .expect("failed to watch client config"); + + assert_eq!(0, reloadable_config.get_version()); + assert!(reloadable_config.get_config().is_some()); + + // Create a channel to verify it gets cleared on reload + let _ = manager.get("127.0.0.1:0").expect("failed to get channel"); + + // Simulate file change by copying a different key file + let tmp_file = key_path.with_extension("tmp"); + std::fs::copy("tests/tls/server.key", &tmp_file).expect("Failed to copy temp key file"); + std::fs::rename(&tmp_file, &key_path).expect("Failed to rename temp key file"); + + const MAX_RETRIES: usize = 30; + let mut retries = 0; + let mut version_updated = false; + + while retries < MAX_RETRIES { + if reloadable_config.get_version() > 0 { + version_updated = true; + break; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + retries += 1; + } + + assert!(version_updated, "TLS config did not reload in time"); + assert!(reloadable_config.get_version() > 0); + assert!(reloadable_config.get_config().is_some()); +} + +#[tokio::test] +async fn test_channel_manager_with_reloadable_tls() { + common_telemetry::init_default_ut_logging(); + + let client_tls_option = ClientTlsOption { + enabled: true, + server_ca_cert_path: Some("tests/tls/ca.pem".to_string()), + client_cert_path: Some("tests/tls/client.pem".to_string()), + client_key_path: Some("tests/tls/client.key".to_string()), + watch: false, + }; + + let reloadable_config = load_client_tls_config(Some(client_tls_option)) + .expect("failed to load tls config") + .expect("tls config should be present"); + + let config = ChannelConfig::new(); + let manager = ChannelManager::with_config(config, Some(reloadable_config.clone())); + + // Test that we can get a channel + let channel = manager.get("127.0.0.1:0"); + assert!(channel.is_ok()); + + // Test that config is properly set + assert_eq!(0, reloadable_config.get_version()); + assert!(reloadable_config.get_config().is_some()); +} diff --git a/src/common/macro/src/row/schema.rs b/src/common/macro/src/row/schema.rs index 67848a36a0..82296655f9 100644 --- a/src/common/macro/src/row/schema.rs +++ b/src/common/macro/src/row/schema.rs @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +use greptime_proto::v1::ColumnDataTypeExtension; use greptime_proto::v1::column_data_type_extension::TypeExt; -use proc_macro2::TokenStream as TokenStream2; +use proc_macro2::{Span, TokenStream as TokenStream2}; use quote::quote; use syn::spanned::Spanned; use syn::{DeriveInput, Result}; @@ -69,57 +70,7 @@ fn impl_schema_method(fields: &[ParsedField<'_>]) -> Result { let semantic_type_val = convert_semantic_type_to_proto_semantic_type(column_attribute.semantic_type) as i32; let semantic_type = syn::LitInt::new(&semantic_type_val.to_string(), ident.span()); let extension = match extension { - Some(ext) => { - match ext.type_ext { - Some(TypeExt::DecimalType(ext)) => { - let precision = syn::LitInt::new(&ext.precision.to_string(), ident.span()); - let scale = syn::LitInt::new(&ext.scale.to_string(), ident.span()); - quote! { - Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::DecimalType(DecimalTypeExtension { precision: #precision, scale: #scale })) }) - } - } - Some(TypeExt::JsonType(ext)) => { - let json_type = syn::LitInt::new(&ext.to_string(), ident.span()); - quote! { - Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::JsonType(#json_type)) }) - } - } - Some(TypeExt::VectorType(ext)) => { - let dim = syn::LitInt::new(&ext.dim.to_string(), ident.span()); - quote! { - Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim: #dim })) }) - } - } - // TODO(sunng87): revisit all these implementations - Some(TypeExt::ListType(ext)) => { - let item_type = syn::Ident::new(&ext.datatype.to_string(), ident.span()); - quote! { - Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::ListType(ListTypeExtension { item_type: #item_type })) }) - } - } - Some(TypeExt::StructType(ext)) => { - let fields = ext.fields.iter().map(|field| { - let field_name = syn::Ident::new(&field.name.clone(), ident.span()); - let field_type = syn::Ident::new(&field.datatype.to_string(), ident.span()); - quote! { - StructField { name: #field_name, type_: #field_type } - } - }).collect::>(); - quote! { - Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::StructType(StructTypeExtension { fields: [#(#fields),*] })) }) - } - } - Some(TypeExt::JsonNativeType(ext)) => { - let inner = syn::Ident::new(&ext.datatype.to_string(), ident.span()); - quote! { - Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::JsonNativeType(JsonNativeTypeExtension { datatype: #inner })) }) - } - } - None => { - quote! { None } - } - } - } + Some(ext) => column_data_type_extension_to_tokens(&ext, ident.span()), None => quote! { None }, }; @@ -141,3 +92,125 @@ fn impl_schema_method(fields: &[ParsedField<'_>]) -> Result { } }) } + +fn column_data_type_extension_to_tokens( + extension: &ColumnDataTypeExtension, + span: Span, +) -> TokenStream2 { + match extension.type_ext.as_ref() { + Some(TypeExt::DecimalType(ext)) => { + let precision = syn::LitInt::new(&ext.precision.to_string(), span); + let scale = syn::LitInt::new(&ext.scale.to_string(), span); + quote! { + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::DecimalType(DecimalTypeExtension { + precision: #precision, + scale: #scale, + })), + }) + } + } + Some(TypeExt::JsonType(ext)) => { + let json_type = syn::LitInt::new(&ext.to_string(), span); + quote! { + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::JsonType(#json_type)), + }) + } + } + Some(TypeExt::VectorType(ext)) => { + let dim = syn::LitInt::new(&ext.dim.to_string(), span); + quote! { + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim: #dim })), + }) + } + } + Some(TypeExt::ListType(ext)) => { + let datatype = syn::LitInt::new(&ext.datatype.to_string(), span); + let datatype_extension = ext + .datatype_extension + .as_deref() + .map(|ext| column_data_type_extension_to_tokens(ext, span)) + .unwrap_or_else(|| quote! { None }); + quote! { + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::ListType(Box::new(ListTypeExtension { + datatype: #datatype, + datatype_extension: #datatype_extension, + }))), + }) + } + } + Some(TypeExt::StructType(ext)) => { + let fields = ext.fields.iter().map(|field| { + let field_name = &field.name; + let datatype = syn::LitInt::new(&field.datatype.to_string(), span); + let datatype_extension = field + .datatype_extension + .as_ref() + .map(|ext| column_data_type_extension_to_tokens(ext, span)) + .unwrap_or_else(|| quote! { None }); + quote! { + greptime_proto::v1::StructField { + name: #field_name.to_string(), + datatype: #datatype, + datatype_extension: #datatype_extension, + } + } + }); + quote! { + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::StructType(StructTypeExtension { + fields: vec![#(#fields),*], + })), + }) + } + } + Some(TypeExt::JsonNativeType(ext)) => { + let inner = syn::LitInt::new(&ext.datatype.to_string(), span); + let datatype_extension = ext + .datatype_extension + .as_deref() + .map(|ext| column_data_type_extension_to_tokens(ext, span)) + .unwrap_or_else(|| quote! { None }); + quote! { + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::JsonNativeType(Box::new( + JsonNativeTypeExtension { + datatype: #inner, + datatype_extension: #datatype_extension, + }, + ))), + }) + } + } + Some(TypeExt::DictionaryType(ext)) => { + let key_datatype = syn::LitInt::new(&ext.key_datatype.to_string(), span); + let value_datatype = syn::LitInt::new(&ext.value_datatype.to_string(), span); + let key_datatype_extension = ext + .key_datatype_extension + .as_deref() + .map(|ext| column_data_type_extension_to_tokens(ext, span)) + .unwrap_or_else(|| quote! { None }); + let value_datatype_extension = ext + .value_datatype_extension + .as_deref() + .map(|ext| column_data_type_extension_to_tokens(ext, span)) + .unwrap_or_else(|| quote! { None }); + quote! { + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::DictionaryType(Box::new( + DictionaryTypeExtension { + key_datatype: #key_datatype, + key_datatype_extension: #key_datatype_extension, + value_datatype: #value_datatype, + value_datatype_extension: #value_datatype_extension, + }, + ))), + }) + } + } + None => quote! { None }, + } +} diff --git a/src/common/macro/src/row/utils.rs b/src/common/macro/src/row/utils.rs index 40f990a40a..1768b2747a 100644 --- a/src/common/macro/src/row/utils.rs +++ b/src/common/macro/src/row/utils.rs @@ -309,5 +309,8 @@ pub(crate) fn convert_column_data_type_to_value_data_ident( ColumnDataType::Vector => format_ident!("VectorValue"), ColumnDataType::List => format_ident!("ListValue"), ColumnDataType::Struct => format_ident!("StructValue"), + ColumnDataType::Dictionary => { + panic!("Dictionary data type is not supported in row macros yet") + } } } diff --git a/src/common/mem-prof/src/jemalloc.rs b/src/common/mem-prof/src/jemalloc.rs index 05966b4754..a9359dad41 100644 --- a/src/common/mem-prof/src/jemalloc.rs +++ b/src/common/mem-prof/src/jemalloc.rs @@ -32,6 +32,7 @@ use crate::error::{FlamegraphSnafu, ParseJeHeapSnafu, Result}; const PROF_DUMP: &[u8] = b"prof.dump\0"; const OPT_PROF: &[u8] = b"opt.prof\0"; const PROF_ACTIVE: &[u8] = b"prof.active\0"; +const PROF_GDUMP: &[u8] = b"prof.gdump\0"; pub async fn dump_profile() -> Result> { ensure!(is_prof_enabled()?, ProfilingNotEnabledSnafu); @@ -119,3 +120,16 @@ fn is_prof_enabled() -> Result { // safety: OPT_PROF variable, if present, is always a boolean value. Ok(unsafe { tikv_jemalloc_ctl::raw::read::(OPT_PROF).context(ReadOptProfSnafu)? }) } + +pub fn set_gdump_active(active: bool) -> Result<()> { + ensure!(is_prof_enabled()?, ProfilingNotEnabledSnafu); + unsafe { + tikv_jemalloc_ctl::raw::update(PROF_GDUMP, active).context(error::UpdateGdumpSnafu)?; + } + Ok(()) +} + +pub fn is_gdump_active() -> Result { + // safety: PROF_GDUMP, if present, is a boolean value. + unsafe { Ok(tikv_jemalloc_ctl::raw::read::(PROF_GDUMP).context(error::ReadGdumpSnafu)?) } +} diff --git a/src/common/mem-prof/src/jemalloc/error.rs b/src/common/mem-prof/src/jemalloc/error.rs index 1787e97a7d..79e4b8f9a6 100644 --- a/src/common/mem-prof/src/jemalloc/error.rs +++ b/src/common/mem-prof/src/jemalloc/error.rs @@ -71,6 +71,18 @@ pub enum Error { #[snafu(source)] error: tikv_jemalloc_ctl::Error, }, + + #[snafu(display("Failed to read jemalloc gdump flag"))] + ReadGdump { + #[snafu(source)] + error: tikv_jemalloc_ctl::Error, + }, + + #[snafu(display("Failed to update jemalloc gdump flag"))] + UpdateGdump { + #[snafu(source)] + error: tikv_jemalloc_ctl::Error, + }, } impl ErrorExt for Error { @@ -84,6 +96,8 @@ impl ErrorExt for Error { Error::ActivateProf { .. } => StatusCode::Internal, Error::DeactivateProf { .. } => StatusCode::Internal, Error::ReadProfActive { .. } => StatusCode::Internal, + Error::ReadGdump { .. } => StatusCode::Internal, + Error::UpdateGdump { .. } => StatusCode::Internal, } } diff --git a/src/common/mem-prof/src/lib.rs b/src/common/mem-prof/src/lib.rs index 3fa6273f6e..9ff67e7277 100644 --- a/src/common/mem-prof/src/lib.rs +++ b/src/common/mem-prof/src/lib.rs @@ -19,7 +19,7 @@ mod jemalloc; #[cfg(not(windows))] pub use jemalloc::{ activate_heap_profile, deactivate_heap_profile, dump_flamegraph, dump_pprof, dump_profile, - is_heap_profile_active, + is_gdump_active, is_heap_profile_active, set_gdump_active, }; #[cfg(windows)] @@ -51,3 +51,13 @@ pub fn deactivate_heap_profile() -> error::Result<()> { pub fn is_heap_profile_active() -> error::Result { error::ProfilingNotSupportedSnafu.fail() } + +#[cfg(windows)] +pub fn is_gdump_active() -> error::Result { + error::ProfilingNotSupportedSnafu.fail() +} + +#[cfg(windows)] +pub fn set_gdump_active(_: bool) -> error::Result<()> { + error::ProfilingNotSupportedSnafu.fail() +} diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml index 3ea7627f9c..7438a237b5 100644 --- a/src/common/meta/Cargo.toml +++ b/src/common/meta/Cargo.toml @@ -77,7 +77,10 @@ serde_json.workspace = true serde_with.workspace = true session.workspace = true snafu.workspace = true -sqlx = { workspace = true, optional = true } +sqlx = { workspace = true, features = [ + "mysql", + "chrono", +], optional = true } store-api.workspace = true strum.workspace = true table = { workspace = true, features = ["testing"] } diff --git a/src/common/meta/src/cluster.rs b/src/common/meta/src/cluster.rs index 63001970b6..74485513e9 100644 --- a/src/common/meta/src/cluster.rs +++ b/src/common/meta/src/cluster.rs @@ -120,10 +120,16 @@ pub struct NodeInfo { pub start_time_ms: u64, // The node build cpus #[serde(default)] - pub cpus: u32, + pub total_cpu_millicores: i64, // The node build memory bytes #[serde(default)] - pub memory_bytes: u64, + pub total_memory_bytes: i64, + // The node build cpu usage millicores + #[serde(default)] + pub cpu_usage_millicores: i64, + // The node build memory usage bytes + #[serde(default)] + pub memory_usage_bytes: i64, // The node build hostname #[serde(default)] pub hostname: String, @@ -333,8 +339,10 @@ mod tests { version: "".to_string(), git_commit: "".to_string(), start_time_ms: 1, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; diff --git a/src/common/meta/src/datanode.rs b/src/common/meta/src/datanode.rs index 2083b5886b..8b521d8e43 100644 --- a/src/common/meta/src/datanode.rs +++ b/src/common/meta/src/datanode.rs @@ -25,8 +25,7 @@ use store_api::region_engine::{RegionRole, RegionStatistic}; use store_api::storage::RegionId; use table::metadata::TableId; -use crate::error; -use crate::error::Result; +use crate::error::{self, DeserializeFromJsonSnafu, Result}; use crate::heartbeat::utils::get_datanode_workloads; const DATANODE_STAT_PREFIX: &str = "__meta_datanode_stat"; @@ -66,10 +65,12 @@ pub struct Stat { pub node_epoch: u64, /// The datanode workloads. pub datanode_workloads: DatanodeWorkloads, + /// The GC statistics of the datanode. + pub gc_stat: Option, } /// The statistics of a region. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct RegionStat { /// The region_id. pub id: RegionId, @@ -126,11 +127,13 @@ pub trait TopicStatsReporter: Send + Sync { fn reportable_topics(&mut self) -> Vec; } -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)] pub enum RegionManifestInfo { Mito { manifest_version: u64, flushed_entry_id: u64, + /// Number of files removed in the manifest's `removed_files` field. + file_removed_cnt: u64, }, Metric { data_manifest_version: u64, @@ -222,11 +225,12 @@ impl TryFrom<&HeartbeatRequest> for Stat { node_epoch, node_workloads, topic_stats, + extensions, .. } = value; match (header, peer) { - (Some(_header), Some(peer)) => { + (Some(header), Some(peer)) => { let region_stats = region_stats .iter() .map(RegionStat::from) @@ -234,6 +238,14 @@ impl TryFrom<&HeartbeatRequest> for Stat { let topic_stats = topic_stats.iter().map(TopicStat::from).collect::>(); let datanode_workloads = get_datanode_workloads(node_workloads.as_ref()); + + let gc_stat = GcStat::from_extensions(extensions).map_err(|err| { + common_telemetry::error!( + "Failed to deserialize GcStat from extensions: {}", + err + ); + header.clone() + })?; Ok(Self { timestamp_millis: time_util::current_time_millis(), // datanode id @@ -247,6 +259,7 @@ impl TryFrom<&HeartbeatRequest> for Stat { topic_stats, node_epoch: *node_epoch, datanode_workloads, + gc_stat, }) } (header, _) => Err(header.clone()), @@ -260,9 +273,11 @@ impl From for RegionManifestInfo { store_api::region_engine::RegionManifestInfo::Mito { manifest_version, flushed_entry_id, + file_removed_cnt, } => RegionManifestInfo::Mito { manifest_version, flushed_entry_id, + file_removed_cnt, }, store_api::region_engine::RegionManifestInfo::Metric { data_manifest_version, @@ -319,6 +334,43 @@ impl From<&api::v1::meta::TopicStat> for TopicStat { } } +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct GcStat { + /// Number of GC tasks currently running on the datanode. + pub running_gc_tasks: u32, + /// The maximum number of concurrent GC tasks the datanode can handle. + pub gc_concurrency: u32, +} + +impl GcStat { + pub const GC_STAT_KEY: &str = "__gc_stat"; + + pub fn new(running_gc_tasks: u32, gc_concurrency: u32) -> Self { + Self { + running_gc_tasks, + gc_concurrency, + } + } + + pub fn into_extensions(&self, extensions: &mut std::collections::HashMap>) { + let bytes = serde_json::to_vec(self).unwrap_or_default(); + extensions.insert(Self::GC_STAT_KEY.to_string(), bytes); + } + + pub fn from_extensions( + extensions: &std::collections::HashMap>, + ) -> Result> { + extensions + .get(Self::GC_STAT_KEY) + .map(|bytes| { + serde_json::from_slice(bytes).with_context(|_| DeserializeFromJsonSnafu { + input: String::from_utf8_lossy(bytes).to_string(), + }) + }) + .transpose() + } +} + /// The key of the datanode stat in the memory store. /// /// The format is `__meta_datanode_stat-0-{node_id}`. diff --git a/src/common/meta/src/ddl/alter_database.rs b/src/common/meta/src/ddl/alter_database.rs index 6e199cb92a..736459e533 100644 --- a/src/common/meta/src/ddl/alter_database.rs +++ b/src/common/meta/src/ddl/alter_database.rs @@ -47,6 +47,9 @@ fn build_new_schema_value( SetDatabaseOption::Ttl(ttl) => { value.ttl = Some(*ttl); } + SetDatabaseOption::Other(key, val) => { + value.extra_options.insert(key.clone(), val.clone()); + } } } } @@ -54,6 +57,9 @@ fn build_new_schema_value( for key in keys.0.iter() { match key { UnsetDatabaseOption::Ttl => value.ttl = None, + UnsetDatabaseOption::Other(key) => { + value.extra_options.remove(key); + } } } } @@ -234,4 +240,41 @@ mod tests { build_new_schema_value(current_schema_value, &unset_ttl_alter_kind).unwrap(); assert_eq!(new_schema_value.ttl, None); } + + #[test] + fn test_build_new_schema_value_with_compaction_options() { + let set_compaction = AlterDatabaseKind::SetDatabaseOptions(SetDatabaseOptions(vec![ + SetDatabaseOption::Other("compaction.type".to_string(), "twcs".to_string()), + SetDatabaseOption::Other("compaction.twcs.time_window".to_string(), "1d".to_string()), + ])); + + let current_schema_value = SchemaNameValue::default(); + let new_schema_value = + build_new_schema_value(current_schema_value.clone(), &set_compaction).unwrap(); + + assert_eq!( + new_schema_value.extra_options.get("compaction.type"), + Some(&"twcs".to_string()) + ); + assert_eq!( + new_schema_value + .extra_options + .get("compaction.twcs.time_window"), + Some(&"1d".to_string()) + ); + + let unset_compaction = AlterDatabaseKind::UnsetDatabaseOptions(UnsetDatabaseOptions(vec![ + UnsetDatabaseOption::Other("compaction.type".to_string()), + ])); + + let new_schema_value = build_new_schema_value(new_schema_value, &unset_compaction).unwrap(); + + assert_eq!(new_schema_value.extra_options.get("compaction.type"), None); + assert_eq!( + new_schema_value + .extra_options + .get("compaction.twcs.time_window"), + Some(&"1d".to_string()) + ); + } } diff --git a/src/common/meta/src/ddl/tests/alter_table.rs b/src/common/meta/src/ddl/tests/alter_table.rs index e16a85b403..a9ba4a0aa8 100644 --- a/src/common/meta/src/ddl/tests/alter_table.rs +++ b/src/common/meta/src/ddl/tests/alter_table.rs @@ -182,7 +182,7 @@ fn alter_request_handler(_peer: Peer, request: RegionRequest) -> Result Result>> { - let schemas = results + let mut schemas = results .iter_mut() .map(|r| r.extensions.remove(key)) .collect::>(); @@ -454,20 +454,24 @@ pub fn extract_column_metadatas( // Verify all the physical schemas are the same // Safety: previous check ensures this vec is not empty - let first = schemas.first().unwrap(); - ensure!( - schemas.iter().all(|x| x == first), - MetadataCorruptionSnafu { - err_msg: "The table column metadata schemas from datanodes are not the same." - } - ); + let first_column_metadatas = schemas + .swap_remove(0) + .map(|first_bytes| ColumnMetadata::decode_list(&first_bytes).context(DecodeJsonSnafu)) + .transpose()?; - if let Some(first) = first { - let column_metadatas = ColumnMetadata::decode_list(first).context(DecodeJsonSnafu)?; - Ok(Some(column_metadatas)) - } else { - Ok(None) + for s in schemas { + // check decoded column metadata instead of bytes because it contains extension map. + let column_metadata = s + .map(|bytes| ColumnMetadata::decode_list(&bytes).context(DecodeJsonSnafu)) + .transpose()?; + ensure!( + column_metadata == first_column_metadatas, + MetadataCorruptionSnafu { + err_msg: "The table column metadata schemas from datanodes are not the same." + } + ); } + Ok(first_column_metadatas) } #[cfg(test)] diff --git a/src/common/meta/src/ddl_manager.rs b/src/common/meta/src/ddl_manager.rs index 9ade13052d..56cee9697b 100644 --- a/src/common/meta/src/ddl_manager.rs +++ b/src/common/meta/src/ddl_manager.rs @@ -14,6 +14,7 @@ use std::sync::Arc; +use common_error::ext::BoxedError; use common_procedure::{ BoxedProcedureLoader, Output, ProcedureId, ProcedureManagerRef, ProcedureWithId, watcher, }; @@ -66,6 +67,19 @@ use crate::rpc::ddl::{ }; use crate::rpc::router::RegionRoute; +/// A configurator that customizes or enhances a [`DdlManager`]. +#[async_trait::async_trait] +pub trait DdlManagerConfigurator: Send + Sync { + /// Configures the given [`DdlManager`] using the provided [`DdlManagerConfigureContext`]. + async fn configure( + &self, + ddl_manager: DdlManager, + ctx: C, + ) -> std::result::Result; +} + +pub type DdlManagerConfiguratorRef = Arc>; + pub type DdlManagerRef = Arc; pub type BoxedProcedureLoaderFactory = dyn Fn(DdlContext) -> BoxedProcedureLoader; @@ -148,11 +162,8 @@ impl DdlManager { } #[cfg(feature = "enterprise")] - pub fn with_trigger_ddl_manager( - mut self, - trigger_ddl_manager: Option, - ) -> Self { - self.trigger_ddl_manager = trigger_ddl_manager; + pub fn with_trigger_ddl_manager(mut self, trigger_ddl_manager: TriggerDdlManagerRef) -> Self { + self.trigger_ddl_manager = Some(trigger_ddl_manager); self } diff --git a/src/common/meta/src/instruction.rs b/src/common/meta/src/instruction.rs index 9a9d955f58..c731c90490 100644 --- a/src/common/meta/src/instruction.rs +++ b/src/common/meta/src/instruction.rs @@ -17,7 +17,7 @@ use std::fmt::{Display, Formatter}; use std::time::Duration; use serde::{Deserialize, Deserializer, Serialize}; -use store_api::storage::{RegionId, RegionNumber}; +use store_api::storage::{FileRefsManifest, GcReport, RegionId, RegionNumber}; use strum::Display; use table::metadata::TableId; use table::table_name::TableName; @@ -55,6 +55,10 @@ impl Display for RegionIdent { /// The result of downgrade leader region. #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] pub struct DowngradeRegionReply { + /// The [RegionId]. + /// For compatibility, it is defaulted to [RegionId::new(0, 0)]. + #[serde(default)] + pub region_id: RegionId, /// Returns the `last_entry_id` if available. pub last_entry_id: Option, /// Returns the `metadata_last_entry_id` if available (Only available for metric engine). @@ -246,7 +250,7 @@ pub struct UpgradeRegion { /// `None` stands for no wait, /// it's helpful to verify whether the leader region is ready. #[serde(with = "humantime_serde")] - pub replay_timeout: Option, + pub replay_timeout: Duration, /// The hint for replaying memtable. #[serde(default)] pub location_id: Option, @@ -335,6 +339,16 @@ pub struct FlushRegions { pub error_strategy: FlushErrorStrategy, } +impl Display for FlushRegions { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "FlushRegions(region_ids={:?}, strategy={:?}, error_strategy={:?})", + self.region_ids, self.strategy, self.error_strategy + ) + } +} + impl FlushRegions { /// Create synchronous single-region flush pub fn sync_single(region_id: RegionId) -> Self { @@ -413,6 +427,93 @@ where }) } +/// Instruction to get file references for specified regions. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GetFileRefs { + /// List of region IDs to get file references from active FileHandles (in-memory). + pub query_regions: Vec, + /// Mapping from the source region ID (where to read the manifest) to + /// the target region IDs (whose file references to look for). + /// Key: The region ID of the manifest. + /// Value: The list of region IDs to find references for in that manifest. + pub related_regions: HashMap>, +} + +impl Display for GetFileRefs { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "GetFileRefs(region_ids={:?})", self.query_regions) + } +} + +/// Instruction to trigger garbage collection for a region. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GcRegions { + /// The region ID to perform GC on, only regions that are currently on the given datanode can be garbage collected, regions not on the datanode will report errors. + pub regions: Vec, + /// The file references manifest containing temporary file references. + pub file_refs_manifest: FileRefsManifest, + /// Whether to perform a full file listing to find orphan files. + pub full_file_listing: bool, +} + +impl Display for GcRegions { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "GcRegion(regions={:?}, file_refs_count={}, full_file_listing={})", + self.regions, + self.file_refs_manifest.file_refs.len(), + self.full_file_listing + ) + } +} + +/// Reply for GetFileRefs instruction. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GetFileRefsReply { + /// The file references manifest. + pub file_refs_manifest: FileRefsManifest, + /// Whether the operation was successful. + pub success: bool, + /// Error message if any. + pub error: Option, +} + +impl Display for GetFileRefsReply { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "GetFileRefsReply(success={}, file_refs_count={}, error={:?})", + self.success, + self.file_refs_manifest.file_refs.len(), + self.error + ) + } +} + +/// Reply for GC instruction. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GcRegionsReply { + pub result: Result, +} + +impl Display for GcRegionsReply { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "GcReply(result={})", + match &self.result { + Ok(report) => format!( + "GcReport(deleted_files_count={}, need_retry_regions_count={})", + report.deleted_files.len(), + report.need_retry_regions.len() + ), + Err(err) => format!("Err({})", err), + } + ) + } +} + #[derive(Debug, Clone, Serialize, Deserialize, Display, PartialEq)] pub enum Instruction { /// Opens regions. @@ -421,19 +522,88 @@ pub enum Instruction { /// Closes regions. #[serde(deserialize_with = "single_or_multiple_from", alias = "CloseRegion")] CloseRegions(Vec), - /// Upgrades a region. - UpgradeRegion(UpgradeRegion), - /// Downgrades a region. - DowngradeRegion(DowngradeRegion), + /// Upgrades regions. + #[serde(deserialize_with = "single_or_multiple_from", alias = "UpgradeRegion")] + UpgradeRegions(Vec), + #[serde( + deserialize_with = "single_or_multiple_from", + alias = "DowngradeRegion" + )] + /// Downgrades regions. + DowngradeRegions(Vec), /// Invalidates batch cache. InvalidateCaches(Vec), /// Flushes regions. FlushRegions(FlushRegions), + /// Gets file references for regions. + GetFileRefs(GetFileRefs), + /// Triggers garbage collection for a region. + GcRegions(GcRegions), +} + +impl Instruction { + /// Converts the instruction into a vector of [OpenRegion]. + pub fn into_open_regions(self) -> Option> { + match self { + Self::OpenRegions(open_regions) => Some(open_regions), + _ => None, + } + } + + /// Converts the instruction into a vector of [RegionIdent]. + pub fn into_close_regions(self) -> Option> { + match self { + Self::CloseRegions(close_regions) => Some(close_regions), + _ => None, + } + } + + /// Converts the instruction into a [FlushRegions]. + pub fn into_flush_regions(self) -> Option { + match self { + Self::FlushRegions(flush_regions) => Some(flush_regions), + _ => None, + } + } + + /// Converts the instruction into a [DowngradeRegion]. + pub fn into_downgrade_regions(self) -> Option> { + match self { + Self::DowngradeRegions(downgrade_region) => Some(downgrade_region), + _ => None, + } + } + + /// Converts the instruction into a [UpgradeRegion]. + pub fn into_upgrade_regions(self) -> Option> { + match self { + Self::UpgradeRegions(upgrade_region) => Some(upgrade_region), + _ => None, + } + } + + pub fn into_get_file_refs(self) -> Option { + match self { + Self::GetFileRefs(get_file_refs) => Some(get_file_refs), + _ => None, + } + } + + pub fn into_gc_regions(self) -> Option { + match self { + Self::GcRegions(gc_regions) => Some(gc_regions), + _ => None, + } + } } /// The reply of [UpgradeRegion]. #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] pub struct UpgradeRegionReply { + /// The [RegionId]. + /// For compatibility, it is defaulted to [RegionId::new(0, 0)]. + #[serde(default)] + pub region_id: RegionId, /// Returns true if `last_entry_id` has been replayed to the latest. pub ready: bool, /// Indicates whether the region exists. @@ -452,6 +622,72 @@ impl Display for UpgradeRegionReply { } } +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct DowngradeRegionsReply { + pub replies: Vec, +} + +impl DowngradeRegionsReply { + pub fn new(replies: Vec) -> Self { + Self { replies } + } + + pub fn single(reply: DowngradeRegionReply) -> Self { + Self::new(vec![reply]) + } +} + +#[derive(Deserialize)] +#[serde(untagged)] +enum DowngradeRegionsCompat { + Single(DowngradeRegionReply), + Multiple(DowngradeRegionsReply), +} + +fn downgrade_regions_compat_from<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let helper = DowngradeRegionsCompat::deserialize(deserializer)?; + Ok(match helper { + DowngradeRegionsCompat::Single(x) => DowngradeRegionsReply::new(vec![x]), + DowngradeRegionsCompat::Multiple(reply) => reply, + }) +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +pub struct UpgradeRegionsReply { + pub replies: Vec, +} + +impl UpgradeRegionsReply { + pub fn new(replies: Vec) -> Self { + Self { replies } + } + + pub fn single(reply: UpgradeRegionReply) -> Self { + Self::new(vec![reply]) + } +} + +#[derive(Deserialize)] +#[serde(untagged)] +enum UpgradeRegionsCompat { + Single(UpgradeRegionReply), + Multiple(UpgradeRegionsReply), +} + +fn upgrade_regions_compat_from<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let helper = UpgradeRegionsCompat::deserialize(deserializer)?; + Ok(match helper { + UpgradeRegionsCompat::Single(x) => UpgradeRegionsReply::new(vec![x]), + UpgradeRegionsCompat::Multiple(reply) => reply, + }) +} + #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] #[serde(tag = "type", rename_all = "snake_case")] pub enum InstructionReply { @@ -459,9 +695,19 @@ pub enum InstructionReply { OpenRegions(SimpleReply), #[serde(alias = "close_region")] CloseRegions(SimpleReply), - UpgradeRegion(UpgradeRegionReply), - DowngradeRegion(DowngradeRegionReply), + #[serde( + deserialize_with = "upgrade_regions_compat_from", + alias = "upgrade_region" + )] + UpgradeRegions(UpgradeRegionsReply), + #[serde( + alias = "downgrade_region", + deserialize_with = "downgrade_regions_compat_from" + )] + DowngradeRegions(DowngradeRegionsReply), FlushRegions(FlushRegionReply), + GetFileRefs(GetFileRefsReply), + GcRegions(GcRegionsReply), } impl Display for InstructionReply { @@ -469,11 +715,15 @@ impl Display for InstructionReply { match self { Self::OpenRegions(reply) => write!(f, "InstructionReply::OpenRegions({})", reply), Self::CloseRegions(reply) => write!(f, "InstructionReply::CloseRegions({})", reply), - Self::UpgradeRegion(reply) => write!(f, "InstructionReply::UpgradeRegion({})", reply), - Self::DowngradeRegion(reply) => { - write!(f, "InstructionReply::DowngradeRegion({})", reply) + Self::UpgradeRegions(reply) => { + write!(f, "InstructionReply::UpgradeRegions({:?})", reply.replies) + } + Self::DowngradeRegions(reply) => { + write!(f, "InstructionReply::DowngradeRegions({:?})", reply.replies) } Self::FlushRegions(reply) => write!(f, "InstructionReply::FlushRegions({})", reply), + Self::GetFileRefs(reply) => write!(f, "InstructionReply::GetFileRefs({})", reply), + Self::GcRegions(reply) => write!(f, "InstructionReply::GcRegion({})", reply), } } } @@ -493,10 +743,35 @@ impl InstructionReply { _ => panic!("Expected OpenRegions reply"), } } + + pub fn expect_upgrade_regions_reply(self) -> Vec { + match self { + Self::UpgradeRegions(reply) => reply.replies, + _ => panic!("Expected UpgradeRegion reply"), + } + } + + pub fn expect_downgrade_regions_reply(self) -> Vec { + match self { + Self::DowngradeRegions(reply) => reply.replies, + _ => panic!("Expected DowngradeRegion reply"), + } + } + + pub fn expect_flush_regions_reply(self) -> FlushRegionReply { + match self { + Self::FlushRegions(reply) => reply, + _ => panic!("Expected FlushRegions reply"), + } + } } #[cfg(test)] mod tests { + use std::collections::HashSet; + + use store_api::storage::FileId; + use super::*; #[test] @@ -532,11 +807,60 @@ mod tests { r#"{"CloseRegions":[{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}]}"#, serialized ); + + let upgrade_region = Instruction::UpgradeRegions(vec![UpgradeRegion { + region_id: RegionId::new(1024, 1), + last_entry_id: None, + metadata_last_entry_id: None, + replay_timeout: Duration::from_millis(1000), + location_id: None, + replay_entry_id: None, + metadata_replay_entry_id: None, + }]); + + let serialized = serde_json::to_string(&upgrade_region).unwrap(); + assert_eq!( + r#"{"UpgradeRegions":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"replay_timeout":"1s","location_id":null}]}"#, + serialized + ); + } + + #[test] + fn test_serialize_instruction_reply() { + let downgrade_region_reply = InstructionReply::DowngradeRegions( + DowngradeRegionsReply::single(DowngradeRegionReply { + region_id: RegionId::new(1024, 1), + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: None, + }), + ); + + let serialized = serde_json::to_string(&downgrade_region_reply).unwrap(); + assert_eq!( + r#"{"type":"downgrade_regions","replies":[{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null}]}"#, + serialized + ); + + let upgrade_region_reply = + InstructionReply::UpgradeRegions(UpgradeRegionsReply::single(UpgradeRegionReply { + region_id: RegionId::new(1024, 1), + ready: true, + exists: true, + error: None, + })); + let serialized = serde_json::to_string(&upgrade_region_reply).unwrap(); + assert_eq!( + r#"{"type":"upgrade_regions","replies":[{"region_id":4398046511105,"ready":true,"exists":true,"error":null}]}"#, + serialized + ); } #[test] fn test_deserialize_instruction() { - let open_region_instruction = r#"{"OpenRegion":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}]}"#; + // legacy open region instruction + let open_region_instruction = r#"{"OpenRegion":{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}}"#; let open_region_instruction: Instruction = serde_json::from_str(open_region_instruction).unwrap(); let open_region = Instruction::OpenRegions(vec![OpenRegion::new( @@ -553,7 +877,8 @@ mod tests { )]); assert_eq!(open_region_instruction, open_region); - let close_region_instruction = r#"{"CloseRegion":[{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}]}"#; + // legacy close region instruction + let close_region_instruction = r#"{"CloseRegion":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"}}"#; let close_region_instruction: Instruction = serde_json::from_str(close_region_instruction).unwrap(); let close_region = Instruction::CloseRegions(vec![RegionIdent { @@ -564,6 +889,35 @@ mod tests { }]); assert_eq!(close_region_instruction, close_region); + // legacy downgrade region instruction + let downgrade_region_instruction = r#"{"DowngradeRegions":{"region_id":4398046511105,"flush_timeout":{"secs":1,"nanos":0}}}"#; + let downgrade_region_instruction: Instruction = + serde_json::from_str(downgrade_region_instruction).unwrap(); + let downgrade_region = Instruction::DowngradeRegions(vec![DowngradeRegion { + region_id: RegionId::new(1024, 1), + flush_timeout: Some(Duration::from_millis(1000)), + }]); + assert_eq!(downgrade_region_instruction, downgrade_region); + + // legacy upgrade region instruction + let upgrade_region_instruction = r#"{"UpgradeRegion":{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"replay_timeout":"1s","location_id":null,"replay_entry_id":null,"metadata_replay_entry_id":null}}"#; + let upgrade_region_instruction: Instruction = + serde_json::from_str(upgrade_region_instruction).unwrap(); + let upgrade_region = Instruction::UpgradeRegions(vec![UpgradeRegion { + region_id: RegionId::new(1024, 1), + last_entry_id: None, + metadata_last_entry_id: None, + replay_timeout: Duration::from_millis(1000), + location_id: None, + replay_entry_id: None, + metadata_replay_entry_id: None, + }]); + assert_eq!(upgrade_region_instruction, upgrade_region); + } + + #[test] + fn test_deserialize_instruction_reply() { + // legacy close region reply let close_region_instruction_reply = r#"{"result":true,"error":null,"type":"close_region"}"#; let close_region_instruction_reply: InstructionReply = @@ -574,6 +928,7 @@ mod tests { }); assert_eq!(close_region_instruction_reply, close_region_reply); + // legacy open region reply let open_region_instruction_reply = r#"{"result":true,"error":null,"type":"open_region"}"#; let open_region_instruction_reply: InstructionReply = serde_json::from_str(open_region_instruction_reply).unwrap(); @@ -582,6 +937,34 @@ mod tests { error: None, }); assert_eq!(open_region_instruction_reply, open_region_reply); + + // legacy downgrade region reply + let downgrade_region_instruction_reply = r#"{"region_id":4398046511105,"last_entry_id":null,"metadata_last_entry_id":null,"exists":true,"error":null,"type":"downgrade_region"}"#; + let downgrade_region_instruction_reply: InstructionReply = + serde_json::from_str(downgrade_region_instruction_reply).unwrap(); + let downgrade_region_reply = InstructionReply::DowngradeRegions( + DowngradeRegionsReply::single(DowngradeRegionReply { + region_id: RegionId::new(1024, 1), + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: None, + }), + ); + assert_eq!(downgrade_region_instruction_reply, downgrade_region_reply); + + // legacy upgrade region reply + let upgrade_region_instruction_reply = r#"{"region_id":4398046511105,"ready":true,"exists":true,"error":null,"type":"upgrade_region"}"#; + let upgrade_region_instruction_reply: InstructionReply = + serde_json::from_str(upgrade_region_instruction_reply).unwrap(); + let upgrade_region_reply = + InstructionReply::UpgradeRegions(UpgradeRegionsReply::single(UpgradeRegionReply { + region_id: RegionId::new(1024, 1), + ready: true, + exists: true, + error: None, + })); + assert_eq!(upgrade_region_instruction_reply, upgrade_region_reply); } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -756,4 +1139,30 @@ mod tests { _ => panic!("Expected FlushRegions instruction"), } } + + #[test] + fn test_serialize_get_file_refs_instruction_reply() { + let mut manifest = FileRefsManifest::default(); + let r0 = RegionId::new(1024, 1); + let r1 = RegionId::new(1024, 2); + manifest + .file_refs + .insert(r0, HashSet::from([FileId::random()])); + manifest + .file_refs + .insert(r1, HashSet::from([FileId::random()])); + manifest.manifest_version.insert(r0, 10); + manifest.manifest_version.insert(r1, 20); + + let instruction_reply = InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest: manifest, + success: true, + error: None, + }); + + let serialized = serde_json::to_string(&instruction_reply).unwrap(); + let deserialized = serde_json::from_str(&serialized).unwrap(); + + assert_eq!(instruction_reply, deserialized); + } } diff --git a/src/common/meta/src/key.rs b/src/common/meta/src/key.rs index a1d98db301..55dbc0ad01 100644 --- a/src/common/meta/src/key.rs +++ b/src/common/meta/src/key.rs @@ -121,6 +121,7 @@ use std::ops::{Deref, DerefMut}; use std::sync::Arc; use bytes::Bytes; +use common_base::regex_pattern::NAME_PATTERN; use common_catalog::consts::{ DEFAULT_CATALOG_NAME, DEFAULT_PRIVATE_SCHEMA_NAME, DEFAULT_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, }; @@ -164,7 +165,6 @@ use crate::rpc::router::{LeaderState, RegionRoute, region_distribution}; use crate::rpc::store::BatchDeleteRequest; use crate::state_store::PoisonValue; -pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.@#]*"; pub const TOPIC_NAME_PATTERN: &str = r"[a-zA-Z0-9_:-][a-zA-Z0-9_:\-\.@#]*"; pub const LEGACY_MAINTENANCE_KEY: &str = "__maintenance"; pub const MAINTENANCE_KEY: &str = "__switches/maintenance"; @@ -269,10 +269,6 @@ pub type FlowId = u32; /// The partition of flow. pub type FlowPartitionId = u32; -lazy_static! { - pub static ref NAME_PATTERN_REGEX: Regex = Regex::new(NAME_PATTERN).unwrap(); -} - lazy_static! { pub static ref TOPIC_NAME_PATTERN_REGEX: Regex = Regex::new(TOPIC_NAME_PATTERN).unwrap(); } diff --git a/src/common/meta/src/key/datanode_table.rs b/src/common/meta/src/key/datanode_table.rs index 68105a478a..8aca2fcaf7 100644 --- a/src/common/meta/src/key/datanode_table.rs +++ b/src/common/meta/src/key/datanode_table.rs @@ -164,6 +164,25 @@ impl DatanodeTableManager { .transpose() } + pub async fn batch_get( + &self, + keys: &[DatanodeTableKey], + ) -> Result> { + let req = BatchGetRequest::default().with_keys(keys.iter().map(|k| k.to_bytes()).collect()); + let resp = self.kv_backend.batch_get(req).await?; + let values = resp + .kvs + .into_iter() + .map(|kv| { + Ok(( + DatanodeTableKey::from_bytes(&kv.key)?, + DatanodeTableValue::try_from_raw_value(&kv.value)?, + )) + }) + .collect::>>()?; + Ok(values) + } + pub fn tables( &self, datanode_id: DatanodeId, diff --git a/src/common/meta/src/key/table_route.rs b/src/common/meta/src/key/table_route.rs index 5f6782f002..fe1f11bf15 100644 --- a/src/common/meta/src/key/table_route.rs +++ b/src/common/meta/src/key/table_route.rs @@ -661,13 +661,32 @@ impl TableRouteStorage { /// Returns batch of [`TableRouteValue`] that respects the order of `table_ids`. pub async fn batch_get(&self, table_ids: &[TableId]) -> Result>> { - let mut table_routes = self.batch_get_inner(table_ids).await?; - self.remap_routes_addresses(&mut table_routes).await?; + let raw_table_routes = self.batch_get_inner(table_ids).await?; - Ok(table_routes) + Ok(raw_table_routes + .into_iter() + .map(|v| v.map(|x| x.inner)) + .collect()) } - async fn batch_get_inner(&self, table_ids: &[TableId]) -> Result>> { + /// Returns batch of [`TableRouteValue`] wrapped with [`DeserializedValueWithBytes`]. + /// + /// The return value is a vector of [`Option>`]. + /// Note: This method remaps the addresses of the table routes, but does not update their raw byte representations. + pub async fn batch_get_with_raw_bytes( + &self, + table_ids: &[TableId], + ) -> Result>>> { + let mut raw_table_routes = self.batch_get_inner(table_ids).await?; + self.remap_routes_addresses(&mut raw_table_routes).await?; + + Ok(raw_table_routes) + } + + async fn batch_get_inner( + &self, + table_ids: &[TableId], + ) -> Result>>> { let keys = table_ids .iter() .map(|id| TableRouteKey::new(*id).to_bytes()) @@ -685,7 +704,7 @@ impl TableRouteStorage { keys.into_iter() .map(|key| { if let Some(value) = kvs.get(&key) { - Ok(Some(TableRouteValue::try_from_raw_value(value)?)) + Ok(Some(DeserializedValueWithBytes::from_inner_slice(value)?)) } else { Ok(None) } @@ -695,14 +714,14 @@ impl TableRouteStorage { async fn remap_routes_addresses( &self, - table_routes: &mut [Option], + table_routes: &mut [Option>], ) -> Result<()> { let keys = table_routes .iter() .flat_map(|table_route| { table_route .as_ref() - .map(extract_address_keys) + .map(|x| extract_address_keys(&x.inner)) .unwrap_or_default() }) .collect::>() diff --git a/src/common/meta/src/key/topic_region.rs b/src/common/meta/src/key/topic_region.rs index 844a46735f..c34229cf9e 100644 --- a/src/common/meta/src/key/topic_region.rs +++ b/src/common/meta/src/key/topic_region.rs @@ -33,7 +33,7 @@ use crate::rpc::store::{ // The TopicRegionKey is a key for the topic-region mapping in the kvbackend. // The layout of the key is `__topic_region/{topic_name}/{region_id}`. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct TopicRegionKey<'a> { pub region_id: RegionId, pub topic: &'a str, diff --git a/src/common/meta/src/kv_backend.rs b/src/common/meta/src/kv_backend.rs index cdd7102e11..7f747508d4 100644 --- a/src/common/meta/src/kv_backend.rs +++ b/src/common/meta/src/kv_backend.rs @@ -34,6 +34,8 @@ pub mod memory; #[cfg(any(feature = "mysql_kvbackend", feature = "pg_kvbackend"))] pub mod rds; pub mod test; +#[cfg(any(test, feature = "testing"))] +pub mod test_util; pub mod txn; pub mod util; pub type KvBackendRef = Arc + Send + Sync>; diff --git a/src/common/meta/src/kv_backend/test_util.rs b/src/common/meta/src/kv_backend/test_util.rs new file mode 100644 index 0000000000..ce502c3332 --- /dev/null +++ b/src/common/meta/src/kv_backend/test_util.rs @@ -0,0 +1,125 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use derive_builder::Builder; + +use crate::error::Result; +use crate::kv_backend::txn::{Txn, TxnResponse}; +use crate::kv_backend::{ + BatchDeleteRequest, BatchDeleteResponse, BatchGetRequest, BatchGetResponse, BatchPutRequest, + BatchPutResponse, DeleteRangeRequest, DeleteRangeResponse, KvBackend, PutRequest, PutResponse, + RangeRequest, RangeResponse, TxnService, +}; + +pub type MockFn = Arc Result + Send + Sync>; + +/// A mock kv backend for testing. +#[derive(Builder)] +pub struct MockKvBackend { + #[builder(setter(strip_option), default)] + pub range_fn: Option>, + #[builder(setter(strip_option), default)] + pub put_fn: Option>, + #[builder(setter(strip_option), default)] + pub batch_put_fn: Option>, + #[builder(setter(strip_option), default)] + pub batch_get_fn: Option>, + #[builder(setter(strip_option), default)] + pub delete_range_fn: Option>, + #[builder(setter(strip_option), default)] + pub batch_delete_fn: Option>, + #[builder(setter(strip_option), default)] + pub txn: Option>, + #[builder(setter(strip_option), default)] + pub max_txn_ops: Option, +} + +#[async_trait::async_trait] +impl TxnService for MockKvBackend { + type Error = crate::error::Error; + + async fn txn(&self, txn: Txn) -> Result { + if let Some(f) = &self.txn { + f(txn) + } else { + unimplemented!() + } + } + + fn max_txn_ops(&self) -> usize { + self.max_txn_ops.unwrap() + } +} + +#[async_trait::async_trait] +impl KvBackend for MockKvBackend { + fn name(&self) -> &str { + "mock_kv_backend" + } + + fn as_any(&self) -> &dyn Any { + self + } + + async fn range(&self, req: RangeRequest) -> Result { + if let Some(f) = &self.range_fn { + f(req) + } else { + unimplemented!() + } + } + + async fn put(&self, req: PutRequest) -> Result { + if let Some(f) = &self.put_fn { + f(req) + } else { + unimplemented!() + } + } + + async fn batch_put(&self, req: BatchPutRequest) -> Result { + if let Some(f) = &self.batch_put_fn { + f(req) + } else { + unimplemented!() + } + } + + async fn batch_get(&self, req: BatchGetRequest) -> Result { + if let Some(f) = &self.batch_get_fn { + f(req) + } else { + unimplemented!() + } + } + + async fn delete_range(&self, req: DeleteRangeRequest) -> Result { + if let Some(f) = &self.delete_range_fn { + f(req) + } else { + unimplemented!() + } + } + + async fn batch_delete(&self, req: BatchDeleteRequest) -> Result { + if let Some(f) = &self.batch_delete_fn { + f(req) + } else { + unimplemented!() + } + } +} diff --git a/src/common/meta/src/region_registry.rs b/src/common/meta/src/region_registry.rs index 1f672d563d..f1741b281b 100644 --- a/src/common/meta/src/region_registry.rs +++ b/src/common/meta/src/region_registry.rs @@ -67,6 +67,7 @@ impl LeaderRegionManifestInfo { RegionManifestInfo::Mito { manifest_version, flushed_entry_id, + file_removed_cnt: _, } => LeaderRegionManifestInfo::Mito { manifest_version, flushed_entry_id, diff --git a/src/common/meta/src/rpc/ddl.rs b/src/common/meta/src/rpc/ddl.rs index b9a871775f..2fe936e6fd 100644 --- a/src/common/meta/src/rpc/ddl.rs +++ b/src/common/meta/src/rpc/ddl.rs @@ -47,6 +47,7 @@ use serde_with::{DefaultOnNull, serde_as}; use session::context::{QueryContextBuilder, QueryContextRef}; use snafu::{OptionExt, ResultExt}; use table::metadata::{RawTableInfo, TableId}; +use table::requests::validate_database_option; use table::table_name::TableName; use table::table_reference::TableReference; @@ -1059,14 +1060,21 @@ impl TryFrom for SetDatabaseOption { type Error = error::Error; fn try_from(PbOption { key, value }: PbOption) -> Result { - match key.to_ascii_lowercase().as_str() { + let key_lower = key.to_ascii_lowercase(); + match key_lower.as_str() { TTL_KEY => { let ttl = DatabaseTimeToLive::from_humantime_or_str(&value) .map_err(|_| InvalidSetDatabaseOptionSnafu { key, value }.build())?; Ok(SetDatabaseOption::Ttl(ttl)) } - _ => InvalidSetDatabaseOptionSnafu { key, value }.fail(), + _ => { + if validate_database_option(&key_lower) { + Ok(SetDatabaseOption::Other(key_lower, value)) + } else { + InvalidSetDatabaseOptionSnafu { key, value }.fail() + } + } } } } @@ -1074,20 +1082,29 @@ impl TryFrom for SetDatabaseOption { #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] pub enum SetDatabaseOption { Ttl(DatabaseTimeToLive), + Other(String, String), } #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] pub enum UnsetDatabaseOption { Ttl, + Other(String), } impl TryFrom<&str> for UnsetDatabaseOption { type Error = error::Error; fn try_from(key: &str) -> Result { - match key.to_ascii_lowercase().as_str() { + let key_lower = key.to_ascii_lowercase(); + match key_lower.as_str() { TTL_KEY => Ok(UnsetDatabaseOption::Ttl), - _ => InvalidUnsetDatabaseOptionSnafu { key }.fail(), + _ => { + if validate_database_option(&key_lower) { + Ok(UnsetDatabaseOption::Other(key_lower)) + } else { + InvalidUnsetDatabaseOptionSnafu { key }.fail() + } + } } } } diff --git a/src/common/procedure/src/event.rs b/src/common/procedure/src/event.rs index bc76de7842..d659236369 100644 --- a/src/common/procedure/src/event.rs +++ b/src/common/procedure/src/event.rs @@ -92,25 +92,96 @@ impl Event for ProcedureEvent { schema } - fn extra_row(&self) -> Result { - let error_str = match &self.state { - ProcedureState::Failed { error } => format!("{:?}", error), - ProcedureState::PrepareRollback { error } => format!("{:?}", error), - ProcedureState::RollingBack { error } => format!("{:?}", error), - ProcedureState::Retrying { error } => format!("{:?}", error), - ProcedureState::Poisoned { error, .. } => format!("{:?}", error), - _ => "".to_string(), - }; - let mut row = vec![ - ValueData::StringValue(self.procedure_id.to_string()).into(), - ValueData::StringValue(self.state.as_str_name().to_string()).into(), - ValueData::StringValue(error_str).into(), - ]; - row.append(&mut self.internal_event.extra_row()?.values); - Ok(Row { values: row }) + fn extra_rows(&self) -> Result> { + let mut internal_event_extra_rows = self.internal_event.extra_rows()?; + let mut rows = Vec::with_capacity(internal_event_extra_rows.len()); + for internal_event_extra_row in internal_event_extra_rows.iter_mut() { + let error_str = match &self.state { + ProcedureState::Failed { error } => format!("{:?}", error), + ProcedureState::PrepareRollback { error } => format!("{:?}", error), + ProcedureState::RollingBack { error } => format!("{:?}", error), + ProcedureState::Retrying { error } => format!("{:?}", error), + ProcedureState::Poisoned { error, .. } => format!("{:?}", error), + _ => "".to_string(), + }; + let mut values = Vec::with_capacity(3 + internal_event_extra_row.values.len()); + values.extend([ + ValueData::StringValue(self.procedure_id.to_string()).into(), + ValueData::StringValue(self.state.as_str_name().to_string()).into(), + ValueData::StringValue(error_str).into(), + ]); + values.append(&mut internal_event_extra_row.values); + rows.push(Row { values }); + } + + Ok(rows) } fn as_any(&self) -> &dyn Any { self } } + +#[cfg(test)] +mod tests { + use api::v1::value::ValueData; + use api::v1::{ColumnDataType, ColumnSchema, Row, SemanticType}; + use common_event_recorder::Event; + + use crate::{ProcedureEvent, ProcedureId, ProcedureState}; + + #[derive(Debug)] + struct TestEvent; + + impl Event for TestEvent { + fn event_type(&self) -> &str { + "test_event" + } + + fn extra_schema(&self) -> Vec { + vec![ColumnSchema { + column_name: "test_event_column".to_string(), + datatype: ColumnDataType::String.into(), + semantic_type: SemanticType::Field.into(), + ..Default::default() + }] + } + + fn extra_rows(&self) -> common_event_recorder::error::Result> { + Ok(vec![ + Row { + values: vec![ValueData::StringValue("test_event1".to_string()).into()], + }, + Row { + values: vec![ValueData::StringValue("test_event2".to_string()).into()], + }, + ]) + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + } + + #[test] + fn test_procedure_event_extra_rows() { + let procedure_event = ProcedureEvent::new( + ProcedureId::random(), + Box::new(TestEvent {}), + ProcedureState::Running, + ); + + let procedure_event_extra_rows = procedure_event.extra_rows().unwrap(); + assert_eq!(procedure_event_extra_rows.len(), 2); + assert_eq!(procedure_event_extra_rows[0].values.len(), 4); + assert_eq!( + procedure_event_extra_rows[0].values[3], + ValueData::StringValue("test_event1".to_string()).into() + ); + assert_eq!(procedure_event_extra_rows[1].values.len(), 4); + assert_eq!( + procedure_event_extra_rows[1].values[3], + ValueData::StringValue("test_event2".to_string()).into() + ); + } +} diff --git a/src/common/query/Cargo.toml b/src/common/query/Cargo.toml index 7cdc5a8a45..48328ea612 100644 --- a/src/common/query/Cargo.toml +++ b/src/common/query/Cargo.toml @@ -14,6 +14,7 @@ workspace = true api.workspace = true async-trait.workspace = true bytes.workspace = true +common-base.workspace = true common-error.workspace = true common-macro.workspace = true common-recordbatch.workspace = true @@ -22,6 +23,7 @@ datafusion.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true datatypes.workspace = true +once_cell.workspace = true serde.workspace = true snafu.workspace = true sqlparser.workspace = true diff --git a/src/common/query/src/error.rs b/src/common/query/src/error.rs index 163efb30a7..e70b9f4833 100644 --- a/src/common/query/src/error.rs +++ b/src/common/query/src/error.rs @@ -52,9 +52,6 @@ pub enum Error { data_type: ArrowDatatype, }, - #[snafu(display("Failed to downcast vector: {}", err_msg))] - DowncastVector { err_msg: String }, - #[snafu(display("Invalid input type: {}", err_msg))] InvalidInputType { #[snafu(implicit)] @@ -199,6 +196,9 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Invalid character in prefix config: {}", prefix))] + InvalidColumnPrefix { prefix: String }, } pub type Result = std::result::Result; @@ -206,8 +206,7 @@ pub type Result = std::result::Result; impl ErrorExt for Error { fn status_code(&self) -> StatusCode { match self { - Error::DowncastVector { .. } - | Error::InvalidInputState { .. } + Error::InvalidInputState { .. } | Error::ToScalarValue { .. } | Error::GetScalarVector { .. } | Error::ArrowCompute { .. } @@ -227,7 +226,8 @@ impl ErrorExt for Error { Error::UnsupportedInputDataType { .. } | Error::TypeCast { .. } - | Error::InvalidFuncArgs { .. } => StatusCode::InvalidArguments, + | Error::InvalidFuncArgs { .. } + | Error::InvalidColumnPrefix { .. } => StatusCode::InvalidArguments, Error::ConvertDfRecordBatchStream { source, .. } => source.status_code(), diff --git a/src/common/query/src/prelude.rs b/src/common/query/src/prelude.rs index f467906402..c27b94294e 100644 --- a/src/common/query/src/prelude.rs +++ b/src/common/query/src/prelude.rs @@ -12,15 +12,61 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_base::regex_pattern::NAME_PATTERN_REG; pub use datafusion_common::ScalarValue; +use once_cell::sync::OnceCell; +use snafu::ensure; pub use crate::columnar_value::ColumnarValue; +use crate::error::{InvalidColumnPrefixSnafu, Result}; -/// Default timestamp column name for Prometheus metrics. -pub const GREPTIME_TIMESTAMP: &str = "greptime_timestamp"; -/// Default value column name for Prometheus metrics. -pub const GREPTIME_VALUE: &str = "greptime_value"; -/// Default counter column name for OTLP metrics. +/// Default time index column name. +static GREPTIME_TIMESTAMP_CELL: OnceCell = OnceCell::new(); + +/// Default value column name. +static GREPTIME_VALUE_CELL: OnceCell = OnceCell::new(); + +pub fn set_default_prefix(prefix: Option<&str>) -> Result<()> { + match prefix { + None => { + // use default greptime prefix + GREPTIME_TIMESTAMP_CELL.get_or_init(|| GREPTIME_TIMESTAMP.to_string()); + GREPTIME_VALUE_CELL.get_or_init(|| GREPTIME_VALUE.to_string()); + } + Some(s) if s.trim().is_empty() => { + // use "" to disable prefix + GREPTIME_TIMESTAMP_CELL.get_or_init(|| "timestamp".to_string()); + GREPTIME_VALUE_CELL.get_or_init(|| "value".to_string()); + } + Some(x) => { + ensure!( + NAME_PATTERN_REG.is_match(x), + InvalidColumnPrefixSnafu { prefix: x } + ); + GREPTIME_TIMESTAMP_CELL.get_or_init(|| format!("{}_timestamp", x)); + GREPTIME_VALUE_CELL.get_or_init(|| format!("{}_value", x)); + } + } + Ok(()) +} + +/// Get the default timestamp column name. +/// Returns the configured value, or `greptime_timestamp` if not set. +pub fn greptime_timestamp() -> &'static str { + GREPTIME_TIMESTAMP_CELL.get_or_init(|| GREPTIME_TIMESTAMP.to_string()) +} + +/// Get the default value column name. +/// Returns the configured value, or `greptime_value` if not set. +pub fn greptime_value() -> &'static str { + GREPTIME_VALUE_CELL.get_or_init(|| GREPTIME_VALUE.to_string()) +} + +/// Default timestamp column name constant for backward compatibility. +const GREPTIME_TIMESTAMP: &str = "greptime_timestamp"; +/// Default value column name constant for backward compatibility. +const GREPTIME_VALUE: &str = "greptime_value"; +/// Default counter column name for OTLP metrics (legacy mode). pub const GREPTIME_COUNT: &str = "greptime_count"; /// Default physical table name pub const GREPTIME_PHYSICAL_TABLE: &str = "greptime_physical_table"; diff --git a/src/common/recordbatch/benches/iter_record_batch_rows.rs b/src/common/recordbatch/benches/iter_record_batch_rows.rs index b819a4658e..7b95189550 100644 --- a/src/common/recordbatch/benches/iter_record_batch_rows.rs +++ b/src/common/recordbatch/benches/iter_record_batch_rows.rs @@ -26,7 +26,6 @@ use datatypes::arrow::datatypes::{ Int32Type, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; -use datatypes::schema::SchemaRef; fn prepare_record_batch(rows: usize) -> RecordBatch { let schema = Schema::new(vec![ @@ -56,14 +55,6 @@ fn prepare_record_batch(rows: usize) -> RecordBatch { RecordBatch::try_new(Arc::new(schema), columns).unwrap() } -fn iter_by_greptimedb_values(schema: SchemaRef, record_batch: RecordBatch) { - let record_batch = - common_recordbatch::RecordBatch::try_from_df_record_batch(schema, record_batch).unwrap(); - for row in record_batch.rows() { - black_box(row); - } -} - fn iter_by_loop_rows_and_columns(record_batch: RecordBatch) { for i in 0..record_batch.num_rows() { for column in record_batch.columns() { @@ -125,19 +116,6 @@ pub fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("iter_record_batch"); for rows in [1usize, 10, 100, 1_000, 10_000] { - group.bench_with_input( - BenchmarkId::new("by_greptimedb_values", rows), - &rows, - |b, rows| { - let record_batch = prepare_record_batch(*rows); - let schema = - Arc::new(datatypes::schema::Schema::try_from(record_batch.schema()).unwrap()); - b.iter(|| { - iter_by_greptimedb_values(schema.clone(), record_batch.clone()); - }) - }, - ); - group.bench_with_input( BenchmarkId::new("by_loop_rows_and_columns", rows), &rows, diff --git a/src/common/recordbatch/src/adapter.rs b/src/common/recordbatch/src/adapter.rs index fdec79fdef..7e504559b6 100644 --- a/src/common/recordbatch/src/adapter.rs +++ b/src/common/recordbatch/src/adapter.rs @@ -314,10 +314,10 @@ impl Stream for RecordBatchStreamAdapter { metric_collector.record_batch_metrics, ); } - Poll::Ready(Some(RecordBatch::try_from_df_record_batch( + Poll::Ready(Some(Ok(RecordBatch::from_df_record_batch( self.schema(), df_record_batch, - ))) + )))) } Poll::Ready(None) => { if let Metrics::Unresolved(df_plan) | Metrics::PartialResolved(df_plan, _) = diff --git a/src/common/recordbatch/src/error.rs b/src/common/recordbatch/src/error.rs index e07d152d2d..6d794463a0 100644 --- a/src/common/recordbatch/src/error.rs +++ b/src/common/recordbatch/src/error.rs @@ -133,18 +133,6 @@ pub enum Error { source: datatypes::error::Error, }, - #[snafu(display( - "Failed to downcast vector of type '{:?}' to type '{:?}'", - from_type, - to_type - ))] - DowncastVector { - from_type: ConcreteDataType, - to_type: ConcreteDataType, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Error occurs when performing arrow computation"))] ArrowCompute { #[snafu(source)] @@ -193,6 +181,20 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Exceeded memory limit: {}", msg))] + ExceedMemoryLimit { + msg: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to align JSON array, reason: {reason}"))] + AlignJsonArray { + reason: String, + #[snafu(implicit)] + location: Location, + }, } impl ErrorExt for Error { @@ -208,9 +210,8 @@ impl ErrorExt for Error { | Error::ToArrowScalar { .. } | Error::ProjectArrowRecordBatch { .. } | Error::PhysicalExpr { .. } - | Error::RecordBatchSliceIndexOverflow { .. } => StatusCode::Internal, - - Error::DowncastVector { .. } => StatusCode::Unexpected, + | Error::RecordBatchSliceIndexOverflow { .. } + | Error::AlignJsonArray { .. } => StatusCode::Internal, Error::PollStream { .. } => StatusCode::EngineExecuteQuery, @@ -229,6 +230,8 @@ impl ErrorExt for Error { Error::StreamTimeout { .. } => StatusCode::Cancelled, Error::StreamCancelled { .. } => StatusCode::Cancelled, + + Error::ExceedMemoryLimit { .. } => StatusCode::RuntimeResourcesExhausted, } } diff --git a/src/common/recordbatch/src/lib.rs b/src/common/recordbatch/src/lib.rs index 7ae4a419d6..c1253cfa1c 100644 --- a/src/common/recordbatch/src/lib.rs +++ b/src/common/recordbatch/src/lib.rs @@ -18,28 +18,32 @@ pub mod adapter; pub mod cursor; pub mod error; pub mod filter; -mod recordbatch; +pub mod recordbatch; pub mod util; +use std::fmt; use std::pin::Pin; use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use adapter::RecordBatchMetrics; use arc_swap::ArcSwapOption; +use common_base::readable_size::ReadableSize; pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream; +use datatypes::arrow::array::{ArrayRef, AsArray, StringBuilder}; use datatypes::arrow::compute::SortOptions; pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch; use datatypes::arrow::util::pretty; use datatypes::prelude::{ConcreteDataType, VectorRef}; -use datatypes::scalars::{ScalarVector, ScalarVectorBuilder}; use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; use datatypes::types::{JsonFormat, jsonb_to_string}; -use datatypes::vectors::{BinaryVector, StringVectorBuilder}; use error::Result; use futures::task::{Context, Poll}; use futures::{Stream, TryStreamExt}; pub use recordbatch::RecordBatch; -use snafu::{OptionExt, ResultExt, ensure}; +use snafu::{ResultExt, ensure}; + +use crate::error::NewDfRecordBatchSnafu; pub trait RecordBatchStream: Stream> { fn name(&self) -> &str { @@ -89,20 +93,14 @@ pub fn map_json_type_to_string( mapped_schema: &SchemaRef, ) -> Result { let mut vectors = Vec::with_capacity(original_schema.column_schemas().len()); - for (vector, schema) in batch.columns.iter().zip(original_schema.column_schemas()) { + for (vector, schema) in batch.columns().iter().zip(original_schema.column_schemas()) { if let ConcreteDataType::Json(j) = &schema.data_type { if matches!(&j.format, JsonFormat::Jsonb) { - let mut string_vector_builder = StringVectorBuilder::with_capacity(vector.len()); - let binary_vector = vector - .as_any() - .downcast_ref::() - .with_context(|| error::DowncastVectorSnafu { - from_type: schema.data_type.clone(), - to_type: ConcreteDataType::binary_datatype(), - })?; - for value in binary_vector.iter_data() { + let mut string_vector_builder = StringBuilder::new(); + let binary_vector = vector.as_binary::(); + for value in binary_vector.iter() { let Some(value) = value else { - string_vector_builder.push(None); + string_vector_builder.append_null(); continue; }; let string_value = @@ -110,11 +108,11 @@ pub fn map_json_type_to_string( from_type: schema.data_type.clone(), to_type: ConcreteDataType::string_datatype(), })?; - string_vector_builder.push(Some(string_value.as_str())); + string_vector_builder.append_value(string_value); } let string_vector = string_vector_builder.finish(); - vectors.push(Arc::new(string_vector) as VectorRef); + vectors.push(Arc::new(string_vector) as ArrayRef); } else { vectors.push(vector.clone()); } @@ -123,7 +121,15 @@ pub fn map_json_type_to_string( } } - RecordBatch::new(mapped_schema.clone(), vectors) + let record_batch = datatypes::arrow::record_batch::RecordBatch::try_new( + mapped_schema.arrow_schema().clone(), + vectors, + ) + .context(NewDfRecordBatchSnafu)?; + Ok(RecordBatch::from_df_record_batch( + mapped_schema.clone(), + record_batch, + )) } /// Maps the json type to string in the schema. @@ -406,6 +412,395 @@ impl> + Unpin> Stream for RecordBatchStream } } +/// Memory permit for a stream, providing privileged access or rate limiting. +/// +/// The permit tracks whether this stream has privileged Top-K status. +/// When dropped, it automatically releases any privileged slot it holds. +pub struct MemoryPermit { + tracker: QueryMemoryTracker, + is_privileged: AtomicBool, +} + +impl MemoryPermit { + /// Check if this permit currently has privileged status. + pub fn is_privileged(&self) -> bool { + self.is_privileged.load(Ordering::Acquire) + } + + /// Ensure this permit has privileged status by acquiring a slot if available. + /// Returns true if privileged (either already privileged or just acquired privilege). + fn ensure_privileged(&self) -> bool { + if self.is_privileged.load(Ordering::Acquire) { + return true; + } + + // Try to claim a privileged slot + self.tracker + .privileged_count + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| { + if count < self.tracker.privileged_slots { + Some(count + 1) + } else { + None + } + }) + .map(|_| { + self.is_privileged.store(true, Ordering::Release); + true + }) + .unwrap_or(false) + } + + /// Track additional memory usage with this permit. + /// Returns error if limit is exceeded. + /// + /// # Arguments + /// * `additional` - Additional memory size to track in bytes + /// * `stream_tracked` - Total memory already tracked by this stream + /// + /// # Behavior + /// - Privileged streams: Can push global memory usage up to full limit + /// - Standard-tier streams: Can push global memory usage up to limit * standard_tier_memory_fraction (default: 0.7) + /// - Standard-tier streams automatically attempt to acquire privilege if slots become available + /// - The configured limit is absolute hard limit - no stream can exceed it + pub fn track(&self, additional: usize, stream_tracked: usize) -> Result<()> { + // Ensure privileged status if possible + let is_privileged = self.ensure_privileged(); + + self.tracker + .track_internal(additional, is_privileged, stream_tracked) + } + + /// Release tracked memory. + /// + /// # Arguments + /// * `amount` - Amount of memory to release in bytes + pub fn release(&self, amount: usize) { + self.tracker.release(amount); + } +} + +impl Drop for MemoryPermit { + fn drop(&mut self) { + // Release privileged slot if we had one + if self.is_privileged.load(Ordering::Acquire) { + self.tracker + .privileged_count + .fetch_sub(1, Ordering::Release); + } + } +} + +/// Memory tracker for RecordBatch streams. Clone to share the same limit across queries. +/// +/// Implements a two-tier memory allocation strategy: +/// - **Privileged tier**: First N streams (default: 20) can use up to the full memory limit +/// - **Standard tier**: Remaining streams are restricted to a fraction of the limit (default: 70%) +/// - Privilege is granted on a first-come-first-served basis +/// - The configured limit is an absolute hard cap - no stream can exceed it +#[derive(Clone)] +pub struct QueryMemoryTracker { + current: Arc, + limit: usize, + standard_tier_memory_fraction: f64, + privileged_count: Arc, + privileged_slots: usize, + on_update: Option>, + on_reject: Option>, +} + +impl fmt::Debug for QueryMemoryTracker { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("QueryMemoryTracker") + .field("current", &self.current.load(Ordering::Acquire)) + .field("limit", &self.limit) + .field( + "standard_tier_memory_fraction", + &self.standard_tier_memory_fraction, + ) + .field( + "privileged_count", + &self.privileged_count.load(Ordering::Acquire), + ) + .field("privileged_slots", &self.privileged_slots) + .field("on_update", &self.on_update.is_some()) + .field("on_reject", &self.on_reject.is_some()) + .finish() + } +} + +impl QueryMemoryTracker { + // Default privileged slots when max_concurrent_queries is 0. + const DEFAULT_PRIVILEGED_SLOTS: usize = 20; + // Ratio for privileged tier: 70% queries get privileged access, standard tier uses 70% memory. + const DEFAULT_PRIVILEGED_TIER_RATIO: f64 = 0.7; + + /// Create a new memory tracker with the given limit and max_concurrent_queries. + /// Calculates privileged slots as 70% of max_concurrent_queries (or 20 if max_concurrent_queries is 0). + /// + /// # Arguments + /// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited. + /// * `max_concurrent_queries` - Maximum number of concurrent queries (0 = unlimited). + pub fn new(limit: usize, max_concurrent_queries: usize) -> Self { + let privileged_slots = Self::calculate_privileged_slots(max_concurrent_queries); + Self::with_privileged_slots(limit, privileged_slots) + } + + /// Create a new memory tracker with custom privileged slots limit. + pub fn with_privileged_slots(limit: usize, privileged_slots: usize) -> Self { + Self::with_config(limit, privileged_slots, Self::DEFAULT_PRIVILEGED_TIER_RATIO) + } + + /// Create a new memory tracker with full configuration. + /// + /// # Arguments + /// * `limit` - Maximum memory usage in bytes (hard limit for all streams). 0 means unlimited. + /// * `privileged_slots` - Maximum number of streams that can get privileged status. + /// * `standard_tier_memory_fraction` - Memory fraction for standard-tier streams (range: [0.0, 1.0]). + /// + /// # Panics + /// Panics if `standard_tier_memory_fraction` is not in the range [0.0, 1.0]. + pub fn with_config( + limit: usize, + privileged_slots: usize, + standard_tier_memory_fraction: f64, + ) -> Self { + assert!( + (0.0..=1.0).contains(&standard_tier_memory_fraction), + "standard_tier_memory_fraction must be in [0.0, 1.0], got {}", + standard_tier_memory_fraction + ); + + Self { + current: Arc::new(AtomicUsize::new(0)), + limit, + standard_tier_memory_fraction, + privileged_count: Arc::new(AtomicUsize::new(0)), + privileged_slots, + on_update: None, + on_reject: None, + } + } + + /// Register a new permit for memory tracking. + /// The first `privileged_slots` permits get privileged status automatically. + /// The returned permit can be shared across multiple streams of the same query. + pub fn register_permit(&self) -> MemoryPermit { + // Try to claim a privileged slot + let is_privileged = self + .privileged_count + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |count| { + if count < self.privileged_slots { + Some(count + 1) + } else { + None + } + }) + .is_ok(); + + MemoryPermit { + tracker: self.clone(), + is_privileged: AtomicBool::new(is_privileged), + } + } + + /// Set a callback to be called whenever the usage changes successfully. + /// The callback receives the new total usage in bytes. + /// + /// # Note + /// The callback is called after both successful `track()` and `release()` operations. + /// It is called even when `limit == 0` (unlimited mode) to track actual usage. + pub fn with_on_update(mut self, on_update: F) -> Self + where + F: Fn(usize) + Send + Sync + 'static, + { + self.on_update = Some(Arc::new(on_update)); + self + } + + /// Set a callback to be called when memory allocation is rejected. + /// + /// # Note + /// This is only called when `track()` fails due to exceeding the limit. + /// It is never called when `limit == 0` (unlimited mode). + pub fn with_on_reject(mut self, on_reject: F) -> Self + where + F: Fn() + Send + Sync + 'static, + { + self.on_reject = Some(Arc::new(on_reject)); + self + } + + /// Get the current memory usage in bytes. + pub fn current(&self) -> usize { + self.current.load(Ordering::Acquire) + } + + fn calculate_privileged_slots(max_concurrent_queries: usize) -> usize { + if max_concurrent_queries == 0 { + Self::DEFAULT_PRIVILEGED_SLOTS + } else { + ((max_concurrent_queries as f64 * Self::DEFAULT_PRIVILEGED_TIER_RATIO) as usize).max(1) + } + } + + /// Internal method to track additional memory usage. + /// + /// Called by `MemoryPermit::track()`. Use `MemoryPermit::track()` instead of calling this directly. + fn track_internal( + &self, + additional: usize, + is_privileged: bool, + stream_tracked: usize, + ) -> Result<()> { + // Calculate effective global limit based on stream privilege + // Privileged streams: can push global usage up to full limit + // Standard-tier streams: can only push global usage up to fraction of limit + let effective_limit = if is_privileged { + self.limit + } else { + (self.limit as f64 * self.standard_tier_memory_fraction) as usize + }; + + let mut new_total = 0; + let result = self + .current + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { + new_total = current.saturating_add(additional); + + if self.limit == 0 { + // Unlimited mode + return Some(new_total); + } + + // Check if new global total exceeds effective limit + // The configured limit is absolute hard limit - no stream can exceed it + if new_total <= effective_limit { + Some(new_total) + } else { + None + } + }); + + match result { + Ok(_) => { + if let Some(callback) = &self.on_update { + callback(new_total); + } + Ok(()) + } + Err(current) => { + if let Some(callback) = &self.on_reject { + callback(); + } + let msg = format!( + "{} requested, {} used globally ({}%), {} used by this stream (privileged: {}), effective limit: {} ({}%), hard limit: {}", + ReadableSize(additional as u64), + ReadableSize(current as u64), + if self.limit > 0 { + current * 100 / self.limit + } else { + 0 + }, + ReadableSize(stream_tracked as u64), + is_privileged, + ReadableSize(effective_limit as u64), + if self.limit > 0 { + effective_limit * 100 / self.limit + } else { + 0 + }, + ReadableSize(self.limit as u64) + ); + error::ExceedMemoryLimitSnafu { msg }.fail() + } + } + } + + /// Release tracked memory. + /// + /// # Arguments + /// * `amount` - Amount of memory to release in bytes + pub fn release(&self, amount: usize) { + if let Ok(old_value) = + self.current + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { + Some(current.saturating_sub(amount)) + }) + && let Some(callback) = &self.on_update + { + callback(old_value.saturating_sub(amount)); + } + } +} + +/// A wrapper stream that tracks memory usage of RecordBatches. +pub struct MemoryTrackedStream { + inner: SendableRecordBatchStream, + permit: Arc, + // Total tracked size, released when stream drops. + total_tracked: usize, +} + +impl MemoryTrackedStream { + pub fn new(inner: SendableRecordBatchStream, permit: Arc) -> Self { + Self { + inner, + permit, + total_tracked: 0, + } + } +} + +impl Stream for MemoryTrackedStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match Pin::new(&mut self.inner).poll_next(cx) { + Poll::Ready(Some(Ok(batch))) => { + let additional = batch.buffer_memory_size(); + + if let Err(e) = self.permit.track(additional, self.total_tracked) { + return Poll::Ready(Some(Err(e))); + } + + self.total_tracked += additional; + + Poll::Ready(Some(Ok(batch))) + } + Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))), + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + } + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl Drop for MemoryTrackedStream { + fn drop(&mut self) { + if self.total_tracked > 0 { + self.permit.release(self.total_tracked); + } + } +} + +impl RecordBatchStream for MemoryTrackedStream { + fn schema(&self) -> SchemaRef { + self.inner.schema() + } + + fn output_ordering(&self) -> Option<&[OrderOption]> { + self.inner.output_ordering() + } + + fn metrics(&self) -> Option { + self.inner.metrics() + } +} + #[cfg(test)] mod tests { use std::sync::Arc; @@ -496,4 +891,157 @@ mod tests { assert_eq!(collected[0], batch1); assert_eq!(collected[1], batch2); } + + #[test] + fn test_query_memory_tracker_basic() { + let tracker = Arc::new(QueryMemoryTracker::new(1000, 0)); + + // Register first stream - should get privileged status + let permit1 = tracker.register_permit(); + assert!(permit1.is_privileged()); + + // Privileged stream can use up to limit + assert!(permit1.track(500, 0).is_ok()); + assert_eq!(tracker.current(), 500); + + // Register second stream - also privileged + let permit2 = tracker.register_permit(); + assert!(permit2.is_privileged()); + // Can add more but cannot exceed hard limit (1000) + assert!(permit2.track(400, 0).is_ok()); + assert_eq!(tracker.current(), 900); + + permit1.release(500); + permit2.release(400); + assert_eq!(tracker.current(), 0); + } + + #[test] + fn test_query_memory_tracker_privileged_limit() { + // Privileged slots = 2 for easy testing + // Limit: 1000, standard-tier fraction: 0.7 (default) + // Privileged can push global to 1000, standard-tier can push global to 700 + let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 2)); + + // First 2 streams are privileged + let permit1 = tracker.register_permit(); + let permit2 = tracker.register_permit(); + assert!(permit1.is_privileged()); + assert!(permit2.is_privileged()); + + // Third stream is standard-tier (not privileged) + let permit3 = tracker.register_permit(); + assert!(!permit3.is_privileged()); + + // Privileged stream uses some memory + assert!(permit1.track(300, 0).is_ok()); + assert_eq!(tracker.current(), 300); + + // Standard-tier can add up to 400 (total becomes 700, its effective limit) + assert!(permit3.track(400, 0).is_ok()); + assert_eq!(tracker.current(), 700); + + // Standard-tier stream cannot push global beyond 700 + let err = permit3.track(100, 400).unwrap_err(); + let err_msg = err.to_string(); + assert!(err_msg.contains("400B used by this stream")); + assert!(err_msg.contains("effective limit: 700B (70%)")); + assert!(err_msg.contains("700B used globally (70%)")); + assert_eq!(tracker.current(), 700); + + permit1.release(300); + permit3.release(400); + assert_eq!(tracker.current(), 0); + } + + #[test] + fn test_query_memory_tracker_promotion() { + // Privileged slots = 1 for easy testing + let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1)); + + // First stream is privileged + let permit1 = tracker.register_permit(); + assert!(permit1.is_privileged()); + + // Second stream is standard-tier (can only use 500) + let permit2 = tracker.register_permit(); + assert!(!permit2.is_privileged()); + + // Standard-tier can only track 500 + assert!(permit2.track(400, 0).is_ok()); + assert_eq!(tracker.current(), 400); + + // Drop first permit to release privileged slot + drop(permit1); + + // Second stream can now be promoted and use more memory + assert!(permit2.track(500, 400).is_ok()); + assert!(permit2.is_privileged()); + assert_eq!(tracker.current(), 900); + + permit2.release(900); + assert_eq!(tracker.current(), 0); + } + + #[test] + fn test_query_memory_tracker_privileged_hard_limit() { + // Test that the configured limit is absolute hard limit for all streams + // Privileged: can use full limit (1000) + // Standard-tier: can use 0.7x limit (700 with defaults) + let tracker = Arc::new(QueryMemoryTracker::new(1000, 0)); + + let permit1 = tracker.register_permit(); + assert!(permit1.is_privileged()); + + // Privileged can use up to full limit (1000) + assert!(permit1.track(900, 0).is_ok()); + assert_eq!(tracker.current(), 900); + + // Privileged cannot exceed hard limit (1000) + assert!(permit1.track(200, 900).is_err()); + assert_eq!(tracker.current(), 900); + + // Can add within hard limit + assert!(permit1.track(100, 900).is_ok()); + assert_eq!(tracker.current(), 1000); + + // Cannot exceed even by 1 byte + assert!(permit1.track(1, 1000).is_err()); + assert_eq!(tracker.current(), 1000); + + permit1.release(1000); + assert_eq!(tracker.current(), 0); + } + + #[test] + fn test_query_memory_tracker_standard_tier_fraction() { + // Test standard-tier streams use fraction of limit + // Limit: 1000, default fraction: 0.7, so standard-tier can use 700 + let tracker = Arc::new(QueryMemoryTracker::with_privileged_slots(1000, 1)); + + let permit1 = tracker.register_permit(); + assert!(permit1.is_privileged()); + + let permit2 = tracker.register_permit(); + assert!(!permit2.is_privileged()); + + // Standard-tier can use up to 700 (1000 * 0.7 default) + assert!(permit2.track(600, 0).is_ok()); + assert_eq!(tracker.current(), 600); + + // Cannot exceed standard-tier limit (700) + assert!(permit2.track(200, 600).is_err()); + assert_eq!(tracker.current(), 600); + + // Can add within standard-tier limit + assert!(permit2.track(100, 600).is_ok()); + assert_eq!(tracker.current(), 700); + + // Cannot exceed standard-tier limit + assert!(permit2.track(1, 700).is_err()); + assert_eq!(tracker.current(), 700); + + permit2.release(700); + assert_eq!(tracker.current(), 0); + } } diff --git a/src/common/recordbatch/src/recordbatch.rs b/src/common/recordbatch/src/recordbatch.rs index 3cc30ce1ba..a9dd663c2c 100644 --- a/src/common/recordbatch/src/recordbatch.rs +++ b/src/common/recordbatch/src/recordbatch.rs @@ -20,10 +20,10 @@ use datafusion::arrow::util::pretty::pretty_format_batches; use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::compute; use datafusion_common::arrow::datatypes::{DataType as ArrowDataType, SchemaRef as ArrowSchemaRef}; -use datatypes::arrow::array::RecordBatchOptions; +use datatypes::arrow::array::{Array, AsArray, RecordBatchOptions, StructArray, new_null_array}; +use datatypes::extension::json::is_json_extension_type; use datatypes::prelude::DataType; use datatypes::schema::SchemaRef; -use datatypes::value::Value; use datatypes::vectors::{Helper, VectorRef}; use serde::ser::{Error, SerializeStruct}; use serde::{Serialize, Serializer}; @@ -31,15 +31,14 @@ use snafu::{OptionExt, ResultExt, ensure}; use crate::DfRecordBatch; use crate::error::{ - self, ArrowComputeSnafu, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu, - ProjectArrowRecordBatchSnafu, Result, + self, AlignJsonArraySnafu, ArrowComputeSnafu, ColumnNotExistsSnafu, DataTypesSnafu, + NewDfRecordBatchSnafu, ProjectArrowRecordBatchSnafu, Result, }; /// A two-dimensional batch of column-oriented data with a defined schema. #[derive(Clone, Debug, PartialEq)] pub struct RecordBatch { pub schema: SchemaRef, - pub columns: Vec, df_record_batch: DfRecordBatch, } @@ -61,12 +60,13 @@ impl RecordBatch { // TODO(LFC): Remove the casting here once `Batch` is no longer used. let arrow_arrays = Self::cast_view_arrays(schema.arrow_schema(), arrow_arrays)?; + let arrow_arrays = maybe_align_json_array_with_schema(schema.arrow_schema(), arrow_arrays)?; + let df_record_batch = DfRecordBatch::try_new(schema.arrow_schema().clone(), arrow_arrays) .context(error::NewDfRecordBatchSnafu)?; Ok(RecordBatch { schema, - columns, df_record_batch, }) } @@ -92,14 +92,8 @@ impl RecordBatch { /// Create an empty [`RecordBatch`] from `schema`. pub fn new_empty(schema: SchemaRef) -> RecordBatch { let df_record_batch = DfRecordBatch::new_empty(schema.arrow_schema().clone()); - let columns = schema - .column_schemas() - .iter() - .map(|col| col.data_type.create_mutable_vector(0).to_vector()) - .collect(); RecordBatch { schema, - columns, df_record_batch, } } @@ -114,17 +108,12 @@ impl RecordBatch { .context(error::NewDfRecordBatchSnafu)?; Ok(RecordBatch { schema, - columns: vec![], df_record_batch, }) } pub fn try_project(&self, indices: &[usize]) -> Result { let schema = Arc::new(self.schema.try_project(indices).context(DataTypesSnafu)?); - let mut columns = Vec::with_capacity(indices.len()); - for index in indices { - columns.push(self.columns[*index].clone()); - } let df_record_batch = self.df_record_batch.project(indices).with_context(|_| { ProjectArrowRecordBatchSnafu { schema: self.schema.clone(), @@ -134,7 +123,6 @@ impl RecordBatch { Ok(Self { schema, - columns, df_record_batch, }) } @@ -142,21 +130,11 @@ impl RecordBatch { /// Create a new [`RecordBatch`] from `schema` and `df_record_batch`. /// /// This method doesn't check the schema. - pub fn try_from_df_record_batch( - schema: SchemaRef, - df_record_batch: DfRecordBatch, - ) -> Result { - let columns = df_record_batch - .columns() - .iter() - .map(|c| Helper::try_into_vector(c.clone()).context(error::DataTypesSnafu)) - .collect::>>()?; - - Ok(RecordBatch { + pub fn from_df_record_batch(schema: SchemaRef, df_record_batch: DfRecordBatch) -> RecordBatch { + RecordBatch { schema, - columns, df_record_batch, - }) + } } #[inline] @@ -170,23 +148,22 @@ impl RecordBatch { } #[inline] - pub fn columns(&self) -> &[VectorRef] { - &self.columns + pub fn columns(&self) -> &[ArrayRef] { + self.df_record_batch.columns() } #[inline] - pub fn column(&self, idx: usize) -> &VectorRef { - &self.columns[idx] + pub fn column(&self, idx: usize) -> &ArrayRef { + self.df_record_batch.column(idx) } - pub fn column_by_name(&self, name: &str) -> Option<&VectorRef> { - let idx = self.schema.column_index_by_name(name)?; - Some(&self.columns[idx]) + pub fn column_by_name(&self, name: &str) -> Option<&ArrayRef> { + self.df_record_batch.column_by_name(name) } #[inline] pub fn num_columns(&self) -> usize { - self.columns.len() + self.df_record_batch.num_columns() } #[inline] @@ -194,11 +171,6 @@ impl RecordBatch { self.df_record_batch.num_rows() } - /// Create an iterator to traverse the data by row - pub fn rows(&self) -> RecordBatchRowIterator<'_> { - RecordBatchRowIterator::new(self) - } - pub fn column_vectors( &self, table_name: &str, @@ -207,9 +179,14 @@ impl RecordBatch { let mut vectors = HashMap::with_capacity(self.num_columns()); // column schemas in recordbatch must match its vectors, otherwise it's corrupted - for (vector_schema, vector) in self.schema.column_schemas().iter().zip(self.columns.iter()) + for (field, array) in self + .df_record_batch + .schema() + .fields() + .iter() + .zip(self.df_record_batch.columns().iter()) { - let column_name = &vector_schema.name; + let column_name = field.name(); let column_schema = table_schema .column_schema_by_name(column_name) @@ -217,15 +194,12 @@ impl RecordBatch { table_name, column_name, })?; - let vector = if vector_schema.data_type != column_schema.data_type { - vector - .cast(&column_schema.data_type) - .with_context(|_| CastVectorSnafu { - from_type: vector.data_type(), - to_type: column_schema.data_type.clone(), - })? + let vector = if field.data_type() != &column_schema.data_type.as_arrow_type() { + let array = compute::cast(array, &column_schema.data_type.as_arrow_type()) + .context(ArrowComputeSnafu)?; + Helper::try_into_vector(array).context(DataTypesSnafu)? } else { - vector.clone() + Helper::try_into_vector(array).context(DataTypesSnafu)? }; let _ = vectors.insert(column_name.clone(), vector); @@ -250,8 +224,69 @@ impl RecordBatch { visit_index: offset + len } ); - let columns = self.columns.iter().map(|vector| vector.slice(offset, len)); - RecordBatch::new(self.schema.clone(), columns) + let sliced = self.df_record_batch.slice(offset, len); + Ok(RecordBatch::from_df_record_batch( + self.schema.clone(), + sliced, + )) + } + + /// Returns the total number of bytes of memory pointed to by the arrays in this `RecordBatch`. + /// + /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. + /// Note that this does not always correspond to the exact memory usage of an array, + /// since multiple arrays can share the same buffers or slices thereof. + pub fn buffer_memory_size(&self) -> usize { + self.df_record_batch + .columns() + .iter() + .map(|array| array.get_buffer_memory_size()) + .sum() + } + + /// Iterate the values as strings in the column at index `i`. + /// + /// Note that if the underlying array is not a valid GreptimeDB vector, an empty iterator is + /// returned. + /// + /// # Panics + /// if index `i` is out of bound. + pub fn iter_column_as_string(&self, i: usize) -> Box> + '_> { + macro_rules! iter { + ($column: ident) => { + Box::new( + (0..$column.len()) + .map(|i| $column.is_valid(i).then(|| $column.value(i).to_string())), + ) + }; + } + + let column = self.df_record_batch.column(i); + match column.data_type() { + ArrowDataType::Utf8 => { + let column = column.as_string::(); + let iter = iter!(column); + iter as _ + } + ArrowDataType::LargeUtf8 => { + let column = column.as_string::(); + iter!(column) + } + ArrowDataType::Utf8View => { + let column = column.as_string_view(); + iter!(column) + } + _ => { + if let Ok(column) = Helper::try_into_vector(column) { + Box::new( + (0..column.len()) + .map(move |i| (!column.is_null(i)).then(|| column.get(i).to_string())), + ) + } else { + Box::new(std::iter::empty()) + } + } + } } } @@ -265,8 +300,9 @@ impl Serialize for RecordBatch { let mut s = serializer.serialize_struct("record", 2)?; s.serialize_field("schema", &**self.schema.arrow_schema())?; - let vec = self - .columns + let columns = self.df_record_batch.columns(); + let columns = Helper::try_into_vectors(columns).map_err(Error::custom)?; + let vec = columns .iter() .map(|c| c.serialize_to_json()) .collect::, _>>() @@ -277,44 +313,6 @@ impl Serialize for RecordBatch { } } -pub struct RecordBatchRowIterator<'a> { - record_batch: &'a RecordBatch, - rows: usize, - columns: usize, - row_cursor: usize, -} - -impl<'a> RecordBatchRowIterator<'a> { - fn new(record_batch: &'a RecordBatch) -> RecordBatchRowIterator<'a> { - RecordBatchRowIterator { - record_batch, - rows: record_batch.df_record_batch.num_rows(), - columns: record_batch.df_record_batch.num_columns(), - row_cursor: 0, - } - } -} - -impl Iterator for RecordBatchRowIterator<'_> { - type Item = Vec; - - fn next(&mut self) -> Option { - if self.row_cursor == self.rows { - None - } else { - let mut row = Vec::with_capacity(self.columns); - - for col in 0..self.columns { - let column = self.record_batch.column(col); - row.push(column.get(self.row_cursor)); - } - - self.row_cursor += 1; - Some(row) - } - } -} - /// merge multiple recordbatch into a single pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Result { let batches_len = batches.len(); @@ -322,40 +320,287 @@ pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Resul return Ok(RecordBatch::new_empty(schema)); } - let n_rows = batches.iter().map(|b| b.num_rows()).sum(); - let n_columns = schema.num_columns(); - // Collect arrays from each batch - let mut merged_columns = Vec::with_capacity(n_columns); - - for col_idx in 0..n_columns { - let mut acc = schema.column_schemas()[col_idx] - .data_type - .create_mutable_vector(n_rows); - - for batch in batches { - let column = batch.column(col_idx); - acc.extend_slice_of(column.as_ref(), 0, column.len()) - .context(error::DataTypesSnafu)?; - } - - merged_columns.push(acc.to_vector()); - } + let record_batch = compute::concat_batches( + schema.arrow_schema(), + batches.iter().map(|x| x.df_record_batch()), + ) + .context(ArrowComputeSnafu)?; // Create a new RecordBatch with merged columns - RecordBatch::new(schema, merged_columns) + Ok(RecordBatch::from_df_record_batch(schema, record_batch)) +} + +/// Align a json array `json_array` to the json type `schema_type`. The `schema_type` is often the +/// "largest" json type after some insertions in the table schema, while the json array previously +/// written in the SST could be lagged behind it. So it's important to "amend" the json array's +/// missing fields with null arrays, to align the array's data type with the provided one. +/// +/// # Panics +/// +/// - The json array is not an Arrow [StructArray], or the provided data type `schema_type` is not +/// of Struct type. Both of which shouldn't happen unless we switch our implementation of how +/// json array is physically stored. +pub fn align_json_array(json_array: &ArrayRef, schema_type: &ArrowDataType) -> Result { + let json_type = json_array.data_type(); + if json_type == schema_type { + return Ok(json_array.clone()); + } + + let json_array = json_array.as_struct(); + let array_fields = json_array.fields(); + let array_columns = json_array.columns(); + let ArrowDataType::Struct(schema_fields) = schema_type else { + unreachable!() + }; + let mut aligned = Vec::with_capacity(schema_fields.len()); + + // Compare the fields in the json array and the to-be-aligned schema, amending with null arrays + // on the way. It's very important to note that fields in the json array and in the json type + // are both SORTED. + + let mut i = 0; // point to the schema fields + let mut j = 0; // point to the array fields + while i < schema_fields.len() && j < array_fields.len() { + let schema_field = &schema_fields[i]; + let array_field = &array_fields[j]; + if schema_field.name() == array_field.name() { + if matches!(schema_field.data_type(), ArrowDataType::Struct(_)) { + // A `StructArray`s in a json array must be another json array. (Like a nested json + // object in a json value.) + aligned.push(align_json_array( + &array_columns[j], + schema_field.data_type(), + )?); + } else { + aligned.push(array_columns[j].clone()); + } + j += 1; + } else { + aligned.push(new_null_array(schema_field.data_type(), json_array.len())); + } + i += 1; + } + if i < schema_fields.len() { + for field in &schema_fields[i..] { + aligned.push(new_null_array(field.data_type(), json_array.len())); + } + } + ensure!( + j == array_fields.len(), + AlignJsonArraySnafu { + reason: format!( + "this json array has more fields {:?}", + array_fields[j..] + .iter() + .map(|x| x.name()) + .collect::>(), + ) + } + ); + + let json_array = + StructArray::try_new(schema_fields.clone(), aligned, json_array.nulls().cloned()) + .context(NewDfRecordBatchSnafu)?; + Ok(Arc::new(json_array)) +} + +fn maybe_align_json_array_with_schema( + schema: &ArrowSchemaRef, + arrays: Vec, +) -> Result> { + if schema.fields().iter().all(|f| !is_json_extension_type(f)) { + return Ok(arrays); + } + + let mut aligned = Vec::with_capacity(arrays.len()); + for (field, array) in schema.fields().iter().zip(arrays.into_iter()) { + if !is_json_extension_type(field) { + aligned.push(array); + continue; + } + + let json_array = align_json_array(&array, field.data_type())?; + aligned.push(json_array); + } + Ok(aligned) } #[cfg(test)] mod tests { use std::sync::Arc; - use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use datatypes::arrow::array::{ + AsArray, BooleanArray, Float64Array, Int64Array, ListArray, UInt32Array, + }; + use datatypes::arrow::datatypes::{ + DataType, Field, Fields, Int64Type, Schema as ArrowSchema, UInt32Type, + }; + use datatypes::arrow_array::StringArray; use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; use datatypes::vectors::{StringVector, UInt32Vector}; use super::*; + #[test] + fn test_align_json_array() -> Result<()> { + struct TestCase { + json_array: ArrayRef, + schema_type: DataType, + expected: std::result::Result, + } + + impl TestCase { + fn new( + json_array: StructArray, + schema_type: Fields, + expected: std::result::Result, String>, + ) -> Self { + Self { + json_array: Arc::new(json_array), + schema_type: DataType::Struct(schema_type.clone()), + expected: expected + .map(|x| Arc::new(StructArray::new(schema_type, x, None)) as ArrayRef), + } + } + + fn test(self) -> Result<()> { + let result = align_json_array(&self.json_array, &self.schema_type); + match (result, self.expected) { + (Ok(json_array), Ok(expected)) => assert_eq!(&json_array, &expected), + (Ok(json_array), Err(e)) => { + panic!("expecting error {e} but actually get: {json_array:?}") + } + (Err(e), Err(expected)) => assert_eq!(e.to_string(), expected), + (Err(e), Ok(_)) => return Err(e), + } + Ok(()) + } + } + + // Test empty json array can be aligned with a complex json type. + TestCase::new( + StructArray::new_empty_fields(2, None), + Fields::from(vec![ + Field::new("int", DataType::Int64, true), + Field::new_struct( + "nested", + vec![Field::new("bool", DataType::Boolean, true)], + true, + ), + Field::new("string", DataType::Utf8, true), + ]), + Ok(vec![ + Arc::new(Int64Array::new_null(2)) as ArrayRef, + Arc::new(StructArray::new_null( + Fields::from(vec![Arc::new(Field::new("bool", DataType::Boolean, true))]), + 2, + )), + Arc::new(StringArray::new_null(2)), + ]), + ) + .test()?; + + // Test simple json array alignment. + TestCase::new( + StructArray::from(vec![( + Arc::new(Field::new("float", DataType::Float64, true)), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef, + )]), + Fields::from(vec![ + Field::new("float", DataType::Float64, true), + Field::new("string", DataType::Utf8, true), + ]), + Ok(vec![ + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef, + Arc::new(StringArray::new_null(3)), + ]), + ) + .test()?; + + // Test complex json array alignment. + TestCase::new( + StructArray::from(vec![ + ( + Arc::new(Field::new_list( + "list", + Field::new_list_field(DataType::Int64, true), + true, + )), + Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1)]), + None, + Some(vec![Some(2), Some(3)]), + ])) as ArrayRef, + ), + ( + Arc::new(Field::new_struct( + "nested", + vec![Field::new("int", DataType::Int64, true)], + true, + )), + Arc::new(StructArray::from(vec![( + Arc::new(Field::new("int", DataType::Int64, true)), + Arc::new(Int64Array::from(vec![-1, -2, -3])) as ArrayRef, + )])), + ), + ( + Arc::new(Field::new("string", DataType::Utf8, true)), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ), + ]), + Fields::from(vec![ + Field::new("bool", DataType::Boolean, true), + Field::new_list("list", Field::new_list_field(DataType::Int64, true), true), + Field::new_struct( + "nested", + vec![ + Field::new("float", DataType::Float64, true), + Field::new("int", DataType::Int64, true), + ], + true, + ), + Field::new("string", DataType::Utf8, true), + ]), + Ok(vec![ + Arc::new(BooleanArray::new_null(3)) as ArrayRef, + Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1)]), + None, + Some(vec![Some(2), Some(3)]), + ])), + Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("float", DataType::Float64, true)), + Arc::new(Float64Array::new_null(3)) as ArrayRef, + ), + ( + Arc::new(Field::new("int", DataType::Int64, true)), + Arc::new(Int64Array::from(vec![-1, -2, -3])), + ), + ])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ]), + ) + .test()?; + + // Test align failed. + TestCase::new( + StructArray::try_from(vec![ + ("i", Arc::new(Int64Array::from(vec![1])) as ArrayRef), + ("j", Arc::new(Int64Array::from(vec![2])) as ArrayRef), + ]) + .unwrap(), + Fields::from(vec![Field::new("i", DataType::Int64, true)]), + Err( + r#"Failed to align JSON array, reason: this json array has more fields ["j"]"# + .to_string(), + ), + ) + .test()?; + Ok(()) + } + #[test] fn test_record_batch() { let arrow_schema = Arc::new(ArrowSchema::new(vec![ @@ -368,21 +613,21 @@ mod tests { let c2 = Arc::new(UInt32Vector::from_slice([4, 5, 6])); let columns: Vec = vec![c1, c2]; + let expected = vec![ + Arc::new(UInt32Array::from_iter_values([1, 2, 3])) as ArrayRef, + Arc::new(UInt32Array::from_iter_values([4, 5, 6])), + ]; + let batch = RecordBatch::new(schema.clone(), columns.clone()).unwrap(); assert_eq!(3, batch.num_rows()); - assert_eq!(&columns, batch.columns()); - for (i, expect) in columns.iter().enumerate().take(batch.num_columns()) { - let column = batch.column(i); - assert_eq!(expect, column); - } + assert_eq!(expected, batch.df_record_batch().columns()); assert_eq!(schema, batch.schema); - assert_eq!(columns[0], *batch.column_by_name("c1").unwrap()); - assert_eq!(columns[1], *batch.column_by_name("c2").unwrap()); + assert_eq!(&expected[0], batch.column_by_name("c1").unwrap()); + assert_eq!(&expected[1], batch.column_by_name("c2").unwrap()); assert!(batch.column_by_name("c3").is_none()); - let converted = - RecordBatch::try_from_df_record_batch(schema, batch.df_record_batch().clone()).unwrap(); + let converted = RecordBatch::from_df_record_batch(schema, batch.df_record_batch().clone()); assert_eq!(batch, converted); assert_eq!(*batch.df_record_batch(), converted.into_df_record_batch()); } @@ -407,64 +652,6 @@ mod tests { ); } - #[test] - fn test_record_batch_visitor() { - let column_schemas = vec![ - ColumnSchema::new("numbers", ConcreteDataType::uint32_datatype(), false), - ColumnSchema::new("strings", ConcreteDataType::string_datatype(), true), - ]; - let schema = Arc::new(Schema::new(column_schemas)); - let columns: Vec = vec![ - Arc::new(UInt32Vector::from_slice(vec![1, 2, 3, 4])), - Arc::new(StringVector::from(vec![ - None, - Some("hello"), - Some("greptime"), - None, - ])), - ]; - let recordbatch = RecordBatch::new(schema, columns).unwrap(); - - let mut record_batch_iter = recordbatch.rows(); - assert_eq!( - vec![Value::UInt32(1), Value::Null], - record_batch_iter - .next() - .unwrap() - .into_iter() - .collect::>() - ); - - assert_eq!( - vec![Value::UInt32(2), Value::String("hello".into())], - record_batch_iter - .next() - .unwrap() - .into_iter() - .collect::>() - ); - - assert_eq!( - vec![Value::UInt32(3), Value::String("greptime".into())], - record_batch_iter - .next() - .unwrap() - .into_iter() - .collect::>() - ); - - assert_eq!( - vec![Value::UInt32(4), Value::Null], - record_batch_iter - .next() - .unwrap() - .into_iter() - .collect::>() - ); - - assert!(record_batch_iter.next().is_none()); - } - #[test] fn test_record_batch_slice() { let column_schemas = vec![ @@ -483,26 +670,16 @@ mod tests { ]; let recordbatch = RecordBatch::new(schema, columns).unwrap(); let recordbatch = recordbatch.slice(1, 2).expect("recordbatch slice"); - let mut record_batch_iter = recordbatch.rows(); - assert_eq!( - vec![Value::UInt32(2), Value::String("hello".into())], - record_batch_iter - .next() - .unwrap() - .into_iter() - .collect::>() - ); - assert_eq!( - vec![Value::UInt32(3), Value::String("greptime".into())], - record_batch_iter - .next() - .unwrap() - .into_iter() - .collect::>() - ); + let expected = &UInt32Array::from_iter_values([2u32, 3]); + let array = recordbatch.column(0); + let actual = array.as_primitive::(); + assert_eq!(expected, actual); - assert!(record_batch_iter.next().is_none()); + let expected = &StringArray::from(vec!["hello", "greptime"]); + let array = recordbatch.column(1); + let actual = array.as_string::(); + assert_eq!(expected, actual); assert!(recordbatch.slice(1, 5).is_err()); } diff --git a/src/common/sql/src/convert.rs b/src/common/sql/src/convert.rs index 0ff2e44061..edb793baf6 100644 --- a/src/common/sql/src/convert.rs +++ b/src/common/sql/src/convert.rs @@ -211,8 +211,7 @@ pub fn sql_value_to_value( | Value::Duration(_) | Value::IntervalYearMonth(_) | Value::IntervalDayTime(_) - | Value::IntervalMonthDayNano(_) - | Value::Json(_) => match unary_op { + | Value::IntervalMonthDayNano(_) => match unary_op { UnaryOperator::Plus => {} UnaryOperator::Minus => { value = value @@ -222,19 +221,25 @@ pub fn sql_value_to_value( _ => return InvalidUnaryOpSnafu { unary_op, value }.fail(), }, - Value::String(_) | Value::Binary(_) | Value::List(_) | Value::Struct(_) => { + Value::String(_) + | Value::Binary(_) + | Value::List(_) + | Value::Struct(_) + | Value::Json(_) => { return InvalidUnaryOpSnafu { unary_op, value }.fail(); } } } - if value.data_type() != *data_type { + let value_datatype = value.data_type(); + // The datatype of json value is determined by its actual data, so we can't simply "cast" it here. + if value_datatype.is_json() || value_datatype == *data_type { + Ok(value) + } else { datatypes::types::cast(value, data_type).with_context(|_| InvalidCastSnafu { sql_value: sql_val.clone(), datatype: data_type, }) - } else { - Ok(value) } } diff --git a/src/common/sql/src/default_constraint.rs b/src/common/sql/src/default_constraint.rs index 0366f9aec3..e2a57337a5 100644 --- a/src/common/sql/src/default_constraint.rs +++ b/src/common/sql/src/default_constraint.rs @@ -16,6 +16,7 @@ use common_time::timezone::Timezone; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnDefaultConstraint; use datatypes::schema::constraint::{CURRENT_TIMESTAMP, CURRENT_TIMESTAMP_FN}; +use snafu::ensure; use sqlparser::ast::ValueWithSpan; pub use sqlparser::ast::{ BinaryOperator, ColumnDef, ColumnOption, ColumnOptionDef, DataType, Expr, Function, @@ -37,6 +38,14 @@ pub fn parse_column_default_constraint( .iter() .find(|o| matches!(o.option, ColumnOption::Default(_))) { + ensure!( + !data_type.is_json(), + UnsupportedDefaultValueSnafu { + column_name, + reason: "json column cannot have a default value", + } + ); + let default_constraint = match &opt.option { ColumnOption::Default(Expr::Value(v)) => ColumnDefaultConstraint::Value( sql_value_to_value(column_name, data_type, &v.value, timezone, None, false)?, @@ -82,7 +91,7 @@ pub fn parse_column_default_constraint( } else { return UnsupportedDefaultValueSnafu { column_name, - expr: *expr.clone(), + reason: format!("expr '{expr}' not supported"), } .fail(); } @@ -90,14 +99,14 @@ pub fn parse_column_default_constraint( ColumnOption::Default(others) => { return UnsupportedDefaultValueSnafu { column_name, - expr: others.clone(), + reason: format!("expr '{others}' not supported"), } .fail(); } _ => { return UnsupportedDefaultValueSnafu { column_name, - expr: Expr::Value(SqlValue::Null.into()), + reason: format!("option '{}' not supported", opt.option), } .fail(); } diff --git a/src/common/sql/src/error.rs b/src/common/sql/src/error.rs index b777b54103..ed23df0cc1 100644 --- a/src/common/sql/src/error.rs +++ b/src/common/sql/src/error.rs @@ -55,13 +55,11 @@ pub enum Error { }, #[snafu(display( - "Unsupported expr in default constraint: {} for column: {}", - expr, - column_name + "Unsupported default constraint for column: '{column_name}', reason: {reason}" ))] UnsupportedDefaultValue { column_name: String, - expr: Expr, + reason: String, #[snafu(implicit)] location: Location, }, diff --git a/src/common/stat/Cargo.toml b/src/common/stat/Cargo.toml index 3d0198f6a2..d0e8b5448f 100644 --- a/src/common/stat/Cargo.toml +++ b/src/common/stat/Cargo.toml @@ -6,11 +6,14 @@ license.workspace = true [dependencies] common-base.workspace = true +common-runtime.workspace = true +common-telemetry.workspace = true lazy_static.workspace = true nix.workspace = true num_cpus.workspace = true prometheus.workspace = true sysinfo.workspace = true +tokio.workspace = true [lints] workspace = true diff --git a/src/common/stat/src/cgroups.rs b/src/common/stat/src/cgroups.rs index fe26f5ec36..ce8f5ac87a 100644 --- a/src/common/stat/src/cgroups.rs +++ b/src/common/stat/src/cgroups.rs @@ -117,7 +117,10 @@ pub fn get_cpu_limit_from_cgroups() -> Option { None } -fn get_cpu_usage() -> Option { +/// Get the usage of cpu in millicores from cgroups filesystem. +/// +/// - Return `None` if it's not in the cgroups v2 environment or fails to read the cpu usage. +pub fn get_cpu_usage_from_cgroups() -> Option { // In certain bare-metal environments, the `/sys/fs/cgroup/cpu.stat` file may be present and reflect system-wide CPU usage rather than container-specific metrics. // To ensure accurate collection of container-level CPU usage, verify the existence of the `/sys/fs/cgroup/memory.current` file. // The presence of this file typically indicates execution within a containerized environment, thereby validating the relevance of the collected CPU usage data. @@ -142,6 +145,22 @@ fn get_cpu_usage() -> Option { fields[1].trim().parse::().ok() } +// Calculate the cpu usage in millicores from cgroups filesystem. +// +// - Return `0` if the current cpu usage is equal to the last cpu usage or the interval is 0. +pub(crate) fn calculate_cpu_usage( + current_cpu_usage_usecs: i64, + last_cpu_usage_usecs: i64, + interval_milliseconds: i64, +) -> i64 { + let diff = current_cpu_usage_usecs - last_cpu_usage_usecs; + if diff > 0 && interval_milliseconds > 0 { + ((diff as f64 / interval_milliseconds as f64).round() as i64).max(1) + } else { + 0 + } +} + // Check whether the cgroup is v2. // - Return `true` if the cgroup is v2, otherwise return `false`. // - Return `None` if the detection fails or not on linux. @@ -230,7 +249,7 @@ impl Collector for CgroupsMetricsCollector { } fn collect(&self) -> Vec { - if let Some(cpu_usage) = get_cpu_usage() { + if let Some(cpu_usage) = get_cpu_usage_from_cgroups() { self.cpu_usage.set(cpu_usage); } diff --git a/src/common/stat/src/lib.rs b/src/common/stat/src/lib.rs index 2c6cbea3f1..544b9439c8 100644 --- a/src/common/stat/src/lib.rs +++ b/src/common/stat/src/lib.rs @@ -13,66 +13,7 @@ // limitations under the License. mod cgroups; +mod resource; pub use cgroups::*; -use common_base::readable_size::ReadableSize; -use sysinfo::System; - -/// Get the total CPU in millicores. -pub fn get_total_cpu_millicores() -> i64 { - // Get CPU limit from cgroups filesystem. - if let Some(cgroup_cpu_limit) = get_cpu_limit_from_cgroups() { - cgroup_cpu_limit - } else { - // Get total CPU cores from host system. - num_cpus::get() as i64 * 1000 - } -} - -/// Get the total memory in bytes. -pub fn get_total_memory_bytes() -> i64 { - // Get memory limit from cgroups filesystem. - if let Some(cgroup_memory_limit) = get_memory_limit_from_cgroups() { - cgroup_memory_limit - } else { - // Get total memory from host system. - if sysinfo::IS_SUPPORTED_SYSTEM { - let mut sys_info = System::new(); - sys_info.refresh_memory(); - sys_info.total_memory() as i64 - } else { - // If the system is not supported, return -1. - -1 - } - } -} - -/// Get the total CPU cores. The result will be rounded to the nearest integer. -/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2. -pub fn get_total_cpu_cores() -> usize { - ((get_total_cpu_millicores() as f64) / 1000.0).round() as usize -} - -/// Get the total memory in readable size. -pub fn get_total_memory_readable() -> Option { - if get_total_memory_bytes() > 0 { - Some(ReadableSize(get_total_memory_bytes() as u64)) - } else { - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_get_total_cpu_cores() { - assert!(get_total_cpu_cores() > 0); - } - - #[test] - fn test_get_total_memory_readable() { - assert!(get_total_memory_readable().unwrap() > ReadableSize::mb(0)); - } -} +pub use resource::*; diff --git a/src/common/stat/src/resource.rs b/src/common/stat/src/resource.rs new file mode 100644 index 0000000000..babfa54a19 --- /dev/null +++ b/src/common/stat/src/resource.rs @@ -0,0 +1,187 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::time::Duration; + +use common_base::readable_size::ReadableSize; +use common_runtime::JoinHandle; +use common_telemetry::info; +use sysinfo::System; +use tokio::time::sleep; + +use crate::cgroups::calculate_cpu_usage; +use crate::{ + get_cpu_limit_from_cgroups, get_cpu_usage_from_cgroups, get_memory_limit_from_cgroups, + get_memory_usage_from_cgroups, +}; + +/// Get the total CPU in millicores. If the CPU limit is unset, it will return the total CPU cores from host system. +pub fn get_total_cpu_millicores() -> i64 { + // Get CPU limit from cgroups filesystem. + if let Some(cgroup_cpu_limit) = get_cpu_limit_from_cgroups() { + cgroup_cpu_limit + } else { + // Get total CPU cores from host system. + num_cpus::get() as i64 * 1000 + } +} + +/// Get the total memory in bytes. If the memory limit is unset, it will return the total memory from host system. +/// If the system is not supported to get the total host memory, it will return 0. +pub fn get_total_memory_bytes() -> i64 { + // Get memory limit from cgroups filesystem. + if let Some(cgroup_memory_limit) = get_memory_limit_from_cgroups() { + cgroup_memory_limit + } else { + // Get total memory from host system. + if sysinfo::IS_SUPPORTED_SYSTEM { + let mut sys_info = System::new(); + sys_info.refresh_memory(); + sys_info.total_memory() as i64 + } else { + // If the system is not supported, return 0 + 0 + } + } +} + +/// Get the total CPU cores. The result will be rounded to the nearest integer. +/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2. +pub fn get_total_cpu_cores() -> usize { + ((get_total_cpu_millicores() as f64) / 1000.0).round() as usize +} + +/// Get the total memory in readable size. +pub fn get_total_memory_readable() -> Option { + if get_total_memory_bytes() > 0 { + Some(ReadableSize(get_total_memory_bytes() as u64)) + } else { + None + } +} + +/// A reference to a `ResourceStat` implementation. +pub type ResourceStatRef = Arc; + +/// A trait for getting resource statistics. +pub trait ResourceStat { + /// Get the total CPU in millicores. + fn get_total_cpu_millicores(&self) -> i64; + /// Get the total memory in bytes. + fn get_total_memory_bytes(&self) -> i64; + /// Get the CPU usage in millicores. + fn get_cpu_usage_millicores(&self) -> i64; + /// Get the memory usage in bytes. + fn get_memory_usage_bytes(&self) -> i64; +} + +/// A implementation of `ResourceStat` trait. +pub struct ResourceStatImpl { + cpu_usage_millicores: Arc, + last_cpu_usage_usecs: Arc, + calculate_interval: Duration, + handler: Option>, +} + +impl Default for ResourceStatImpl { + fn default() -> Self { + Self { + cpu_usage_millicores: Arc::new(AtomicI64::new(0)), + last_cpu_usage_usecs: Arc::new(AtomicI64::new(0)), + calculate_interval: Duration::from_secs(5), + handler: None, + } + } +} + +impl ResourceStatImpl { + /// Start collecting CPU usage periodically. It will calculate the CPU usage in millicores based on rate of change of CPU usage usage_usec in `/sys/fs/cgroup/cpu.stat`. + /// It ONLY works in cgroup v2 environment. + pub fn start_collect_cpu_usage(&mut self) { + if self.handler.is_some() { + return; + } + + let cpu_usage_millicores = self.cpu_usage_millicores.clone(); + let last_cpu_usage_usecs = self.last_cpu_usage_usecs.clone(); + let calculate_interval = self.calculate_interval; + + let handler = common_runtime::spawn_global(async move { + info!( + "Starting to collect CPU usage periodically for every {} seconds", + calculate_interval.as_secs() + ); + loop { + let current_cpu_usage_usecs = get_cpu_usage_from_cgroups(); + if let Some(current_cpu_usage_usecs) = current_cpu_usage_usecs { + // Skip the first time to collect CPU usage. + if last_cpu_usage_usecs.load(Ordering::Relaxed) == 0 { + last_cpu_usage_usecs.store(current_cpu_usage_usecs, Ordering::Relaxed); + continue; + } + let cpu_usage = calculate_cpu_usage( + current_cpu_usage_usecs, + last_cpu_usage_usecs.load(Ordering::Relaxed), + calculate_interval.as_millis() as i64, + ); + cpu_usage_millicores.store(cpu_usage, Ordering::Relaxed); + last_cpu_usage_usecs.store(current_cpu_usage_usecs, Ordering::Relaxed); + } + sleep(calculate_interval).await; + } + }); + + self.handler = Some(handler); + } +} + +impl ResourceStat for ResourceStatImpl { + /// Get the total CPU in millicores. + fn get_total_cpu_millicores(&self) -> i64 { + get_total_cpu_millicores() + } + + /// Get the total memory in bytes. + fn get_total_memory_bytes(&self) -> i64 { + get_total_memory_bytes() + } + + /// Get the CPU usage in millicores. + fn get_cpu_usage_millicores(&self) -> i64 { + self.cpu_usage_millicores.load(Ordering::Relaxed) + } + + /// Get the memory usage in bytes. + /// It ONLY works in cgroup v2 environment. + fn get_memory_usage_bytes(&self) -> i64 { + get_memory_usage_from_cgroups().unwrap_or_default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_total_cpu_cores() { + assert!(get_total_cpu_cores() > 0); + } + + #[test] + fn test_get_total_memory_readable() { + assert!(get_total_memory_readable().unwrap() > ReadableSize::mb(0)); + } +} diff --git a/src/common/telemetry/Cargo.toml b/src/common/telemetry/Cargo.toml index d0bc6876bc..92c3304d53 100644 --- a/src/common/telemetry/Cargo.toml +++ b/src/common/telemetry/Cargo.toml @@ -35,5 +35,5 @@ tokio.workspace = true tracing = "0.1" tracing-appender.workspace = true tracing-log = "0.2" -tracing-opentelemetry = "0.31.0" +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true diff --git a/src/common/telemetry/src/lib.rs b/src/common/telemetry/src/lib.rs index ba46bfa0d9..cd60d61645 100644 --- a/src/common/telemetry/src/lib.rs +++ b/src/common/telemetry/src/lib.rs @@ -21,7 +21,10 @@ mod panic_hook; pub mod tracing_context; mod tracing_sampler; -pub use logging::{RELOAD_HANDLE, init_default_ut_logging, init_global_logging}; +pub use logging::{ + LOG_RELOAD_HANDLE, TRACE_RELOAD_HANDLE, get_or_init_tracer, init_default_ut_logging, + init_global_logging, +}; pub use metric::dump_metrics; pub use panic_hook::set_panic_hook; pub use {common_error, tracing, tracing_subscriber}; diff --git a/src/common/telemetry/src/logging.rs b/src/common/telemetry/src/logging.rs index d2b8a64b39..1b371c1d78 100644 --- a/src/common/telemetry/src/logging.rs +++ b/src/common/telemetry/src/logging.rs @@ -16,7 +16,7 @@ use std::collections::HashMap; use std::env; use std::io::IsTerminal; -use std::sync::{Arc, Mutex, Once}; +use std::sync::{Arc, Mutex, Once, RwLock}; use std::time::Duration; use common_base::serde::empty_string_as_default; @@ -25,15 +25,17 @@ use opentelemetry::trace::TracerProvider; use opentelemetry::{KeyValue, global}; use opentelemetry_otlp::{Protocol, SpanExporter, WithExportConfig, WithHttpConfig}; use opentelemetry_sdk::propagation::TraceContextPropagator; -use opentelemetry_sdk::trace::Sampler; +use opentelemetry_sdk::trace::{Sampler, Tracer}; use opentelemetry_semantic_conventions::resource; use serde::{Deserialize, Serialize}; +use tracing::callsite; +use tracing::metadata::LevelFilter; use tracing_appender::non_blocking::WorkerGuard; use tracing_appender::rolling::{RollingFileAppender, Rotation}; use tracing_log::LogTracer; use tracing_subscriber::filter::{FilterFn, Targets}; use tracing_subscriber::fmt::Layer; -use tracing_subscriber::layer::SubscriberExt; +use tracing_subscriber::layer::{Layered, SubscriberExt}; use tracing_subscriber::prelude::*; use tracing_subscriber::{EnvFilter, Registry, filter}; @@ -48,10 +50,192 @@ pub const DEFAULT_OTLP_HTTP_ENDPOINT: &str = "http://localhost:4318/v1/traces"; /// The default logs directory. pub const DEFAULT_LOGGING_DIR: &str = "logs"; -// Handle for reloading log level -pub static RELOAD_HANDLE: OnceCell> = +/// Handle for reloading log level +pub static LOG_RELOAD_HANDLE: OnceCell> = OnceCell::new(); +type DynSubscriber = Layered, Registry>; +type OtelTraceLayer = tracing_opentelemetry::OpenTelemetryLayer; + +#[derive(Clone)] +pub struct TraceReloadHandle { + inner: Arc>>, +} + +impl TraceReloadHandle { + fn new(inner: Arc>>) -> Self { + Self { inner } + } + + pub fn reload(&self, new_layer: Option) { + let mut guard = self.inner.write().unwrap(); + *guard = new_layer; + drop(guard); + + callsite::rebuild_interest_cache(); + } +} + +/// A tracing layer that can be dynamically reloaded. +/// +/// Mostly copied from [`tracing_subscriber::reload::Layer`]. +struct TraceLayer { + inner: Arc>>, +} + +impl TraceLayer { + fn new(initial: Option) -> (Self, TraceReloadHandle) { + let inner = Arc::new(RwLock::new(initial)); + ( + Self { + inner: inner.clone(), + }, + TraceReloadHandle::new(inner), + ) + } + + fn with_layer(&self, f: impl FnOnce(&OtelTraceLayer) -> R) -> Option { + self.inner + .read() + .ok() + .and_then(|guard| guard.as_ref().map(f)) + } + + fn with_layer_mut(&self, f: impl FnOnce(&mut OtelTraceLayer) -> R) -> Option { + self.inner + .write() + .ok() + .and_then(|mut guard| guard.as_mut().map(f)) + } +} + +impl tracing_subscriber::Layer for TraceLayer { + fn on_register_dispatch(&self, subscriber: &tracing::Dispatch) { + let _ = self.with_layer(|layer| layer.on_register_dispatch(subscriber)); + } + + fn on_layer(&mut self, subscriber: &mut DynSubscriber) { + let _ = self.with_layer_mut(|layer| layer.on_layer(subscriber)); + } + + fn register_callsite( + &self, + metadata: &'static tracing::Metadata<'static>, + ) -> tracing::subscriber::Interest { + self.with_layer(|layer| layer.register_callsite(metadata)) + .unwrap_or_else(tracing::subscriber::Interest::always) + } + + fn enabled( + &self, + metadata: &tracing::Metadata<'_>, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) -> bool { + self.with_layer(|layer| layer.enabled(metadata, ctx)) + .unwrap_or(true) + } + + fn on_new_span( + &self, + attrs: &tracing::span::Attributes<'_>, + id: &tracing::span::Id, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) { + let _ = self.with_layer(|layer| layer.on_new_span(attrs, id, ctx)); + } + + fn max_level_hint(&self) -> Option { + self.with_layer(|layer| layer.max_level_hint()).flatten() + } + + fn on_record( + &self, + span: &tracing::span::Id, + values: &tracing::span::Record<'_>, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) { + let _ = self.with_layer(|layer| layer.on_record(span, values, ctx)); + } + + fn on_follows_from( + &self, + span: &tracing::span::Id, + follows: &tracing::span::Id, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) { + let _ = self.with_layer(|layer| layer.on_follows_from(span, follows, ctx)); + } + + fn event_enabled( + &self, + event: &tracing::Event<'_>, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) -> bool { + self.with_layer(|layer| layer.event_enabled(event, ctx)) + .unwrap_or(true) + } + + fn on_event( + &self, + event: &tracing::Event<'_>, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) { + let _ = self.with_layer(|layer| layer.on_event(event, ctx)); + } + + fn on_enter( + &self, + id: &tracing::span::Id, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) { + let _ = self.with_layer(|layer| layer.on_enter(id, ctx)); + } + + fn on_exit( + &self, + id: &tracing::span::Id, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) { + let _ = self.with_layer(|layer| layer.on_exit(id, ctx)); + } + + fn on_close( + &self, + id: tracing::span::Id, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) { + let _ = self.with_layer(|layer| layer.on_close(id, ctx)); + } + + fn on_id_change( + &self, + old: &tracing::span::Id, + new: &tracing::span::Id, + ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>, + ) { + let _ = self.with_layer(|layer| layer.on_id_change(old, new, ctx)); + } + + unsafe fn downcast_raw(&self, id: std::any::TypeId) -> Option<*const ()> { + self.inner.read().ok().and_then(|guard| { + guard + .as_ref() + .and_then(|layer| unsafe { layer.downcast_raw(id) }) + }) + } +} + +/// Handle for reloading trace level +pub static TRACE_RELOAD_HANDLE: OnceCell = OnceCell::new(); + +static TRACER: OnceCell> = OnceCell::new(); + +#[derive(Debug)] +enum TraceState { + Ready(Tracer), + Deferred(TraceContext), +} + /// The logging options that used to initialize the logger. #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(default)] @@ -167,6 +351,13 @@ impl PartialEq for LoggingOptions { impl Eq for LoggingOptions {} +#[derive(Clone, Debug)] +struct TraceContext { + app_name: String, + node_id: String, + logging_opts: LoggingOptions, +} + impl Default for LoggingOptions { fn default() -> Self { Self { @@ -242,6 +433,7 @@ pub fn init_global_logging( ) -> Vec { static START: Once = Once::new(); let mut guards = vec![]; + let node_id = node_id.unwrap_or_else(|| "none".to_string()); START.call_once(|| { // Enable log compatible layer to convert log record to tracing span. @@ -357,10 +549,37 @@ pub fn init_global_logging( let (dyn_filter, reload_handle) = tracing_subscriber::reload::Layer::new(filter.clone()); - RELOAD_HANDLE + LOG_RELOAD_HANDLE .set(reload_handle) .expect("reload handle already set, maybe init_global_logging get called twice?"); + let mut initial_tracer = None; + let trace_state = if opts.enable_otlp_tracing { + let tracer = create_tracer(app_name, &node_id, opts); + initial_tracer = Some(tracer.clone()); + TraceState::Ready(tracer) + } else { + TraceState::Deferred(TraceContext { + app_name: app_name.to_string(), + node_id: node_id.clone(), + logging_opts: opts.clone(), + }) + }; + + TRACER + .set(Mutex::new(trace_state)) + .expect("trace state already initialized"); + + let initial_trace_layer = initial_tracer + .as_ref() + .map(|tracer| tracing_opentelemetry::layer().with_tracer(tracer.clone())); + + let (dyn_trace_layer, trace_reload_handle) = TraceLayer::new(initial_trace_layer); + + TRACE_RELOAD_HANDLE + .set(trace_reload_handle) + .unwrap_or_else(|_| panic!("failed to set trace reload handle")); + // Must enable 'tokio_unstable' cfg to use this feature. // For example: `RUSTFLAGS="--cfg tokio_unstable" cargo run -F common-telemetry/console -- standalone start` #[cfg(feature = "tokio-console")] @@ -383,6 +602,7 @@ pub fn init_global_logging( Registry::default() .with(dyn_filter) + .with(dyn_trace_layer) .with(tokio_console_layer) .with(stdout_logging_layer) .with(file_logging_layer) @@ -396,53 +616,61 @@ pub fn init_global_logging( #[cfg(not(feature = "tokio-console"))] let subscriber = Registry::default() .with(dyn_filter) + .with(dyn_trace_layer) .with(stdout_logging_layer) .with(file_logging_layer) .with(err_file_logging_layer) .with(slow_query_logging_layer); - if opts.enable_otlp_tracing { - global::set_text_map_propagator(TraceContextPropagator::new()); + global::set_text_map_propagator(TraceContextPropagator::new()); - let sampler = opts - .tracing_sample_ratio - .as_ref() - .map(create_sampler) - .map(Sampler::ParentBased) - .unwrap_or(Sampler::ParentBased(Box::new(Sampler::AlwaysOn))); - - let provider = opentelemetry_sdk::trace::SdkTracerProvider::builder() - .with_batch_exporter(build_otlp_exporter(opts)) - .with_sampler(sampler) - .with_resource( - opentelemetry_sdk::Resource::builder_empty() - .with_attributes([ - KeyValue::new(resource::SERVICE_NAME, app_name.to_string()), - KeyValue::new( - resource::SERVICE_INSTANCE_ID, - node_id.unwrap_or("none".to_string()), - ), - KeyValue::new(resource::SERVICE_VERSION, common_version::version()), - KeyValue::new(resource::PROCESS_PID, std::process::id().to_string()), - ]) - .build(), - ) - .build(); - let tracer = provider.tracer("greptimedb"); - - tracing::subscriber::set_global_default( - subscriber.with(tracing_opentelemetry::layer().with_tracer(tracer)), - ) + tracing::subscriber::set_global_default(subscriber) .expect("error setting global tracing subscriber"); - } else { - tracing::subscriber::set_global_default(subscriber) - .expect("error setting global tracing subscriber"); - } }); guards } +fn create_tracer(app_name: &str, node_id: &str, opts: &LoggingOptions) -> Tracer { + let sampler = opts + .tracing_sample_ratio + .as_ref() + .map(create_sampler) + .map(Sampler::ParentBased) + .unwrap_or(Sampler::ParentBased(Box::new(Sampler::AlwaysOn))); + + let resource = opentelemetry_sdk::Resource::builder_empty() + .with_attributes([ + KeyValue::new(resource::SERVICE_NAME, app_name.to_string()), + KeyValue::new(resource::SERVICE_INSTANCE_ID, node_id.to_string()), + KeyValue::new(resource::SERVICE_VERSION, common_version::version()), + KeyValue::new(resource::PROCESS_PID, std::process::id().to_string()), + ]) + .build(); + + opentelemetry_sdk::trace::SdkTracerProvider::builder() + .with_batch_exporter(build_otlp_exporter(opts)) + .with_sampler(sampler) + .with_resource(resource) + .build() + .tracer("greptimedb") +} + +/// Ensure that the OTLP tracer has been constructed, building it lazily if needed. +pub fn get_or_init_tracer() -> Result { + let state = TRACER.get().ok_or("trace state is not initialized")?; + let mut guard = state.lock().expect("trace state lock poisoned"); + + match &mut *guard { + TraceState::Ready(tracer) => Ok(tracer.clone()), + TraceState::Deferred(context) => { + let tracer = create_tracer(&context.app_name, &context.node_id, &context.logging_opts); + *guard = TraceState::Ready(tracer.clone()); + Ok(tracer) + } + } +} + fn build_otlp_exporter(opts: &LoggingOptions) -> SpanExporter { let protocol = opts .otlp_export_protocol diff --git a/src/common/test-util/src/recordbatch.rs b/src/common/test-util/src/recordbatch.rs index eb666e167a..aa68f79356 100644 --- a/src/common/test-util/src/recordbatch.rs +++ b/src/common/test-util/src/recordbatch.rs @@ -28,7 +28,7 @@ pub async fn check_output_stream(output: OutputData, expected: &str) { _ => unreachable!(), }; let pretty_print = recordbatches.pretty_print().unwrap(); - assert_eq!(pretty_print, expected, "actual: \n{}", pretty_print); + assert_eq!(pretty_print, expected.trim(), "actual: \n{}", pretty_print); } pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) { diff --git a/src/common/time/src/timezone.rs b/src/common/time/src/timezone.rs index fec659248f..41cc1f7842 100644 --- a/src/common/time/src/timezone.rs +++ b/src/common/time/src/timezone.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::fmt::Display; -use std::str::FromStr; use chrono::{FixedOffset, TimeZone}; use chrono_tz::{OffsetComponents, Tz}; @@ -102,7 +101,7 @@ impl Timezone { .parse::() .context(ParseOffsetStrSnafu { raw: tz_string })?; Self::hours_mins_opt(hrs, mins) - } else if let Ok(tz) = Tz::from_str(tz_string) { + } else if let Ok(tz) = Tz::from_str_insensitive(tz_string) { Ok(Self::Named(tz)) } else { ParseTimezoneNameSnafu { raw: tz_string }.fail() @@ -203,6 +202,10 @@ mod tests { Timezone::Named(Tz::Asia__Shanghai), Timezone::from_tz_string("Asia/Shanghai").unwrap() ); + assert_eq!( + Timezone::Named(Tz::Asia__Shanghai), + Timezone::from_tz_string("Asia/ShangHai").unwrap() + ); assert_eq!( Timezone::Named(Tz::UTC), Timezone::from_tz_string("UTC").unwrap() diff --git a/src/common/version/Cargo.toml b/src/common/version/Cargo.toml index 3a8a2a511e..adee41afd7 100644 --- a/src/common/version/Cargo.toml +++ b/src/common/version/Cargo.toml @@ -11,7 +11,7 @@ workspace = true codec = ["dep:serde"] [dependencies] -const_format = "0.2" +const_format.workspace = true serde = { workspace = true, optional = true } shadow-rs = { version = "1.2.1", default-features = false } diff --git a/src/datanode/Cargo.toml b/src/datanode/Cargo.toml index 3dcffd0ac9..265ede339e 100644 --- a/src/datanode/Cargo.toml +++ b/src/datanode/Cargo.toml @@ -30,6 +30,7 @@ common-procedure.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-time.workspace = true common-version.workspace = true diff --git a/src/datanode/src/config.rs b/src/datanode/src/config.rs index 2f2fcd2697..19b4647b8e 100644 --- a/src/datanode/src/config.rs +++ b/src/datanode/src/config.rs @@ -28,7 +28,6 @@ use mito2::config::MitoConfig; pub(crate) use object_store::config::ObjectStoreConfig; use query::options::QueryOptions; use serde::{Deserialize, Serialize}; -use servers::export_metrics::ExportMetricsOption; use servers::grpc::GrpcOptions; use servers::heartbeat_options::HeartbeatOptions; use servers::http::HttpOptions; @@ -66,6 +65,7 @@ impl Default for StorageConfig { #[serde(default)] pub struct DatanodeOptions { pub node_id: Option, + pub default_column_prefix: Option, pub workload_types: Vec, pub require_lease_before_startup: bool, pub init_regions_in_background: bool, @@ -81,7 +81,6 @@ pub struct DatanodeOptions { pub region_engine: Vec, pub logging: LoggingOptions, pub enable_telemetry: bool, - pub export_metrics: ExportMetricsOption, pub tracing: TracingOptions, pub query: QueryOptions, pub memory: MemoryOptions, @@ -119,6 +118,7 @@ impl Default for DatanodeOptions { fn default() -> Self { Self { node_id: None, + default_column_prefix: None, workload_types: vec![DatanodeWorkloadType::Hybrid], require_lease_before_startup: false, init_regions_in_background: false, @@ -136,7 +136,6 @@ impl Default for DatanodeOptions { logging: LoggingOptions::default(), heartbeat: HeartbeatOptions::datanode_default(), enable_telemetry: true, - export_metrics: ExportMetricsOption::default(), tracing: TracingOptions::default(), query: QueryOptions::default(), memory: MemoryOptions::default(), diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs index ed8b41f0c7..5a1279db9b 100644 --- a/src/datanode/src/datanode.rs +++ b/src/datanode/src/datanode.rs @@ -27,6 +27,8 @@ use common_meta::key::runtime_switch::RuntimeSwitchManager; use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; pub use common_procedure::options::ProcedureConfig; +use common_query::prelude::set_default_prefix; +use common_stat::ResourceStatImpl; use common_telemetry::{error, info, warn}; use common_wal::config::DatanodeWalConfig; use common_wal::config::kafka::DatanodeKafkaConfig; @@ -46,7 +48,6 @@ use object_store::manager::{ObjectStoreManager, ObjectStoreManagerRef}; use object_store::util::normalize_dir; use query::QueryEngineFactory; use query::dummy_catalog::{DummyCatalogManager, TableProviderFactoryRef}; -use servers::export_metrics::ExportMetricsTask; use servers::server::ServerHandlers; use snafu::{OptionExt, ResultExt, ensure}; use store_api::path_utils::WAL_DIR; @@ -58,9 +59,9 @@ use tokio::sync::Notify; use crate::config::{DatanodeOptions, RegionEngineConfig, StorageConfig}; use crate::error::{ - self, BuildMetricEngineSnafu, BuildMitoEngineSnafu, CreateDirSnafu, GetMetadataSnafu, - MissingCacheSnafu, MissingNodeIdSnafu, OpenLogStoreSnafu, Result, ShutdownInstanceSnafu, - ShutdownServerSnafu, StartServerSnafu, + self, BuildDatanodeSnafu, BuildMetricEngineSnafu, BuildMitoEngineSnafu, CreateDirSnafu, + GetMetadataSnafu, MissingCacheSnafu, MissingNodeIdSnafu, OpenLogStoreSnafu, Result, + ShutdownInstanceSnafu, ShutdownServerSnafu, StartServerSnafu, }; use crate::event_listener::{ NoopRegionServerEventListener, RegionServerEventListenerRef, RegionServerEventReceiver, @@ -82,7 +83,6 @@ pub struct Datanode { greptimedb_telemetry_task: Arc, leases_notifier: Option>, plugins: Plugins, - export_metrics_task: Option, } impl Datanode { @@ -94,10 +94,6 @@ impl Datanode { self.start_telemetry(); - if let Some(t) = self.export_metrics_task.as_ref() { - t.start(None).context(StartServerSnafu)? - } - self.services.start_all().await.context(StartServerSnafu) } @@ -219,6 +215,9 @@ impl DatanodeBuilder { pub async fn build(mut self) -> Result { let node_id = self.opts.node_id.context(MissingNodeIdSnafu)?; + set_default_prefix(self.opts.default_column_prefix.as_deref()) + .map_err(BoxedError::new) + .context(BuildDatanodeSnafu)?; let meta_client = self.meta_client.take(); @@ -282,6 +281,9 @@ impl DatanodeBuilder { open_all_regions.await?; } + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + let heartbeat_task = if let Some(meta_client) = meta_client { Some( HeartbeatTask::try_new( @@ -290,6 +292,7 @@ impl DatanodeBuilder { meta_client, cache_registry, self.plugins.clone(), + Arc::new(resource_stat), ) .await?, ) @@ -310,10 +313,6 @@ impl DatanodeBuilder { None }; - let export_metrics_task = - ExportMetricsTask::try_new(&self.opts.export_metrics, Some(&self.plugins)) - .context(StartServerSnafu)?; - Ok(Datanode { services: ServerHandlers::default(), heartbeat_task, @@ -322,7 +321,6 @@ impl DatanodeBuilder { region_event_receiver, leases_notifier, plugins: self.plugins.clone(), - export_metrics_task, }) } @@ -513,6 +511,7 @@ impl DatanodeBuilder { file_ref_manager, partition_expr_fetcher.clone(), plugins, + opts.max_concurrent_queries, ); #[cfg(feature = "enterprise")] @@ -555,6 +554,7 @@ impl DatanodeBuilder { file_ref_manager, partition_expr_fetcher, plugins, + opts.max_concurrent_queries, ); #[cfg(feature = "enterprise")] @@ -576,6 +576,7 @@ impl DatanodeBuilder { file_ref_manager, partition_expr_fetcher.clone(), plugins, + opts.max_concurrent_queries, ); #[cfg(feature = "enterprise")] diff --git a/src/datanode/src/error.rs b/src/datanode/src/error.rs index a2e6f674e2..74bddbaede 100644 --- a/src/datanode/src/error.rs +++ b/src/datanode/src/error.rs @@ -165,6 +165,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to build datanode"))] + BuildDatanode { + #[snafu(implicit)] + location: Location, + source: BoxedError, + }, + #[snafu(display("Failed to build http client"))] BuildHttpClient { #[snafu(implicit)] @@ -315,6 +322,21 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to run gc for region {}", region_id))] + GcMitoEngine { + region_id: RegionId, + source: mito2::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Invalid arguments for GC: {}", msg))] + InvalidGcArgs { + msg: String, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Failed to list SST entries from storage"))] ListStorageSsts { #[snafu(implicit)] @@ -429,7 +451,8 @@ impl ErrorExt for Error { | MissingRequiredField { .. } | RegionEngineNotFound { .. } | ParseAddr { .. } - | TomlFormat { .. } => StatusCode::InvalidArguments, + | TomlFormat { .. } + | BuildDatanode { .. } => StatusCode::InvalidArguments, PayloadNotExist { .. } | Unexpected { .. } @@ -438,9 +461,11 @@ impl ErrorExt for Error { AsyncTaskExecute { source, .. } => source.status_code(), - CreateDir { .. } | RemoveDir { .. } | ShutdownInstance { .. } | DataFusion { .. } => { - StatusCode::Internal - } + CreateDir { .. } + | RemoveDir { .. } + | ShutdownInstance { .. } + | DataFusion { .. } + | InvalidGcArgs { .. } => StatusCode::Internal, RegionNotFound { .. } => StatusCode::RegionNotFound, RegionNotReady { .. } => StatusCode::RegionNotReady, @@ -458,7 +483,7 @@ impl ErrorExt for Error { StopRegionEngine { source, .. } => source.status_code(), FindLogicalRegions { source, .. } => source.status_code(), - BuildMitoEngine { source, .. } => source.status_code(), + BuildMitoEngine { source, .. } | GcMitoEngine { source, .. } => source.status_code(), BuildMetricEngine { source, .. } => source.status_code(), ListStorageSsts { source, .. } => source.status_code(), ConcurrentQueryLimiterClosed { .. } | ConcurrentQueryLimiterTimeout { .. } => { diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 9c059e5698..33ba648830 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -20,7 +20,6 @@ use std::time::Duration; use api::v1::meta::heartbeat_request::NodeWorkloads; use api::v1::meta::{DatanodeWorkloads, HeartbeatRequest, NodeInfo, Peer, RegionRole, RegionStat}; use common_base::Plugins; -use common_config::utils::ResourceSpec; use common_meta::cache_invalidator::CacheInvalidatorRef; use common_meta::datanode::REGION_STATISTIC_KEY; use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS; @@ -31,19 +30,20 @@ use common_meta::heartbeat::handler::{ }; use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef}; use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message; +use common_stat::ResourceStatRef; use common_telemetry::{debug, error, info, trace, warn}; use common_workload::DatanodeWorkloadType; use meta_client::MetaClientRef; use meta_client::client::{HeartbeatSender, MetaClient}; use servers::addrs; -use snafu::ResultExt; +use snafu::{OptionExt as _, ResultExt}; use tokio::sync::{Notify, mpsc}; use tokio::time::Instant; use self::handler::RegionHeartbeatResponseHandler; use crate::alive_keeper::{CountdownTaskHandlerExtRef, RegionAliveKeeper}; use crate::config::DatanodeOptions; -use crate::error::{self, MetaClientInitSnafu, Result}; +use crate::error::{self, MetaClientInitSnafu, RegionEngineNotFoundSnafu, Result}; use crate::event_listener::RegionServerEventReceiver; use crate::metrics::{self, HEARTBEAT_RECV_COUNT, HEARTBEAT_SENT_COUNT}; use crate::region_server::RegionServer; @@ -63,7 +63,7 @@ pub struct HeartbeatTask { interval: u64, resp_handler_executor: HeartbeatResponseHandlerExecutorRef, region_alive_keeper: Arc, - resource_spec: ResourceSpec, + resource_stat: ResourceStatRef, } impl Drop for HeartbeatTask { @@ -80,6 +80,7 @@ impl HeartbeatTask { meta_client: MetaClientRef, cache_invalidator: CacheInvalidatorRef, plugins: Plugins, + resource_stat: ResourceStatRef, ) -> Result { let countdown_task_handler_ext = plugins.get::(); let region_alive_keeper = Arc::new(RegionAliveKeeper::new( @@ -109,7 +110,7 @@ impl HeartbeatTask { interval: opts.heartbeat.interval.as_millis() as u64, resp_handler_executor, region_alive_keeper, - resource_spec: Default::default(), + resource_stat, }) } @@ -186,6 +187,7 @@ impl HeartbeatTask { .context(error::HandleHeartbeatResponseSnafu) } + #[allow(deprecated)] /// Start heartbeat task, spawn background task. pub async fn start( &self, @@ -237,14 +239,21 @@ impl HeartbeatTask { self.region_alive_keeper.start(Some(event_receiver)).await?; let mut last_sent = Instant::now(); - let cpus = self.resource_spec.cpus as u32; - let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes(); + let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores(); + let total_memory_bytes = self.resource_stat.get_total_memory_bytes(); + let resource_stat = self.resource_stat.clone(); + let gc_limiter = self + .region_server + .mito_engine() + .context(RegionEngineNotFoundSnafu { name: "mito" })? + .gc_limiter(); common_runtime::spawn_hb(async move { let sleep = tokio::time::sleep(Duration::from_millis(0)); tokio::pin!(sleep); let build_info = common_version::build_info(); + let heartbeat_request = HeartbeatRequest { peer: self_peer, node_epoch, @@ -252,8 +261,13 @@ impl HeartbeatTask { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms: node_epoch, - cpus, - memory_bytes, + total_cpu_millicores, + total_memory_bytes, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, + // TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto. + cpus: total_cpu_millicores as u32, + memory_bytes: total_memory_bytes as u64, hostname: hostname::get() .unwrap_or_default() .to_string_lossy() @@ -275,8 +289,13 @@ impl HeartbeatTask { if let Some(message) = message { match outgoing_message_to_mailbox_message(message) { Ok(message) => { + let mut extensions = heartbeat_request.extensions.clone(); + let gc_stat = gc_limiter.gc_stat(); + gc_stat.into_extensions(&mut extensions); + let req = HeartbeatRequest { mailbox_message: Some(message), + extensions, ..heartbeat_request.clone() }; HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc(); @@ -297,12 +316,24 @@ impl HeartbeatTask { let topic_stats = region_server_clone.topic_stats(); let now = Instant::now(); let duration_since_epoch = (now - epoch).as_millis() as u64; - let req = HeartbeatRequest { + + let mut extensions = heartbeat_request.extensions.clone(); + let gc_stat = gc_limiter.gc_stat(); + gc_stat.into_extensions(&mut extensions); + + let mut req = HeartbeatRequest { region_stats, topic_stats, duration_since_epoch, + extensions, ..heartbeat_request.clone() }; + + if let Some(info) = req.info.as_mut() { + info.cpu_usage_millicores = resource_stat.get_cpu_usage_millicores(); + info.memory_usage_bytes = resource_stat.get_memory_usage_bytes(); + } + sleep.as_mut().reset(now + Duration::from_millis(interval)); Some(req) } diff --git a/src/datanode/src/heartbeat/handler.rs b/src/datanode/src/heartbeat/handler.rs index 14a671a14b..8954513653 100644 --- a/src/datanode/src/heartbeat/handler.rs +++ b/src/datanode/src/heartbeat/handler.rs @@ -13,60 +13,69 @@ // limitations under the License. use async_trait::async_trait; -use common_meta::RegionIdent; use common_meta::error::{InvalidHeartbeatResponseSnafu, Result as MetaResult}; use common_meta::heartbeat::handler::{ HandleControl, HeartbeatResponseHandler, HeartbeatResponseHandlerContext, }; use common_meta::instruction::{Instruction, InstructionReply}; use common_telemetry::error; -use futures::future::BoxFuture; use snafu::OptionExt; -use store_api::storage::RegionId; +use store_api::storage::GcReport; mod close_region; mod downgrade_region; +mod file_ref; mod flush_region; +mod gc_worker; mod open_region; mod upgrade_region; +use crate::heartbeat::handler::close_region::CloseRegionsHandler; +use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler; +use crate::heartbeat::handler::file_ref::GetFileRefsHandler; +use crate::heartbeat::handler::flush_region::FlushRegionsHandler; +use crate::heartbeat::handler::gc_worker::GcRegionsHandler; +use crate::heartbeat::handler::open_region::OpenRegionsHandler; +use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler; use crate::heartbeat::task_tracker::TaskTracker; use crate::region_server::RegionServer; -/// Handler for [Instruction::OpenRegion] and [Instruction::CloseRegion]. +/// The handler for [`Instruction`]s. #[derive(Clone)] pub struct RegionHeartbeatResponseHandler { region_server: RegionServer, - catchup_tasks: TaskTracker<()>, downgrade_tasks: TaskTracker<()>, flush_tasks: TaskTracker<()>, open_region_parallelism: usize, + gc_tasks: TaskTracker, } -/// Handler of the instruction. -pub type InstructionHandler = - Box BoxFuture<'static, Option> + Send>; +#[async_trait::async_trait] +pub trait InstructionHandler: Send + Sync { + type Instruction; + async fn handle( + &self, + ctx: &HandlerContext, + instruction: Self::Instruction, + ) -> Option; +} #[derive(Clone)] pub struct HandlerContext { region_server: RegionServer, - catchup_tasks: TaskTracker<()>, downgrade_tasks: TaskTracker<()>, flush_tasks: TaskTracker<()>, + gc_tasks: TaskTracker, } impl HandlerContext { - fn region_ident_to_region_id(region_ident: &RegionIdent) -> RegionId { - RegionId::new(region_ident.table_id, region_ident.region_number) - } - #[cfg(test)] pub fn new_for_test(region_server: RegionServer) -> Self { Self { region_server, - catchup_tasks: TaskTracker::new(), downgrade_tasks: TaskTracker::new(), flush_tasks: TaskTracker::new(), + gc_tasks: TaskTracker::new(), } } } @@ -76,11 +85,11 @@ impl RegionHeartbeatResponseHandler { pub fn new(region_server: RegionServer) -> Self { Self { region_server, - catchup_tasks: TaskTracker::new(), downgrade_tasks: TaskTracker::new(), flush_tasks: TaskTracker::new(), // Default to half of the number of CPUs. open_region_parallelism: (num_cpus::get() / 2).max(1), + gc_tasks: TaskTracker::new(), } } @@ -90,54 +99,114 @@ impl RegionHeartbeatResponseHandler { self } - /// Builds the [InstructionHandler]. - fn build_handler(&self, instruction: Instruction) -> MetaResult { + fn build_handler(&self, instruction: &Instruction) -> MetaResult> { match instruction { - Instruction::OpenRegions(open_regions) => { - let open_region_parallelism = self.open_region_parallelism; - Ok(Box::new(move |handler_context| { - handler_context - .handle_open_regions_instruction(open_regions, open_region_parallelism) - })) - } - Instruction::CloseRegions(close_regions) => Ok(Box::new(move |handler_context| { - handler_context.handle_close_regions_instruction(close_regions) - })), - Instruction::DowngradeRegion(downgrade_region) => { - Ok(Box::new(move |handler_context| { - handler_context.handle_downgrade_region_instruction(downgrade_region) - })) - } - Instruction::UpgradeRegion(upgrade_region) => Ok(Box::new(move |handler_context| { - handler_context.handle_upgrade_region_instruction(upgrade_region) - })), + Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler.into())), + Instruction::OpenRegions(_) => Ok(Box::new( + OpenRegionsHandler { + open_region_parallelism: self.open_region_parallelism, + } + .into(), + )), + Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())), + Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())), + Instruction::UpgradeRegions(_) => Ok(Box::new( + UpgradeRegionsHandler { + upgrade_region_parallelism: self.open_region_parallelism, + } + .into(), + )), + Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())), + Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())), Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(), - Instruction::FlushRegions(flush_regions) => Ok(Box::new(move |handler_context| { - handler_context.handle_flush_regions_instruction(flush_regions) - })), } } } +#[allow(clippy::enum_variant_names)] +pub enum InstructionHandlers { + CloseRegions(CloseRegionsHandler), + OpenRegions(OpenRegionsHandler), + FlushRegions(FlushRegionsHandler), + DowngradeRegions(DowngradeRegionsHandler), + UpgradeRegions(UpgradeRegionsHandler), + GetFileRefs(GetFileRefsHandler), + GcRegions(GcRegionsHandler), +} + +macro_rules! impl_from_handler { + ($($handler:ident => $variant:ident),*) => { + $( + impl From<$handler> for InstructionHandlers { + fn from(handler: $handler) -> Self { + InstructionHandlers::$variant(handler) + } + } + )* + }; +} + +impl_from_handler!( + CloseRegionsHandler => CloseRegions, + OpenRegionsHandler => OpenRegions, + FlushRegionsHandler => FlushRegions, + DowngradeRegionsHandler => DowngradeRegions, + UpgradeRegionsHandler => UpgradeRegions, + GetFileRefsHandler => GetFileRefs, + GcRegionsHandler => GcRegions +); + +macro_rules! dispatch_instr { + ( + $( $instr_variant:ident => $handler_variant:ident ),* $(,)? + ) => { + impl InstructionHandlers { + pub async fn handle( + &self, + ctx: &HandlerContext, + instruction: Instruction, + ) -> Option { + match (self, instruction) { + $( + ( + InstructionHandlers::$handler_variant(handler), + Instruction::$instr_variant(instr), + ) => handler.handle(ctx, instr).await, + )* + // Safety: must be used in pairs with `build_handler`. + _ => unreachable!(), + } + } + /// Check whether this instruction is acceptable by any handler. + pub fn is_acceptable(instruction: &Instruction) -> bool { + matches!( + instruction, + $( + Instruction::$instr_variant { .. } + )|* + ) + } + } + }; +} + +dispatch_instr!( + CloseRegions => CloseRegions, + OpenRegions => OpenRegions, + FlushRegions => FlushRegions, + DowngradeRegions => DowngradeRegions, + UpgradeRegions => UpgradeRegions, + GetFileRefs => GetFileRefs, + GcRegions => GcRegions, +); + #[async_trait] impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler { fn is_acceptable(&self, ctx: &HeartbeatResponseHandlerContext) -> bool { - matches!(ctx.incoming_message.as_ref(), |Some(( - _, - Instruction::DowngradeRegion { .. }, - ))| Some(( - _, - Instruction::UpgradeRegion { .. } - )) | Some(( - _, - Instruction::FlushRegions { .. } - )) | Some(( - _, - Instruction::OpenRegions { .. } - )) | Some(( - _, - Instruction::CloseRegions { .. } - ))) + if let Some((_, instruction)) = ctx.incoming_message.as_ref() { + return InstructionHandlers::is_acceptable(instruction); + } + false } async fn handle(&self, ctx: &mut HeartbeatResponseHandlerContext) -> MetaResult { @@ -148,18 +217,22 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler { let mailbox = ctx.mailbox.clone(); let region_server = self.region_server.clone(); - let catchup_tasks = self.catchup_tasks.clone(); let downgrade_tasks = self.downgrade_tasks.clone(); let flush_tasks = self.flush_tasks.clone(); - let handler = self.build_handler(instruction)?; + let gc_tasks = self.gc_tasks.clone(); + let handler = self.build_handler(&instruction)?; let _handle = common_runtime::spawn_global(async move { - let reply = handler(HandlerContext { - region_server, - catchup_tasks, - downgrade_tasks, - flush_tasks, - }) - .await; + let reply = handler + .handle( + &HandlerContext { + region_server, + downgrade_tasks, + flush_tasks, + gc_tasks, + }, + instruction, + ) + .await; if let Some(reply) = reply && let Err(e) = mailbox.send((meta, reply)).await @@ -179,6 +252,7 @@ mod tests { use std::sync::Arc; use std::time::Duration; + use common_meta::RegionIdent; use common_meta::heartbeat::mailbox::{ HeartbeatMailbox, IncomingMessage, MailboxRef, MessageMeta, }; @@ -249,20 +323,20 @@ mod tests { ); // Downgrade region - let instruction = Instruction::DowngradeRegion(DowngradeRegion { + let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion { region_id: RegionId::new(2048, 1), flush_timeout: Some(Duration::from_secs(1)), - }); + }]); assert!( heartbeat_handler .is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction))) ); // Upgrade region - let instruction = Instruction::UpgradeRegion(UpgradeRegion { + let instruction = Instruction::UpgradeRegions(vec![UpgradeRegion { region_id, ..Default::default() - }); + }]); assert!( heartbeat_handler.is_acceptable(&heartbeat_env.create_handler_ctx((meta, instruction))) ); @@ -447,10 +521,10 @@ mod tests { // Should be ok, if we try to downgrade it twice. for _ in 0..2 { let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0"); - let instruction = Instruction::DowngradeRegion(DowngradeRegion { + let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion { region_id, flush_timeout: Some(Duration::from_secs(1)), - }); + }]); let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction)); let control = heartbeat_handler.handle(&mut ctx).await.unwrap(); @@ -458,33 +532,27 @@ mod tests { let (_, reply) = heartbeat_env.receiver.recv().await.unwrap(); - if let InstructionReply::DowngradeRegion(reply) = reply { - assert!(reply.exists); - assert!(reply.error.is_none()); - assert_eq!(reply.last_entry_id.unwrap(), 0); - } else { - unreachable!() - } + let reply = &reply.expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.is_none()); + assert_eq!(reply.last_entry_id.unwrap(), 0); } // Downgrades a not exists region. let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0"); - let instruction = Instruction::DowngradeRegion(DowngradeRegion { + let instruction = Instruction::DowngradeRegions(vec![DowngradeRegion { region_id: RegionId::new(2048, 1), flush_timeout: Some(Duration::from_secs(1)), - }); + }]); let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction)); let control = heartbeat_handler.handle(&mut ctx).await.unwrap(); assert_matches!(control, HandleControl::Continue); let (_, reply) = heartbeat_env.receiver.recv().await.unwrap(); - if let InstructionReply::DowngradeRegion(reply) = reply { - assert!(!reply.exists); - assert!(reply.error.is_none()); - assert!(reply.last_entry_id.is_none()); - } else { - unreachable!() - } + let reply = reply.expect_downgrade_regions_reply(); + assert!(!reply[0].exists); + assert!(reply[0].error.is_none()); + assert!(reply[0].last_entry_id.is_none()); } } diff --git a/src/datanode/src/heartbeat/handler/close_region.rs b/src/datanode/src/heartbeat/handler/close_region.rs index c942642731..770d6a75cc 100644 --- a/src/datanode/src/heartbeat/handler/close_region.rs +++ b/src/datanode/src/heartbeat/handler/close_region.rs @@ -16,56 +16,61 @@ use common_meta::RegionIdent; use common_meta::instruction::{InstructionReply, SimpleReply}; use common_telemetry::warn; use futures::future::join_all; -use futures_util::future::BoxFuture; use store_api::region_request::{RegionCloseRequest, RegionRequest}; +use store_api::storage::RegionId; use crate::error; -use crate::heartbeat::handler::HandlerContext; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; -impl HandlerContext { - pub(crate) fn handle_close_regions_instruction( - self, - region_idents: Vec, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let region_ids = region_idents - .into_iter() - .map(|region_ident| Self::region_ident_to_region_id(®ion_ident)) - .collect::>(); +#[derive(Debug, Clone, Copy, Default)] +pub struct CloseRegionsHandler; - let futs = region_ids.iter().map(|region_id| { - self.region_server - .handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {})) - }); +#[async_trait::async_trait] +impl InstructionHandler for CloseRegionsHandler { + type Instruction = Vec; - let results = join_all(futs).await; + async fn handle( + &self, + ctx: &HandlerContext, + region_idents: Self::Instruction, + ) -> Option { + let region_ids = region_idents + .into_iter() + .map(|region_ident| RegionId::new(region_ident.table_id, region_ident.region_number)) + .collect::>(); - let mut errors = vec![]; - for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) { - match result { - Ok(_) => (), - Err(error::Error::RegionNotFound { .. }) => { - warn!( - "Received a close regions instruction from meta, but target region:{} is not found.", - region_id - ); - } - Err(err) => errors.push(format!("region:{region_id}: {err:?}")), + let futs = region_ids.iter().map(|region_id| { + ctx.region_server + .handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {})) + }); + + let results = join_all(futs).await; + + let mut errors = vec![]; + for (region_id, result) in region_ids.into_iter().zip(results.into_iter()) { + match result { + Ok(_) => (), + Err(error::Error::RegionNotFound { .. }) => { + warn!( + "Received a close regions instruction from meta, but target region:{} is not found.", + region_id + ); } + Err(err) => errors.push(format!("region:{region_id}: {err:?}")), } + } - if errors.is_empty() { - return Some(InstructionReply::CloseRegions(SimpleReply { - result: true, - error: None, - })); - } + if errors.is_empty() { + return Some(InstructionReply::CloseRegions(SimpleReply { + result: true, + error: None, + })); + } - Some(InstructionReply::CloseRegions(SimpleReply { - result: false, - error: Some(errors.join("; ")), - })) - }) + Some(InstructionReply::CloseRegions(SimpleReply { + result: false, + error: Some(errors.join("; ")), + })) } } diff --git a/src/datanode/src/heartbeat/handler/downgrade_region.rs b/src/datanode/src/heartbeat/handler/downgrade_region.rs index 06d3ab046e..779023a52f 100644 --- a/src/datanode/src/heartbeat/handler/downgrade_region.rs +++ b/src/datanode/src/heartbeat/handler/downgrade_region.rs @@ -12,209 +12,242 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::instruction::{DowngradeRegion, DowngradeRegionReply, InstructionReply}; +use common_meta::instruction::{ + DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, InstructionReply, +}; use common_telemetry::tracing::info; use common_telemetry::{error, warn}; -use futures_util::future::BoxFuture; +use futures::future::join_all; use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState}; use store_api::region_request::{RegionFlushRequest, RegionRequest}; use store_api::storage::RegionId; -use crate::heartbeat::handler::HandlerContext; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; use crate::heartbeat::task_tracker::WaitResult; -impl HandlerContext { - async fn downgrade_to_follower_gracefully( +#[derive(Debug, Clone, Copy, Default)] +pub struct DowngradeRegionsHandler; + +impl DowngradeRegionsHandler { + async fn handle_downgrade_region( + ctx: &HandlerContext, + DowngradeRegion { + region_id, + flush_timeout, + }: DowngradeRegion, + ) -> DowngradeRegionReply { + let Some(writable) = ctx.region_server.is_region_leader(region_id) else { + warn!("Region: {region_id} is not found"); + return DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: false, + error: None, + }; + }; + + let region_server_moved = ctx.region_server.clone(); + + // Ignores flush request + if !writable { + warn!( + "Region: {region_id} is not writable, flush_timeout: {:?}", + flush_timeout + ); + return ctx.downgrade_to_follower_gracefully(region_id).await; + } + + // If flush_timeout is not set, directly convert region to follower. + let Some(flush_timeout) = flush_timeout else { + return ctx.downgrade_to_follower_gracefully(region_id).await; + }; + + // Sets region to downgrading, + // the downgrading region will reject all write requests. + // However, the downgrading region will still accept read, flush requests. + match ctx + .region_server + .set_region_role_state_gracefully(region_id, SettableRegionRoleState::DowngradingLeader) + .await + { + Ok(SetRegionRoleStateResponse::Success { .. }) => {} + Ok(SetRegionRoleStateResponse::NotFound) => { + warn!("Region: {region_id} is not found"); + return DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: false, + error: None, + }; + } + Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => { + error!(err; "Failed to convert region to downgrading leader - invalid transition"); + return DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: Some(format!("{err:?}")), + }; + } + Err(err) => { + error!(err; "Failed to convert region to downgrading leader"); + return DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: Some(format!("{err:?}")), + }; + } + } + + let register_result = ctx + .downgrade_tasks + .try_register( + region_id, + Box::pin(async move { + info!("Flush region: {region_id} before converting region to follower"); + region_server_moved + .handle_request( + region_id, + RegionRequest::Flush(RegionFlushRequest { + row_group_size: None, + }), + ) + .await?; + + Ok(()) + }), + ) + .await; + + if register_result.is_busy() { + warn!("Another flush task is running for the region: {region_id}"); + } + + let mut watcher = register_result.into_watcher(); + let result = ctx.downgrade_tasks.wait(&mut watcher, flush_timeout).await; + + match result { + WaitResult::Timeout => DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: Some(format!( + "Flush region timeout, region: {region_id}, timeout: {:?}", + flush_timeout + )), + }, + WaitResult::Finish(Ok(_)) => ctx.downgrade_to_follower_gracefully(region_id).await, + WaitResult::Finish(Err(err)) => DowngradeRegionReply { + region_id, + last_entry_id: None, + metadata_last_entry_id: None, + exists: true, + error: Some(format!("{err:?}")), + }, + } + } +} + +#[async_trait::async_trait] +impl InstructionHandler for DowngradeRegionsHandler { + type Instruction = Vec; + + async fn handle( &self, - region_id: RegionId, + ctx: &HandlerContext, + downgrade_regions: Self::Instruction, ) -> Option { + let futures = downgrade_regions + .into_iter() + .map(|downgrade_region| Self::handle_downgrade_region(ctx, downgrade_region)); + // Join all futures; parallelism is governed by the underlying flush scheduler. + let results = join_all(futures).await; + + Some(InstructionReply::DowngradeRegions( + DowngradeRegionsReply::new(results), + )) + } +} + +impl HandlerContext { + async fn downgrade_to_follower_gracefully(&self, region_id: RegionId) -> DowngradeRegionReply { match self .region_server .set_region_role_state_gracefully(region_id, SettableRegionRoleState::Follower) .await { - Ok(SetRegionRoleStateResponse::Success(success)) => { - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: success.last_entry_id(), - metadata_last_entry_id: success.metadata_last_entry_id(), - exists: true, - error: None, - })) - } + Ok(SetRegionRoleStateResponse::Success(success)) => DowngradeRegionReply { + region_id, + last_entry_id: success.last_entry_id(), + metadata_last_entry_id: success.metadata_last_entry_id(), + exists: true, + error: None, + }, Ok(SetRegionRoleStateResponse::NotFound) => { warn!("Region: {region_id} is not found"); - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { + DowngradeRegionReply { + region_id, last_entry_id: None, metadata_last_entry_id: None, exists: false, error: None, - })) + } } Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => { error!(err; "Failed to convert region to follower - invalid transition"); - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { + DowngradeRegionReply { + region_id, last_entry_id: None, metadata_last_entry_id: None, exists: true, error: Some(format!("{err:?}")), - })) + } } Err(err) => { error!(err; "Failed to convert region to {}", SettableRegionRoleState::Follower); - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { + DowngradeRegionReply { + region_id, last_entry_id: None, metadata_last_entry_id: None, exists: true, error: Some(format!("{err:?}")), - })) + } } } } - - pub(crate) fn handle_downgrade_region_instruction( - self, - DowngradeRegion { - region_id, - flush_timeout, - }: DowngradeRegion, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let Some(writable) = self.region_server.is_region_leader(region_id) else { - warn!("Region: {region_id} is not found"); - return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: false, - error: None, - })); - }; - - let region_server_moved = self.region_server.clone(); - - // Ignores flush request - if !writable { - warn!( - "Region: {region_id} is not writable, flush_timeout: {:?}", - flush_timeout - ); - return self.downgrade_to_follower_gracefully(region_id).await; - } - - // If flush_timeout is not set, directly convert region to follower. - let Some(flush_timeout) = flush_timeout else { - return self.downgrade_to_follower_gracefully(region_id).await; - }; - - // Sets region to downgrading, - // the downgrading region will reject all write requests. - // However, the downgrading region will still accept read, flush requests. - match self - .region_server - .set_region_role_state_gracefully( - region_id, - SettableRegionRoleState::DowngradingLeader, - ) - .await - { - Ok(SetRegionRoleStateResponse::Success { .. }) => {} - Ok(SetRegionRoleStateResponse::NotFound) => { - warn!("Region: {region_id} is not found"); - return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: false, - error: None, - })); - } - Ok(SetRegionRoleStateResponse::InvalidTransition(err)) => { - error!(err; "Failed to convert region to downgrading leader - invalid transition"); - return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: true, - error: Some(format!("{err:?}")), - })); - } - Err(err) => { - error!(err; "Failed to convert region to downgrading leader"); - return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: true, - error: Some(format!("{err:?}")), - })); - } - } - - let register_result = self - .downgrade_tasks - .try_register( - region_id, - Box::pin(async move { - info!("Flush region: {region_id} before converting region to follower"); - region_server_moved - .handle_request( - region_id, - RegionRequest::Flush(RegionFlushRequest { - row_group_size: None, - }), - ) - .await?; - - Ok(()) - }), - ) - .await; - - if register_result.is_busy() { - warn!("Another flush task is running for the region: {region_id}"); - } - - let mut watcher = register_result.into_watcher(); - let result = self.downgrade_tasks.wait(&mut watcher, flush_timeout).await; - - match result { - WaitResult::Timeout => { - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: true, - error: Some(format!( - "Flush region timeout, region: {region_id}, timeout: {:?}", - flush_timeout - )), - })) - } - WaitResult::Finish(Ok(_)) => self.downgrade_to_follower_gracefully(region_id).await, - WaitResult::Finish(Err(err)) => { - Some(InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id: None, - metadata_last_entry_id: None, - exists: true, - error: Some(format!("{err:?}")), - })) - } - } - }) - } } #[cfg(test)] mod tests { use std::assert_matches::assert_matches; + use std::sync::Arc; use std::time::Duration; - use common_meta::instruction::{DowngradeRegion, InstructionReply}; + use common_meta::heartbeat::handler::{HandleControl, HeartbeatResponseHandler}; + use common_meta::heartbeat::mailbox::MessageMeta; + use common_meta::instruction::{DowngradeRegion, Instruction}; + use mito2::config::MitoConfig; use mito2::engine::MITO_ENGINE_NAME; + use mito2::test_util::{CreateRequestBuilder, TestEnv}; use store_api::region_engine::{ - RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, + RegionEngine, RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, }; use store_api::region_request::RegionRequest; use store_api::storage::RegionId; use tokio::time::Instant; use crate::error; - use crate::heartbeat::handler::HandlerContext; + use crate::heartbeat::handler::downgrade_region::DowngradeRegionsHandler; + use crate::heartbeat::handler::tests::HeartbeatResponseTestEnv; + use crate::heartbeat::handler::{ + HandlerContext, InstructionHandler, RegionHeartbeatResponseHandler, + }; use crate::tests::{MockRegionEngine, mock_region_server}; #[tokio::test] @@ -227,20 +260,20 @@ mod tests { let waits = vec![None, Some(Duration::from_millis(100u64))]; for flush_timeout in waits { - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout, + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(!reply.exists); - assert!(reply.error.is_none()); - assert!(reply.last_entry_id.is_none()); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(!reply.exists); + assert!(reply.error.is_none()); + assert!(reply.last_entry_id.is_none()); } } @@ -270,20 +303,20 @@ mod tests { let waits = vec![None, Some(Duration::from_millis(100u64))]; for flush_timeout in waits { - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout, + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.is_none()); - assert_eq!(reply.last_entry_id.unwrap(), 1024); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.is_none()); + assert_eq!(reply.last_entry_id.unwrap(), 1024); } } @@ -305,20 +338,20 @@ mod tests { let handler_context = HandlerContext::new_for_test(mock_region_server); let flush_timeout = Duration::from_millis(100); - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: Some(flush_timeout), - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout: Some(flush_timeout), + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.unwrap().contains("timeout")); - assert!(reply.last_entry_id.is_none()); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.as_ref().unwrap().contains("timeout")); + assert!(reply.last_entry_id.is_none()); } #[tokio::test] @@ -344,36 +377,38 @@ mod tests { ]; for flush_timeout in waits { - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout, + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.unwrap().contains("timeout")); - assert!(reply.last_entry_id.is_none()); - } + + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.as_ref().unwrap().contains("timeout")); + assert!(reply.last_entry_id.is_none()); } let timer = Instant::now(); - let reply = handler_context - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: Some(Duration::from_millis(500)), - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout: Some(Duration::from_millis(500)), + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); // Must less than 300 ms. assert!(timer.elapsed().as_millis() < 300); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.is_none()); - assert_eq!(reply.last_entry_id.unwrap(), 1024); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.is_none()); + assert_eq!(reply.last_entry_id.unwrap(), 1024); } #[tokio::test] @@ -405,36 +440,36 @@ mod tests { ]; for flush_timeout in waits { - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout, + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!(reply.error.unwrap().contains("timeout")); - assert!(reply.last_entry_id.is_none()); - } - } - let timer = Instant::now(); - let reply = handler_context - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: Some(Duration::from_millis(500)), - }) - .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - // Must less than 300 ms. - assert!(timer.elapsed().as_millis() < 300); - - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; assert!(reply.exists); - assert!(reply.error.unwrap().contains("flush failed")); + assert!(reply.error.as_ref().unwrap().contains("timeout")); assert!(reply.last_entry_id.is_none()); } + let timer = Instant::now(); + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout: Some(Duration::from_millis(500)), + }], + ) + .await; + // Must less than 300 ms. + assert!(timer.elapsed().as_millis() < 300); + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!(reply.error.as_ref().unwrap().contains("flush failed")); + assert!(reply.last_entry_id.is_none()); } #[tokio::test] @@ -449,19 +484,19 @@ mod tests { }); mock_region_server.register_test_region(region_id, mock_engine); let handler_context = HandlerContext::new_for_test(mock_region_server); - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: None, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout: None, + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(!reply.exists); - assert!(reply.error.is_none()); - assert!(reply.last_entry_id.is_none()); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(!reply.exists); + assert!(reply.error.is_none()); + assert!(reply.last_entry_id.is_none()); } #[tokio::test] @@ -480,23 +515,77 @@ mod tests { }); mock_region_server.register_test_region(region_id, mock_engine); let handler_context = HandlerContext::new_for_test(mock_region_server); - let reply = handler_context - .clone() - .handle_downgrade_region_instruction(DowngradeRegion { - region_id, - flush_timeout: None, - }) + let reply = DowngradeRegionsHandler + .handle( + &handler_context, + vec![DowngradeRegion { + region_id, + flush_timeout: None, + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_))); - if let InstructionReply::DowngradeRegion(reply) = reply.unwrap() { - assert!(reply.exists); - assert!( - reply - .error - .unwrap() - .contains("Failed to set region to readonly") - ); - assert!(reply.last_entry_id.is_none()); - } + let reply = &reply.unwrap().expect_downgrade_regions_reply()[0]; + assert!(reply.exists); + assert!( + reply + .error + .as_ref() + .unwrap() + .contains("Failed to set region to readonly") + ); + assert!(reply.last_entry_id.is_none()); + } + + #[tokio::test] + async fn test_downgrade_regions() { + common_telemetry::init_default_ut_logging(); + + let mut region_server = mock_region_server(); + let heartbeat_handler = RegionHeartbeatResponseHandler::new(region_server.clone()); + let mut engine_env = TestEnv::with_prefix("downgrade-regions").await; + let engine = engine_env.create_engine(MitoConfig::default()).await; + region_server.register_engine(Arc::new(engine.clone())); + let region_id = RegionId::new(1024, 1); + let region_id1 = RegionId::new(1024, 2); + let builder = CreateRequestBuilder::new(); + let create_req = builder.build(); + region_server + .handle_request(region_id, RegionRequest::Create(create_req)) + .await + .unwrap(); + let create_req1 = builder.build(); + region_server + .handle_request(region_id1, RegionRequest::Create(create_req1)) + .await + .unwrap(); + let meta = MessageMeta::new_test(1, "test", "dn-1", "meta-0"); + let instruction = Instruction::DowngradeRegions(vec![ + DowngradeRegion { + region_id, + flush_timeout: Some(Duration::from_secs(1)), + }, + DowngradeRegion { + region_id: region_id1, + flush_timeout: Some(Duration::from_secs(1)), + }, + ]); + let mut heartbeat_env = HeartbeatResponseTestEnv::new(); + let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction)); + let control = heartbeat_handler.handle(&mut ctx).await.unwrap(); + assert_matches!(control, HandleControl::Continue); + + let (_, reply) = heartbeat_env.receiver.recv().await.unwrap(); + let reply = reply.expect_downgrade_regions_reply(); + assert_eq!(reply[0].region_id, region_id); + assert!(reply[0].exists); + assert!(reply[0].error.is_none()); + assert_eq!(reply[0].last_entry_id, Some(0)); + assert_eq!(reply[1].region_id, region_id1); + assert!(reply[1].exists); + assert!(reply[1].error.is_none()); + assert_eq!(reply[1].last_entry_id, Some(0)); + + assert_eq!(engine.role(region_id).unwrap(), RegionRole::Follower); + assert_eq!(engine.role(region_id1).unwrap(), RegionRole::Follower); } } diff --git a/src/datanode/src/heartbeat/handler/file_ref.rs b/src/datanode/src/heartbeat/handler/file_ref.rs new file mode 100644 index 0000000000..4d2ac325a8 --- /dev/null +++ b/src/datanode/src/heartbeat/handler/file_ref.rs @@ -0,0 +1,61 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_error::ext::ErrorExt; +use common_meta::instruction::{GetFileRefs, GetFileRefsReply, InstructionReply}; +use store_api::storage::FileRefsManifest; + +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; + +pub struct GetFileRefsHandler; + +#[async_trait::async_trait] +impl InstructionHandler for GetFileRefsHandler { + type Instruction = GetFileRefs; + + async fn handle( + &self, + ctx: &HandlerContext, + get_file_refs: Self::Instruction, + ) -> Option { + let region_server = &ctx.region_server; + + // Get the MitoEngine + let Some(mito_engine) = region_server.mito_engine() else { + return Some(InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest: FileRefsManifest::default(), + success: false, + error: Some("MitoEngine not found".to_string()), + })); + }; + match mito_engine + .get_snapshot_of_file_refs(get_file_refs.query_regions, get_file_refs.related_regions) + .await + { + Ok(all_file_refs) => { + // Return the file references + Some(InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest: all_file_refs, + success: true, + error: None, + })) + } + Err(e) => Some(InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest: FileRefsManifest::default(), + success: false, + error: Some(format!("Failed to get file refs: {}", e.output_msg())), + })), + } + } +} diff --git a/src/datanode/src/heartbeat/handler/flush_region.rs b/src/datanode/src/heartbeat/handler/flush_region.rs index 963d3bf488..a86d672eca 100644 --- a/src/datanode/src/heartbeat/handler/flush_region.rs +++ b/src/datanode/src/heartbeat/handler/flush_region.rs @@ -18,16 +18,51 @@ use common_meta::instruction::{ FlushErrorStrategy, FlushRegionReply, FlushRegions, FlushStrategy, InstructionReply, }; use common_telemetry::{debug, warn}; -use futures_util::future::BoxFuture; use store_api::region_request::{RegionFlushRequest, RegionRequest}; use store_api::storage::RegionId; -use crate::error::{self, RegionNotFoundSnafu, RegionNotReadySnafu, UnexpectedSnafu}; -use crate::heartbeat::handler::HandlerContext; +use crate::error::{self, RegionNotFoundSnafu, RegionNotReadySnafu, Result, UnexpectedSnafu}; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; + +pub struct FlushRegionsHandler; + +#[async_trait::async_trait] +impl InstructionHandler for FlushRegionsHandler { + type Instruction = FlushRegions; + + async fn handle( + &self, + ctx: &HandlerContext, + flush_regions: FlushRegions, + ) -> Option { + let start_time = Instant::now(); + let strategy = flush_regions.strategy; + let region_ids = flush_regions.region_ids; + let error_strategy = flush_regions.error_strategy; + + let reply = if matches!(strategy, FlushStrategy::Async) { + // Asynchronous hint mode: fire-and-forget, no reply expected + ctx.handle_flush_hint(region_ids).await; + None + } else { + // Synchronous mode: return reply with results + let reply = ctx.handle_flush_sync(region_ids, error_strategy).await; + Some(InstructionReply::FlushRegions(reply)) + }; + + let elapsed = start_time.elapsed(); + debug!( + "FlushRegions strategy: {:?}, elapsed: {:?}, reply: {:?}", + strategy, elapsed, reply + ); + + reply + } +} impl HandlerContext { /// Performs the actual region flush operation. - async fn perform_region_flush(&self, region_id: RegionId) -> Result<(), error::Error> { + async fn perform_region_flush(&self, region_id: RegionId) -> Result<()> { let request = RegionRequest::Flush(RegionFlushRequest { row_group_size: None, }); @@ -92,7 +127,7 @@ impl HandlerContext { } /// Flushes a single region synchronously with proper error handling. - async fn flush_single_region_sync(&self, region_id: RegionId) -> Result<(), error::Error> { + async fn flush_single_region_sync(&self, region_id: RegionId) -> Result<()> { // Check if region is leader and writable let Some(writable) = self.region_server.is_region_leader(region_id) else { return Err(RegionNotFoundSnafu { region_id }.build()); @@ -135,37 +170,6 @@ impl HandlerContext { .build()), } } - - /// Unified handler for FlushRegions with all flush semantics. - pub(crate) fn handle_flush_regions_instruction( - self, - flush_regions: FlushRegions, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let start_time = Instant::now(); - let strategy = flush_regions.strategy; - let region_ids = flush_regions.region_ids; - let error_strategy = flush_regions.error_strategy; - - let reply = if matches!(strategy, FlushStrategy::Async) { - // Asynchronous hint mode: fire-and-forget, no reply expected - self.handle_flush_hint(region_ids).await; - None - } else { - // Synchronous mode: return reply with results - let reply = self.handle_flush_sync(region_ids, error_strategy).await; - Some(InstructionReply::FlushRegions(reply)) - }; - - let elapsed = start_time.elapsed(); - debug!( - "FlushRegions strategy: {:?}, elapsed: {:?}, reply: {:?}", - strategy, elapsed, reply - ); - - reply - }) - } } #[cfg(test)] @@ -201,9 +205,8 @@ mod tests { // Async hint mode let flush_instruction = FlushRegions::async_batch(region_ids.clone()); - let reply = handler_context - .clone() - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle(&handler_context, flush_instruction) .await; assert!(reply.is_none()); // Hint mode returns no reply assert_eq!(*flushed_region_ids.read().unwrap(), region_ids); @@ -212,8 +215,8 @@ mod tests { flushed_region_ids.write().unwrap().clear(); let not_found_region_ids = (0..2).map(|i| RegionId::new(2048, i)).collect::>(); let flush_instruction = FlushRegions::async_batch(not_found_region_ids); - let reply = handler_context - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle(&handler_context, flush_instruction) .await; assert!(reply.is_none()); assert!(flushed_region_ids.read().unwrap().is_empty()); @@ -238,20 +241,14 @@ mod tests { let handler_context = HandlerContext::new_for_test(mock_region_server); let flush_instruction = FlushRegions::sync_single(region_id); - let reply = handler_context - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle(&handler_context, flush_instruction) .await; - - assert!(reply.is_some()); - if let Some(InstructionReply::FlushRegions(flush_reply)) = reply { - assert!(flush_reply.overall_success); - assert_eq!(flush_reply.results.len(), 1); - assert_eq!(flush_reply.results[0].0, region_id); - assert!(flush_reply.results[0].1.is_ok()); - } else { - panic!("Expected FlushRegions reply"); - } - + let flush_reply = reply.unwrap().expect_flush_regions_reply(); + assert!(flush_reply.overall_success); + assert_eq!(flush_reply.results.len(), 1); + assert_eq!(flush_reply.results[0].0, region_id); + assert!(flush_reply.results[0].1.is_ok()); assert_eq!(*flushed_region_ids.read().unwrap(), vec![region_id]); } @@ -281,18 +278,13 @@ mod tests { // Sync batch with fail-fast strategy let flush_instruction = FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::FailFast); - let reply = handler_context - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle(&handler_context, flush_instruction) .await; - - assert!(reply.is_some()); - if let Some(InstructionReply::FlushRegions(flush_reply)) = reply { - assert!(!flush_reply.overall_success); // Should fail due to non-existent regions - // With fail-fast, only process regions until first failure - assert!(flush_reply.results.len() <= region_ids.len()); - } else { - panic!("Expected FlushRegions reply"); - } + let flush_reply = reply.unwrap().expect_flush_regions_reply(); + assert!(!flush_reply.overall_success); // Should fail due to non-existent regions + // With fail-fast, only process regions until first failure + assert!(flush_reply.results.len() <= region_ids.len()); } #[tokio::test] @@ -317,20 +309,26 @@ mod tests { // Sync batch with try-all strategy let flush_instruction = FlushRegions::sync_batch(region_ids.clone(), FlushErrorStrategy::TryAll); - let reply = handler_context - .handle_flush_regions_instruction(flush_instruction) + let reply = FlushRegionsHandler + .handle(&handler_context, flush_instruction) .await; + let flush_reply = reply.unwrap().expect_flush_regions_reply(); + assert!(!flush_reply.overall_success); // Should fail due to one non-existent region + // With try-all, should process all regions + assert_eq!(flush_reply.results.len(), region_ids.len()); + // First should succeed, second should fail + assert!(flush_reply.results[0].1.is_ok()); + assert!(flush_reply.results[1].1.is_err()); + } - assert!(reply.is_some()); - if let Some(InstructionReply::FlushRegions(flush_reply)) = reply { - assert!(!flush_reply.overall_success); // Should fail due to one non-existent region - // With try-all, should process all regions - assert_eq!(flush_reply.results.len(), region_ids.len()); - // First should succeed, second should fail - assert!(flush_reply.results[0].1.is_ok()); - assert!(flush_reply.results[1].1.is_err()); - } else { - panic!("Expected FlushRegions reply"); - } + #[test] + fn test_flush_regions_display() { + let region_id = RegionId::new(1024, 1); + let flush_regions = FlushRegions::sync_single(region_id); + let display = format!("{}", flush_regions); + assert_eq!( + display, + "FlushRegions(region_ids=[4398046511105(1024, 1)], strategy=Sync, error_strategy=FailFast)" + ); } } diff --git a/src/datanode/src/heartbeat/handler/gc_worker.rs b/src/datanode/src/heartbeat/handler/gc_worker.rs new file mode 100644 index 0000000000..9329dcb0c6 --- /dev/null +++ b/src/datanode/src/heartbeat/handler/gc_worker.rs @@ -0,0 +1,167 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_meta::instruction::{GcRegions, GcRegionsReply, InstructionReply}; +use common_telemetry::{debug, warn}; +use mito2::gc::LocalGcWorker; +use snafu::{OptionExt, ResultExt, ensure}; +use store_api::storage::{FileRefsManifest, RegionId}; + +use crate::error::{GcMitoEngineSnafu, InvalidGcArgsSnafu, Result, UnexpectedSnafu}; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; + +pub struct GcRegionsHandler; + +#[async_trait::async_trait] +impl InstructionHandler for GcRegionsHandler { + type Instruction = GcRegions; + + async fn handle( + &self, + ctx: &HandlerContext, + gc_regions: Self::Instruction, + ) -> Option { + let region_ids = gc_regions.regions.clone(); + debug!("Received gc regions instruction: {:?}", region_ids); + + let (region_id, gc_worker) = match self + .create_gc_worker( + ctx, + region_ids, + &gc_regions.file_refs_manifest, + gc_regions.full_file_listing, + ) + .await + { + Ok(worker) => worker, + Err(e) => { + return Some(InstructionReply::GcRegions(GcRegionsReply { + result: Err(format!("Failed to create GC worker: {}", e)), + })); + } + }; + + let register_result = ctx + .gc_tasks + .try_register( + region_id, + Box::pin(async move { + debug!("Starting gc worker for region {}", region_id); + let report = gc_worker + .run() + .await + .context(GcMitoEngineSnafu { region_id })?; + debug!("Gc worker for region {} finished", region_id); + Ok(report) + }), + ) + .await; + if register_result.is_busy() { + warn!("Another gc task is running for the region: {region_id}"); + return Some(InstructionReply::GcRegions(GcRegionsReply { + result: Err(format!( + "Another gc task is running for the region: {region_id}" + )), + })); + } + let mut watcher = register_result.into_watcher(); + let result = ctx.gc_tasks.wait_until_finish(&mut watcher).await; + match result { + Ok(report) => Some(InstructionReply::GcRegions(GcRegionsReply { + result: Ok(report), + })), + Err(err) => Some(InstructionReply::GcRegions(GcRegionsReply { + result: Err(format!("{err:?}")), + })), + } + } +} + +impl GcRegionsHandler { + /// Create a GC worker for the given region IDs. + /// Return the first region ID(after sort by given region id) and the GC worker. + async fn create_gc_worker( + &self, + ctx: &HandlerContext, + mut region_ids: Vec, + file_ref_manifest: &FileRefsManifest, + full_file_listing: bool, + ) -> Result<(RegionId, LocalGcWorker)> { + // always use the smallest region id on datanode as the target region id + region_ids.sort_by_key(|r| r.region_number()); + + let mito_engine = ctx + .region_server + .mito_engine() + .with_context(|| UnexpectedSnafu { + violated: "MitoEngine not found".to_string(), + })?; + + let region_id = *region_ids.first().with_context(|| InvalidGcArgsSnafu { + msg: "No region ids provided".to_string(), + })?; + + // also need to ensure all regions are on this datanode + ensure!( + region_ids + .iter() + .all(|rid| mito_engine.find_region(*rid).is_some()), + InvalidGcArgsSnafu { + msg: format!( + "Some regions are not on current datanode:{:?}", + region_ids + .iter() + .filter(|rid| mito_engine.find_region(**rid).is_none()) + .collect::>() + ), + } + ); + + // Find the access layer from one of the regions that exists on this datanode + let access_layer = mito_engine + .find_region(region_id) + .with_context(|| InvalidGcArgsSnafu { + msg: format!( + "None of the regions is on current datanode:{:?}", + region_ids + ), + })? + .access_layer(); + + // if region happen to be dropped before this but after gc scheduler send gc instr, + // need to deal with it properly(it is ok for region to be dropped after GC worker started) + // region not found here can only be drop table/database case, since region migration is prevented by lock in gc procedure + // TODO(discord9): add integration test for this drop case + let mito_regions = region_ids + .iter() + .filter_map(|rid| mito_engine.find_region(*rid).map(|r| (*rid, r))) + .collect(); + + let cache_manager = mito_engine.cache_manager(); + + let gc_worker = LocalGcWorker::try_new( + access_layer.clone(), + Some(cache_manager), + mito_regions, + mito_engine.mito_config().gc.clone(), + file_ref_manifest.clone(), + &mito_engine.gc_limiter(), + full_file_listing, + ) + .await + .context(GcMitoEngineSnafu { region_id })?; + + Ok((region_id, gc_worker)) + } +} diff --git a/src/datanode/src/heartbeat/handler/open_region.rs b/src/datanode/src/heartbeat/handler/open_region.rs index e6ea973eec..76ca806a98 100644 --- a/src/datanode/src/heartbeat/handler/open_region.rs +++ b/src/datanode/src/heartbeat/handler/open_region.rs @@ -14,54 +14,59 @@ use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply}; use common_meta::wal_options_allocator::prepare_wal_options; -use futures_util::future::BoxFuture; use store_api::path_utils::table_dir; use store_api::region_request::{PathType, RegionOpenRequest}; +use store_api::storage::RegionId; -use crate::heartbeat::handler::HandlerContext; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; -impl HandlerContext { - pub(crate) fn handle_open_regions_instruction( - self, - open_regions: Vec, - open_region_parallelism: usize, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let requests = open_regions - .into_iter() - .map(|open_region| { - let OpenRegion { - region_ident, - region_storage_path, - mut region_options, - region_wal_options, - skip_wal_replay, - } = open_region; - let region_id = Self::region_ident_to_region_id(®ion_ident); - prepare_wal_options(&mut region_options, region_id, ®ion_wal_options); - let request = RegionOpenRequest { - engine: region_ident.engine, - table_dir: table_dir(®ion_storage_path, region_id.table_id()), - path_type: PathType::Bare, - options: region_options, - skip_wal_replay, - checkpoint: None, - }; - (region_id, request) - }) - .collect::>(); +pub struct OpenRegionsHandler { + pub open_region_parallelism: usize, +} - let result = self - .region_server - .handle_batch_open_requests(open_region_parallelism, requests, false) - .await; - let success = result.is_ok(); - let error = result.as_ref().map_err(|e| format!("{e:?}")).err(); - Some(InstructionReply::OpenRegions(SimpleReply { - result: success, - error, - })) - }) +#[async_trait::async_trait] +impl InstructionHandler for OpenRegionsHandler { + type Instruction = Vec; + async fn handle( + &self, + ctx: &HandlerContext, + open_regions: Self::Instruction, + ) -> Option { + let requests = open_regions + .into_iter() + .map(|open_region| { + let OpenRegion { + region_ident, + region_storage_path, + mut region_options, + region_wal_options, + skip_wal_replay, + } = open_region; + let region_id = RegionId::new(region_ident.table_id, region_ident.region_number); + prepare_wal_options(&mut region_options, region_id, ®ion_wal_options); + let request = RegionOpenRequest { + engine: region_ident.engine, + table_dir: table_dir(®ion_storage_path, region_id.table_id()), + path_type: PathType::Bare, + options: region_options, + skip_wal_replay, + checkpoint: None, + }; + (region_id, request) + }) + .collect::>(); + + let result = ctx + .region_server + .handle_batch_open_requests(self.open_region_parallelism, requests, false) + .await; + let success = result.is_ok(); + let error = result.as_ref().map_err(|e| format!("{e:?}")).err(); + + Some(InstructionReply::OpenRegions(SimpleReply { + result: success, + error, + })) } } diff --git a/src/datanode/src/heartbeat/handler/upgrade_region.rs b/src/datanode/src/heartbeat/handler/upgrade_region.rs index c1f238e059..d89d0d08b2 100644 --- a/src/datanode/src/heartbeat/handler/upgrade_region.rs +++ b/src/datanode/src/heartbeat/handler/upgrade_region.rs @@ -12,135 +12,224 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply}; -use common_telemetry::{info, warn}; -use futures_util::future::BoxFuture; -use store_api::region_request::{RegionCatchupRequest, RegionRequest, ReplayCheckpoint}; +use common_error::ext::{BoxedError, ErrorExt}; +use common_error::status_code::StatusCode; +use common_meta::instruction::{ + InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply, +}; +use common_telemetry::{debug, info, warn}; +use store_api::region_request::{RegionCatchupRequest, ReplayCheckpoint}; +use store_api::storage::RegionId; -use crate::heartbeat::handler::HandlerContext; -use crate::heartbeat::task_tracker::WaitResult; +use crate::error::Result; +use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; -impl HandlerContext { - pub(crate) fn handle_upgrade_region_instruction( - self, - UpgradeRegion { - region_id, - last_entry_id, - metadata_last_entry_id, - replay_timeout, - location_id, - replay_entry_id, - metadata_replay_entry_id, - }: UpgradeRegion, - ) -> BoxFuture<'static, Option> { - Box::pin(async move { - let Some(writable) = self.region_server.is_region_leader(region_id) else { - return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: false, - exists: false, - error: None, - })); - }; +#[derive(Debug, Clone, Copy, Default)] +pub struct UpgradeRegionsHandler { + pub upgrade_region_parallelism: usize, +} - if writable { - return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: true, - exists: true, - error: None, - })); - } +#[cfg(test)] +impl UpgradeRegionsHandler { + fn new_test() -> UpgradeRegionsHandler { + UpgradeRegionsHandler { + upgrade_region_parallelism: 8, + } + } +} - let region_server_moved = self.region_server.clone(); - - let checkpoint = match (replay_entry_id, metadata_replay_entry_id) { - (Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint { - entry_id, - metadata_entry_id, - }), - _ => None, - }; - - // The catchup task is almost zero cost if the inside region is writable. - // Therefore, it always registers a new catchup task. - let register_result = self - .catchup_tasks - .try_register( - region_id, - Box::pin(async move { - info!("Executing region: {region_id} catchup to: last entry id {last_entry_id:?}"); - region_server_moved - .handle_request( - region_id, - RegionRequest::Catchup(RegionCatchupRequest { - set_writable: true, - entry_id: last_entry_id, - metadata_entry_id: metadata_last_entry_id, - location_id, - checkpoint, - }), - ) - .await?; - - Ok(()) - }), - ) - .await; - - if register_result.is_busy() { - warn!("Another catchup task is running for the region: {region_id}"); - } - - // Returns immediately - let Some(replay_timeout) = replay_timeout else { - return Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: false, - exists: true, - error: None, - })); - }; - - // We don't care that it returns a newly registered or running task. - let mut watcher = register_result.into_watcher(); - let result = self.catchup_tasks.wait(&mut watcher, replay_timeout).await; - - match result { - WaitResult::Timeout => Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: false, - exists: true, - error: None, - })), - WaitResult::Finish(Ok(_)) => { - Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { +impl UpgradeRegionsHandler { + fn convert_responses_to_replies( + responses: Result)>>, + catchup_regions: &[RegionId], + ) -> Vec { + match responses { + Ok(responses) => responses + .into_iter() + .map(|(region_id, result)| match result { + Ok(()) => UpgradeRegionReply { + region_id, ready: true, exists: true, error: None, - })) - } - WaitResult::Finish(Err(err)) => { - Some(InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready: false, - exists: true, - error: Some(format!("{err:?}")), - })) + }, + Err(err) => { + if err.status_code() == StatusCode::RegionNotFound { + UpgradeRegionReply { + region_id, + ready: false, + exists: false, + error: Some(format!("{err:?}")), + } + } else { + UpgradeRegionReply { + region_id, + ready: false, + exists: true, + error: Some(format!("{err:?}")), + } + } + } + }) + .collect::>(), + Err(err) => catchup_regions + .iter() + .map(|region_id| UpgradeRegionReply { + region_id: *region_id, + ready: false, + exists: true, + error: Some(format!("{err:?}")), + }) + .collect::>(), + } + } +} + +impl UpgradeRegionsHandler { + // Handles upgrade regions instruction. + // + // Returns batch of upgrade region replies, the order of the replies is not guaranteed. + async fn handle_upgrade_regions( + &self, + ctx: &HandlerContext, + upgrade_regions: Vec, + ) -> Vec { + let num_upgrade_regions = upgrade_regions.len(); + let mut replies = Vec::with_capacity(num_upgrade_regions); + let mut catchup_requests = Vec::with_capacity(num_upgrade_regions); + let mut catchup_regions = Vec::with_capacity(num_upgrade_regions); + let mut timeout = None; + + for upgrade_region in upgrade_regions { + let Some(writable) = ctx.region_server.is_region_leader(upgrade_region.region_id) + else { + // Region is not found. + debug!("Region {} is not found", upgrade_region.region_id); + replies.push(UpgradeRegionReply { + region_id: upgrade_region.region_id, + ready: false, + exists: false, + error: None, + }); + continue; + }; + + // Ignores the catchup requests for writable regions. + if writable { + warn!( + "Region {} is writable, ignores the catchup request", + upgrade_region.region_id + ); + replies.push(UpgradeRegionReply { + region_id: upgrade_region.region_id, + ready: true, + exists: true, + error: None, + }); + } else { + let UpgradeRegion { + last_entry_id, + metadata_last_entry_id, + location_id, + replay_entry_id, + metadata_replay_entry_id, + replay_timeout, + .. + } = upgrade_region; + match timeout { + Some(timeout) => { + debug_assert_eq!(timeout, replay_timeout); + } + None => { + // TODO(weny): required the replay_timeout. + timeout = Some(replay_timeout); + } } + + let checkpoint = match (replay_entry_id, metadata_replay_entry_id) { + (Some(entry_id), metadata_entry_id) => Some(ReplayCheckpoint { + entry_id, + metadata_entry_id, + }), + _ => None, + }; + + catchup_regions.push(upgrade_region.region_id); + catchup_requests.push(( + upgrade_region.region_id, + RegionCatchupRequest { + set_writable: true, + entry_id: last_entry_id, + metadata_entry_id: metadata_last_entry_id, + location_id, + checkpoint, + }, + )); } - }) + } + + let Some(timeout) = timeout else { + // No replay timeout, so we don't need to catchup the regions. + info!("All regions are writable, no need to catchup"); + debug_assert_eq!(replies.len(), num_upgrade_regions); + return replies; + }; + + match tokio::time::timeout( + timeout, + ctx.region_server + .handle_batch_catchup_requests(self.upgrade_region_parallelism, catchup_requests), + ) + .await + { + Ok(responses) => { + replies.extend( + Self::convert_responses_to_replies(responses, &catchup_regions).into_iter(), + ); + } + Err(_) => { + replies.extend(catchup_regions.iter().map(|region_id| UpgradeRegionReply { + region_id: *region_id, + ready: false, + exists: true, + error: None, + })); + } + } + + replies + } +} + +#[async_trait::async_trait] +impl InstructionHandler for UpgradeRegionsHandler { + type Instruction = Vec; + + async fn handle( + &self, + ctx: &HandlerContext, + upgrade_regions: Self::Instruction, + ) -> Option { + let replies = self.handle_upgrade_regions(ctx, upgrade_regions).await; + + Some(InstructionReply::UpgradeRegions(UpgradeRegionsReply::new( + replies, + ))) } } #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; use std::time::Duration; - use common_meta::instruction::{InstructionReply, UpgradeRegion}; + use common_meta::instruction::UpgradeRegion; use mito2::engine::MITO_ENGINE_NAME; use store_api::region_engine::RegionRole; use store_api::storage::RegionId; - use tokio::time::Instant; use crate::error; - use crate::heartbeat::handler::HandlerContext; + use crate::heartbeat::handler::upgrade_region::UpgradeRegionsHandler; + use crate::heartbeat::handler::{HandlerContext, InstructionHandler}; use crate::tests::{MockRegionEngine, mock_region_server}; #[tokio::test] @@ -152,23 +241,32 @@ mod tests { let handler_context = HandlerContext::new_for_test(mock_region_server); let region_id = RegionId::new(1024, 1); - let waits = vec![None, Some(Duration::from_millis(100u64))]; + let region_id2 = RegionId::new(1024, 2); + let replay_timeout = Duration::from_millis(100u64); + let reply = UpgradeRegionsHandler::new_test() + .handle( + &handler_context, + vec![ + UpgradeRegion { + region_id, + replay_timeout, + ..Default::default() + }, + UpgradeRegion { + region_id: region_id2, + replay_timeout, + ..Default::default() + }, + ], + ) + .await; - for replay_timeout in waits { - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout, - ..Default::default() - }) - .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.exists); - assert!(reply.error.is_none()); - } + let replies = &reply.unwrap().expect_upgrade_regions_reply(); + assert_eq!(replies[0].region_id, region_id); + assert_eq!(replies[1].region_id, region_id2); + for reply in replies { + assert!(!reply.exists); + assert!(reply.error.is_none()); } } @@ -176,6 +274,7 @@ mod tests { async fn test_region_writable() { let mock_region_server = mock_region_server(); let region_id = RegionId::new(1024, 1); + let region_id2 = RegionId::new(1024, 2); let (mock_engine, _) = MockRegionEngine::with_custom_apply_fn(MITO_ENGINE_NAME, |region_engine| { @@ -185,28 +284,35 @@ mod tests { unreachable!(); })); }); - mock_region_server.register_test_region(region_id, mock_engine); - + mock_region_server.register_test_region(region_id, mock_engine.clone()); + mock_region_server.register_test_region(region_id2, mock_engine); let handler_context = HandlerContext::new_for_test(mock_region_server); + let replay_timeout = Duration::from_millis(100u64); + let reply = UpgradeRegionsHandler::new_test() + .handle( + &handler_context, + vec![ + UpgradeRegion { + region_id, + replay_timeout, + ..Default::default() + }, + UpgradeRegion { + region_id: region_id2, + replay_timeout, + ..Default::default() + }, + ], + ) + .await; - let waits = vec![None, Some(Duration::from_millis(100u64))]; - - for replay_timeout in waits { - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout, - ..Default::default() - }) - .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(reply.ready); - assert!(reply.exists); - assert!(reply.error.is_none()); - } + let replies = &reply.unwrap().expect_upgrade_regions_reply(); + assert_eq!(replies[0].region_id, region_id); + assert_eq!(replies[1].region_id, region_id2); + for reply in replies { + assert!(reply.ready); + assert!(reply.exists); + assert!(reply.error.is_none()); } } @@ -226,30 +332,27 @@ mod tests { mock_region_server.register_test_region(region_id, mock_engine); let handler_context = HandlerContext::new_for_test(mock_region_server); - - let waits = vec![None, Some(Duration::from_millis(100u64))]; - - for replay_timeout in waits { - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { + let replay_timeout = Duration::from_millis(100u64); + let reply = UpgradeRegionsHandler::new_test() + .handle( + &handler_context, + vec![UpgradeRegion { region_id, replay_timeout, ..Default::default() - }) - .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); + }], + ) + .await; - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.ready); - assert!(reply.exists); - assert!(reply.error.is_none()); - } - } + let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; + assert!(!reply.ready); + assert!(reply.exists); + assert!(reply.error.is_none(), "error: {:?}", reply.error); } #[tokio::test] async fn test_region_not_ready_with_retry() { + common_telemetry::init_default_ut_logging(); let mock_region_server = mock_region_server(); let region_id = RegionId::new(1024, 1); @@ -258,57 +361,48 @@ mod tests { // Region is not ready. region_engine.mock_role = Some(Some(RegionRole::Follower)); region_engine.handle_request_mock_fn = Some(Box::new(|_, _| Ok(0))); - // Note: Don't change. region_engine.handle_request_delay = Some(Duration::from_millis(300)); }); mock_region_server.register_test_region(region_id, mock_engine); - - let waits = vec![ - Some(Duration::from_millis(100u64)), - Some(Duration::from_millis(100u64)), - ]; - + let waits = vec![Duration::from_millis(100u64), Duration::from_millis(100u64)]; let handler_context = HandlerContext::new_for_test(mock_region_server); - for replay_timeout in waits { - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout, - ..Default::default() - }) + let reply = UpgradeRegionsHandler::new_test() + .handle( + &handler_context, + vec![UpgradeRegion { + region_id, + replay_timeout, + ..Default::default() + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.ready); - assert!(reply.exists); - assert!(reply.error.is_none()); - } - } - - let timer = Instant::now(); - let reply = handler_context - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout: Some(Duration::from_millis(500)), - ..Default::default() - }) - .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - // Must less than 300 ms. - assert!(timer.elapsed().as_millis() < 300); - - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(reply.ready); + let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; + assert!(!reply.ready); assert!(reply.exists); - assert!(reply.error.is_none()); + assert!(reply.error.is_none(), "error: {:?}", reply.error); } + + let reply = UpgradeRegionsHandler::new_test() + .handle( + &handler_context, + vec![UpgradeRegion { + region_id, + replay_timeout: Duration::from_millis(500), + ..Default::default() + }], + ) + .await; + let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; + assert!(reply.ready); + assert!(reply.exists); + assert!(reply.error.is_none(), "error: {:?}", reply.error); } #[tokio::test] async fn test_region_error() { + common_telemetry::init_default_ut_logging(); let mock_region_server = mock_region_server(); let region_id = RegionId::new(1024, 1); @@ -328,38 +422,37 @@ mod tests { mock_region_server.register_test_region(region_id, mock_engine); let handler_context = HandlerContext::new_for_test(mock_region_server); - - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - ..Default::default() - }) + let reply = UpgradeRegionsHandler::new_test() + .handle( + &handler_context, + vec![UpgradeRegion { + region_id, + ..Default::default() + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); // It didn't wait for handle returns; it had no idea about the error. - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.ready); - assert!(reply.exists); - assert!(reply.error.is_none()); - } + let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; + assert!(!reply.ready); + assert!(reply.exists); + assert!(reply.error.is_none()); - let reply = handler_context - .clone() - .handle_upgrade_region_instruction(UpgradeRegion { - region_id, - replay_timeout: Some(Duration::from_millis(200)), - ..Default::default() - }) + let reply = UpgradeRegionsHandler::new_test() + .handle( + &handler_context, + vec![UpgradeRegion { + region_id, + replay_timeout: Duration::from_millis(200), + ..Default::default() + }], + ) .await; - assert_matches!(reply, Some(InstructionReply::UpgradeRegion(_))); - if let InstructionReply::UpgradeRegion(reply) = reply.unwrap() { - assert!(!reply.ready); - assert!(reply.exists); - assert!(reply.error.is_some()); - assert!(reply.error.unwrap().contains("mock_error")); - } + let reply = &reply.unwrap().expect_upgrade_regions_reply()[0]; + assert!(!reply.ready); + assert!(reply.exists); + assert!(reply.error.is_some()); + assert!(reply.error.as_ref().unwrap().contains("mock_error")); } } diff --git a/src/datanode/src/metrics.rs b/src/datanode/src/metrics.rs index 1b0e513375..4e763f5858 100644 --- a/src/datanode/src/metrics.rs +++ b/src/datanode/src/metrics.rs @@ -75,4 +75,20 @@ lazy_static! { &[RESULT_TYPE] ) .unwrap(); + + /// Total count of failed region server requests. + pub static ref REGION_SERVER_REQUEST_FAILURE_COUNT: IntCounterVec = register_int_counter_vec!( + "greptime_datanode_region_request_fail_count", + "failed region server requests count", + &[REGION_REQUEST_TYPE] + ) + .unwrap(); + + /// Total count of failed insert requests to region server. + pub static ref REGION_SERVER_INSERT_FAIL_COUNT: IntCounterVec = register_int_counter_vec!( + "greptime_datanode_region_failed_insert_count", + "failed region server insert requests count", + &[REGION_REQUEST_TYPE] + ) + .unwrap(); } diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index 341ee9442c..88680ed195 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -66,7 +66,8 @@ use store_api::region_engine::{ SettableRegionRoleState, }; use store_api::region_request::{ - AffectedRows, BatchRegionDdlRequest, RegionCloseRequest, RegionOpenRequest, RegionRequest, + AffectedRows, BatchRegionDdlRequest, RegionCatchupRequest, RegionCloseRequest, + RegionOpenRequest, RegionRequest, }; use store_api::storage::RegionId; use tokio::sync::{Semaphore, SemaphorePermit}; @@ -158,6 +159,27 @@ impl RegionServer { } } + /// Gets the MitoEngine if it's registered. + pub fn mito_engine(&self) -> Option { + if let Some(mito) = self.inner.mito_engine.read().unwrap().clone() { + Some(mito) + } else { + self.inner + .engines + .read() + .unwrap() + .get(MITO_ENGINE_NAME) + .cloned() + .and_then(|e| { + let mito = e.as_any().downcast_ref::().cloned(); + if mito.is_none() { + warn!("Mito engine not found in region server engines"); + } + mito + }) + } + } + #[tracing::instrument(skip_all)] pub async fn handle_batch_open_requests( &self, @@ -170,6 +192,17 @@ impl RegionServer { .await } + #[tracing::instrument(skip_all)] + pub async fn handle_batch_catchup_requests( + &self, + parallelism: usize, + requests: Vec<(RegionId, RegionCatchupRequest)>, + ) -> Result)>> { + self.inner + .handle_batch_catchup_requests(parallelism, requests) + .await + } + #[tracing::instrument(skip_all, fields(request_type = request.request_type()))] pub async fn handle_request( &self, @@ -378,6 +411,14 @@ impl RegionServer { #[cfg(test)] /// Registers a region for test purpose. pub(crate) fn register_test_region(&self, region_id: RegionId, engine: RegionEngineRef) { + { + let mut engines = self.inner.engines.write().unwrap(); + if !engines.contains_key(engine.name()) { + debug!("Registering test engine: {}", engine.name()); + engines.insert(engine.name().to_string(), engine.clone()); + } + } + self.inner .region_map .insert(region_id, RegionEngineWithStatus::Ready(engine)); @@ -477,7 +518,7 @@ impl RegionServer { let manifest_info = match manifest_info { ManifestInfo::MitoManifestInfo(info) => { - RegionManifestInfo::mito(info.data_manifest_version, 0) + RegionManifestInfo::mito(info.data_manifest_version, 0, 0) } ManifestInfo::MetricManifestInfo(info) => RegionManifestInfo::metric( info.data_manifest_version, @@ -559,6 +600,8 @@ impl RegionServer { #[async_trait] impl RegionServerHandler for RegionServer { async fn handle(&self, request: region_request::Body) -> ServerResult { + let failed_requests_cnt = crate::metrics::REGION_SERVER_REQUEST_FAILURE_COUNT + .with_label_values(&[request.as_ref()]); let response = match &request { region_request::Body::Creates(_) | region_request::Body::Drops(_) @@ -576,6 +619,9 @@ impl RegionServerHandler for RegionServer { _ => self.handle_requests_in_serial(request).await, } .map_err(BoxedError::new) + .inspect_err(|_| { + failed_requests_cnt.inc(); + }) .context(ExecuteGrpcRequestSnafu)?; Ok(RegionResponseV1 { @@ -676,14 +722,14 @@ struct RegionServerInner { runtime: Runtime, event_listener: RegionServerEventListenerRef, table_provider_factory: TableProviderFactoryRef, - // The number of queries allowed to be executed at the same time. - // Act as last line of defense on datanode to prevent query overloading. + /// The number of queries allowed to be executed at the same time. + /// Act as last line of defense on datanode to prevent query overloading. parallelism: Option, - // The topic stats reporter. + /// The topic stats reporter. topic_stats_reporter: RwLock>>, - // HACK(zhongzc): Direct MitoEngine handle for diagnostics. This couples the - // server with a concrete engine; acceptable for now to fetch Mito-specific - // info (e.g., list SSTs). Consider a diagnostics trait later. + /// HACK(zhongzc): Direct MitoEngine handle for diagnostics. This couples the + /// server with a concrete engine; acceptable for now to fetch Mito-specific + /// info (e.g., list SSTs). Consider a diagnostics trait later. mito_engine: RwLock>, } @@ -951,6 +997,116 @@ impl RegionServerInner { .collect::>()) } + pub async fn handle_batch_catchup_requests_inner( + &self, + engine: RegionEngineRef, + parallelism: usize, + requests: Vec<(RegionId, RegionCatchupRequest)>, + ) -> Result)>> { + for (region_id, _) in &requests { + self.set_region_status_not_ready(*region_id, &engine, &RegionChange::Catchup); + } + let region_ids = requests + .iter() + .map(|(region_id, _)| *region_id) + .collect::>(); + let mut responses = Vec::with_capacity(requests.len()); + match engine + .handle_batch_catchup_requests(parallelism, requests) + .await + { + Ok(results) => { + for (region_id, result) in results { + match result { + Ok(_) => { + if let Err(e) = self + .set_region_status_ready( + region_id, + engine.clone(), + RegionChange::Catchup, + ) + .await + { + error!(e; "Failed to set region to ready: {}", region_id); + responses.push((region_id, Err(BoxedError::new(e)))); + } else { + responses.push((region_id, Ok(()))); + } + } + Err(e) => { + self.unset_region_status(region_id, &engine, RegionChange::Catchup); + error!(e; "Failed to catchup region: {}", region_id); + responses.push((region_id, Err(e))); + } + } + } + } + Err(e) => { + for region_id in region_ids { + self.unset_region_status(region_id, &engine, RegionChange::Catchup); + } + error!(e; "Failed to catchup batch regions"); + return error::UnexpectedSnafu { + violated: format!("Failed to catchup batch regions: {:?}", e), + } + .fail(); + } + } + + Ok(responses) + } + + pub async fn handle_batch_catchup_requests( + &self, + parallelism: usize, + requests: Vec<(RegionId, RegionCatchupRequest)>, + ) -> Result)>> { + let mut engine_grouped_requests: HashMap> = HashMap::new(); + + let mut responses = Vec::with_capacity(requests.len()); + for (region_id, request) in requests { + if let Ok(engine) = self.get_engine(region_id, &RegionChange::Catchup) { + match engine { + CurrentEngine::Engine(engine) => { + engine_grouped_requests + .entry(engine.name().to_string()) + .or_default() + .push((region_id, request)); + } + CurrentEngine::EarlyReturn(_) => { + return error::UnexpectedSnafu { + violated: format!("Unexpected engine type for region {}", region_id), + } + .fail(); + } + } + } else { + responses.push(( + region_id, + Err(BoxedError::new( + error::RegionNotFoundSnafu { region_id }.build(), + )), + )); + } + } + + for (engine, requests) in engine_grouped_requests { + let engine = self + .engines + .read() + .unwrap() + .get(&engine) + .with_context(|| RegionEngineNotFoundSnafu { name: &engine })? + .clone(); + responses.extend( + self.handle_batch_catchup_requests_inner(engine, parallelism, requests) + .await?, + ); + } + + Ok(responses) + } + // Handle requests in batch. // // limitation: all create requests must be in the same engine. @@ -1044,7 +1200,8 @@ impl RegionServerInner { | RegionRequest::Flush(_) | RegionRequest::Compact(_) | RegionRequest::Truncate(_) - | RegionRequest::BuildIndex(_) => RegionChange::None, + | RegionRequest::BuildIndex(_) + | RegionRequest::EnterStaging(_) => RegionChange::None, RegionRequest::Catchup(_) => RegionChange::Catchup, }; @@ -1079,6 +1236,11 @@ impl RegionServerInner { }) } Err(err) => { + if matches!(region_change, RegionChange::Ingest) { + crate::metrics::REGION_SERVER_INSERT_FAIL_COUNT + .with_label_values(&[request_type]) + .inc(); + } // Removes the region status if the operation fails. self.unset_region_status(region_id, &engine, region_change); Err(err) diff --git a/src/datanode/src/store.rs b/src/datanode/src/store.rs index 8d1c8c99dc..6dc6f280c6 100644 --- a/src/datanode/src/store.rs +++ b/src/datanode/src/store.rs @@ -47,10 +47,7 @@ pub(crate) async fn new_object_store_without_cache( Ok(object_store) } -pub(crate) async fn new_object_store( - store: ObjectStoreConfig, - data_home: &str, -) -> Result { +pub async fn new_object_store(store: ObjectStoreConfig, data_home: &str) -> Result { let object_store = new_raw_object_store(&store, data_home) .await .context(error::ObjectStoreSnafu)?; @@ -59,7 +56,7 @@ pub(crate) async fn new_object_store( let object_store = { // It's safe to unwrap here because we already checked above. let cache_config = store.cache_config().unwrap(); - if let Some(cache_layer) = build_cache_layer(cache_config).await? { + if let Some(cache_layer) = build_cache_layer(cache_config, data_home).await? { // Adds cache layer object_store.layer(cache_layer) } else { @@ -79,17 +76,22 @@ pub(crate) async fn new_object_store( async fn build_cache_layer( cache_config: &ObjectStorageCacheConfig, + data_home: &str, ) -> Result>> { // No need to build cache layer if read cache is disabled. if !cache_config.enable_read_cache { return Ok(None); } - - let atomic_temp_dir = join_dir(&cache_config.cache_path, ATOMIC_WRITE_DIR); + let cache_base_dir = if cache_config.cache_path.is_empty() { + data_home + } else { + &cache_config.cache_path + }; + let atomic_temp_dir = join_dir(cache_base_dir, ATOMIC_WRITE_DIR); clean_temp_dir(&atomic_temp_dir).context(error::ObjectStoreSnafu)?; let cache_store = Fs::default() - .root(&cache_config.cache_path) + .root(cache_base_dir) .atomic_write_dir(&atomic_temp_dir) .build() .context(error::BuildCacheStoreSnafu)?; diff --git a/src/datanode/src/tests.rs b/src/datanode/src/tests.rs index 5f7db4d928..557a8c92ed 100644 --- a/src/datanode/src/tests.rs +++ b/src/datanode/src/tests.rs @@ -34,7 +34,8 @@ use session::context::QueryContextRef; use store_api::metadata::RegionMetadataRef; use store_api::region_engine::{ RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, - SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse, + RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse, + SettableRegionRoleState, SyncManifestResponse, }; use store_api::region_request::{AffectedRows, RegionRequest}; use store_api::storage::{RegionId, ScanRequest, SequenceNumber}; @@ -291,6 +292,13 @@ impl RegionEngine for MockRegionEngine { unimplemented!() } + async fn remap_manifests( + &self, + _request: RemapManifestsRequest, + ) -> Result { + unimplemented!() + } + fn as_any(&self) -> &dyn Any { self } diff --git a/src/datatypes/src/arrow_array.rs b/src/datatypes/src/arrow_array.rs index 97aa299fad..ac5e6444af 100644 --- a/src/datatypes/src/arrow_array.rs +++ b/src/datatypes/src/arrow_array.rs @@ -12,9 +12,117 @@ // See the License for the specific language governing permissions and // limitations under the License. +use arrow::array::{ArrayRef, AsArray}; +use arrow::datatypes::{ + DataType, DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType, + DurationSecondType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, + Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, +}; +use common_time::time::Time; +use common_time::{Duration, Timestamp}; + pub type BinaryArray = arrow::array::BinaryArray; pub type MutableBinaryArray = arrow::array::BinaryBuilder; pub type StringArray = arrow::array::StringArray; pub type MutableStringArray = arrow::array::StringBuilder; pub type LargeStringArray = arrow::array::LargeStringArray; pub type MutableLargeStringArray = arrow::array::LargeStringBuilder; + +/// Get the [Timestamp] value at index `i` of the timestamp array. +/// +/// Note: This method does not check for nulls and the value is arbitrary +/// if [`is_null`](arrow::array::Array::is_null) returns true for the index. +/// +/// # Panics +/// 1. if index `i` is out of bounds; +/// 2. or the array is not timestamp type. +pub fn timestamp_array_value(array: &ArrayRef, i: usize) -> Timestamp { + let DataType::Timestamp(time_unit, _) = &array.data_type() else { + unreachable!() + }; + let v = match time_unit { + TimeUnit::Second => { + let array = array.as_primitive::(); + array.value(i) + } + TimeUnit::Millisecond => { + let array = array.as_primitive::(); + array.value(i) + } + TimeUnit::Microsecond => { + let array = array.as_primitive::(); + array.value(i) + } + TimeUnit::Nanosecond => { + let array = array.as_primitive::(); + array.value(i) + } + }; + Timestamp::new(v, time_unit.into()) +} + +/// Get the [Time] value at index `i` of the time array. +/// +/// Note: This method does not check for nulls and the value is arbitrary +/// if [`is_null`](arrow::array::Array::is_null) returns true for the index. +/// +/// # Panics +/// 1. if index `i` is out of bounds; +/// 2. or the array is not `Time32` or `Time64` type. +pub fn time_array_value(array: &ArrayRef, i: usize) -> Time { + match array.data_type() { + DataType::Time32(time_unit) | DataType::Time64(time_unit) => match time_unit { + TimeUnit::Second => { + let array = array.as_primitive::(); + Time::new_second(array.value(i) as i64) + } + TimeUnit::Millisecond => { + let array = array.as_primitive::(); + Time::new_millisecond(array.value(i) as i64) + } + TimeUnit::Microsecond => { + let array = array.as_primitive::(); + Time::new_microsecond(array.value(i)) + } + TimeUnit::Nanosecond => { + let array = array.as_primitive::(); + Time::new_nanosecond(array.value(i)) + } + }, + _ => unreachable!(), + } +} + +/// Get the [Duration] value at index `i` of the duration array. +/// +/// Note: This method does not check for nulls and the value is arbitrary +/// if [`is_null`](arrow::array::Array::is_null) returns true for the index. +/// +/// # Panics +/// 1. if index `i` is out of bounds; +/// 2. or the array is not duration type. +pub fn duration_array_value(array: &ArrayRef, i: usize) -> Duration { + let DataType::Duration(time_unit) = array.data_type() else { + unreachable!(); + }; + let v = match time_unit { + TimeUnit::Second => { + let array = array.as_primitive::(); + array.value(i) + } + TimeUnit::Millisecond => { + let array = array.as_primitive::(); + array.value(i) + } + TimeUnit::Microsecond => { + let array = array.as_primitive::(); + array.value(i) + } + TimeUnit::Nanosecond => { + let array = array.as_primitive::(); + array.value(i) + } + }; + Duration::new(v, time_unit.into()) +} diff --git a/src/datatypes/src/data_type.rs b/src/datatypes/src/data_type.rs index eb47d30305..25fd095a9f 100644 --- a/src/datatypes/src/data_type.rs +++ b/src/datatypes/src/data_type.rs @@ -15,7 +15,6 @@ use std::fmt; use std::sync::Arc; -use arrow::compute::cast as arrow_array_cast; use arrow::datatypes::{ DataType as ArrowDataType, IntervalUnit as ArrowIntervalUnit, TimeUnit as ArrowTimeUnit, }; @@ -33,8 +32,8 @@ use crate::types::{ BinaryType, BooleanType, DateType, Decimal128Type, DictionaryType, DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType, DurationSecondType, DurationType, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, IntervalDayTimeType, - IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, JsonFormat, JsonType, ListType, - NullType, StringType, StructType, TimeMillisecondType, TimeType, TimestampMicrosecondType, + IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, JsonType, ListType, NullType, + StringType, StructType, TimeMillisecondType, TimeType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType, UInt8Type, UInt16Type, UInt32Type, UInt64Type, VectorType, }; @@ -277,6 +276,10 @@ impl ConcreteDataType { matches!(self, ConcreteDataType::Null(NullType)) } + pub(crate) fn is_struct(&self) -> bool { + matches!(self, ConcreteDataType::Struct(_)) + } + /// Try to cast the type as a [`ListType`]. pub fn as_list(&self) -> Option<&ListType> { match self { @@ -348,9 +351,9 @@ impl ConcreteDataType { } } - pub fn as_json(&self) -> Option { + pub fn as_json(&self) -> Option<&JsonType> { match self { - ConcreteDataType::Json(j) => Some(j.clone()), + ConcreteDataType::Json(j) => Some(j), _ => None, } } @@ -364,8 +367,10 @@ impl ConcreteDataType { /// Checks if the data type can cast to another data type. pub fn can_arrow_type_cast_to(&self, to_type: &ConcreteDataType) -> bool { - let array = arrow_array::new_empty_array(&self.as_arrow_type()); - arrow_array_cast(array.as_ref(), &to_type.as_arrow_type()).is_ok() + match (self, to_type) { + (ConcreteDataType::Json(this), ConcreteDataType::Json(that)) => that.is_include(this), + _ => arrow::compute::can_cast_types(&self.as_arrow_type(), &to_type.as_arrow_type()), + } } /// Try to cast data type as a [`DurationType`]. @@ -673,7 +678,7 @@ impl ConcreteDataType { } pub fn json_native_datatype(inner_type: ConcreteDataType) -> ConcreteDataType { - ConcreteDataType::Json(JsonType::new(JsonFormat::Native(Box::new(inner_type)))) + ConcreteDataType::Json(JsonType::new_native((&inner_type).into())) } } diff --git a/src/datatypes/src/error.rs b/src/datatypes/src/error.rs index 85e78ce1eb..65aca699ec 100644 --- a/src/datatypes/src/error.rs +++ b/src/datatypes/src/error.rs @@ -189,7 +189,7 @@ pub enum Error { location: Location, }, - #[snafu(display("Invalid JSON text: {}", value))] + #[snafu(display("Invalid JSON: {}", value))] InvalidJson { value: String, #[snafu(implicit)] @@ -259,6 +259,21 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Failed to merge JSON datatype: {reason}"))] + MergeJsonDatatype { + reason: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to parse or serialize arrow metadata"))] + ArrowMetadata { + #[snafu(source)] + error: arrow::error::ArrowError, + #[snafu(implicit)] + location: Location, + }, } impl ErrorExt for Error { @@ -281,7 +296,8 @@ impl ErrorExt for Error { | InvalidJsonb { .. } | InvalidVector { .. } | InvalidFulltextOption { .. } - | InvalidSkippingIndexOption { .. } => StatusCode::InvalidArguments, + | InvalidSkippingIndexOption { .. } + | MergeJsonDatatype { .. } => StatusCode::InvalidArguments, ValueExceedsPrecision { .. } | CastType { .. } @@ -299,7 +315,8 @@ impl ErrorExt for Error { | ConvertArrowArrayToScalars { .. } | ConvertScalarToArrowArray { .. } | ParseExtendedType { .. } - | InconsistentStructFieldsAndItems { .. } => StatusCode::Internal, + | InconsistentStructFieldsAndItems { .. } + | ArrowMetadata { .. } => StatusCode::Internal, } } diff --git a/src/datatypes/src/extension.rs b/src/datatypes/src/extension.rs new file mode 100644 index 0000000000..83776cdcc1 --- /dev/null +++ b/src/datatypes/src/extension.rs @@ -0,0 +1,15 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod json; diff --git a/src/datatypes/src/extension/json.rs b/src/datatypes/src/extension/json.rs new file mode 100644 index 0000000000..abc75bb35b --- /dev/null +++ b/src/datatypes/src/extension/json.rs @@ -0,0 +1,109 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_schema::extension::ExtensionType; +use arrow_schema::{ArrowError, DataType, FieldRef}; +use serde::{Deserialize, Serialize}; + +use crate::json::JsonStructureSettings; + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct JsonMetadata { + /// Indicates how to handle JSON is stored in underlying data type + /// + /// This field can be `None` for data is converted to complete structured in-memory form. + pub json_structure_settings: Option, +} + +#[derive(Debug, Clone)] +pub struct JsonExtensionType(Arc); + +impl JsonExtensionType { + pub fn new(metadata: Arc) -> Self { + JsonExtensionType(metadata) + } +} + +impl ExtensionType for JsonExtensionType { + const NAME: &'static str = "greptime.json"; + type Metadata = Arc; + + fn metadata(&self) -> &Self::Metadata { + &self.0 + } + + fn serialize_metadata(&self) -> Option { + serde_json::to_string(self.metadata()).ok() + } + + fn deserialize_metadata(metadata: Option<&str>) -> Result { + if let Some(metadata) = metadata { + let metadata = serde_json::from_str(metadata).map_err(|e| { + ArrowError::ParseError(format!("Failed to deserialize JSON metadata: {}", e)) + })?; + Ok(Arc::new(metadata)) + } else { + Ok(Arc::new(JsonMetadata::default())) + } + } + + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + match data_type { + // object + DataType::Struct(_) + // array + | DataType::List(_) + | DataType::ListView(_) + | DataType::LargeList(_) + | DataType::LargeListView(_) + // string + | DataType::Utf8 + | DataType::Utf8View + | DataType::LargeUtf8 + // number + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 + // boolean + | DataType::Boolean + // null + | DataType::Null + // legacy json type + | DataType::Binary => Ok(()), + dt => Err(ArrowError::SchemaError(format!( + "Unexpected data type {dt}" + ))), + } + } + + fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result { + let json = Self(metadata); + json.supports_data_type(data_type)?; + Ok(json) + } +} + +/// Check if this field is to be treated as json extension type. +pub fn is_json_extension_type(field: &FieldRef) -> bool { + field.extension_type_name() == Some(JsonExtensionType::NAME) +} diff --git a/src/datatypes/src/json.rs b/src/datatypes/src/json.rs index 380cc8ce06..b12c63c479 100644 --- a/src/datatypes/src/json.rs +++ b/src/datatypes/src/json.rs @@ -19,16 +19,18 @@ //! The struct will carry all the fields of the Json object. We will not flatten any json object in this implementation. //! -use std::collections::HashSet; +pub mod value; + +use std::collections::{BTreeMap, HashSet}; use std::sync::Arc; -use common_base::bytes::StringBytes; -use ordered_float::OrderedFloat; +use serde::{Deserialize, Serialize}; use serde_json::{Map, Value as Json}; use snafu::{ResultExt, ensure}; -use crate::data_type::{ConcreteDataType, DataType}; use crate::error::{self, Error}; +use crate::json::value::{JsonValue, JsonVariant}; +use crate::types::json_type::{JsonNativeType, JsonNumberType, JsonObjectType}; use crate::types::{StructField, StructType}; use crate::value::{ListValue, StructValue, Value}; @@ -45,7 +47,7 @@ use crate::value::{ListValue, StructValue, Value}; /// convert them to fully structured StructValue for user-facing APIs: the UI protocol and the UDF interface. /// /// **Important**: This settings only controls the internal form of JSON encoding. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub enum JsonStructureSettings { // TODO(sunng87): provide a limit Structured(Option), @@ -101,7 +103,7 @@ impl JsonStructureSettings { pub fn encode_with_type( &self, json: Json, - data_type: Option<&ConcreteDataType>, + data_type: Option<&JsonNativeType>, ) -> Result { let context = JsonContext { key_path: String::new(), @@ -111,6 +113,12 @@ impl JsonStructureSettings { } } +impl Default for JsonStructureSettings { + fn default() -> Self { + Self::Structured(None) + } +} + impl<'a> JsonContext<'a> { /// Create a new context with an updated key path pub fn with_key(&self, key: &str) -> JsonContext<'a> { @@ -139,70 +147,65 @@ impl<'a> JsonContext<'a> { /// Main encoding function with key path tracking pub fn encode_json_with_context<'a>( json: Json, - data_type: Option<&ConcreteDataType>, + data_type: Option<&JsonNativeType>, context: &JsonContext<'a>, -) -> Result { +) -> Result { // Check if the entire encoding should be unstructured if matches!(context.settings, JsonStructureSettings::UnstructuredRaw) { let json_string = json.to_string(); - let struct_value = StructValue::try_new( - vec![Value::String(json_string.into())], - StructType::new(Arc::new(vec![StructField::new( - JsonStructureSettings::RAW_FIELD.to_string(), - ConcreteDataType::string_datatype(), - true, - )])), - )?; - return Ok(Value::Struct(struct_value)); + return Ok([(JsonStructureSettings::RAW_FIELD, json_string)].into()); } // Check if current key should be treated as unstructured if context.is_unstructured_key() { - return Ok(Value::String(json.to_string().into())); + return Ok(json.to_string().into()); } match json { Json::Object(json_object) => { - ensure!( - matches!(data_type, Some(ConcreteDataType::Struct(_)) | None), - error::InvalidJsonSnafu { - value: "JSON object can only be encoded to Struct type".to_string(), + let object_type = match data_type.as_ref() { + Some(JsonNativeType::Object(x)) => Some(x), + None => None, + _ => { + return error::InvalidJsonSnafu { + value: "JSON object value must be encoded with object type", + } + .fail(); } - ); - - let data_type = data_type.and_then(|x| x.as_struct()); - let struct_value = encode_json_object_with_context(json_object, data_type, context)?; - Ok(Value::Struct(struct_value)) + }; + encode_json_object_with_context(json_object, object_type, context) } Json::Array(json_array) => { - let item_type = if let Some(ConcreteDataType::List(list_type)) = data_type { - Some(list_type.item_type()) - } else { - None + let item_type = match data_type.as_ref() { + Some(JsonNativeType::Array(x)) => Some(x.as_ref()), + None => None, + _ => { + return error::InvalidJsonSnafu { + value: "JSON array value must be encoded with array type", + } + .fail(); + } }; - let list_value = encode_json_array_with_context(json_array, item_type, context)?; - Ok(Value::List(list_value)) + encode_json_array_with_context(json_array, item_type, context) } _ => { // For non-collection types, verify type compatibility if let Some(expected_type) = data_type { - let (value, actual_type) = - encode_json_value_with_context(json, Some(expected_type), context)?; - if &actual_type == expected_type { + let value = encode_json_value_with_context(json, Some(expected_type), context)?; + let actual_type = value.json_type().native_type(); + if actual_type == expected_type { Ok(value) } else { Err(error::InvalidJsonSnafu { value: format!( "JSON value type {} does not match expected type {}", - actual_type.name(), - expected_type.name() + actual_type, expected_type ), } .build()) } } else { - let (value, _) = encode_json_value_with_context(json, None, context)?; - Ok(value) + encode_json_value_with_context(json, None, context) } } } @@ -210,31 +213,21 @@ pub fn encode_json_with_context<'a>( fn encode_json_object_with_context<'a>( mut json_object: Map, - fields: Option<&StructType>, + fields: Option<&JsonObjectType>, context: &JsonContext<'a>, -) -> Result { - let total_json_keys = json_object.len(); - let mut items = Vec::with_capacity(total_json_keys); - let mut struct_fields = Vec::with_capacity(total_json_keys); +) -> Result { + let mut object = BTreeMap::new(); // First, process fields from the provided schema in their original order if let Some(fields) = fields { - for field in fields.fields().iter() { - let field_name = field.name(); - + for (field_name, field_type) in fields { if let Some(value) = json_object.remove(field_name) { let field_context = context.with_key(field_name); - let (value, data_type) = - encode_json_value_with_context(value, Some(field.data_type()), &field_context)?; - items.push(value); - struct_fields.push(StructField::new( - field_name.to_string(), - data_type, - true, // JSON fields are always nullable - )); + let value = + encode_json_value_with_context(value, Some(field_type), &field_context)?; + object.insert(field_name.clone(), value.into_variant()); } else { // Field exists in schema but not in JSON - add null value - items.push(Value::Null); - struct_fields.push(field.clone()); + object.insert(field_name.clone(), ().into()); } } } @@ -243,139 +236,111 @@ fn encode_json_object_with_context<'a>( for (key, value) in json_object { let field_context = context.with_key(&key); - let (value, data_type) = encode_json_value_with_context(value, None, &field_context)?; - items.push(value); + let value = encode_json_value_with_context(value, None, &field_context)?; - struct_fields.push(StructField::new( - key.clone(), - data_type, - true, // JSON fields are always nullable - )); + object.insert(key, value.into_variant()); } - let struct_type = StructType::new(Arc::new(struct_fields)); - StructValue::try_new(items, struct_type) + Ok(JsonValue::new(JsonVariant::Object(object))) } fn encode_json_array_with_context<'a>( json_array: Vec, - item_type: Option<&ConcreteDataType>, + item_type: Option<&JsonNativeType>, context: &JsonContext<'a>, -) -> Result { +) -> Result { let json_array_len = json_array.len(); let mut items = Vec::with_capacity(json_array_len); - let mut element_type = None; + let mut element_type = item_type.cloned(); for (index, value) in json_array.into_iter().enumerate() { let array_context = context.with_key(&index.to_string()); - let (item_value, item_type) = - encode_json_value_with_context(value, item_type, &array_context)?; - items.push(item_value); + let item_value = + encode_json_value_with_context(value, element_type.as_ref(), &array_context)?; + let item_type = item_value.json_type().native_type().clone(); + items.push(item_value.into_variant()); // Determine the common type for the list if let Some(current_type) = &element_type { - // For now, we'll use the first non-null type we encounter - // In a more sophisticated implementation, we might want to find a common supertype - if *current_type == ConcreteDataType::null_datatype() - && item_type != ConcreteDataType::null_datatype() - { - element_type = Some(item_type); - } + // It's valid for json array to have different types of items, for example, + // ["a string", 1]. However, the `JsonValue` will be converted to Arrow list array, + // which requires all items have exactly same type. So we forbid the different types + // case here. Besides, it's not common for items in a json array to differ. So I think + // we are good here. + ensure!( + item_type == *current_type, + error::InvalidJsonSnafu { + value: "all items in json array must have the same type" + } + ); } else { element_type = Some(item_type); } } - // Use provided item_type if available, otherwise determine from elements - let element_type = if let Some(item_type) = item_type { - item_type.clone() - } else { - element_type.unwrap_or_else(ConcreteDataType::string_datatype) - }; - - Ok(ListValue::new(items, Arc::new(element_type))) + Ok(JsonValue::new(JsonVariant::Array(items))) } /// Helper function to encode a JSON value to a Value and determine its ConcreteDataType with context fn encode_json_value_with_context<'a>( json: Json, - expected_type: Option<&ConcreteDataType>, + expected_type: Option<&JsonNativeType>, context: &JsonContext<'a>, -) -> Result<(Value, ConcreteDataType), Error> { +) -> Result { // Check if current key should be treated as unstructured if context.is_unstructured_key() { - return Ok(( - Value::String(json.to_string().into()), - ConcreteDataType::string_datatype(), - )); + return Ok(json.to_string().into()); } match json { - Json::Null => Ok((Value::Null, ConcreteDataType::null_datatype())), - Json::Bool(b) => Ok((Value::Boolean(b), ConcreteDataType::boolean_datatype())), + Json::Null => Ok(JsonValue::null()), + Json::Bool(b) => Ok(b.into()), Json::Number(n) => { if let Some(i) = n.as_i64() { // Use int64 for all integer numbers when possible if let Some(expected) = expected_type && let Ok(value) = try_convert_to_expected_type(i, expected) { - return Ok((value, expected.clone())); + return Ok(value); } - Ok((Value::Int64(i), ConcreteDataType::int64_datatype())) + Ok(i.into()) } else if let Some(u) = n.as_u64() { // Use int64 for unsigned integers that fit, otherwise use u64 if let Some(expected) = expected_type && let Ok(value) = try_convert_to_expected_type(u, expected) { - return Ok((value, expected.clone())); + return Ok(value); } if u <= i64::MAX as u64 { - Ok((Value::Int64(u as i64), ConcreteDataType::int64_datatype())) + Ok((u as i64).into()) } else { - Ok((Value::UInt64(u), ConcreteDataType::uint64_datatype())) + Ok(u.into()) } } else if let Some(f) = n.as_f64() { // Try to use the expected type if provided if let Some(expected) = expected_type && let Ok(value) = try_convert_to_expected_type(f, expected) { - return Ok((value, expected.clone())); + return Ok(value); } // Default to f64 for floating point numbers - Ok(( - Value::Float64(OrderedFloat(f)), - ConcreteDataType::float64_datatype(), - )) + Ok(f.into()) } else { // Fallback to string representation - Ok(( - Value::String(StringBytes::from(n.to_string())), - ConcreteDataType::string_datatype(), - )) + Ok(n.to_string().into()) } } Json::String(s) => { if let Some(expected) = expected_type && let Ok(value) = try_convert_to_expected_type(s.as_str(), expected) { - return Ok((value, expected.clone())); + return Ok(value); } - Ok(( - Value::String(StringBytes::from(s.clone())), - ConcreteDataType::string_datatype(), - )) - } - Json::Array(arr) => { - let list_value = encode_json_array_with_context(arr, expected_type, context)?; - let data_type = list_value.datatype().clone(); - Ok((Value::List(list_value), (*data_type).clone())) - } - Json::Object(obj) => { - let struct_value = encode_json_object_with_context(obj, None, context)?; - let data_type = ConcreteDataType::Struct(struct_value.struct_type().clone()); - Ok((Value::Struct(struct_value), data_type)) + Ok(s.into()) } + Json::Array(arr) => encode_json_array_with_context(arr, expected_type, context), + Json::Object(obj) => encode_json_object_with_context(obj, None, context), } } @@ -395,7 +360,6 @@ pub fn decode_value_with_context<'a>( } match value { - Value::Json(inner) => decode_value_with_context(*inner, context), Value::Struct(struct_value) => decode_struct_with_context(struct_value, context), Value::List(list_value) => decode_list_with_context(list_value, context), _ => decode_primitive_value(value), @@ -562,11 +526,13 @@ fn decode_struct_with_settings<'a>( key_path: field_context.key_path.clone(), settings: &JsonStructureSettings::Structured(None), }; - let (decoded_value, data_type) = encode_json_value_with_context( + let decoded_value = encode_json_value_with_context( json_value, None, // Don't force a specific type, let it be inferred from JSON &structured_context, - )?; + )? + .into_value(); + let data_type = decoded_value.data_type(); items.push(decoded_value); struct_fields.push(StructField::new( @@ -644,8 +610,9 @@ fn decode_unstructured_raw_struct(struct_value: StructValue) -> Result Result( value: T, - expected_type: &ConcreteDataType, -) -> Result + expected_type: &JsonNativeType, +) -> Result where - T: Into, + T: Into, { let value = value.into(); - expected_type.try_cast(value.clone()).ok_or_else(|| { + let cast_error = || { error::CastTypeSnafu { - msg: format!( - "Cannot cast from {} to {}", - value.data_type().name(), - expected_type.name() - ), + msg: format!("Cannot cast value {value} to {expected_type}"), } - .build() - }) + .fail() + }; + let actual_type = value.json_type().native_type(); + match (actual_type, expected_type) { + (x, y) if x == y => Ok(value), + (JsonNativeType::Number(x), JsonNativeType::Number(y)) => match (x, y) { + (JsonNumberType::U64, JsonNumberType::I64) => { + if let Some(i) = value.as_i64() { + Ok(i.into()) + } else { + cast_error() + } + } + (JsonNumberType::I64, JsonNumberType::U64) => { + if let Some(i) = value.as_u64() { + Ok(i.into()) + } else { + cast_error() + } + } + (_, JsonNumberType::F64) => { + if let Some(f) = value.as_f64() { + Ok(f.into()) + } else { + cast_error() + } + } + _ => cast_error(), + }, + (_, JsonNativeType::String) => Ok(value.to_string().into()), + _ => cast_error(), + } } #[cfg(test)] @@ -695,6 +688,7 @@ mod tests { use serde_json::json; use super::*; + use crate::data_type::ConcreteDataType; use crate::types::ListType; #[test] @@ -891,15 +885,15 @@ mod tests { let json = Json::from(42); let settings = JsonStructureSettings::Structured(None); let result = settings - .encode_with_type(json.clone(), Some(&ConcreteDataType::int8_datatype())) + .encode_with_type(json.clone(), Some(&JsonNativeType::u64())) .unwrap() .into_json_inner() .unwrap(); - assert_eq!(result, Value::Int8(42)); + assert_eq!(result, Value::UInt64(42)); // Test with expected string type let result = settings - .encode_with_type(json, Some(&ConcreteDataType::string_datatype())) + .encode_with_type(json, Some(&JsonNativeType::String)) .unwrap() .into_json_inner() .unwrap(); @@ -910,23 +904,11 @@ mod tests { fn test_encode_json_array_mixed_types() { let json = json!([1, "hello", true, 3.15]); let settings = JsonStructureSettings::Structured(None); - let result = settings - .encode_with_type(json, None) - .unwrap() - .into_json_inner() - .unwrap(); - - if let Value::List(list_value) = result { - assert_eq!(list_value.items().len(), 4); - // The first non-null type should determine the list type - // In this case, it should be string since we can't find a common numeric type - assert_eq!( - list_value.datatype(), - Arc::new(ConcreteDataType::int64_datatype()) - ); - } else { - panic!("Expected List value"); - } + let result = settings.encode_with_type(json, None); + assert_eq!( + result.unwrap_err().to_string(), + "Invalid JSON: all items in json array must have the same type" + ); } #[test] @@ -944,7 +926,7 @@ mod tests { // Empty arrays default to string type assert_eq!( list_value.datatype(), - Arc::new(ConcreteDataType::string_datatype()) + Arc::new(ConcreteDataType::null_datatype()) ); } else { panic!("Expected List value"); @@ -980,16 +962,10 @@ mod tests { }); // Define expected struct type - let fields = vec![ - StructField::new( - "name".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - StructField::new("age".to_string(), ConcreteDataType::int64_datatype(), true), - ]; - let struct_type = StructType::new(Arc::new(fields)); - let concrete_type = ConcreteDataType::Struct(struct_type); + let concrete_type = JsonNativeType::Object(JsonObjectType::from([ + ("name".to_string(), JsonNativeType::String), + ("age".to_string(), JsonNativeType::i64()), + ])); let settings = JsonStructureSettings::Structured(None); let result = settings @@ -1001,15 +977,15 @@ mod tests { if let Value::Struct(struct_value) = result { assert_eq!(struct_value.items().len(), 2); let struct_fields = struct_value.struct_type().fields(); - assert_eq!(struct_fields[0].name(), "name"); + assert_eq!(struct_fields[0].name(), "age"); assert_eq!( struct_fields[0].data_type(), - &ConcreteDataType::string_datatype() + &ConcreteDataType::int64_datatype() ); - assert_eq!(struct_fields[1].name(), "age"); + assert_eq!(struct_fields[1].name(), "name"); assert_eq!( struct_fields[1].data_type(), - &ConcreteDataType::int64_datatype() + &ConcreteDataType::string_datatype() ); } else { panic!("Expected Struct value"); @@ -1025,34 +1001,24 @@ mod tests { }); // Define schema with specific field order - let fields = vec![ - StructField::new( - "a_field".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - StructField::new( - "m_field".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - StructField::new( - "z_field".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - ]; - let struct_type = StructType::new(Arc::new(fields)); + let json_type = JsonObjectType::from([ + ("a_field".to_string(), JsonNativeType::String), + ("m_field".to_string(), JsonNativeType::String), + ("z_field".to_string(), JsonNativeType::String), + ]); - let result = encode_json_object_with_context( + let Value::Struct(result) = encode_json_object_with_context( json.as_object().unwrap().clone(), - Some(&struct_type), + Some(&json_type), &JsonContext { key_path: String::new(), settings: &JsonStructureSettings::Structured(None), }, ) - .unwrap(); + .map(|x| x.into_value()) + .unwrap() else { + unreachable!() + }; // Verify field order is preserved from schema let struct_fields = result.struct_type().fields(); @@ -1076,37 +1042,35 @@ mod tests { }); // Define schema with only name and age - let fields = vec![ - StructField::new( - "name".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - StructField::new("age".to_string(), ConcreteDataType::int64_datatype(), true), - ]; - let struct_type = StructType::new(Arc::new(fields)); + let json_type = JsonObjectType::from([ + ("name".to_string(), JsonNativeType::String), + ("age".to_string(), JsonNativeType::i64()), + ]); - let result = encode_json_object_with_context( + let Value::Struct(result) = encode_json_object_with_context( json.as_object().unwrap().clone(), - Some(&struct_type), + Some(&json_type), &JsonContext { key_path: String::new(), settings: &JsonStructureSettings::Structured(None), }, ) - .unwrap(); + .map(|x| x.into_value()) + .unwrap() else { + unreachable!() + }; - // Verify schema fields come first in order + // verify fields are sorted in json value let struct_fields = result.struct_type().fields(); - assert_eq!(struct_fields[0].name(), "name"); + assert_eq!(struct_fields[0].name(), "active"); assert_eq!(struct_fields[1].name(), "age"); - assert_eq!(struct_fields[2].name(), "active"); + assert_eq!(struct_fields[2].name(), "name"); // Verify values are correct let items = result.items(); - assert_eq!(items[0], Value::String("Alice".into())); + assert_eq!(items[0], Value::Boolean(true)); assert_eq!(items[1], Value::Int64(25)); - assert_eq!(items[2], Value::Boolean(true)); + assert_eq!(items[2], Value::String("Alice".into())); } #[test] @@ -1117,35 +1081,33 @@ mod tests { }); // Define schema with name and age - let fields = vec![ - StructField::new( - "name".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - StructField::new("age".to_string(), ConcreteDataType::int64_datatype(), true), - ]; - let struct_type = StructType::new(Arc::new(fields)); + let json_type = JsonObjectType::from([ + ("name".to_string(), JsonNativeType::String), + ("age".to_string(), JsonNativeType::i64()), + ]); - let result = encode_json_object_with_context( + let Value::Struct(result) = encode_json_object_with_context( json.as_object().unwrap().clone(), - Some(&struct_type), + Some(&json_type), &JsonContext { key_path: String::new(), settings: &JsonStructureSettings::Structured(None), }, ) - .unwrap(); + .map(|x| x.into_value()) + .unwrap() else { + unreachable!() + }; // Verify both schema fields are present let struct_fields = result.struct_type().fields(); - assert_eq!(struct_fields[0].name(), "name"); - assert_eq!(struct_fields[1].name(), "age"); + assert_eq!(struct_fields[0].name(), "age"); + assert_eq!(struct_fields[1].name(), "name"); // Verify values - name has value, age is null let items = result.items(); - assert_eq!(items[0], Value::String("Bob".into())); - assert_eq!(items[1], Value::Null); + assert_eq!(items[0], Value::Null); + assert_eq!(items[1], Value::String("Bob".into())); } #[test] @@ -1168,21 +1130,22 @@ mod tests { #[test] fn test_encode_json_array_with_item_type() { let json = json!([1, 2, 3]); - let item_type = Arc::new(ConcreteDataType::int8_datatype()); - let list_type = ListType::new(item_type.clone()); - let concrete_type = ConcreteDataType::List(list_type); + let item_type = Arc::new(ConcreteDataType::uint64_datatype()); let settings = JsonStructureSettings::Structured(None); let result = settings - .encode_with_type(json, Some(&concrete_type)) + .encode_with_type( + json, + Some(&JsonNativeType::Array(Box::new(JsonNativeType::u64()))), + ) .unwrap() .into_json_inner() .unwrap(); if let Value::List(list_value) = result { assert_eq!(list_value.items().len(), 3); - assert_eq!(list_value.items()[0], Value::Int8(1)); - assert_eq!(list_value.items()[1], Value::Int8(2)); - assert_eq!(list_value.items()[2], Value::Int8(3)); + assert_eq!(list_value.items()[0], Value::UInt64(1)); + assert_eq!(list_value.items()[1], Value::UInt64(2)); + assert_eq!(list_value.items()[2], Value::UInt64(3)); assert_eq!(list_value.datatype(), item_type); } else { panic!("Expected List value"); @@ -1192,12 +1155,13 @@ mod tests { #[test] fn test_encode_json_array_empty_with_item_type() { let json = json!([]); - let item_type = Arc::new(ConcreteDataType::string_datatype()); - let list_type = ListType::new(item_type.clone()); - let concrete_type = ConcreteDataType::List(list_type); + let item_type = Arc::new(ConcreteDataType::null_datatype()); let settings = JsonStructureSettings::Structured(None); let result = settings - .encode_with_type(json, Some(&concrete_type)) + .encode_with_type( + json, + Some(&JsonNativeType::Array(Box::new(JsonNativeType::Null))), + ) .unwrap() .into_json_inner() .unwrap(); @@ -1212,6 +1176,7 @@ mod tests { #[cfg(test)] mod decode_tests { + use ordered_float::OrderedFloat; use serde_json::json; use super::*; @@ -1466,7 +1431,7 @@ mod tests { // Test encoding JSON number with expected int64 type let json = Json::from(42); let result = settings - .encode_with_type(json, Some(&ConcreteDataType::int64_datatype())) + .encode_with_type(json, Some(&JsonNativeType::i64())) .unwrap() .into_json_inner() .unwrap(); @@ -1475,7 +1440,7 @@ mod tests { // Test encoding JSON string with expected string type let json = Json::String("hello".to_string()); let result = settings - .encode_with_type(json, Some(&ConcreteDataType::string_datatype())) + .encode_with_type(json, Some(&JsonNativeType::String)) .unwrap() .into_json_inner() .unwrap(); @@ -1484,7 +1449,7 @@ mod tests { // Test encoding JSON boolean with expected boolean type let json = Json::Bool(true); let result = settings - .encode_with_type(json, Some(&ConcreteDataType::boolean_datatype())) + .encode_with_type(json, Some(&JsonNativeType::Bool)) .unwrap() .into_json_inner() .unwrap(); @@ -1496,12 +1461,12 @@ mod tests { // Test encoding JSON number with mismatched string type let json = Json::from(42); let settings = JsonStructureSettings::Structured(None); - let result = settings.encode_with_type(json, Some(&ConcreteDataType::string_datatype())); + let result = settings.encode_with_type(json, Some(&JsonNativeType::String)); assert!(result.is_ok()); // Should succeed due to type conversion // Test encoding JSON object with mismatched non-struct type let json = json!({"name": "test"}); - let result = settings.encode_with_type(json, Some(&ConcreteDataType::int64_datatype())); + let result = settings.encode_with_type(json, Some(&JsonNativeType::i64())); assert!(result.is_err()); // Should fail - object can't be converted to int64 } @@ -1509,12 +1474,13 @@ mod tests { fn test_encode_json_array_with_list_type() { let json = json!([1, 2, 3]); let item_type = Arc::new(ConcreteDataType::int64_datatype()); - let list_type = ListType::new(item_type.clone()); - let concrete_type = ConcreteDataType::List(list_type); let settings = JsonStructureSettings::Structured(None); let result = settings - .encode_with_type(json, Some(&concrete_type)) + .encode_with_type( + json, + Some(&JsonNativeType::Array(Box::new(JsonNativeType::i64()))), + ) .unwrap() .into_json_inner() .unwrap(); @@ -1536,7 +1502,7 @@ mod tests { let json = Json::Null; let settings = JsonStructureSettings::Structured(None); let result = settings - .encode_with_type(json.clone(), Some(&ConcreteDataType::null_datatype())) + .encode_with_type(json.clone(), Some(&JsonNativeType::Null)) .unwrap() .into_json_inner() .unwrap(); @@ -1545,7 +1511,7 @@ mod tests { // Test float with float64 type let json = Json::from(3.15); let result = settings - .encode_with_type(json, Some(&ConcreteDataType::float64_datatype())) + .encode_with_type(json, Some(&JsonNativeType::f64())) .unwrap() .into_json_inner() .unwrap(); @@ -1637,20 +1603,11 @@ mod tests { } // Test with encode_with_type (with type) - let struct_type = StructType::new(Arc::new(vec![ - StructField::new( - "name".to_string(), - ConcreteDataType::string_datatype(), - true, - ), - StructField::new("age".to_string(), ConcreteDataType::int64_datatype(), true), - StructField::new( - "active".to_string(), - ConcreteDataType::boolean_datatype(), - true, - ), + let concrete_type = JsonNativeType::Object(JsonObjectType::from([ + ("name".to_string(), JsonNativeType::String), + ("age".to_string(), JsonNativeType::i64()), + ("active".to_string(), JsonNativeType::Bool), ])); - let concrete_type = ConcreteDataType::Struct(struct_type); let result2 = settings .encode_with_type(json, Some(&concrete_type)) @@ -2146,20 +2103,11 @@ mod tests { )])), ); - let decoded_struct = settings.decode_struct(array_struct).unwrap(); - let fields = decoded_struct.struct_type().fields(); - let decoded_fields: Vec<&str> = fields.iter().map(|f| f.name()).collect(); - assert!(decoded_fields.contains(&"value")); - - if let Value::List(list_value) = &decoded_struct.items()[0] { - assert_eq!(list_value.items().len(), 4); - assert_eq!(list_value.items()[0], Value::Int64(1)); - assert_eq!(list_value.items()[1], Value::String("hello".into())); - assert_eq!(list_value.items()[2], Value::Boolean(true)); - assert_eq!(list_value.items()[3], Value::Float64(OrderedFloat(3.15))); - } else { - panic!("Expected array to be decoded as ListValue"); - } + let decoded_struct = settings.decode_struct(array_struct); + assert_eq!( + decoded_struct.unwrap_err().to_string(), + "Invalid JSON: all items in json array must have the same type" + ); } #[test] diff --git a/src/datatypes/src/json/value.rs b/src/datatypes/src/json/value.rs new file mode 100644 index 0000000000..acff194e12 --- /dev/null +++ b/src/datatypes/src/json/value.rs @@ -0,0 +1,691 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; +use std::fmt::{Display, Formatter}; +use std::hash::{Hash, Hasher}; +use std::sync::{Arc, OnceLock}; + +use num_traits::ToPrimitive; +use ordered_float::OrderedFloat; +use serde::{Deserialize, Serialize}; +use serde_json::Number; + +use crate::data_type::ConcreteDataType; +use crate::types::json_type::JsonNativeType; +use crate::types::{JsonType, StructField, StructType}; +use crate::value::{ListValue, ListValueRef, StructValue, StructValueRef, Value, ValueRef}; + +/// Number in json, can be a positive integer, a negative integer, or a floating number. +/// Each of which is represented as `u64`, `i64` and `f64`. +/// +/// This follows how `serde_json` designs number. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum JsonNumber { + PosInt(u64), + NegInt(i64), + Float(OrderedFloat), +} + +impl JsonNumber { + fn as_u64(&self) -> Option { + match self { + JsonNumber::PosInt(n) => Some(*n), + JsonNumber::NegInt(n) => (*n >= 0).then_some(*n as u64), + _ => None, + } + } + + fn as_i64(&self) -> Option { + match self { + JsonNumber::PosInt(n) => (*n <= i64::MAX as u64).then_some(*n as i64), + JsonNumber::NegInt(n) => Some(*n), + _ => None, + } + } + + fn as_f64(&self) -> f64 { + match self { + JsonNumber::PosInt(n) => *n as f64, + JsonNumber::NegInt(n) => *n as f64, + JsonNumber::Float(n) => n.0, + } + } +} + +impl From for JsonNumber { + fn from(i: u64) -> Self { + Self::PosInt(i) + } +} + +impl From for JsonNumber { + fn from(n: i64) -> Self { + Self::NegInt(n) + } +} + +impl From for JsonNumber { + fn from(i: f64) -> Self { + Self::Float(i.into()) + } +} + +impl Display for JsonNumber { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::PosInt(x) => write!(f, "{x}"), + Self::NegInt(x) => write!(f, "{x}"), + Self::Float(x) => write!(f, "{x}"), + } + } +} + +/// Variants of json. +/// +/// This follows how [serde_json::Value] designs except that we only choose to use [BTreeMap] to +/// preserve the fields order by their names in the json object. (By default `serde_json` uses +/// [BTreeMap], too. But it additionally supports "IndexMap" which preserves the order by insertion +/// times of fields.) +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum JsonVariant { + Null, + Bool(bool), + Number(JsonNumber), + String(String), + Array(Vec), + Object(BTreeMap), +} + +impl JsonVariant { + fn native_type(&self) -> JsonNativeType { + match self { + JsonVariant::Null => JsonNativeType::Null, + JsonVariant::Bool(_) => JsonNativeType::Bool, + JsonVariant::Number(n) => match n { + JsonNumber::PosInt(_) => JsonNativeType::u64(), + JsonNumber::NegInt(_) => JsonNativeType::i64(), + JsonNumber::Float(_) => JsonNativeType::f64(), + }, + JsonVariant::String(_) => JsonNativeType::String, + JsonVariant::Array(array) => { + let item_type = if let Some(first) = array.first() { + first.native_type() + } else { + JsonNativeType::Null + }; + JsonNativeType::Array(Box::new(item_type)) + } + JsonVariant::Object(object) => JsonNativeType::Object( + object + .iter() + .map(|(k, v)| (k.clone(), v.native_type())) + .collect(), + ), + } + } + + fn json_type(&self) -> JsonType { + JsonType::new_native(self.native_type()) + } + + fn as_ref(&self) -> JsonVariantRef<'_> { + match self { + JsonVariant::Null => JsonVariantRef::Null, + JsonVariant::Bool(x) => (*x).into(), + JsonVariant::Number(x) => match x { + JsonNumber::PosInt(i) => (*i).into(), + JsonNumber::NegInt(i) => (*i).into(), + JsonNumber::Float(f) => (f.0).into(), + }, + JsonVariant::String(x) => x.as_str().into(), + JsonVariant::Array(array) => { + JsonVariantRef::Array(array.iter().map(|x| x.as_ref()).collect()) + } + JsonVariant::Object(object) => JsonVariantRef::Object( + object + .iter() + .map(|(k, v)| (k.as_str(), v.as_ref())) + .collect(), + ), + } + } +} + +impl From<()> for JsonVariant { + fn from(_: ()) -> Self { + Self::Null + } +} + +impl From for JsonVariant { + fn from(v: bool) -> Self { + Self::Bool(v) + } +} + +impl> From for JsonVariant { + fn from(v: T) -> Self { + Self::Number(v.into()) + } +} + +impl From<&str> for JsonVariant { + fn from(v: &str) -> Self { + Self::String(v.to_string()) + } +} + +impl From for JsonVariant { + fn from(v: String) -> Self { + Self::String(v) + } +} + +impl> From<[T; N]> for JsonVariant { + fn from(vs: [T; N]) -> Self { + Self::Array(vs.into_iter().map(|x| x.into()).collect()) + } +} + +impl, V: Into, const N: usize> From<[(K, V); N]> for JsonVariant { + fn from(vs: [(K, V); N]) -> Self { + Self::Object(vs.into_iter().map(|(k, v)| (k.into(), v.into())).collect()) + } +} + +impl Display for JsonVariant { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Null => write!(f, "null"), + Self::Bool(x) => write!(f, "{x}"), + Self::Number(x) => write!(f, "{x}"), + Self::String(x) => write!(f, "{x}"), + Self::Array(array) => write!( + f, + "[{}]", + array + .iter() + .map(|x| x.to_string()) + .collect::>() + .join(", ") + ), + Self::Object(object) => { + write!( + f, + "{{ {} }}", + object + .iter() + .map(|(k, v)| format!("{k}: {v}")) + .collect::>() + .join(", ") + ) + } + } + } +} + +/// Represents any valid JSON value. +#[derive(Debug, Eq, Serialize, Deserialize)] +pub struct JsonValue { + #[serde(skip)] + json_type: OnceLock, + json_variant: JsonVariant, +} + +impl JsonValue { + pub fn null() -> Self { + ().into() + } + + pub(crate) fn new(json_variant: JsonVariant) -> Self { + Self { + json_type: OnceLock::new(), + json_variant, + } + } + + pub(crate) fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::Json(self.json_type().clone()) + } + + pub fn json_type(&self) -> &JsonType { + self.json_type.get_or_init(|| self.json_variant.json_type()) + } + + pub(crate) fn is_null(&self) -> bool { + matches!(self.json_variant, JsonVariant::Null) + } + + /// Check if this JSON value is an empty object. + pub fn is_empty_object(&self) -> bool { + match &self.json_variant { + JsonVariant::Object(object) => object.is_empty(), + _ => false, + } + } + + pub(crate) fn as_i64(&self) -> Option { + match self.json_variant { + JsonVariant::Number(n) => n.as_i64(), + _ => None, + } + } + + pub(crate) fn as_u64(&self) -> Option { + match self.json_variant { + JsonVariant::Number(n) => n.as_u64(), + _ => None, + } + } + + pub(crate) fn as_f64(&self) -> Option { + match self.json_variant { + JsonVariant::Number(n) => Some(n.as_f64()), + _ => None, + } + } + + pub(crate) fn as_f64_lossy(&self) -> Option { + match self.json_variant { + JsonVariant::Number(n) => Some(match n { + JsonNumber::PosInt(i) => i as f64, + JsonNumber::NegInt(i) => i as f64, + JsonNumber::Float(f) => f.0, + }), + _ => None, + } + } + + pub(crate) fn as_bool(&self) -> Option { + match self.json_variant { + JsonVariant::Bool(b) => Some(b), + _ => None, + } + } + + pub fn as_ref(&self) -> JsonValueRef<'_> { + JsonValueRef { + json_type: OnceLock::new(), + json_variant: self.json_variant.as_ref(), + } + } + + pub fn into_variant(self) -> JsonVariant { + self.json_variant + } + + pub(crate) fn into_value(self) -> Value { + fn helper(v: JsonVariant) -> Value { + match v { + JsonVariant::Null => Value::Null, + JsonVariant::Bool(x) => Value::Boolean(x), + JsonVariant::Number(x) => match x { + JsonNumber::PosInt(i) => Value::UInt64(i), + JsonNumber::NegInt(i) => Value::Int64(i), + JsonNumber::Float(f) => Value::Float64(f), + }, + JsonVariant::String(x) => Value::String(x.into()), + JsonVariant::Array(array) => { + let item_type = if let Some(first) = array.first() { + first.native_type() + } else { + JsonNativeType::Null + }; + Value::List(ListValue::new( + array.into_iter().map(helper).collect(), + Arc::new((&item_type).into()), + )) + } + JsonVariant::Object(object) => { + let mut fields = Vec::with_capacity(object.len()); + let mut items = Vec::with_capacity(object.len()); + for (k, v) in object { + fields.push(StructField::new(k, (&v.native_type()).into(), true)); + items.push(helper(v)); + } + Value::Struct(StructValue::new(items, StructType::new(Arc::new(fields)))) + } + } + } + helper(self.json_variant) + } +} + +impl> From for JsonValue { + fn from(v: T) -> Self { + Self { + json_type: OnceLock::new(), + json_variant: v.into(), + } + } +} + +impl From for serde_json::Value { + fn from(v: JsonValue) -> Self { + fn helper(v: JsonVariant) -> serde_json::Value { + match v { + JsonVariant::Null => serde_json::Value::Null, + JsonVariant::Bool(x) => serde_json::Value::Bool(x), + JsonVariant::Number(x) => match x { + JsonNumber::PosInt(i) => serde_json::Value::Number(i.into()), + JsonNumber::NegInt(i) => serde_json::Value::Number(i.into()), + JsonNumber::Float(f) => { + if let Some(x) = Number::from_f64(f.0) { + serde_json::Value::Number(x) + } else { + serde_json::Value::String("NaN".into()) + } + } + }, + JsonVariant::String(x) => serde_json::Value::String(x), + JsonVariant::Array(array) => { + serde_json::Value::Array(array.into_iter().map(helper).collect()) + } + JsonVariant::Object(object) => serde_json::Value::Object( + object.into_iter().map(|(k, v)| (k, helper(v))).collect(), + ), + } + } + helper(v.json_variant) + } +} + +impl Clone for JsonValue { + fn clone(&self) -> Self { + let Self { + json_type: _, + json_variant, + } = self; + Self { + json_type: OnceLock::new(), + json_variant: json_variant.clone(), + } + } +} + +impl PartialEq for JsonValue { + fn eq(&self, other: &JsonValue) -> bool { + let Self { + json_type: _, + json_variant, + } = self; + json_variant.eq(&other.json_variant) + } +} + +impl Hash for JsonValue { + fn hash(&self, state: &mut H) { + let Self { + json_type: _, + json_variant, + } = self; + json_variant.hash(state); + } +} + +impl Display for JsonValue { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.json_variant) + } +} + +/// References of variants of json. +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub enum JsonVariantRef<'a> { + Null, + Bool(bool), + Number(JsonNumber), + String(&'a str), + Array(Vec>), + Object(BTreeMap<&'a str, JsonVariantRef<'a>>), +} + +impl JsonVariantRef<'_> { + fn json_type(&self) -> JsonType { + fn native_type(v: &JsonVariantRef<'_>) -> JsonNativeType { + match v { + JsonVariantRef::Null => JsonNativeType::Null, + JsonVariantRef::Bool(_) => JsonNativeType::Bool, + JsonVariantRef::Number(n) => match n { + JsonNumber::PosInt(_) => JsonNativeType::u64(), + JsonNumber::NegInt(_) => JsonNativeType::i64(), + JsonNumber::Float(_) => JsonNativeType::f64(), + }, + JsonVariantRef::String(_) => JsonNativeType::String, + JsonVariantRef::Array(array) => { + let item_type = if let Some(first) = array.first() { + native_type(first) + } else { + JsonNativeType::Null + }; + JsonNativeType::Array(Box::new(item_type)) + } + JsonVariantRef::Object(object) => JsonNativeType::Object( + object + .iter() + .map(|(k, v)| (k.to_string(), native_type(v))) + .collect(), + ), + } + } + JsonType::new_native(native_type(self)) + } +} + +impl From<()> for JsonVariantRef<'_> { + fn from(_: ()) -> Self { + Self::Null + } +} + +impl From for JsonVariantRef<'_> { + fn from(v: bool) -> Self { + Self::Bool(v) + } +} + +impl> From for JsonVariantRef<'_> { + fn from(v: T) -> Self { + Self::Number(v.into()) + } +} + +impl<'a> From<&'a str> for JsonVariantRef<'a> { + fn from(v: &'a str) -> Self { + Self::String(v) + } +} + +impl<'a, const N: usize, T: Into>> From<[T; N]> for JsonVariantRef<'a> { + fn from(vs: [T; N]) -> Self { + Self::Array(vs.into_iter().map(|x| x.into()).collect()) + } +} + +impl<'a, V: Into>, const N: usize> From<[(&'a str, V); N]> + for JsonVariantRef<'a> +{ + fn from(vs: [(&'a str, V); N]) -> Self { + Self::Object(vs.into_iter().map(|(k, v)| (k, v.into())).collect()) + } +} + +impl<'a> From>> for JsonVariantRef<'a> { + fn from(v: Vec>) -> Self { + Self::Array(v) + } +} + +impl<'a> From>> for JsonVariantRef<'a> { + fn from(v: BTreeMap<&'a str, JsonVariantRef<'a>>) -> Self { + Self::Object(v) + } +} + +impl From> for JsonVariant { + fn from(v: JsonVariantRef) -> Self { + match v { + JsonVariantRef::Null => Self::Null, + JsonVariantRef::Bool(x) => Self::Bool(x), + JsonVariantRef::Number(x) => Self::Number(x), + JsonVariantRef::String(x) => Self::String(x.to_string()), + JsonVariantRef::Array(array) => { + Self::Array(array.into_iter().map(Into::into).collect()) + } + JsonVariantRef::Object(object) => Self::Object( + object + .into_iter() + .map(|(k, v)| (k.to_string(), v.into())) + .collect(), + ), + } + } +} + +/// Reference to representation of any valid JSON value. +#[derive(Debug, Serialize)] +pub struct JsonValueRef<'a> { + #[serde(skip)] + json_type: OnceLock, + json_variant: JsonVariantRef<'a>, +} + +impl<'a> JsonValueRef<'a> { + pub fn null() -> Self { + ().into() + } + + pub(crate) fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::Json(self.json_type().clone()) + } + + pub(crate) fn json_type(&self) -> &JsonType { + self.json_type.get_or_init(|| self.json_variant.json_type()) + } + + pub fn into_variant(self) -> JsonVariantRef<'a> { + self.json_variant + } + + pub(crate) fn is_null(&self) -> bool { + matches!(self.json_variant, JsonVariantRef::Null) + } + + pub fn is_object(&self) -> bool { + matches!(self.json_variant, JsonVariantRef::Object(_)) + } + + pub(crate) fn as_f32(&self) -> Option { + match self.json_variant { + JsonVariantRef::Number(JsonNumber::Float(f)) => f.to_f32(), + _ => None, + } + } + + pub(crate) fn as_f64(&self) -> Option { + match self.json_variant { + JsonVariantRef::Number(JsonNumber::Float(f)) => Some(f.0), + _ => None, + } + } + + pub fn as_value_ref(&self) -> ValueRef<'_> { + fn helper<'a>(v: &'a JsonVariantRef) -> ValueRef<'a> { + match v { + JsonVariantRef::Null => ValueRef::Null, + JsonVariantRef::Bool(x) => ValueRef::Boolean(*x), + JsonVariantRef::Number(x) => match x { + JsonNumber::PosInt(i) => ValueRef::UInt64(*i), + JsonNumber::NegInt(i) => ValueRef::Int64(*i), + JsonNumber::Float(f) => ValueRef::Float64(*f), + }, + JsonVariantRef::String(x) => ValueRef::String(x), + JsonVariantRef::Array(array) => { + let val = array.iter().map(helper).collect::>(); + let item_datatype = if let Some(first) = val.first() { + first.data_type() + } else { + ConcreteDataType::null_datatype() + }; + ValueRef::List(ListValueRef::RefList { + val, + item_datatype: Arc::new(item_datatype), + }) + } + JsonVariantRef::Object(object) => { + let mut fields = Vec::with_capacity(object.len()); + let mut val = Vec::with_capacity(object.len()); + for (k, v) in object.iter() { + let v = helper(v); + fields.push(StructField::new(k.to_string(), v.data_type(), true)); + val.push(v); + } + ValueRef::Struct(StructValueRef::RefList { + val, + fields: StructType::new(Arc::new(fields)), + }) + } + } + } + helper(&self.json_variant) + } + + pub(crate) fn data_size(&self) -> usize { + size_of_val(self) + } +} + +impl<'a, T: Into>> From for JsonValueRef<'a> { + fn from(v: T) -> Self { + Self { + json_type: OnceLock::new(), + json_variant: v.into(), + } + } +} + +impl From> for JsonValue { + fn from(v: JsonValueRef<'_>) -> Self { + Self { + json_type: OnceLock::new(), + json_variant: v.json_variant.into(), + } + } +} + +impl PartialEq for JsonValueRef<'_> { + fn eq(&self, other: &Self) -> bool { + let Self { + json_type: _, + json_variant, + } = self; + json_variant == &other.json_variant + } +} + +impl Eq for JsonValueRef<'_> {} + +impl Clone for JsonValueRef<'_> { + fn clone(&self) -> Self { + let Self { + json_type: _, + json_variant, + } = self; + Self { + json_type: OnceLock::new(), + json_variant: json_variant.clone(), + } + } +} diff --git a/src/datatypes/src/lib.rs b/src/datatypes/src/lib.rs index 6b20080380..2c3d4c23bf 100644 --- a/src/datatypes/src/lib.rs +++ b/src/datatypes/src/lib.rs @@ -13,11 +13,13 @@ // limitations under the License. #![feature(assert_matches)] +#![feature(box_patterns)] pub mod arrow_array; pub mod data_type; pub mod duration; pub mod error; +pub mod extension; pub mod interval; pub mod json; pub mod macros; diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs index 6bdf321137..812b3c3b22 100644 --- a/src/datatypes/src/schema.rs +++ b/src/datatypes/src/schema.rs @@ -273,8 +273,9 @@ fn collect_fields(column_schemas: &[ColumnSchema]) -> Result { _ => None, }; if let Some(extype) = extype { - let metadata = HashMap::from([(TYPE_KEY.to_string(), extype.to_string())]); - field = field.with_metadata(metadata); + field + .metadata_mut() + .insert(TYPE_KEY.to_string(), extype.to_string()); } fields.push(field); ensure!( diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs index f176350b8c..9272ba4b21 100644 --- a/src/datatypes/src/schema/column_schema.rs +++ b/src/datatypes/src/schema/column_schema.rs @@ -17,12 +17,17 @@ use std::fmt; use std::str::FromStr; use arrow::datatypes::Field; +use arrow_schema::extension::{ + EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY, ExtensionType, +}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, ensure}; use sqlparser_derive::{Visit, VisitMut}; use crate::data_type::{ConcreteDataType, DataType}; -use crate::error::{self, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result}; +use crate::error::{ + self, ArrowMetadataSnafu, Error, InvalidFulltextOptionSnafu, ParseExtendedTypeSnafu, Result, +}; use crate::schema::TYPE_KEY; use crate::schema::constraint::ColumnDefaultConstraint; use crate::value::Value; @@ -391,6 +396,45 @@ impl ColumnSchema { self.metadata.remove(SKIPPING_INDEX_KEY); Ok(()) } + + pub fn extension_type(&self) -> Result> + where + E: ExtensionType, + { + let extension_type_name = self.metadata.get(EXTENSION_TYPE_NAME_KEY); + + if extension_type_name.map(|s| s.as_str()) == Some(E::NAME) { + let extension_metadata = self.metadata.get(EXTENSION_TYPE_METADATA_KEY); + let extension_metadata = + E::deserialize_metadata(extension_metadata.map(|s| s.as_str())) + .context(ArrowMetadataSnafu)?; + + let extension = E::try_new(&self.data_type.as_arrow_type(), extension_metadata) + .context(ArrowMetadataSnafu)?; + Ok(Some(extension)) + } else { + Ok(None) + } + } + + pub fn with_extension_type(&mut self, extension_type: &E) -> Result<()> + where + E: ExtensionType, + { + self.metadata + .insert(EXTENSION_TYPE_NAME_KEY.to_string(), E::NAME.to_string()); + + if let Some(extension_metadata) = extension_type.serialize_metadata() { + self.metadata + .insert(EXTENSION_TYPE_METADATA_KEY.to_string(), extension_metadata); + } + + Ok(()) + } + + pub fn is_indexed(&self) -> bool { + self.is_inverted_indexed() || self.is_fulltext_indexed() || self.is_skipping_indexed() + } } /// Column extended type set in column schema's metadata. diff --git a/src/datatypes/src/types.rs b/src/datatypes/src/types.rs index 1c7df86249..597bbb673b 100644 --- a/src/datatypes/src/types.rs +++ b/src/datatypes/src/types.rs @@ -20,7 +20,7 @@ mod decimal_type; mod dictionary_type; mod duration_type; mod interval_type; -mod json_type; +pub mod json_type; mod list_type; mod null_type; mod primitive_type; diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 01ec81dd08..4c838b78d1 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; +use std::fmt::{Display, Formatter}; use std::str::FromStr; +use std::sync::Arc; use arrow::datatypes::DataType as ArrowDataType; use common_base::bytes::Bytes; @@ -20,20 +23,146 @@ use serde::{Deserialize, Serialize}; use snafu::ResultExt; use crate::data_type::DataType; -use crate::error::{DeserializeSnafu, InvalidJsonSnafu, InvalidJsonbSnafu, Result}; +use crate::error::{ + DeserializeSnafu, InvalidJsonSnafu, InvalidJsonbSnafu, MergeJsonDatatypeSnafu, Result, +}; use crate::prelude::ConcreteDataType; use crate::scalars::ScalarVectorBuilder; use crate::type_id::LogicalTypeId; +use crate::types::{ListType, StructField, StructType}; use crate::value::Value; +use crate::vectors::json::builder::JsonVectorBuilder; use crate::vectors::{BinaryVectorBuilder, MutableVector}; pub const JSON_TYPE_NAME: &str = "Json"; +const JSON_PLAIN_FIELD_NAME: &str = "__json_plain__"; +const JSON_PLAIN_FIELD_METADATA_KEY: &str = "is_plain_json"; + +pub type JsonObjectType = BTreeMap; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)] +pub enum JsonNumberType { + U64, + I64, + F64, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)] +pub enum JsonNativeType { + Null, + Bool, + Number(JsonNumberType), + String, + Array(Box), + Object(JsonObjectType), +} + +impl JsonNativeType { + pub fn is_null(&self) -> bool { + matches!(self, JsonNativeType::Null) + } + + pub fn u64() -> Self { + Self::Number(JsonNumberType::U64) + } + + pub fn i64() -> Self { + Self::Number(JsonNumberType::I64) + } + + pub fn f64() -> Self { + Self::Number(JsonNumberType::F64) + } +} + +impl From<&JsonNativeType> for ConcreteDataType { + fn from(value: &JsonNativeType) -> Self { + match value { + JsonNativeType::Null => ConcreteDataType::null_datatype(), + JsonNativeType::Bool => ConcreteDataType::boolean_datatype(), + JsonNativeType::Number(JsonNumberType::U64) => ConcreteDataType::uint64_datatype(), + JsonNativeType::Number(JsonNumberType::I64) => ConcreteDataType::int64_datatype(), + JsonNativeType::Number(JsonNumberType::F64) => ConcreteDataType::float64_datatype(), + JsonNativeType::String => ConcreteDataType::string_datatype(), + JsonNativeType::Array(item_type) => { + ConcreteDataType::List(ListType::new(Arc::new(item_type.as_ref().into()))) + } + JsonNativeType::Object(object) => { + let fields = object + .iter() + .map(|(type_name, field_type)| { + StructField::new(type_name.clone(), field_type.into(), true) + }) + .collect(); + ConcreteDataType::Struct(StructType::new(Arc::new(fields))) + } + } + } +} + +impl From<&ConcreteDataType> for JsonNativeType { + fn from(value: &ConcreteDataType) -> Self { + match value { + ConcreteDataType::Null(_) => JsonNativeType::Null, + ConcreteDataType::Boolean(_) => JsonNativeType::Bool, + ConcreteDataType::UInt64(_) + | ConcreteDataType::UInt32(_) + | ConcreteDataType::UInt16(_) + | ConcreteDataType::UInt8(_) => JsonNativeType::u64(), + ConcreteDataType::Int64(_) + | ConcreteDataType::Int32(_) + | ConcreteDataType::Int16(_) + | ConcreteDataType::Int8(_) => JsonNativeType::i64(), + ConcreteDataType::Float64(_) | ConcreteDataType::Float32(_) => JsonNativeType::f64(), + ConcreteDataType::String(_) => JsonNativeType::String, + ConcreteDataType::List(list_type) => { + JsonNativeType::Array(Box::new(list_type.item_type().into())) + } + ConcreteDataType::Struct(struct_type) => JsonNativeType::Object( + struct_type + .fields() + .iter() + .map(|field| (field.name().to_string(), field.data_type().into())) + .collect(), + ), + ConcreteDataType::Json(json_type) => json_type.native_type().clone(), + _ => unreachable!(), + } + } +} + +impl Display for JsonNativeType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + JsonNativeType::Null => write!(f, "Null"), + JsonNativeType::Bool => write!(f, "Bool"), + JsonNativeType::Number(t) => { + write!(f, "Number({t:?})") + } + JsonNativeType::String => write!(f, "String"), + JsonNativeType::Array(item_type) => { + write!(f, "Array[{}]", item_type) + } + JsonNativeType::Object(object) => { + write!( + f, + "Object{{{}}}", + object + .iter() + .map(|(k, v)| format!(r#""{k}": {v}"#)) + .collect::>() + .join(", ") + ) + } + } + } +} #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Default)] pub enum JsonFormat { #[default] Jsonb, - Native(Box), + Native(Box), } /// JsonType is a data type for JSON data. It is stored as binary data of jsonb format. @@ -47,11 +176,175 @@ impl JsonType { pub fn new(format: JsonFormat) -> Self { Self { format } } + + pub(crate) fn new_native(native: JsonNativeType) -> Self { + Self { + format: JsonFormat::Native(Box::new(native)), + } + } + + pub(crate) fn native_type(&self) -> &JsonNativeType { + match &self.format { + JsonFormat::Jsonb => &JsonNativeType::String, + JsonFormat::Native(x) => x.as_ref(), + } + } + + pub fn null() -> Self { + Self { + format: JsonFormat::Native(Box::new(JsonNativeType::Null)), + } + } + + /// Make json type a struct type, by: + /// - if the json is an object, its entries are mapped to struct fields, obviously; + /// - if not, the json is one of bool, number, string or array, make it a special field + /// (see [plain_json_struct_type]). + pub(crate) fn as_struct_type(&self) -> StructType { + match &self.format { + JsonFormat::Jsonb => StructType::default(), + JsonFormat::Native(inner) => match ConcreteDataType::from(inner.as_ref()) { + ConcreteDataType::Struct(t) => t.clone(), + x => plain_json_struct_type(x), + }, + } + } + + /// Try to merge this json type with others, error on datatype conflict. + pub fn merge(&mut self, other: &JsonType) -> Result<()> { + match (&self.format, &other.format) { + (JsonFormat::Jsonb, JsonFormat::Jsonb) => Ok(()), + (JsonFormat::Native(this), JsonFormat::Native(that)) => { + let merged = merge(this.as_ref(), that.as_ref())?; + self.format = JsonFormat::Native(Box::new(merged)); + Ok(()) + } + _ => MergeJsonDatatypeSnafu { + reason: "json format not match", + } + .fail(), + } + } + + /// Check if it can merge with `other` json type. + pub fn is_mergeable(&self, other: &JsonType) -> bool { + match (&self.format, &other.format) { + (JsonFormat::Jsonb, JsonFormat::Jsonb) => true, + (JsonFormat::Native(this), JsonFormat::Native(that)) => { + is_mergeable(this.as_ref(), that.as_ref()) + } + _ => false, + } + } + + /// Check if it includes all fields in `other` json type. + pub fn is_include(&self, other: &JsonType) -> bool { + match (&self.format, &other.format) { + (JsonFormat::Jsonb, JsonFormat::Jsonb) => true, + (JsonFormat::Native(this), JsonFormat::Native(that)) => { + is_include(this.as_ref(), that.as_ref()) + } + _ => false, + } + } +} + +fn is_include(this: &JsonNativeType, that: &JsonNativeType) -> bool { + fn is_include_object(this: &JsonObjectType, that: &JsonObjectType) -> bool { + for (type_name, that_type) in that { + let Some(this_type) = this.get(type_name) else { + return false; + }; + if !is_include(this_type, that_type) { + return false; + } + } + true + } + + match (this, that) { + (this, that) if this == that => true, + (JsonNativeType::Array(this), JsonNativeType::Array(that)) => { + is_include(this.as_ref(), that.as_ref()) + } + (JsonNativeType::Object(this), JsonNativeType::Object(that)) => { + is_include_object(this, that) + } + (_, JsonNativeType::Null) => true, + _ => false, + } +} + +/// A special struct type for denoting "plain"(not object) json value. It has only one field, with +/// fixed name [JSON_PLAIN_FIELD_NAME] and with metadata [JSON_PLAIN_FIELD_METADATA_KEY] = `"true"`. +pub(crate) fn plain_json_struct_type(item_type: ConcreteDataType) -> StructType { + let mut field = StructField::new(JSON_PLAIN_FIELD_NAME.to_string(), item_type, true); + field.insert_metadata(JSON_PLAIN_FIELD_METADATA_KEY, true); + StructType::new(Arc::new(vec![field])) +} + +fn is_mergeable(this: &JsonNativeType, that: &JsonNativeType) -> bool { + fn is_mergeable_object(this: &JsonObjectType, that: &JsonObjectType) -> bool { + for (type_name, that_type) in that { + if let Some(this_type) = this.get(type_name) + && !is_mergeable(this_type, that_type) + { + return false; + } + } + true + } + + match (this, that) { + (this, that) if this == that => true, + (JsonNativeType::Array(this), JsonNativeType::Array(that)) => { + is_mergeable(this.as_ref(), that.as_ref()) + } + (JsonNativeType::Object(this), JsonNativeType::Object(that)) => { + is_mergeable_object(this, that) + } + (JsonNativeType::Null, _) | (_, JsonNativeType::Null) => true, + _ => false, + } +} + +fn merge(this: &JsonNativeType, that: &JsonNativeType) -> Result { + fn merge_object(this: &JsonObjectType, that: &JsonObjectType) -> Result { + let mut this = this.clone(); + // merge "that" into "this" directly: + for (type_name, that_type) in that { + if let Some(this_type) = this.get_mut(type_name) { + let merged_type = merge(this_type, that_type)?; + *this_type = merged_type; + } else { + this.insert(type_name.clone(), that_type.clone()); + } + } + Ok(this) + } + + match (this, that) { + (this, that) if this == that => Ok(this.clone()), + (JsonNativeType::Array(this), JsonNativeType::Array(that)) => { + merge(this.as_ref(), that.as_ref()).map(|x| JsonNativeType::Array(Box::new(x))) + } + (JsonNativeType::Object(this), JsonNativeType::Object(that)) => { + merge_object(this, that).map(JsonNativeType::Object) + } + (JsonNativeType::Null, x) | (x, JsonNativeType::Null) => Ok(x.clone()), + _ => MergeJsonDatatypeSnafu { + reason: format!("datatypes have conflict, this: {this}, that: {that}"), + } + .fail(), + } } impl DataType for JsonType { fn name(&self) -> String { - JSON_TYPE_NAME.to_string() + match &self.format { + JsonFormat::Jsonb => JSON_TYPE_NAME.to_string(), + JsonFormat::Native(x) => format!("Json<{x}>"), + } } fn logical_type_id(&self) -> LogicalTypeId { @@ -63,11 +356,17 @@ impl DataType for JsonType { } fn as_arrow_type(&self) -> ArrowDataType { - ArrowDataType::Binary + match self.format { + JsonFormat::Jsonb => ArrowDataType::Binary, + JsonFormat::Native(_) => self.as_struct_type().as_arrow_type(), + } } fn create_mutable_vector(&self, capacity: usize) -> Box { - Box::new(BinaryVectorBuilder::with_capacity(capacity)) + match &self.format { + JsonFormat::Jsonb => Box::new(BinaryVectorBuilder::with_capacity(capacity)), + JsonFormat::Native(x) => Box::new(JsonVectorBuilder::new(*x.clone(), capacity)), + } } fn try_cast(&self, from: Value) -> Option { @@ -78,6 +377,12 @@ impl DataType for JsonType { } } +impl Display for JsonType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.name()) + } +} + /// Converts a json type value to string pub fn jsonb_to_string(val: &[u8]) -> Result { match jsonb::from_slice(val) { @@ -102,3 +407,291 @@ pub fn parse_string_to_jsonb(s: &str) -> Result> { .map_err(|_| InvalidJsonSnafu { value: s }.build()) .map(|json| json.to_vec()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::json::JsonStructureSettings; + + #[test] + fn test_json_type_include() { + fn test(this: &JsonNativeType, that: &JsonNativeType, expected: bool) { + assert_eq!(is_include(this, that), expected); + } + + test(&JsonNativeType::Null, &JsonNativeType::Null, true); + test(&JsonNativeType::Null, &JsonNativeType::Bool, false); + + test(&JsonNativeType::Bool, &JsonNativeType::Null, true); + test(&JsonNativeType::Bool, &JsonNativeType::Bool, true); + test(&JsonNativeType::Bool, &JsonNativeType::u64(), false); + + test(&JsonNativeType::u64(), &JsonNativeType::Null, true); + test(&JsonNativeType::u64(), &JsonNativeType::u64(), true); + test(&JsonNativeType::u64(), &JsonNativeType::String, false); + + test(&JsonNativeType::String, &JsonNativeType::Null, true); + test(&JsonNativeType::String, &JsonNativeType::String, true); + test( + &JsonNativeType::String, + &JsonNativeType::Array(Box::new(JsonNativeType::f64())), + false, + ); + + test( + &JsonNativeType::Array(Box::new(JsonNativeType::f64())), + &JsonNativeType::Null, + true, + ); + test( + &JsonNativeType::Array(Box::new(JsonNativeType::f64())), + &JsonNativeType::Array(Box::new(JsonNativeType::Null)), + true, + ); + test( + &JsonNativeType::Array(Box::new(JsonNativeType::f64())), + &JsonNativeType::Array(Box::new(JsonNativeType::f64())), + true, + ); + test( + &JsonNativeType::Array(Box::new(JsonNativeType::f64())), + &JsonNativeType::String, + false, + ); + test( + &JsonNativeType::Array(Box::new(JsonNativeType::f64())), + &JsonNativeType::Object(JsonObjectType::new()), + false, + ); + + let simple_json_object = &JsonNativeType::Object(JsonObjectType::from([( + "foo".to_string(), + JsonNativeType::String, + )])); + test(simple_json_object, &JsonNativeType::Null, true); + test(simple_json_object, simple_json_object, true); + test(simple_json_object, &JsonNativeType::i64(), false); + test( + simple_json_object, + &JsonNativeType::Object(JsonObjectType::from([( + "bar".to_string(), + JsonNativeType::i64(), + )])), + false, + ); + + let complex_json_object = &JsonNativeType::Object(JsonObjectType::from([ + ( + "nested".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "a".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "b".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "c".to_string(), + JsonNativeType::String, + )])), + )])), + )])), + ), + ("bar".to_string(), JsonNativeType::i64()), + ])); + test(complex_json_object, &JsonNativeType::Null, true); + test(complex_json_object, &JsonNativeType::String, false); + test(complex_json_object, complex_json_object, true); + test( + complex_json_object, + &JsonNativeType::Object(JsonObjectType::from([( + "bar".to_string(), + JsonNativeType::i64(), + )])), + true, + ); + test( + complex_json_object, + &JsonNativeType::Object(JsonObjectType::from([ + ( + "nested".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "a".to_string(), + JsonNativeType::Null, + )])), + ), + ("bar".to_string(), JsonNativeType::i64()), + ])), + true, + ); + test( + complex_json_object, + &JsonNativeType::Object(JsonObjectType::from([ + ( + "nested".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "a".to_string(), + JsonNativeType::String, + )])), + ), + ("bar".to_string(), JsonNativeType::i64()), + ])), + false, + ); + test( + complex_json_object, + &JsonNativeType::Object(JsonObjectType::from([ + ( + "nested".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "a".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "b".to_string(), + JsonNativeType::String, + )])), + )])), + ), + ("bar".to_string(), JsonNativeType::i64()), + ])), + false, + ); + test( + complex_json_object, + &JsonNativeType::Object(JsonObjectType::from([ + ( + "nested".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "a".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "b".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "c".to_string(), + JsonNativeType::Null, + )])), + )])), + )])), + ), + ("bar".to_string(), JsonNativeType::i64()), + ])), + true, + ); + test( + complex_json_object, + &JsonNativeType::Object(JsonObjectType::from([ + ( + "nested".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "a".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "b".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "c".to_string(), + JsonNativeType::Bool, + )])), + )])), + )])), + ), + ("bar".to_string(), JsonNativeType::i64()), + ])), + false, + ); + test( + complex_json_object, + &JsonNativeType::Object(JsonObjectType::from([( + "nested".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "a".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "b".to_string(), + JsonNativeType::Object(JsonObjectType::from([( + "c".to_string(), + JsonNativeType::String, + )])), + )])), + )])), + )])), + true, + ); + } + + #[test] + fn test_merge_json_type() -> Result<()> { + fn test( + json: &str, + json_type: &mut JsonType, + expected: std::result::Result<&str, &str>, + ) -> Result<()> { + let json: serde_json::Value = serde_json::from_str(json).unwrap(); + + let settings = JsonStructureSettings::Structured(None); + let value = settings.encode(json)?; + let value_type = value.data_type(); + let Some(other) = value_type.as_json() else { + unreachable!() + }; + + let result = json_type.merge(other); + match (result, expected) { + (Ok(()), Ok(expected)) => { + assert_eq!(json_type.name(), expected); + assert!(json_type.is_mergeable(other)); + } + (Err(err), Err(expected)) => { + assert_eq!(err.to_string(), expected); + assert!(!json_type.is_mergeable(other)); + } + _ => unreachable!(), + } + Ok(()) + } + + let json_type = &mut JsonType::new_native(JsonNativeType::Null); + + // can merge with json object: + let json = r#"{ + "hello": "world", + "list": [1, 2, 3], + "object": {"a": 1} + }"#; + let expected = r#"Json"#; + test(json, json_type, Ok(expected))?; + + // cannot merge with other non-object json values: + let jsons = [r#""s""#, "1", "[1]"]; + let expects = [ + r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: String"#, + r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: Number(I64)"#, + r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: Array[Number(I64)]"#, + ]; + for (json, expect) in jsons.into_iter().zip(expects.into_iter()) { + test(json, json_type, Err(expect))?; + } + + // cannot merge with other json object with conflict field datatype: + let json = r#"{ + "hello": 1, + "float": 0.123, + "no": 42 + }"#; + let expected = r#"Failed to merge JSON datatype: datatypes have conflict, this: String, that: Number(I64)"#; + test(json, json_type, Err(expected))?; + + // can merge with another json object: + let json = r#"{ + "hello": "greptime", + "float": 0.123, + "int": 42 + }"#; + let expected = r#"Json"#; + test(json, json_type, Ok(expected))?; + + // can merge with some complex nested json object: + let json = r#"{ + "list": [4], + "object": {"foo": "bar", "l": ["x"], "o": {"key": "value"}}, + "float": 0.456, + "int": 0 + }"#; + let expected = r#"Json"#; + test(json, json_type, Ok(expected))?; + + Ok(()) + } +} diff --git a/src/datatypes/src/types/string_type.rs b/src/datatypes/src/types/string_type.rs index 61677ead4a..fff1d87f00 100644 --- a/src/datatypes/src/types/string_type.rs +++ b/src/datatypes/src/types/string_type.rs @@ -177,7 +177,7 @@ impl DataType for StringType { Value::Duration(v) => Some(Value::String(StringBytes::from(v.to_string()))), Value::Decimal128(v) => Some(Value::String(StringBytes::from(v.to_string()))), - Value::Json(v) => self.try_cast(*v), + Value::Json(v) => serde_json::to_string(v.as_ref()).ok().map(|s| s.into()), // StringBytes is only support for utf-8, Value::Binary and collections are not allowed. Value::Binary(_) | Value::List(_) | Value::Struct(_) => None, diff --git a/src/datatypes/src/types/struct_type.rs b/src/datatypes/src/types/struct_type.rs index 5e3156498f..2cf2a8825d 100644 --- a/src/datatypes/src/types/struct_type.rs +++ b/src/datatypes/src/types/struct_type.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; use std::sync::Arc; use arrow::datatypes::{DataType as ArrowDataType, Field}; @@ -46,13 +47,22 @@ impl TryFrom<&Fields> for StructType { } } +impl From<[StructField; N]> for StructType { + fn from(value: [StructField; N]) -> Self { + let value: Box<[StructField]> = Box::new(value); + Self { + fields: Arc::new(value.into_vec()), + } + } +} + impl DataType for StructType { fn name(&self) -> String { format!( "Struct<{}>", self.fields .iter() - .map(|f| f.name()) + .map(|f| format!(r#""{}": {}"#, f.name(), f.data_type())) .collect::>() .join(", ") ) @@ -108,6 +118,7 @@ pub struct StructField { name: String, data_type: ConcreteDataType, nullable: bool, + metadata: BTreeMap, } impl StructField { @@ -116,6 +127,7 @@ impl StructField { name, data_type, nullable, + metadata: BTreeMap::new(), } } @@ -135,11 +147,26 @@ impl StructField { self.nullable } + pub(crate) fn insert_metadata(&mut self, key: impl ToString, value: impl ToString) { + self.metadata.insert(key.to_string(), value.to_string()); + } + + #[expect(unused)] + pub(crate) fn metadata(&self, key: &str) -> Option<&str> { + self.metadata.get(key).map(String::as_str) + } + pub fn to_df_field(&self) -> Field { + let metadata = self + .metadata + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); Field::new( self.name.clone(), self.data_type.as_arrow_type(), self.nullable, ) + .with_metadata(metadata) } } diff --git a/src/datatypes/src/value.rs b/src/datatypes/src/value.rs index 7acc7073d4..1c7dc35de6 100644 --- a/src/datatypes/src/value.rs +++ b/src/datatypes/src/value.rs @@ -36,6 +36,7 @@ use crate::error::{ self, ConvertArrowArrayToScalarsSnafu, ConvertScalarToArrowArraySnafu, Error, InconsistentStructFieldsAndItemsSnafu, Result, TryFromValueSnafu, }; +use crate::json::value::{JsonValue, JsonValueRef}; use crate::prelude::*; use crate::type_id::LogicalTypeId; use crate::types::{IntervalType, ListType, StructType}; @@ -86,7 +87,7 @@ pub enum Value { Struct(StructValue), // Json Logical types: - Json(Box), + Json(Box), } impl Display for Value { @@ -197,7 +198,7 @@ macro_rules! define_data_type_func { $struct::Struct(struct_value) => { ConcreteDataType::struct_datatype(struct_value.struct_type().clone()) } - $struct::Json(v) => ConcreteDataType::json_native_datatype(v.data_type()), + $struct::Json(v) => v.data_type(), } } }; @@ -220,7 +221,6 @@ impl Value { match self { Value::Null => Ok(None), Value::List(v) => Ok(Some(v)), - Value::Json(inner) => inner.as_list(), other => error::CastTypeSnafu { msg: format!("Failed to cast {other:?} to list value"), } @@ -232,7 +232,6 @@ impl Value { match self { Value::Null => Ok(None), Value::Struct(v) => Ok(Some(v)), - Value::Json(inner) => inner.as_struct(), other => error::CastTypeSnafu { msg: format!("Failed to cast {other:?} to struct value"), } @@ -267,7 +266,7 @@ impl Value { Value::Duration(v) => ValueRef::Duration(*v), Value::Decimal128(v) => ValueRef::Decimal128(*v), Value::Struct(v) => ValueRef::Struct(StructValueRef::Ref(v)), - Value::Json(v) => ValueRef::Json(Box::new(v.as_value_ref())), + Value::Json(v) => ValueRef::Json(Box::new((**v).as_ref())), } } @@ -391,7 +390,7 @@ impl Value { /// Extract the inner JSON value from a JSON type. pub fn into_json_inner(self) -> Option { match self { - Value::Json(v) => Some(*v), + Value::Json(v) => Some((*v).into_value()), _ => None, } } @@ -501,7 +500,12 @@ impl Value { let struct_type = output_type.as_struct().unwrap(); struct_value.try_to_scalar_value(struct_type)? } - Value::Json(v) => v.try_to_scalar_value(output_type)?, + Value::Json(_) => { + return error::ToScalarValueSnafu { + reason: "unsupported for json value", + } + .fail(); + } }; Ok(scalar_value) @@ -554,13 +558,12 @@ impl Value { Value::IntervalDayTime(x) => Some(Value::IntervalDayTime(x.negative())), Value::IntervalMonthDayNano(x) => Some(Value::IntervalMonthDayNano(x.negative())), - Value::Json(v) => v.try_negative().map(|neg| Value::Json(Box::new(neg))), - Value::Binary(_) | Value::String(_) | Value::Boolean(_) | Value::List(_) - | Value::Struct(_) => None, + | Value::Struct(_) + | Value::Json(_) => None, } } } @@ -873,6 +876,12 @@ impl From<&[u8]> for Value { } } +impl From<()> for Value { + fn from(_: ()) -> Self { + Value::Null + } +} + impl TryFrom for serde_json::Value { type Error = serde_json::Error; @@ -923,7 +932,7 @@ impl TryFrom for serde_json::Value { .collect::>>()?; serde_json::Value::Object(map) } - Value::Json(v) => serde_json::Value::try_from(*v)?, + Value::Json(v) => (*v).into(), }; Ok(json_value) @@ -1257,7 +1266,7 @@ impl From> for Value { ValueRef::List(v) => v.to_value(), ValueRef::Decimal128(v) => Value::Decimal128(v), ValueRef::Struct(v) => v.to_value(), - ValueRef::Json(v) => Value::Json(Box::new(Value::from(*v))), + ValueRef::Json(v) => Value::Json(Box::new(JsonValue::from(*v))), } } } @@ -1301,7 +1310,7 @@ pub enum ValueRef<'a> { List(ListValueRef<'a>), Struct(StructValueRef<'a>), - Json(Box>), + Json(Box>), } macro_rules! impl_as_for_value_ref { @@ -1309,18 +1318,6 @@ macro_rules! impl_as_for_value_ref { match $value { ValueRef::Null => Ok(None), ValueRef::$Variant(v) => Ok(Some(v.clone())), - ValueRef::Json(v) => match v.as_ref() { - ValueRef::Null => Ok(None), - ValueRef::$Variant(v) => Ok(Some(v.clone())), - other => error::CastTypeSnafu { - msg: format!( - "Failed to cast value ref {:?} to {}", - other, - stringify!($Variant) - ), - } - .fail(), - }, other => error::CastTypeSnafu { msg: format!( "Failed to cast value ref {:?} to {}", @@ -1396,7 +1393,7 @@ impl<'a> ValueRef<'a> { match self { ValueRef::Null => Ok(None), ValueRef::Float32(f) => Ok(Some(f.0)), - ValueRef::Json(v) => v.try_into_f32(), + ValueRef::Json(v) => Ok(v.as_f32()), other => error::CastTypeSnafu { msg: format!("Failed to cast value ref {:?} to ValueRef::Float32", other,), } @@ -1408,7 +1405,7 @@ impl<'a> ValueRef<'a> { match self { ValueRef::Null => Ok(None), ValueRef::Float64(f) => Ok(Some(f.0)), - ValueRef::Json(v) => v.try_into_f64(), + ValueRef::Json(v) => Ok(v.as_f64()), other => error::CastTypeSnafu { msg: format!("Failed to cast value ref {:?} to ValueRef::Float64", other,), } @@ -1740,6 +1737,7 @@ pub(crate) mod tests { use num_traits::Float; use super::*; + use crate::json::value::{JsonVariant, JsonVariantRef}; use crate::types::StructField; use crate::vectors::ListVectorBuilder; @@ -2275,19 +2273,48 @@ pub(crate) mod tests { check_type_and_value( &ConcreteDataType::json_native_datatype(ConcreteDataType::boolean_datatype()), - &Value::Json(Box::new(Value::Boolean(true))), + &Value::Json(Box::new(true.into())), ); check_type_and_value( &ConcreteDataType::json_native_datatype(build_list_type()), - &Value::Json(Box::new(Value::List(build_list_value()))), + &Value::Json(Box::new([true].into())), ); check_type_and_value( &ConcreteDataType::json_native_datatype(ConcreteDataType::struct_datatype( - build_struct_type(), + StructType::new(Arc::new(vec![ + StructField::new( + "address".to_string(), + ConcreteDataType::string_datatype(), + true, + ), + StructField::new("age".to_string(), ConcreteDataType::uint64_datatype(), true), + StructField::new( + "awards".to_string(), + ConcreteDataType::list_datatype(Arc::new( + ConcreteDataType::boolean_datatype(), + )), + true, + ), + StructField::new("id".to_string(), ConcreteDataType::int64_datatype(), true), + StructField::new( + "name".to_string(), + ConcreteDataType::string_datatype(), + true, + ), + ])), + )), + &Value::Json(Box::new( + [ + ("id", JsonVariant::from(1i64)), + ("name", "Alice".into()), + ("age", 1u64.into()), + ("address", "blah".into()), + ("awards", [true, false].into()), + ] + .into(), )), - &Value::Json(Box::new(Value::Struct(build_struct_value()))), ); } @@ -2429,25 +2456,27 @@ pub(crate) mod tests { // string wrapped in json assert_eq!( - serde_json::Value::try_from(Value::Json(Box::new(Value::String("hello".into())))) - .unwrap(), + serde_json::Value::try_from(Value::Json(Box::new("hello".into()))).unwrap(), serde_json::json!("hello") ); // list wrapped in json assert_eq!( - serde_json::Value::try_from(Value::Json(Box::new(Value::List(ListValue::new( - vec![Value::Int32(1), Value::Int32(2), Value::Int32(3),], - Arc::new(ConcreteDataType::int32_datatype()) - ))))) - .unwrap(), + serde_json::Value::try_from(Value::Json(Box::new([1i64, 2, 3,].into()))).unwrap(), serde_json::json!([1, 2, 3]) ); // struct wrapped in json assert_eq!( - serde_json::Value::try_from(Value::Json(Box::new(Value::Struct(struct_value)))) - .unwrap(), + serde_json::Value::try_from(Value::Json(Box::new( + [ + ("num".to_string(), JsonVariant::from(42i64)), + ("name".to_string(), "tomcat".into()), + ("yes_or_no".to_string(), true.into()), + ] + .into() + ))) + .unwrap(), serde_json::json!({ "num": 42, "name": "tomcat", @@ -2459,7 +2488,7 @@ pub(crate) mod tests { #[test] fn test_null_value() { assert!(Value::Null.is_null()); - assert!(Value::Json(Box::new(Value::Null)).is_null()); + assert!(Value::Json(Box::new(JsonValue::null())).is_null()); assert!(!Value::Boolean(true).is_null()); assert!(Value::Null < Value::Boolean(false)); assert!(Value::Boolean(true) > Value::Null); @@ -2538,13 +2567,6 @@ pub(crate) mod tests { ValueRef::Struct(StructValueRef::Ref(&struct_value)), Value::Struct(struct_value.clone()).as_value_ref() ); - - assert_eq!( - ValueRef::Json(Box::new(ValueRef::Struct(StructValueRef::Ref( - &struct_value - )))), - Value::Json(Box::new(Value::Struct(struct_value.clone()))).as_value_ref() - ); } #[test] @@ -2669,8 +2691,18 @@ pub(crate) mod tests { ); assert_eq!( - Value::Json(Box::new(Value::Struct(build_struct_value()))).to_string(), - "Json({ id: 1, name: tom, age: 25, address: 94038, awards: Boolean[true, false] })" + Value::Json(Box::new( + [ + ("id", JsonVariant::from(1i64)), + ("name", "tom".into()), + ("age", 25u64.into()), + ("address", "94038".into()), + ("awards", [true, false].into()), + ] + .into() + )) + .to_string(), + "Json({ address: 94038, age: 25, awards: [true, false], id: 1, name: tom })" ) } @@ -3161,10 +3193,17 @@ pub(crate) mod tests { ); check_value_ref_size_eq( - &ValueRef::Json(Box::new(ValueRef::Struct(StructValueRef::Ref( - &build_struct_value(), - )))), - 31, + &ValueRef::Json(Box::new( + [ + ("id", JsonVariantRef::from(1i64)), + ("name", "tom".into()), + ("age", 25u64.into()), + ("address", "94038".into()), + ("awards", [true, false].into()), + ] + .into(), + )), + 48, ); } diff --git a/src/datatypes/src/vectors.rs b/src/datatypes/src/vectors.rs index e61b2ca35e..5355b35ff4 100644 --- a/src/datatypes/src/vectors.rs +++ b/src/datatypes/src/vectors.rs @@ -35,6 +35,7 @@ mod duration; mod eq; mod helper; mod interval; +pub(crate) mod json; mod list; mod null; pub(crate) mod operations; diff --git a/src/datatypes/src/vectors/helper.rs b/src/datatypes/src/vectors/helper.rs index 024a01c6b1..1bc6951ce3 100644 --- a/src/datatypes/src/vectors/helper.rs +++ b/src/datatypes/src/vectors/helper.rs @@ -464,6 +464,14 @@ impl Helper { } } +#[cfg(test)] +pub(crate) fn pretty_print(vector: VectorRef) -> String { + let array = vector.to_arrow_array(); + arrow::util::pretty::pretty_format_columns(&vector.vector_type_name(), &[array]) + .map(|x| x.to_string()) + .unwrap_or_else(|e| e.to_string()) +} + #[cfg(test)] mod tests { use arrow::array::{ diff --git a/src/datatypes/src/vectors/json.rs b/src/datatypes/src/vectors/json.rs new file mode 100644 index 0000000000..83aa1dd2aa --- /dev/null +++ b/src/datatypes/src/vectors/json.rs @@ -0,0 +1,15 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) mod builder; diff --git a/src/datatypes/src/vectors/json/builder.rs b/src/datatypes/src/vectors/json/builder.rs new file mode 100644 index 0000000000..3a32dda171 --- /dev/null +++ b/src/datatypes/src/vectors/json/builder.rs @@ -0,0 +1,484 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::HashMap; +use std::sync::LazyLock; + +use crate::data_type::ConcreteDataType; +use crate::error::{Result, TryFromValueSnafu, UnsupportedOperationSnafu}; +use crate::json::value::JsonValueRef; +use crate::prelude::{ValueRef, Vector, VectorRef}; +use crate::types::json_type::JsonNativeType; +use crate::types::{JsonType, json_type}; +use crate::value::StructValueRef; +use crate::vectors::{MutableVector, StructVectorBuilder}; + +struct JsonStructsBuilder { + json_type: JsonType, + inner: StructVectorBuilder, +} + +impl JsonStructsBuilder { + fn new(json_type: JsonType, capacity: usize) -> Self { + let struct_type = json_type.as_struct_type(); + let inner = StructVectorBuilder::with_type_and_capacity(struct_type, capacity); + Self { json_type, inner } + } + + fn len(&self) -> usize { + self.inner.len() + } + + fn push(&mut self, json: &JsonValueRef) -> Result<()> { + let mut value = json.as_value_ref(); + if !json.is_object() { + let fields = json_type::plain_json_struct_type(value.data_type()); + value = ValueRef::Struct(StructValueRef::RefList { + val: vec![value], + fields, + }) + } + self.inner.try_push_value_ref(&value) + } + + /// Try to merge (and consume the data of) other json vector builder into this one. + /// Note that the other builder's json type must be able to be merged with this one's + /// (this one's json type has all the fields in other one's, and no datatypes conflict). + /// Normally this is guaranteed, as long as json values are pushed through [JsonVectorBuilder]. + fn try_merge(&mut self, other: &mut JsonStructsBuilder) -> Result<()> { + debug_assert!(self.json_type.is_mergeable(&other.json_type)); + + fn helper(this: &mut StructVectorBuilder, that: &mut StructVectorBuilder) -> Result<()> { + let that_len = that.len(); + if let Some(x) = that.mut_null_buffer().finish() { + this.mut_null_buffer().append_buffer(&x) + } else { + this.mut_null_buffer().append_n_non_nulls(that_len); + } + + let that_fields = that.struct_type().fields(); + let mut that_builders = that_fields + .iter() + .zip(that.mut_value_builders().iter_mut()) + .map(|(field, builder)| (field.name(), builder)) + .collect::>(); + + for (field, this_builder) in this + .struct_type() + .fields() + .iter() + .zip(this.mut_value_builders().iter_mut()) + { + if let Some(that_builder) = that_builders.get_mut(field.name()) { + if field.data_type().is_struct() { + let this = this_builder + .as_mut_any() + .downcast_mut::() + // Safety: a struct datatype field must be corresponding to a struct vector builder. + .unwrap(); + + let that = that_builder + .as_mut_any() + .downcast_mut::() + // Safety: other builder with same field name must have same datatype, + // ensured because the two json types are mergeable. + .unwrap(); + helper(this, that)?; + } else { + let vector = that_builder.to_vector(); + this_builder.extend_slice_of(vector.as_ref(), 0, vector.len())?; + } + } else { + this_builder.push_nulls(that_len); + } + } + Ok(()) + } + helper(&mut self.inner, &mut other.inner) + } + + /// Same as [JsonStructsBuilder::try_merge], but does not consume the other builder's data. + fn try_merge_cloned(&mut self, other: &JsonStructsBuilder) -> Result<()> { + debug_assert!(self.json_type.is_mergeable(&other.json_type)); + + fn helper(this: &mut StructVectorBuilder, that: &StructVectorBuilder) -> Result<()> { + let that_len = that.len(); + if let Some(x) = that.null_buffer().finish_cloned() { + this.mut_null_buffer().append_buffer(&x) + } else { + this.mut_null_buffer().append_n_non_nulls(that_len); + } + + let that_fields = that.struct_type().fields(); + let that_builders = that_fields + .iter() + .zip(that.value_builders().iter()) + .map(|(field, builder)| (field.name(), builder)) + .collect::>(); + + for (field, this_builder) in this + .struct_type() + .fields() + .iter() + .zip(this.mut_value_builders().iter_mut()) + { + if let Some(that_builder) = that_builders.get(field.name()) { + if field.data_type().is_struct() { + let this = this_builder + .as_mut_any() + .downcast_mut::() + // Safety: a struct datatype field must be corresponding to a struct vector builder. + .unwrap(); + + let that = that_builder + .as_any() + .downcast_ref::() + // Safety: other builder with same field name must have same datatype, + // ensured because the two json types are mergeable. + .unwrap(); + helper(this, that)?; + } else { + let vector = that_builder.to_vector_cloned(); + this_builder.extend_slice_of(vector.as_ref(), 0, vector.len())?; + } + } else { + this_builder.push_nulls(that_len); + } + } + Ok(()) + } + helper(&mut self.inner, &other.inner) + } +} + +/// The vector builder for json type values. +/// +/// Json type are dynamic, to some degree (as long as they can be merged into each other). So are +/// json values. Json values are physically stored in struct vectors, which require the types of +/// struct values to be fixed inside a certain struct vector. So to resolve "dynamic" vs "fixed" +/// datatype problem, in this builder, each type of json value gets its own struct vector builder. +/// Once new json type value is pushing into this builder, it creates a new "child" builder for it. +/// +/// Given the "mixed" nature of the values stored in this builder, to produce the json vector, a +/// "merge" operation is performed. The "merge" is to iterate over all the "child" builders, and fill +/// nulls for missing json fields. The final vector's json type is fixed to be the "merge" of all +/// pushed json types. +pub(crate) struct JsonVectorBuilder { + merged_type: JsonType, + capacity: usize, + builders: Vec, +} + +impl JsonVectorBuilder { + pub(crate) fn new(json_type: JsonNativeType, capacity: usize) -> Self { + Self { + merged_type: JsonType::new_native(json_type), + capacity, + builders: vec![], + } + } + + fn try_create_new_builder(&mut self, json_type: &JsonType) -> Result<&mut JsonStructsBuilder> { + self.merged_type.merge(json_type)?; + + let builder = JsonStructsBuilder::new(json_type.clone(), self.capacity); + self.builders.push(builder); + + let len = self.builders.len(); + Ok(&mut self.builders[len - 1]) + } +} + +impl MutableVector for JsonVectorBuilder { + fn data_type(&self) -> ConcreteDataType { + ConcreteDataType::Json(self.merged_type.clone()) + } + + fn len(&self) -> usize { + self.builders.iter().map(|x| x.len()).sum() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn to_vector(&mut self) -> VectorRef { + // Fast path: + if self.builders.len() == 1 { + return self.builders[0].inner.to_vector(); + } + + let mut unified_jsons = JsonStructsBuilder::new(self.merged_type.clone(), self.capacity); + for builder in self.builders.iter_mut() { + unified_jsons + .try_merge(builder) + // Safety: the "unified_jsons" has the merged json type from all the builders, + // so it should merge them without errors. + .unwrap_or_else(|e| panic!("failed to merge json builders, error: {e}")); + } + unified_jsons.inner.to_vector() + } + + fn to_vector_cloned(&self) -> VectorRef { + // Fast path: + if self.builders.len() == 1 { + return self.builders[0].inner.to_vector_cloned(); + } + + let mut unified_jsons = JsonStructsBuilder::new(self.merged_type.clone(), self.capacity); + for builder in self.builders.iter() { + unified_jsons + .try_merge_cloned(builder) + // Safety: the "unified_jsons" has the merged json type from all the builders, + // so it should merge them without errors. + .unwrap_or_else(|e| panic!("failed to merge json builders, error: {e}")); + } + unified_jsons.inner.to_vector_cloned() + } + + fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> { + let ValueRef::Json(value) = value else { + return TryFromValueSnafu { + reason: format!("expected json value, got {value:?}"), + } + .fail(); + }; + let json_type = value.json_type(); + + let builder = match self.builders.last_mut() { + Some(last) => { + // TODO(LFC): use "is_include" and amend json value with nulls + if &last.json_type != json_type { + self.try_create_new_builder(json_type)? + } else { + last + } + } + None => self.try_create_new_builder(json_type)?, + }; + + builder.push(value.as_ref()) + } + + fn push_null(&mut self) { + static NULL_JSON: LazyLock = + LazyLock::new(|| ValueRef::Json(Box::new(JsonValueRef::null()))); + self.try_push_value_ref(&NULL_JSON) + // Safety: learning from the method "try_push_value_ref", a null json value should be + // always able to push into any json vectors. + .unwrap_or_else(|e| panic!("failed to push null json value, error: {e}")); + } + + fn extend_slice_of(&mut self, _: &dyn Vector, _: usize, _: usize) -> Result<()> { + UnsupportedOperationSnafu { + op: "extend_slice_of", + vector_type: "JsonVector", + } + .fail() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::data_type::DataType; + use crate::json::JsonStructureSettings; + use crate::vectors::helper::pretty_print; + + fn push(json: &str, builder: &mut JsonVectorBuilder, expected: std::result::Result<(), &str>) { + let settings = JsonStructureSettings::Structured(None); + let json: serde_json::Value = serde_json::from_str(json).unwrap(); + let value = settings.encode(json).unwrap(); + + let value = value.as_value_ref(); + let result = builder + .try_push_value_ref(&value) + .map_err(|e| e.to_string()); + let expected = expected.map_err(|e| e.to_string()); + assert_eq!(result, expected); + } + + #[test] + fn test_push_plain_jsons() -> Result<()> { + let jsons = vec!["1", "2", r#""s""#, "[true]"]; + let results = vec![ + Ok(()), + Ok(()), + Err( + "Failed to merge JSON datatype: datatypes have conflict, this: Number(I64), that: String", + ), + Err( + "Failed to merge JSON datatype: datatypes have conflict, this: Number(I64), that: Array[Bool]", + ), + ]; + let mut builder = JsonVectorBuilder::new(JsonNativeType::Null, 1); + for (json, result) in jsons.into_iter().zip(results.into_iter()) { + push(json, &mut builder, result); + } + let vector = builder.to_vector(); + let expected = r#" ++---------------------+ +| StructVector | ++---------------------+ +| {__json_plain__: 1} | +| {__json_plain__: 2} | ++---------------------+"#; + assert_eq!(pretty_print(vector), expected.trim()); + Ok(()) + } + + #[test] + fn test_push_json_objects() -> Result<()> { + let jsons = vec![ + r#"{ + "s": "a", + "list": [1, 2, 3] + }"#, + r#"{ + "list": [4], + "s": "b" + }"#, + r#"{ + "s": "c", + "float": 0.9 + }"#, + r#"{ + "float": 0.8, + "s": "d" + }"#, + r#"{ + "float": 0.7, + "int": -1 + }"#, + r#"{ + "int": 0, + "float": 0.6 + }"#, + r#"{ + "int": 1, + "object": {"hello": "world", "timestamp": 1761523200000} + }"#, + r#"{ + "object": {"hello": "greptime", "timestamp": 1761523201000}, + "int": 2 + }"#, + r#"{ + "object": {"timestamp": 1761523202000}, + "nested": {"a": {"b": {"b": {"a": "abba"}}}} + }"#, + r#"{ + "nested": {"a": {"b": {"a": {"b": "abab"}}}}, + "object": {"timestamp": 1761523203000} + }"#, + ]; + let mut builder = JsonVectorBuilder::new(JsonNativeType::Null, 1); + for json in jsons { + push(json, &mut builder, Ok(())); + } + assert_eq!(builder.len(), 10); + + // test children builders: + assert_eq!(builder.builders.len(), 6); + let expect_types = [ + r#"Json"#, + r#"Json"#, + r#"Json"#, + r#"Json"#, + r#"Json"#, + r#"Json"#, + ]; + let expect_vectors = [ + r#" ++-------------------------+ +| StructVector | ++-------------------------+ +| {list: [1, 2, 3], s: a} | +| {list: [4], s: b} | ++-------------------------+"#, + r#" ++--------------------+ +| StructVector | ++--------------------+ +| {float: 0.9, s: c} | +| {float: 0.8, s: d} | ++--------------------+"#, + r#" ++-----------------------+ +| StructVector | ++-----------------------+ +| {float: 0.7, int: -1} | +| {float: 0.6, int: 0} | ++-----------------------+"#, + r#" ++---------------------------------------------------------------+ +| StructVector | ++---------------------------------------------------------------+ +| {int: 1, object: {hello: world, timestamp: 1761523200000}} | +| {int: 2, object: {hello: greptime, timestamp: 1761523201000}} | ++---------------------------------------------------------------+"#, + r#" ++------------------------------------------------------------------------+ +| StructVector | ++------------------------------------------------------------------------+ +| {nested: {a: {b: {b: {a: abba}}}}, object: {timestamp: 1761523202000}} | ++------------------------------------------------------------------------+"#, + r#" ++------------------------------------------------------------------------+ +| StructVector | ++------------------------------------------------------------------------+ +| {nested: {a: {b: {a: {b: abab}}}}, object: {timestamp: 1761523203000}} | ++------------------------------------------------------------------------+"#, + ]; + for (builder, (expect_type, expect_vector)) in builder + .builders + .iter() + .zip(expect_types.into_iter().zip(expect_vectors.into_iter())) + { + assert_eq!(builder.json_type.name(), expect_type); + let vector = builder.inner.to_vector_cloned(); + assert_eq!(pretty_print(vector), expect_vector.trim()); + } + + // test final merged json type: + let expected = r#"Json"#; + assert_eq!(builder.data_type().to_string(), expected); + + // test final produced vector: + let expected = r#" ++-------------------------------------------------------------------------------------------------------------------+ +| StructVector | ++-------------------------------------------------------------------------------------------------------------------+ +| {float: , int: , list: [1, 2, 3], nested: , object: , s: a} | +| {float: , int: , list: [4], nested: , object: , s: b} | +| {float: 0.9, int: , list: , nested: , object: , s: c} | +| {float: 0.8, int: , list: , nested: , object: , s: d} | +| {float: 0.7, int: -1, list: , nested: , object: , s: } | +| {float: 0.6, int: 0, list: , nested: , object: , s: } | +| {float: , int: 1, list: , nested: , object: {hello: world, timestamp: 1761523200000}, s: } | +| {float: , int: 2, list: , nested: , object: {hello: greptime, timestamp: 1761523201000}, s: } | +| {float: , int: , list: , nested: {a: {b: {a: , b: {a: abba}}}}, object: {hello: , timestamp: 1761523202000}, s: } | +| {float: , int: , list: , nested: {a: {b: {a: {b: abab}, b: }}}, object: {hello: , timestamp: 1761523203000}, s: } | ++-------------------------------------------------------------------------------------------------------------------+"#; + let vector = builder.to_vector_cloned(); + assert_eq!(pretty_print(vector), expected.trim()); + let vector = builder.to_vector(); + assert_eq!(pretty_print(vector), expected.trim()); + Ok(()) + } +} diff --git a/src/datatypes/src/vectors/struct_vector.rs b/src/datatypes/src/vectors/struct_vector.rs index e4f0fe5b2a..44de9abf5e 100644 --- a/src/datatypes/src/vectors/struct_vector.rs +++ b/src/datatypes/src/vectors/struct_vector.rs @@ -323,6 +323,26 @@ impl StructVectorBuilder { } self.null_buffer.append_null(); } + + pub(crate) fn struct_type(&self) -> &StructType { + &self.fields + } + + pub(crate) fn value_builders(&self) -> &[Box] { + &self.value_builders + } + + pub(crate) fn mut_value_builders(&mut self) -> &mut [Box] { + &mut self.value_builders + } + + pub(crate) fn null_buffer(&self) -> &NullBufferBuilder { + &self.null_buffer + } + + pub(crate) fn mut_null_buffer(&mut self) -> &mut NullBufferBuilder { + &mut self.null_buffer + } } impl MutableVector for StructVectorBuilder { @@ -359,10 +379,8 @@ impl MutableVector for StructVectorBuilder { }, StructValueRef::Ref(val) => self.push_struct_value(val)?, StructValueRef::RefList { val, fields } => { - let struct_value = StructValue::try_new( - val.iter().map(|v| Value::from(v.clone())).collect(), - fields.clone(), - )?; + let struct_value = + StructValue::try_new(val.into_iter().map(Value::from).collect(), fields)?; self.push_struct_value(&struct_value)?; } } @@ -409,12 +427,17 @@ impl ScalarVectorBuilder for StructVectorBuilder { .value_builders .iter_mut() .map(|b| b.to_vector().to_arrow_array()) - .collect(); - let struct_array = StructArray::new( - self.fields.as_arrow_fields(), - arrays, - self.null_buffer.finish(), - ); + .collect::>(); + + let struct_array = if arrays.is_empty() { + StructArray::new_empty_fields(self.len(), self.null_buffer.finish()) + } else { + StructArray::new( + self.fields.as_arrow_fields(), + arrays, + self.null_buffer.finish(), + ) + }; StructVector::try_new(self.fields.clone(), struct_array).unwrap() } diff --git a/src/file-engine/src/engine.rs b/src/file-engine/src/engine.rs index d3ec72c1e2..231583beb6 100644 --- a/src/file-engine/src/engine.rs +++ b/src/file-engine/src/engine.rs @@ -27,8 +27,9 @@ use snafu::{OptionExt, ensure}; use store_api::metadata::RegionMetadataRef; use store_api::region_engine::{ RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, - SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState, - SinglePartitionScanner, SyncManifestResponse, + RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse, + SetRegionRoleStateSuccess, SettableRegionRoleState, SinglePartitionScanner, + SyncManifestResponse, }; use store_api::region_request::{ AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest, @@ -150,6 +151,18 @@ impl RegionEngine for FileRegionEngine { Ok(SyncManifestResponse::NotSupported) } + async fn remap_manifests( + &self, + _request: RemapManifestsRequest, + ) -> Result { + Err(BoxedError::new( + UnsupportedSnafu { + operation: "remap_manifests", + } + .build(), + )) + } + fn role(&self, region_id: RegionId) -> Option { self.inner.state(region_id) } diff --git a/src/file-engine/src/error.rs b/src/file-engine/src/error.rs index 2447fe1fde..3179d0d0fd 100644 --- a/src/file-engine/src/error.rs +++ b/src/file-engine/src/error.rs @@ -151,13 +151,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to build stream adapter"))] - BuildStreamAdapter { - #[snafu(implicit)] - location: Location, - source: common_recordbatch::error::Error, - }, - #[snafu(display("Failed to parse file format"))] ParseFileFormat { #[snafu(implicit)] @@ -200,13 +193,6 @@ pub enum Error { #[snafu(implicit)] location: Location, }, - - #[snafu(display("Missing default value for column: {}", column))] - MissingColumnNoDefault { - column: String, - #[snafu(implicit)] - location: Location, - }, } pub type Result = std::result::Result; @@ -222,13 +208,11 @@ impl ErrorExt for Error { | Unsupported { .. } | InvalidMetadata { .. } | ProjectionOutOfBounds { .. } - | CreateDefault { .. } - | MissingColumnNoDefault { .. } => StatusCode::InvalidArguments, + | CreateDefault { .. } => StatusCode::InvalidArguments, RegionNotFound { .. } => StatusCode::RegionNotFound, BuildBackend { source, .. } => source.status_code(), - BuildStreamAdapter { source, .. } => source.status_code(), ParseFileFormat { source, .. } => source.status_code(), CheckObject { .. } diff --git a/src/file-engine/src/manifest.rs b/src/file-engine/src/manifest.rs index 7e8aa7a732..ac2732fe69 100644 --- a/src/file-engine/src/manifest.rs +++ b/src/file-engine/src/manifest.rs @@ -94,7 +94,9 @@ impl FileRegionManifest { builder.push_column_metadata(column.clone()); } builder.primary_key(self.primary_key.clone()); - let metadata = builder.build().context(InvalidMetadataSnafu)?; + let metadata = builder + .build_without_validation() + .context(InvalidMetadataSnafu)?; Ok(Arc::new(metadata)) } @@ -127,3 +129,49 @@ impl FileRegionManifest { .context(MissingRequiredFieldSnafu { name }) } } + +#[cfg(test)] +mod tests { + use api::v1::SemanticType; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::ColumnSchema; + + use super::*; + + #[test] + fn metadata_allows_internal_column_name() { + let manifest = FileRegionManifest { + region_id: RegionId::new(1, 0), + column_metadatas: vec![ + ColumnMetadata { + column_schema: ColumnSchema::new( + "__primary_key", + ConcreteDataType::string_datatype(), + false, + ), + semantic_type: SemanticType::Tag, + column_id: 1, + }, + ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 2, + }, + ], + primary_key: vec![1], + options: HashMap::default(), + }; + + let metadata = manifest.metadata().unwrap(); + assert!( + metadata + .column_metadatas + .iter() + .any(|c| c.column_schema.name == "__primary_key") + ); + } +} diff --git a/src/file-engine/src/query.rs b/src/file-engine/src/query.rs index b56777d43c..75d40c4608 100644 --- a/src/file-engine/src/query.rs +++ b/src/file-engine/src/query.rs @@ -20,23 +20,26 @@ use std::sync::Arc; use std::task::{Context, Poll}; use common_datasource::object_store::build_backend; -use common_error::ext::BoxedError; use common_recordbatch::adapter::RecordBatchMetrics; -use common_recordbatch::error::{CastVectorSnafu, ExternalSnafu, Result as RecordBatchResult}; -use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream}; +use common_recordbatch::error::{self as recordbatch_error, Result as RecordBatchResult}; +use common_recordbatch::{ + DfSendableRecordBatchStream, OrderOption, RecordBatch, RecordBatchStream, + SendableRecordBatchStream, +}; use datafusion::logical_expr::utils as df_logical_expr_utils; use datafusion_expr::expr::Expr; -use datatypes::prelude::ConcreteDataType; -use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; -use datatypes::vectors::VectorRef; +use datatypes::arrow::compute as arrow_compute; +use datatypes::data_type::DataType; +use datatypes::schema::{Schema, SchemaRef}; +use datatypes::vectors::Helper; use futures::Stream; -use snafu::{OptionExt, ResultExt, ensure}; +use snafu::{GenerateImplicitData, ResultExt, ensure}; use store_api::storage::ScanRequest; use self::file_stream::ScanPlanConfig; use crate::error::{ - BuildBackendSnafu, CreateDefaultSnafu, ExtractColumnFromFilterSnafu, - MissingColumnNoDefaultSnafu, ProjectSchemaSnafu, ProjectionOutOfBoundsSnafu, Result, + BuildBackendSnafu, ExtractColumnFromFilterSnafu, ProjectSchemaSnafu, + ProjectionOutOfBoundsSnafu, Result, }; use crate::region::FileRegion; @@ -48,6 +51,16 @@ impl FileRegion { let file_filters = self.filters_pushdown_to_file(&request.filters)?; let file_schema = Arc::new(Schema::new(self.file_options.file_column_schemas.clone())); + let projected_file_schema = if let Some(projection) = &file_projection { + Arc::new( + file_schema + .try_project(projection) + .context(ProjectSchemaSnafu)?, + ) + } else { + file_schema.clone() + }; + let file_stream = file_stream::create_stream( &self.format, &ScanPlanConfig { @@ -64,6 +77,7 @@ impl FileRegion { Ok(Box::pin(FileToScanRegionStream::new( scan_schema, + projected_file_schema, file_stream, ))) } @@ -144,7 +158,10 @@ impl FileRegion { struct FileToScanRegionStream { scan_schema: SchemaRef, - file_stream: SendableRecordBatchStream, + file_stream: DfSendableRecordBatchStream, + /// Maps columns in `scan_schema` to their index in the projected file schema. + /// `None` means the column doesn't exist in the file and should be filled with default values. + scan_to_file_projection: Vec>, } impl RecordBatchStream for FileToScanRegionStream { @@ -167,15 +184,49 @@ impl Stream for FileToScanRegionStream { fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { match Pin::new(&mut self.file_stream).poll_next(ctx) { Poll::Pending => Poll::Pending, - Poll::Ready(Some(file_record_batch)) => { - let file_record_batch = file_record_batch?; - let scan_record_batch = if self.schema_eq(&file_record_batch) { - Ok(file_record_batch) - } else { - self.convert_record_batch(&file_record_batch) - }; + Poll::Ready(Some(Ok(file_record_batch))) => { + let num_rows = file_record_batch.num_rows(); + let mut columns = Vec::with_capacity(self.scan_schema.num_columns()); - Poll::Ready(Some(scan_record_batch)) + for (idx, column_schema) in self.scan_schema.column_schemas().iter().enumerate() { + if let Some(file_idx) = self.scan_to_file_projection[idx] { + let expected_arrow_type = column_schema.data_type.as_arrow_type(); + let mut array = file_record_batch.column(file_idx).clone(); + + if array.data_type() != &expected_arrow_type { + array = arrow_compute::cast(array.as_ref(), &expected_arrow_type) + .context(recordbatch_error::ArrowComputeSnafu)?; + } + + let vector = Helper::try_into_vector(array) + .context(recordbatch_error::DataTypesSnafu)?; + columns.push(vector); + } else { + let vector = column_schema + .create_default_vector(num_rows) + .context(recordbatch_error::DataTypesSnafu)? + .ok_or_else(|| { + recordbatch_error::CreateRecordBatchesSnafu { + reason: format!( + "column {} is missing from file source and has no default", + column_schema.name + ), + } + .build() + })?; + columns.push(vector); + } + } + + let record_batch = RecordBatch::new(self.scan_schema.clone(), columns)?; + + Poll::Ready(Some(Ok(record_batch))) + } + Poll::Ready(Some(Err(error))) => { + Poll::Ready(Some(Err(recordbatch_error::Error::PollStream { + error, + location: snafu::Location::generate(), + }))) } Poll::Ready(None) => Poll::Ready(None), } @@ -183,86 +234,21 @@ impl Stream for FileToScanRegionStream { } impl FileToScanRegionStream { - fn new(scan_schema: SchemaRef, file_stream: SendableRecordBatchStream) -> Self { + fn new( + scan_schema: SchemaRef, + file_schema: SchemaRef, + file_stream: DfSendableRecordBatchStream, + ) -> Self { + let scan_to_file_projection = scan_schema + .column_schemas() + .iter() + .map(|column| file_schema.column_index_by_name(&column.name)) + .collect(); + Self { scan_schema, file_stream, + scan_to_file_projection, } } - - fn schema_eq(&self, file_record_batch: &RecordBatch) -> bool { - self.scan_schema - .column_schemas() - .iter() - .all(|scan_column_schema| { - file_record_batch - .column_by_name(&scan_column_schema.name) - .map(|rb| rb.data_type() == scan_column_schema.data_type) - .unwrap_or_default() - }) - } - - /// Converts a RecordBatch from file schema to scan schema. - /// - /// This function performs the following operations: - /// - Projection: Only columns present in scan schema are retained. - /// - Cast Type: Columns present in both file schema and scan schema but with different types are cast to the type in scan schema. - /// - Backfill: Columns present in scan schema but not in file schema are backfilled with default values. - fn convert_record_batch( - &self, - file_record_batch: &RecordBatch, - ) -> RecordBatchResult { - let file_row_count = file_record_batch.num_rows(); - let columns = self - .scan_schema - .column_schemas() - .iter() - .map(|scan_column_schema| { - let file_column = file_record_batch.column_by_name(&scan_column_schema.name); - if let Some(file_column) = file_column { - Self::cast_column_type(file_column, &scan_column_schema.data_type) - } else { - Self::backfill_column(scan_column_schema, file_row_count) - } - }) - .collect::>>()?; - - RecordBatch::new(self.scan_schema.clone(), columns) - } - - fn cast_column_type( - source_column: &VectorRef, - target_data_type: &ConcreteDataType, - ) -> RecordBatchResult { - if &source_column.data_type() == target_data_type { - Ok(source_column.clone()) - } else { - source_column - .cast(target_data_type) - .context(CastVectorSnafu { - from_type: source_column.data_type(), - to_type: target_data_type.clone(), - }) - } - } - - fn backfill_column( - column_schema: &ColumnSchema, - num_rows: usize, - ) -> RecordBatchResult { - Self::create_default_vector(column_schema, num_rows) - .map_err(BoxedError::new) - .context(ExternalSnafu) - } - - fn create_default_vector(column_schema: &ColumnSchema, num_rows: usize) -> Result { - column_schema - .create_default_vector(num_rows) - .with_context(|_| CreateDefaultSnafu { - column: column_schema.name.clone(), - })? - .with_context(|| MissingColumnNoDefaultSnafu { - column: column_schema.name.clone(), - }) - } } diff --git a/src/file-engine/src/query/file_stream.rs b/src/file-engine/src/query/file_stream.rs index 1f26c25493..199bb5e0bd 100644 --- a/src/file-engine/src/query/file_stream.rs +++ b/src/file-engine/src/query/file_stream.rs @@ -17,8 +17,6 @@ use std::sync::Arc; use common_datasource::file_format::Format; use common_datasource::file_format::csv::CsvFormat; use common_datasource::file_format::parquet::DefaultParquetFileReaderFactory; -use common_recordbatch::SendableRecordBatchStream; -use common_recordbatch::adapter::RecordBatchStreamAdapter; use datafusion::common::ToDFSchema; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; @@ -28,8 +26,10 @@ use datafusion::datasource::physical_plan::{ use datafusion::datasource::source::DataSourceExec; use datafusion::physical_expr::create_physical_expr; use datafusion::physical_expr::execution_props::ExecutionProps; -use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion::physical_plan::{ + ExecutionPlan, SendableRecordBatchStream as DfSendableRecordBatchStream, +}; use datafusion::prelude::SessionContext; use datafusion_expr::expr::Expr; use datafusion_expr::utils::conjunction; @@ -48,7 +48,7 @@ fn build_record_batch_stream( file_schema: Arc, limit: Option, file_source: Arc, -) -> Result { +) -> Result { let files = scan_plan_config .files .iter() @@ -77,15 +77,13 @@ fn build_record_batch_stream( &ExecutionPlanMetricsSet::new(), ) .context(error::BuildStreamSnafu)?; - let adapter = RecordBatchStreamAdapter::try_new(Box::pin(stream)) - .context(error::BuildStreamAdapterSnafu)?; - Ok(Box::pin(adapter)) + Ok(Box::pin(stream)) } fn new_csv_stream( config: &ScanPlanConfig, format: &CsvFormat, -) -> Result { +) -> Result { let file_schema = config.file_schema.arrow_schema().clone(); // push down limit only if there is no filter @@ -98,7 +96,7 @@ fn new_csv_stream( build_record_batch_stream(config, file_schema, limit, csv_source) } -fn new_json_stream(config: &ScanPlanConfig) -> Result { +fn new_json_stream(config: &ScanPlanConfig) -> Result { let file_schema = config.file_schema.arrow_schema().clone(); // push down limit only if there is no filter @@ -108,7 +106,9 @@ fn new_json_stream(config: &ScanPlanConfig) -> Result build_record_batch_stream(config, file_schema, limit, file_source) } -fn new_parquet_stream_with_exec_plan(config: &ScanPlanConfig) -> Result { +fn new_parquet_stream_with_exec_plan( + config: &ScanPlanConfig, +) -> Result { let file_schema = config.file_schema.arrow_schema().clone(); let ScanPlanConfig { files, @@ -161,12 +161,10 @@ fn new_parquet_stream_with_exec_plan(config: &ScanPlanConfig) -> Result Result { +fn new_orc_stream(config: &ScanPlanConfig) -> Result { let file_schema = config.file_schema.arrow_schema().clone(); // push down limit only if there is no filter @@ -189,7 +187,7 @@ pub struct ScanPlanConfig<'a> { pub fn create_stream( format: &Format, config: &ScanPlanConfig, -) -> Result { +) -> Result { match format { Format::Csv(format) => new_csv_stream(config, format), Format::Json(_) => new_json_stream(config), diff --git a/src/flow/src/adapter.rs b/src/flow/src/adapter.rs index 9721d49040..a8d2482faf 100644 --- a/src/flow/src/adapter.rs +++ b/src/flow/src/adapter.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; use api::v1::{RowDeleteRequest, RowDeleteRequests, RowInsertRequest, RowInsertRequests}; +use common_base::memory_limit::MemoryLimit; use common_config::Configurable; use common_error::ext::BoxedError; use common_meta::key::TableMetadataManagerRef; @@ -132,6 +133,7 @@ impl Default for FlownodeOptions { query: QueryOptions { parallelism: 1, allow_query_fallback: false, + memory_pool_size: MemoryLimit::default(), }, user_provider: None, memory: MemoryOptions::default(), diff --git a/src/flow/src/adapter/node_context.rs b/src/flow/src/adapter/node_context.rs index 2cfad8671e..bcddcbb891 100644 --- a/src/flow/src/adapter/node_context.rs +++ b/src/flow/src/adapter/node_context.rs @@ -199,7 +199,7 @@ impl SourceSender { /// send record batch pub async fn send_record_batch(&self, batch: RecordBatch) -> Result { let row_cnt = batch.num_rows(); - let batch = Batch::from(batch); + let batch = Batch::try_from(batch)?; self.send_buf_row_cnt.fetch_add(row_cnt, Ordering::SeqCst); diff --git a/src/flow/src/adapter/refill.rs b/src/flow/src/adapter/refill.rs index 89b7344c0c..6d66505e89 100644 --- a/src/flow/src/adapter/refill.rs +++ b/src/flow/src/adapter/refill.rs @@ -18,6 +18,7 @@ use std::collections::BTreeSet; use std::sync::Arc; use catalog::CatalogManagerRef; +use client::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_error::ext::BoxedError; use common_meta::key::flow::FlowMetadataManagerRef; use common_recordbatch::{RecordBatch, RecordBatches, SendableRecordBatchStream}; @@ -396,8 +397,8 @@ impl RefillTask { // we don't need information from query context in this query so a default query context is enough let query_ctx = Arc::new( QueryContextBuilder::default() - .current_catalog("greptime".to_string()) - .current_schema("public".to_string()) + .current_catalog(DEFAULT_CATALOG_NAME.to_string()) + .current_schema(DEFAULT_SCHEMA_NAME.to_string()) .build(), ); diff --git a/src/flow/src/batching_mode/frontend_client.rs b/src/flow/src/batching_mode/frontend_client.rs index cba8f896d5..d79c3033e3 100644 --- a/src/flow/src/batching_mode/frontend_client.rs +++ b/src/flow/src/batching_mode/frontend_client.rs @@ -23,7 +23,7 @@ use api::v1::query_request::Query; use api::v1::{CreateTableExpr, QueryRequest}; use client::{Client, Database}; use common_error::ext::{BoxedError, ErrorExt}; -use common_grpc::channel_manager::{ChannelConfig, ChannelManager}; +use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_client_tls_config}; use common_meta::cluster::{NodeInfo, NodeInfoKey, Role}; use common_meta::peer::Peer; use common_meta::rpc::store::RangeRequest; @@ -123,12 +123,10 @@ impl FrontendClient { let cfg = ChannelConfig::new() .connect_timeout(batch_opts.grpc_conn_timeout) .timeout(batch_opts.query_timeout); - if let Some(tls) = &batch_opts.frontend_tls { - let cfg = cfg.client_tls_config(tls.clone()); - ChannelManager::with_tls_config(cfg).context(InvalidClientConfigSnafu)? - } else { - ChannelManager::with_config(cfg) - } + + let tls_config = load_client_tls_config(batch_opts.frontend_tls.clone()) + .context(InvalidClientConfigSnafu)?; + ChannelManager::with_config(cfg, tls_config) }, auth, query, diff --git a/src/flow/src/expr.rs b/src/flow/src/expr.rs index c17db3bf7e..5c0359e55f 100644 --- a/src/flow/src/expr.rs +++ b/src/flow/src/expr.rs @@ -25,6 +25,7 @@ mod signature; pub(crate) mod utils; use arrow::compute::FilterBuilder; +use common_recordbatch::RecordBatch; use datatypes::prelude::{ConcreteDataType, DataType}; use datatypes::value::Value; use datatypes::vectors::{BooleanVector, Helper, VectorRef}; @@ -38,6 +39,8 @@ pub(crate) use relation::{Accum, Accumulator, AggregateExpr, AggregateFunc}; pub(crate) use scalar::{ScalarExpr, TypedExpr}; use snafu::{ResultExt, ensure}; +use crate::Error; +use crate::error::DatatypesSnafu; use crate::expr::error::{ArrowSnafu, DataTypeSnafu}; use crate::repr::Diff; @@ -55,13 +58,19 @@ pub struct Batch { diffs: Option, } -impl From for Batch { - fn from(value: common_recordbatch::RecordBatch) -> Self { - Self { +impl TryFrom for Batch { + type Error = Error; + + fn try_from(value: RecordBatch) -> Result { + let columns = value.columns(); + let batch = Helper::try_into_vectors(columns).context(DatatypesSnafu { + extra: "failed to convert Arrow array to vector when building Flow batch", + })?; + Ok(Self { row_count: value.num_rows(), - batch: value.columns, + batch, diffs: None, - } + }) } } diff --git a/src/flow/src/heartbeat.rs b/src/flow/src/heartbeat.rs index cc42668f5a..89b37860c5 100644 --- a/src/flow/src/heartbeat.rs +++ b/src/flow/src/heartbeat.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use api::v1::meta::{HeartbeatRequest, Peer}; -use common_config::utils::ResourceSpec; use common_error::ext::BoxedError; use common_meta::heartbeat::handler::{ HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef, @@ -26,6 +25,7 @@ use common_meta::heartbeat::handler::{ use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage}; use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message; use common_meta::key::flow::flow_state::FlowStat; +use common_stat::ResourceStatRef; use common_telemetry::{debug, error, info, warn}; use greptime_proto::v1::meta::NodeInfo; use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient}; @@ -69,7 +69,7 @@ pub struct HeartbeatTask { resp_handler_executor: HeartbeatResponseHandlerExecutorRef, running: Arc, query_stat_size: Option, - resource_spec: ResourceSpec, + resource_stat: ResourceStatRef, } impl HeartbeatTask { @@ -77,11 +77,13 @@ impl HeartbeatTask { self.query_stat_size = Some(query_stat_size); self } + pub fn new( opts: &FlownodeOptions, meta_client: Arc, heartbeat_opts: HeartbeatOptions, resp_handler_executor: HeartbeatResponseHandlerExecutorRef, + resource_stat: ResourceStatRef, ) -> Self { Self { node_id: opts.node_id.unwrap_or(0), @@ -93,7 +95,7 @@ impl HeartbeatTask { resp_handler_executor, running: Arc::new(AtomicBool::new(false)), query_stat_size: None, - resource_spec: Default::default(), + resource_stat, } } @@ -146,6 +148,8 @@ impl HeartbeatTask { heartbeat_request: &HeartbeatRequest, message: Option, latest_report: &Option, + cpu_usage: i64, + memory_usage: i64, ) -> Option { let mailbox_message = match message.map(outgoing_message_to_mailbox_message) { Some(Ok(message)) => Some(message), @@ -170,21 +174,38 @@ impl HeartbeatTask { .collect(), }); - Some(HeartbeatRequest { + let mut heartbeat_request = HeartbeatRequest { mailbox_message, flow_stat, ..heartbeat_request.clone() - }) + }; + + if let Some(info) = heartbeat_request.info.as_mut() { + info.cpu_usage_millicores = cpu_usage; + info.memory_usage_bytes = memory_usage; + } + + Some(heartbeat_request) } - fn build_node_info(start_time_ms: u64, cpus: u32, memory_bytes: u64) -> Option { + #[allow(deprecated)] + fn build_node_info( + start_time_ms: u64, + total_cpu_millicores: i64, + total_memory_bytes: i64, + ) -> Option { let build_info = common_version::build_info(); Some(NodeInfo { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms, - cpus, - memory_bytes, + total_cpu_millicores, + total_memory_bytes, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, + // TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto. + cpus: total_cpu_millicores as u32, + memory_bytes: total_memory_bytes as u64, hostname: hostname::get() .unwrap_or_default() .to_string_lossy() @@ -203,9 +224,9 @@ impl HeartbeatTask { id: self.node_id, addr: self.peer_addr.clone(), }); - let cpus = self.resource_spec.cpus as u32; - let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes(); - + let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores(); + let total_memory_bytes = self.resource_stat.get_total_memory_bytes(); + let resource_stat = self.resource_stat.clone(); let query_stat_size = self.query_stat_size.clone(); common_runtime::spawn_hb(async move { @@ -218,7 +239,7 @@ impl HeartbeatTask { let heartbeat_request = HeartbeatRequest { peer: self_peer, node_epoch, - info: Self::build_node_info(node_epoch, cpus, memory_bytes), + info: Self::build_node_info(node_epoch, total_cpu_millicores, total_memory_bytes), ..Default::default() }; @@ -226,7 +247,7 @@ impl HeartbeatTask { let req = tokio::select! { message = outgoing_rx.recv() => { if let Some(message) = message { - Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report) + Self::new_heartbeat_request(&heartbeat_request, Some(message), &latest_report, 0, 0) } else { warn!("Sender has been dropped, exiting the heartbeat loop"); // Receives None that means Sender was dropped, we need to break the current loop @@ -234,7 +255,7 @@ impl HeartbeatTask { } } _ = interval.tick() => { - Self::new_heartbeat_request(&heartbeat_request, None, &latest_report) + Self::new_heartbeat_request(&heartbeat_request, None, &latest_report, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes()) } }; diff --git a/src/flow/src/repr.rs b/src/flow/src/repr.rs index 301431aff5..715f60594b 100644 --- a/src/flow/src/repr.rs +++ b/src/flow/src/repr.rs @@ -17,7 +17,7 @@ mod relation; -use api::helper::{pb_value_to_value_ref, value_to_grpc_value}; +use api::helper::{pb_value_to_value_ref, to_grpc_value}; use api::v1::Row as ProtoRow; use datatypes::data_type::ConcreteDataType; use datatypes::types::cast; @@ -201,11 +201,7 @@ impl From for Row { impl From for ProtoRow { fn from(row: Row) -> Self { - let values = row - .unpack() - .into_iter() - .map(value_to_grpc_value) - .collect_vec(); + let values = row.unpack().into_iter().map(to_grpc_value).collect_vec(); ProtoRow { values } } } diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index b90e4f5eb2..24d9c8c5ff 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -37,6 +37,7 @@ common-procedure.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-time.workspace = true common-version.workspace = true diff --git a/src/frontend/src/frontend.rs b/src/frontend/src/frontend.rs index bf2e7a0558..027f8a4254 100644 --- a/src/frontend/src/frontend.rs +++ b/src/frontend/src/frontend.rs @@ -23,7 +23,6 @@ use common_telemetry::logging::{LoggingOptions, SlowQueryOptions, TracingOptions use meta_client::MetaClientOptions; use query::options::QueryOptions; use serde::{Deserialize, Serialize}; -use servers::export_metrics::{ExportMetricsOption, ExportMetricsTask}; use servers::grpc::GrpcOptions; use servers::heartbeat_options::HeartbeatOptions; use servers::http::HttpOptions; @@ -34,7 +33,6 @@ use crate::error; use crate::error::Result; use crate::heartbeat::HeartbeatTask; use crate::instance::Instance; -use crate::instance::prom_store::ExportMetricHandler; use crate::service_config::{ InfluxdbOptions, JaegerOptions, MysqlOptions, OpentsdbOptions, OtlpOptions, PostgresOptions, PromStoreOptions, @@ -45,6 +43,7 @@ use crate::service_config::{ pub struct FrontendOptions { pub node_id: Option, pub default_timezone: Option, + pub default_column_prefix: Option, pub heartbeat: HeartbeatOptions, pub http: HttpOptions, pub grpc: GrpcOptions, @@ -62,7 +61,6 @@ pub struct FrontendOptions { pub logging: LoggingOptions, pub datanode: DatanodeClientOptions, pub user_provider: Option, - pub export_metrics: ExportMetricsOption, pub tracing: TracingOptions, pub query: QueryOptions, pub max_in_flight_write_bytes: Option, @@ -77,6 +75,7 @@ impl Default for FrontendOptions { Self { node_id: None, default_timezone: None, + default_column_prefix: None, heartbeat: HeartbeatOptions::frontend_default(), http: HttpOptions::default(), grpc: GrpcOptions::default(), @@ -92,7 +91,6 @@ impl Default for FrontendOptions { logging: LoggingOptions::default(), datanode: DatanodeClientOptions::default(), user_provider: None, - export_metrics: ExportMetricsOption::default(), tracing: TracingOptions::default(), query: QueryOptions::default(), max_in_flight_write_bytes: None, @@ -115,7 +113,6 @@ pub struct Frontend { pub instance: Arc, pub servers: ServerHandlers, pub heartbeat_task: Option, - pub export_metrics_task: Option, } impl Frontend { @@ -124,17 +121,6 @@ impl Frontend { t.start().await?; } - if let Some(t) = self.export_metrics_task.as_ref() { - if t.send_by_handler { - let inserter = self.instance.inserter().clone(); - let statement_executor = self.instance.statement_executor().clone(); - let handler = ExportMetricHandler::new_handler(inserter, statement_executor); - t.start(Some(handler)).context(error::StartServerSnafu)? - } else { - t.start(None).context(error::StartServerSnafu)?; - } - } - self.servers .start_all() .await diff --git a/src/frontend/src/heartbeat.rs b/src/frontend/src/heartbeat.rs index 76fdc3305b..9c3954b0c6 100644 --- a/src/frontend/src/heartbeat.rs +++ b/src/frontend/src/heartbeat.rs @@ -18,12 +18,12 @@ mod tests; use std::sync::Arc; use api::v1::meta::{HeartbeatRequest, NodeInfo, Peer}; -use common_config::utils::ResourceSpec; use common_meta::heartbeat::handler::{ HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef, }; use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage}; use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message; +use common_stat::ResourceStatRef; use common_telemetry::{debug, error, info, warn}; use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient}; use servers::addrs; @@ -47,7 +47,7 @@ pub struct HeartbeatTask { retry_interval: Duration, resp_handler_executor: HeartbeatResponseHandlerExecutorRef, start_time_ms: u64, - resource_spec: ResourceSpec, + resource_stat: ResourceStatRef, } impl HeartbeatTask { @@ -56,6 +56,7 @@ impl HeartbeatTask { meta_client: Arc, heartbeat_opts: HeartbeatOptions, resp_handler_executor: HeartbeatResponseHandlerExecutorRef, + resource_stat: ResourceStatRef, ) -> Self { HeartbeatTask { // if internal grpc is configured, use its address as the peer address @@ -71,7 +72,7 @@ impl HeartbeatTask { retry_interval: heartbeat_opts.retry_interval, resp_handler_executor, start_time_ms: common_time::util::current_time_millis() as u64, - resource_spec: Default::default(), + resource_stat, } } @@ -103,6 +104,9 @@ impl HeartbeatTask { match resp_stream.message().await { Ok(Some(resp)) => { debug!("Receiving heartbeat response: {:?}", resp); + if let Some(message) = &resp.mailbox_message { + info!("Received mailbox message: {message:?}"); + } let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), resp); if let Err(e) = capture_self.handle_response(ctx).await { error!(e; "Error while handling heartbeat response"); @@ -133,6 +137,8 @@ impl HeartbeatTask { fn new_heartbeat_request( heartbeat_request: &HeartbeatRequest, message: Option, + cpu_usage: i64, + memory_usage: i64, ) -> Option { let mailbox_message = match message.map(outgoing_message_to_mailbox_message) { Some(Ok(message)) => Some(message), @@ -143,21 +149,38 @@ impl HeartbeatTask { None => None, }; - Some(HeartbeatRequest { + let mut heartbeat_request = HeartbeatRequest { mailbox_message, ..heartbeat_request.clone() - }) + }; + + if let Some(info) = heartbeat_request.info.as_mut() { + info.memory_usage_bytes = memory_usage; + info.cpu_usage_millicores = cpu_usage; + } + + Some(heartbeat_request) } - fn build_node_info(start_time_ms: u64, cpus: u32, memory_bytes: u64) -> Option { + #[allow(deprecated)] + fn build_node_info( + start_time_ms: u64, + total_cpu_millicores: i64, + total_memory_bytes: i64, + ) -> Option { let build_info = common_version::build_info(); Some(NodeInfo { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms, - cpus, - memory_bytes, + total_cpu_millicores, + total_memory_bytes, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, + // TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto. + cpus: total_cpu_millicores as u32, + memory_bytes: total_memory_bytes as u64, hostname: hostname::get() .unwrap_or_default() .to_string_lossy() @@ -177,16 +200,20 @@ impl HeartbeatTask { id: 0, addr: self.peer_addr.clone(), }); - let cpus = self.resource_spec.cpus as u32; - let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes(); - + let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores(); + let total_memory_bytes = self.resource_stat.get_total_memory_bytes(); + let resource_stat = self.resource_stat.clone(); common_runtime::spawn_hb(async move { let sleep = tokio::time::sleep(Duration::from_millis(0)); tokio::pin!(sleep); let heartbeat_request = HeartbeatRequest { peer: self_peer, - info: Self::build_node_info(start_time_ms, cpus, memory_bytes), + info: Self::build_node_info( + start_time_ms, + total_cpu_millicores, + total_memory_bytes, + ), ..Default::default() }; @@ -194,7 +221,7 @@ impl HeartbeatTask { let req = tokio::select! { message = outgoing_rx.recv() => { if let Some(message) = message { - Self::new_heartbeat_request(&heartbeat_request, Some(message)) + Self::new_heartbeat_request(&heartbeat_request, Some(message), 0, 0) } else { warn!("Sender has been dropped, exiting the heartbeat loop"); // Receives None that means Sender was dropped, we need to break the current loop @@ -202,8 +229,8 @@ impl HeartbeatTask { } } _ = &mut sleep => { - sleep.as_mut().reset(Instant::now() + report_interval); - Self::new_heartbeat_request(&heartbeat_request, None) + sleep.as_mut().reset(Instant::now() + report_interval); + Self::new_heartbeat_request(&heartbeat_request, None, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes()) } }; diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs index 7a32e0adcb..ff42ba53f2 100644 --- a/src/frontend/src/instance/builder.rs +++ b/src/frontend/src/instance/builder.rs @@ -32,15 +32,18 @@ use operator::flow::FlowServiceOperator; use operator::insert::Inserter; use operator::procedure::ProcedureServiceOperator; use operator::request::Requester; -use operator::statement::{StatementExecutor, StatementExecutorRef}; +use operator::statement::{ + ExecutorConfigureContext, StatementExecutor, StatementExecutorConfiguratorRef, + StatementExecutorRef, +}; use operator::table::TableMutationOperator; use partition::manager::PartitionRuleManager; use pipeline::pipeline_operator::PipelineOperator; use query::QueryEngineFactory; use query::region_query::RegionQueryHandlerFactoryRef; -use snafu::OptionExt; +use snafu::{OptionExt, ResultExt}; -use crate::error::{self, Result}; +use crate::error::{self, ExternalSnafu, Result}; use crate::events::EventHandlerImpl; use crate::frontend::FrontendOptions; use crate::instance::Instance; @@ -187,10 +190,15 @@ impl FrontendBuilder { Some(process_manager.clone()), ); - #[cfg(feature = "enterprise")] let statement_executor = - if let Some(factory) = plugins.get::() { - statement_executor.with_trigger_querier(factory.create(kv_backend.clone())) + if let Some(configurator) = plugins.get::() { + let ctx = ExecutorConfigureContext { + kv_backend: kv_backend.clone(), + }; + configurator + .configure(statement_executor, ctx) + .await + .context(ExternalSnafu)? } else { statement_executor }; diff --git a/src/frontend/src/instance/jaeger.rs b/src/frontend/src/instance/jaeger.rs index 6208866db2..e7f9388538 100644 --- a/src/frontend/src/instance/jaeger.rs +++ b/src/frontend/src/instance/jaeger.rs @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use async_trait::async_trait; use catalog::CatalogManagerRef; -use common_catalog::consts::{TRACE_TABLE_NAME, trace_services_table_name}; +use common_catalog::consts::{ + TRACE_TABLE_NAME, trace_operations_table_name, trace_services_table_name, +}; use common_function::function::FunctionRef; use common_function::scalars::json::json_get::{ JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString, @@ -26,32 +28,36 @@ use common_function::scalars::udf::create_udf; use common_query::{Output, OutputData}; use common_recordbatch::adapter::RecordBatchStreamAdapter; use common_recordbatch::util; +use common_telemetry::warn; use datafusion::dataframe::DataFrame; use datafusion::execution::SessionStateBuilder; use datafusion::execution::context::SessionContext; +use datafusion::functions_window::expr_fn::row_number; use datafusion_expr::select_expr::SelectExpr; -use datafusion_expr::{Expr, SortExpr, col, lit, lit_timestamp_nano, wildcard}; -use datatypes::value::ValueRef; +use datafusion_expr::{Expr, ExprFunctionExt, SortExpr, col, lit, lit_timestamp_nano, wildcard}; use query::QueryEngineRef; use serde_json::Value as JsonValue; use servers::error::{ CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, Result as ServerResult, TableNotFoundSnafu, }; -use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams}; +use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams, TraceUserAgent}; use servers::otlp::trace::{ - DURATION_NANO_COLUMN, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_KIND_COLUMN, - SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, TIMESTAMP_COLUMN, TRACE_ID_COLUMN, + DURATION_NANO_COLUMN, KEY_OTEL_STATUS_ERROR_KEY, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, + SPAN_KIND_COLUMN, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, SPAN_STATUS_ERROR, + TIMESTAMP_COLUMN, TRACE_ID_COLUMN, }; use servers::query_handler::JaegerQueryHandler; use session::context::QueryContextRef; use snafu::{OptionExt, ResultExt}; +use table::TableRef; use table::requests::{TABLE_DATA_MODEL, TABLE_DATA_MODEL_TRACE_V1}; use table::table::adapter::DfTableProviderAdapter; use crate::instance::Instance; const DEFAULT_LIMIT: usize = 2000; +const KEY_RN: &str = "greptime_rn"; #[async_trait] impl JaegerQueryHandler for Instance { @@ -76,8 +82,6 @@ impl JaegerQueryHandler for Instance { ctx: QueryContextRef, service_name: &str, span_kind: Option<&str>, - start_time: Option, - end_time: Option, ) -> ServerResult { let mut filters = vec![col(SERVICE_NAME_COLUMN).eq(lit(service_name))]; @@ -89,16 +93,6 @@ impl JaegerQueryHandler for Instance { )))); } - if let Some(start_time) = start_time { - // Microseconds to nanoseconds. - filters.push(col(TIMESTAMP_COLUMN).gt_eq(lit_timestamp_nano(start_time * 1_000))); - } - - if let Some(end_time) = end_time { - // Microseconds to nanoseconds. - filters.push(col(TIMESTAMP_COLUMN).lt_eq(lit_timestamp_nano(end_time * 1_000))); - } - // It's equivalent to the following SQL query: // // ``` @@ -107,8 +101,6 @@ impl JaegerQueryHandler for Instance { // {db}.{trace_table} // WHERE // service_name = '{service_name}' AND - // timestamp >= {start_time} AND - // timestamp <= {end_time} AND // span_kind = '{span_kind}' // ORDER BY // span_name ASC @@ -138,6 +130,7 @@ impl JaegerQueryHandler for Instance { trace_id: &str, start_time: Option, end_time: Option, + limit: Option, ) -> ServerResult { // It's equivalent to the following SQL query: // @@ -172,7 +165,7 @@ impl JaegerQueryHandler for Instance { selects, filters, vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order. - Some(DEFAULT_LIMIT), + limit, None, vec![], ) @@ -269,18 +262,41 @@ impl JaegerQueryHandler for Instance { filters.push(col(TIMESTAMP_COLUMN).lt_eq(lit_timestamp_nano(end_time))); } - Ok(query_trace_table( - ctx, - self.catalog_manager(), - self.query_engine(), - vec![wildcard()], - filters, - vec![], - None, - None, - vec![], - ) - .await?) + match query_params.user_agent { + TraceUserAgent::Grafana => { + // grafana only use trace id and timestamp + // clicking the trace id will invoke the query trace api + // so we only need to return 1 span for each trace + let table_name = ctx + .extension(JAEGER_QUERY_TABLE_NAME_KEY) + .unwrap_or(TRACE_TABLE_NAME); + + let table = get_table(ctx.clone(), self.catalog_manager(), table_name).await?; + + Ok(find_traces_rank_3( + table, + self.query_engine(), + filters, + vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order. + ) + .await?) + } + _ => { + // query all spans + Ok(query_trace_table( + ctx, + self.catalog_manager(), + self.query_engine(), + vec![wildcard()], + filters, + vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order. + None, + None, + vec![], + ) + .await?) + } + } } } @@ -301,12 +317,18 @@ async fn query_trace_table( .unwrap_or(TRACE_TABLE_NAME); // If only select services, use the trace services table. + // If querying operations (distinct by span_name and span_kind), use the trace operations table. let table_name = { if match selects.as_slice() { [SelectExpr::Expression(x)] => x == &col(SERVICE_NAME_COLUMN), _ => false, } { &trace_services_table_name(trace_table_name) + } else if !distincts.is_empty() + && distincts.contains(&col(SPAN_NAME_COLUMN)) + && distincts.contains(&col(SPAN_KIND_COLUMN)) + { + &trace_operations_table_name(trace_table_name) } else { trace_table_name } @@ -328,6 +350,7 @@ async fn query_trace_table( })?; let is_data_model_v1 = table + .clone() .table_info() .meta .options @@ -336,6 +359,14 @@ async fn query_trace_table( .map(|s| s.as_str()) == Some(TABLE_DATA_MODEL_TRACE_V1); + // collect to set + let col_names = table + .table_info() + .meta + .field_column_names() + .map(|s| format!("\"{}\"", s)) + .collect::>(); + let df_context = create_df_context(query_engine)?; let dataframe = df_context @@ -348,7 +379,7 @@ async fn query_trace_table( let dataframe = filters .into_iter() .chain(tags.map_or(Ok(vec![]), |t| { - tags_filters(&dataframe, t, is_data_model_v1) + tags_filters(&dataframe, t, is_data_model_v1, &col_names) })?) .try_fold(dataframe, |df, expr| { df.filter(expr).context(DataFusionSnafu) @@ -387,6 +418,84 @@ async fn query_trace_table( Ok(output) } +async fn get_table( + ctx: QueryContextRef, + catalog_manager: &CatalogManagerRef, + table_name: &str, +) -> ServerResult { + catalog_manager + .table( + ctx.current_catalog(), + &ctx.current_schema(), + table_name, + Some(&ctx), + ) + .await + .context(CatalogSnafu)? + .with_context(|| TableNotFoundSnafu { + table: table_name, + catalog: ctx.current_catalog(), + schema: ctx.current_schema(), + }) +} + +async fn find_traces_rank_3( + table: TableRef, + query_engine: &QueryEngineRef, + filters: Vec, + sorts: Vec, +) -> ServerResult { + let df_context = create_df_context(query_engine)?; + + let dataframe = df_context + .read_table(Arc::new(DfTableProviderAdapter::new(table))) + .context(DataFusionSnafu)?; + + let dataframe = dataframe + .select(vec![wildcard()]) + .context(DataFusionSnafu)?; + + // Apply all filters. + let dataframe = filters.into_iter().try_fold(dataframe, |df, expr| { + df.filter(expr).context(DataFusionSnafu) + })?; + + // Apply the sorts if needed. + let dataframe = if !sorts.is_empty() { + dataframe.sort(sorts).context(DataFusionSnafu)? + } else { + dataframe + }; + + // create rank column, for each trace, get the earliest 3 spans + let trace_id_col = vec![col(TRACE_ID_COLUMN)]; + let timestamp_asc = vec![col(TIMESTAMP_COLUMN).sort(true, false)]; + + let dataframe = dataframe + .with_column( + KEY_RN, + row_number() + .partition_by(trace_id_col) + .order_by(timestamp_asc) + .build() + .context(DataFusionSnafu)?, + ) + .context(DataFusionSnafu)?; + + let dataframe = dataframe + .filter(col(KEY_RN).lt_eq(lit(3))) + .context(DataFusionSnafu)?; + + // Execute the query and collect the result. + let stream = dataframe.execute_stream().await.context(DataFusionSnafu)?; + + let output = Output::new_with_stream(Box::pin( + RecordBatchStreamAdapter::try_new(stream).context(CollectRecordbatchSnafu)?, + )); + + Ok(output) +} + // The current implementation registers UDFs during the planning stage, which makes it difficult // to utilize them through DataFrame APIs. To address this limitation, we create a new session // context and register the required UDFs, allowing them to be decoupled from the global context. @@ -478,23 +587,73 @@ fn json_tag_filters( Ok(filters) } -fn flatten_tag_filters(tags: HashMap) -> ServerResult> { +/// Helper function to check if span_key or resource_key exists in col_names and create an expression. +/// If neither exists, logs a warning and returns None. +#[inline] +fn check_col_and_build_expr( + span_key: String, + resource_key: String, + key: &str, + col_names: &HashSet, + expr_builder: F, +) -> Option +where + F: FnOnce(String) -> Expr, +{ + if col_names.contains(&span_key) { + return Some(expr_builder(span_key)); + } + if col_names.contains(&resource_key) { + return Some(expr_builder(resource_key)); + } + warn!("tag key {} not found in table columns", key); + None +} + +fn flatten_tag_filters( + tags: HashMap, + col_names: &HashSet, +) -> ServerResult> { let filters = tags .into_iter() .filter_map(|(key, value)| { - let key = format!("\"span_attributes.{}\"", key); + if key == KEY_OTEL_STATUS_ERROR_KEY && value == JsonValue::Bool(true) { + return Some(col(SPAN_STATUS_CODE).eq(lit(SPAN_STATUS_ERROR))); + } + + // TODO(shuiyisong): add more precise mapping from key to col name + let span_key = format!("\"span_attributes.{}\"", key); + let resource_key = format!("\"resource_attributes.{}\"", key); match value { - JsonValue::String(value) => Some(col(key).eq(lit(value))), + JsonValue::String(value) => { + check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| { + col(k).eq(lit(value)) + }) + } JsonValue::Number(value) => { if value.is_f64() { // safe to unwrap as checked previously - Some(col(key).eq(lit(value.as_f64().unwrap()))) + let value = value.as_f64().unwrap(); + check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| { + col(k).eq(lit(value)) + }) } else { - Some(col(key).eq(lit(value.as_i64().unwrap()))) + let value = value.as_i64().unwrap(); + check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| { + col(k).eq(lit(value)) + }) } } - JsonValue::Bool(value) => Some(col(key).eq(lit(value))), - JsonValue::Null => Some(col(key).is_null()), + JsonValue::Bool(value) => { + check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| { + col(k).eq(lit(value)) + }) + } + JsonValue::Null => { + check_col_and_build_expr(span_key, resource_key, &key, col_names, |k| { + col(k).is_null() + }) + } // not supported at the moment JsonValue::Array(_value) => None, JsonValue::Object(_value) => None, @@ -508,9 +667,10 @@ fn tags_filters( dataframe: &DataFrame, tags: HashMap, is_data_model_v1: bool, + col_names: &HashSet, ) -> ServerResult> { if is_data_model_v1 { - flatten_tag_filters(tags) + flatten_tag_filters(tags, col_names) } else { json_tag_filters(dataframe, tags) } @@ -531,13 +691,10 @@ async fn trace_ids_from_output(output: Output) -> ServerResult> { { let mut trace_ids = vec![]; for recordbatch in recordbatches { - for col in recordbatch.columns().iter() { - for row_idx in 0..recordbatch.num_rows() { - if let ValueRef::String(value) = col.get_ref(row_idx) { - trace_ids.push(value.to_string()); - } - } - } + recordbatch + .iter_column_as_string(0) + .flatten() + .for_each(|x| trace_ids.push(x)); } return Ok(trace_ids); diff --git a/src/frontend/src/instance/promql.rs b/src/frontend/src/instance/promql.rs index 0d754167c7..4236474cf7 100644 --- a/src/frontend/src/instance/promql.rs +++ b/src/frontend/src/instance/promql.rs @@ -20,7 +20,6 @@ use common_catalog::consts::INFORMATION_SCHEMA_NAME; use common_catalog::format_full_table_name; use common_recordbatch::util; use common_telemetry::tracing; -use datatypes::prelude::Value; use promql_parser::label::{MatchOp, Matcher, Matchers}; use query::promql; use query::promql::planner::PromPlanner; @@ -90,15 +89,10 @@ impl Instance { for batch in batches { // Only one column the results, ensured by `prometheus::metric_name_matchers_to_plan`. - let names = batch.column(0); - - for i in 0..names.len() { - let Value::String(name) = names.get(i) else { - unreachable!(); - }; - - results.push(name.into_string()); - } + batch + .iter_column_as_string(0) + .flatten() + .for_each(|x| results.push(x)) } Ok(results) @@ -173,11 +167,10 @@ impl Instance { let mut results = Vec::with_capacity(batches.iter().map(|b| b.num_rows()).sum()); for batch in batches { // Only one column in results, ensured by `prometheus::label_values_matchers_to_plan`. - let names = batch.column(0); - - for i in 0..names.len() { - results.push(names.get(i).to_string()); - } + batch + .iter_column_as_string(0) + .flatten() + .for_each(|x| results.push(x)) } Ok(results) diff --git a/src/frontend/src/limiter.rs b/src/frontend/src/limiter.rs index e0e32e6b1b..1055267b2d 100644 --- a/src/frontend/src/limiter.rs +++ b/src/frontend/src/limiter.rs @@ -18,7 +18,8 @@ use api::v1::column::Values; use api::v1::greptime_request::Request; use api::v1::value::ValueData; use api::v1::{ - Decimal128, InsertRequests, IntervalMonthDayNano, RowInsertRequest, RowInsertRequests, + Decimal128, InsertRequests, IntervalMonthDayNano, JsonValue, RowInsertRequest, + RowInsertRequests, json_value, }; use pipeline::ContextReq; use snafu::ResultExt; @@ -229,12 +230,29 @@ impl Limiter { .unwrap_or(0) }) .sum(), - ValueData::JsonValue(inner) => inner - .as_ref() - .value_data - .as_ref() - .map(Self::size_of_value_data) - .unwrap_or(0), + ValueData::JsonValue(v) => { + fn calc(v: &JsonValue) -> usize { + let Some(value) = v.value.as_ref() else { + return 0; + }; + match value { + json_value::Value::Boolean(_) => size_of::(), + json_value::Value::Int(_) => size_of::(), + json_value::Value::Uint(_) => size_of::(), + json_value::Value::Float(_) => size_of::(), + json_value::Value::Str(s) => s.len(), + json_value::Value::Array(array) => array.items.iter().map(calc).sum(), + json_value::Value::Object(object) => object + .entries + .iter() + .flat_map(|entry| { + entry.value.as_ref().map(|v| entry.key.len() + calc(v)) + }) + .sum(), + } + } + calc(v) + } } } } diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs index 6c19109ab2..d70aa3dd49 100644 --- a/src/frontend/src/server.rs +++ b/src/frontend/src/server.rs @@ -36,7 +36,7 @@ use servers::postgres::PostgresServer; use servers::query_handler::grpc::ServerGrpcQueryHandlerAdapter; use servers::query_handler::sql::ServerSqlQueryHandlerAdapter; use servers::server::{Server, ServerHandlers}; -use servers::tls::{ReloadableTlsServerConfig, maybe_watch_tls_config}; +use servers::tls::{ReloadableTlsServerConfig, maybe_watch_server_tls_config}; use snafu::ResultExt; use crate::error::{self, Result, StartServerSnafu, TomlFormatSnafu}; @@ -258,7 +258,7 @@ where ); // will not watch if watch is disabled in tls option - maybe_watch_tls_config(tls_server_config.clone()).context(StartServerSnafu)?; + maybe_watch_server_tls_config(tls_server_config.clone()).context(StartServerSnafu)?; let mysql_server = MysqlServer::create_server( common_runtime::global_runtime(), @@ -287,7 +287,7 @@ where ReloadableTlsServerConfig::try_new(opts.tls.clone()).context(StartServerSnafu)?, ); - maybe_watch_tls_config(tls_server_config.clone()).context(StartServerSnafu)?; + maybe_watch_server_tls_config(tls_server_config.clone()).context(StartServerSnafu)?; let pg_server = Box::new(PostgresServer::new( ServerSqlQueryHandlerAdapter::arc(instance.clone()), diff --git a/src/meta-client/examples/meta_client.rs b/src/meta-client/examples/meta_client.rs index fb5125224c..175888f170 100644 --- a/src/meta-client/examples/meta_client.rs +++ b/src/meta-client/examples/meta_client.rs @@ -36,7 +36,7 @@ async fn run() { .timeout(Duration::from_secs(3)) .connect_timeout(Duration::from_secs(5)) .tcp_nodelay(true); - let channel_manager = ChannelManager::with_config(config); + let channel_manager = ChannelManager::with_config(config, None); let mut meta_client = MetaClientBuilder::datanode_default_options(id) .channel_manager(channel_manager) .build(); diff --git a/src/meta-client/src/client.rs b/src/meta-client/src/client.rs index 2a66c1570a..d819251597 100644 --- a/src/meta-client/src/client.rs +++ b/src/meta-client/src/client.rs @@ -24,7 +24,9 @@ mod util; use std::fmt::Debug; use std::sync::Arc; -use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role}; +use api::v1::meta::{ + MetasrvNodeInfo, ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role, +}; pub use ask_leader::{AskLeader, LeaderProvider, LeaderProviderRef}; use cluster::Client as ClusterClient; pub use cluster::ClusterKvBackend; @@ -371,7 +373,8 @@ impl ClusterInfo for MetaClient { let mut nodes = if get_metasrv_nodes { let last_activity_ts = -1; // Metasrv does not provide this information. - let (leader, followers) = cluster_client.get_metasrv_peers().await?; + let (leader, followers): (Option, Vec) = + cluster_client.get_metasrv_peers().await?; followers .into_iter() .map(|node| { @@ -383,8 +386,10 @@ impl ClusterInfo for MetaClient { version: node_info.version, git_commit: node_info.git_commit, start_time_ms: node_info.start_time_ms, - cpus: node_info.cpus, - memory_bytes: node_info.memory_bytes, + total_cpu_millicores: node_info.total_cpu_millicores, + total_memory_bytes: node_info.total_memory_bytes, + cpu_usage_millicores: node_info.cpu_usage_millicores, + memory_usage_bytes: node_info.memory_usage_bytes, hostname: node_info.hostname, } } else { @@ -396,8 +401,10 @@ impl ClusterInfo for MetaClient { version: node.version, git_commit: node.git_commit, start_time_ms: node.start_time_ms, - cpus: node.cpus, - memory_bytes: node.memory_bytes, + total_cpu_millicores: node.cpus as i64, + total_memory_bytes: node.memory_bytes as i64, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "".to_string(), } } @@ -411,8 +418,10 @@ impl ClusterInfo for MetaClient { version: node_info.version, git_commit: node_info.git_commit, start_time_ms: node_info.start_time_ms, - cpus: node_info.cpus, - memory_bytes: node_info.memory_bytes, + total_cpu_millicores: node_info.total_cpu_millicores, + total_memory_bytes: node_info.total_memory_bytes, + cpu_usage_millicores: node_info.cpu_usage_millicores, + memory_usage_bytes: node_info.memory_usage_bytes, hostname: node_info.hostname, } } else { @@ -424,8 +433,10 @@ impl ClusterInfo for MetaClient { version: node.version, git_commit: node.git_commit, start_time_ms: node.start_time_ms, - cpus: node.cpus, - memory_bytes: node.memory_bytes, + total_cpu_millicores: node.cpus as i64, + total_memory_bytes: node.memory_bytes as i64, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "".to_string(), } } diff --git a/src/meta-client/src/lib.rs b/src/meta-client/src/lib.rs index 47384785e2..5b56b8e181 100644 --- a/src/meta-client/src/lib.rs +++ b/src/meta-client/src/lib.rs @@ -101,7 +101,7 @@ pub async fn create_meta_client( if let MetaClientType::Frontend = client_type { let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout); - builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config)); + builder = builder.ddl_channel_manager(ChannelManager::with_config(ddl_config, None)); if let Some(plugins) = plugins { let region_follower = plugins.get::(); if let Some(region_follower) = region_follower { @@ -112,8 +112,8 @@ pub async fn create_meta_client( } builder = builder - .channel_manager(ChannelManager::with_config(base_config)) - .heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config)); + .channel_manager(ChannelManager::with_config(base_config, None)) + .heartbeat_channel_manager(ChannelManager::with_config(heartbeat_config, None)); let mut meta_client = builder.build(); diff --git a/src/meta-srv/Cargo.toml b/src/meta-srv/Cargo.toml index 90a4fdc17b..3ed3c5a834 100644 --- a/src/meta-srv/Cargo.toml +++ b/src/meta-srv/Cargo.toml @@ -39,6 +39,7 @@ common-meta.workspace = true common-options.workspace = true common-procedure.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-time.workspace = true common-version.workspace = true @@ -61,7 +62,9 @@ hyper-util = { workspace = true, features = ["tokio"] } itertools.workspace = true lazy_static.workspace = true once_cell.workspace = true +ordered-float.workspace = true parking_lot.workspace = true +partition.workspace = true prometheus.workspace = true prost.workspace = true rand.workspace = true @@ -71,7 +74,10 @@ serde.workspace = true serde_json.workspace = true servers.workspace = true snafu.workspace = true -sqlx = { workspace = true, optional = true } +sqlx = { workspace = true, features = [ + "mysql", + "chrono", +], optional = true } store-api.workspace = true strum.workspace = true table.workspace = true diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs index 20e5810a90..351853d2bf 100644 --- a/src/meta-srv/src/bootstrap.rs +++ b/src/meta-srv/src/bootstrap.rs @@ -29,8 +29,7 @@ use common_meta::kv_backend::memory::MemoryKvBackend; use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef}; use common_telemetry::info; use either::Either; -use servers::configurator::ConfiguratorRef; -use servers::export_metrics::ExportMetricsTask; +use servers::configurator::GrpcRouterConfiguratorRef; use servers::http::{HttpServer, HttpServerBuilder}; use servers::metrics_handler::MetricsHandler; use servers::server::Server; @@ -45,6 +44,7 @@ use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef}; #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] use crate::election::CANDIDATE_LEASE_SECS; use crate::election::etcd::EtcdElection; +use crate::error::OtherSnafu; use crate::metasrv::builder::MetasrvBuilder; use crate::metasrv::{ BackendImpl, ElectionRef, Metasrv, MetasrvOptions, SelectTarget, SelectorRef, @@ -70,8 +70,6 @@ pub struct MetasrvInstance { plugins: Plugins, - export_metrics_task: Option, - /// gRPC serving state receiver. Only present if the gRPC server is started. serve_state: Arc>>>>, @@ -95,15 +93,12 @@ impl MetasrvInstance { // put metasrv into plugins for later use plugins.insert::>(metasrv.clone()); - let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins)) - .context(error::InitExportMetricsTaskSnafu)?; Ok(MetasrvInstance { metasrv, http_server: Either::Left(Some(builder)), opts, signal_sender: None, plugins, - export_metrics_task, serve_state: Default::default(), bind_addr: None, }) @@ -131,18 +126,21 @@ impl MetasrvInstance { self.metasrv.try_start().await?; - if let Some(t) = self.export_metrics_task.as_ref() { - t.start(None).context(error::InitExportMetricsTaskSnafu)? - } - let (tx, rx) = mpsc::channel::<()>(1); self.signal_sender = Some(tx); // Start gRPC server with admin services for backward compatibility let mut router = router(self.metasrv.clone()); - if let Some(configurator) = self.metasrv.plugins().get::() { - router = configurator.config_grpc(router); + if let Some(configurator) = self + .metasrv + .plugins() + .get::>() + { + router = configurator + .configure_grpc_router(router, ()) + .await + .context(OtherSnafu)?; } let (serve_state_tx, serve_state_rx) = oneshot::channel(); diff --git a/src/meta-srv/src/discovery.rs b/src/meta-srv/src/discovery.rs index 6151e7afbd..54532ec454 100644 --- a/src/meta-srv/src/discovery.rs +++ b/src/meta-srv/src/discovery.rs @@ -26,6 +26,7 @@ use common_meta::distributed_time_constants::{ use common_meta::error::Result; use common_meta::peer::{Peer, PeerDiscovery, PeerResolver}; use common_meta::{DatanodeId, FlownodeId}; +use common_time::util::DefaultSystemTimer; use snafu::ResultExt; use crate::cluster::MetaPeerClient; @@ -35,6 +36,7 @@ use crate::discovery::lease::{LeaseValueAccessor, LeaseValueType}; impl PeerDiscovery for MetaPeerClient { async fn active_frontends(&self) -> Result> { utils::alive_frontends( + &DefaultSystemTimer, self, Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS), ) @@ -47,20 +49,30 @@ impl PeerDiscovery for MetaPeerClient { &self, filter: Option fn(&'a NodeWorkloads) -> bool>, ) -> Result> { - utils::alive_datanodes(self, Duration::from_secs(DATANODE_LEASE_SECS), filter) - .await - .map_err(BoxedError::new) - .context(common_meta::error::ExternalSnafu) + utils::alive_datanodes( + &DefaultSystemTimer, + self, + Duration::from_secs(DATANODE_LEASE_SECS), + filter, + ) + .await + .map_err(BoxedError::new) + .context(common_meta::error::ExternalSnafu) } async fn active_flownodes( &self, filter: Option fn(&'a NodeWorkloads) -> bool>, ) -> Result> { - utils::alive_flownodes(self, Duration::from_secs(FLOWNODE_LEASE_SECS), filter) - .await - .map_err(BoxedError::new) - .context(common_meta::error::ExternalSnafu) + utils::alive_flownodes( + &DefaultSystemTimer, + self, + Duration::from_secs(FLOWNODE_LEASE_SECS), + filter, + ) + .await + .map_err(BoxedError::new) + .context(common_meta::error::ExternalSnafu) } } diff --git a/src/meta-srv/src/discovery/lease.rs b/src/meta-srv/src/discovery/lease.rs index 46b92c0f1a..7035e3bcad 100644 --- a/src/meta-srv/src/discovery/lease.rs +++ b/src/meta-srv/src/discovery/lease.rs @@ -95,20 +95,22 @@ impl LeaseValueAccessor for MetaPeerClient { #[cfg(test)] mod tests { + use std::sync::Arc; + use std::sync::atomic::{AtomicI64, Ordering}; use std::time::Duration; - use api::v1::meta::DatanodeWorkloads; use api::v1::meta::heartbeat_request::NodeWorkloads; + use api::v1::meta::{DatanodeWorkloads, FlownodeWorkloads}; use common_meta::cluster::{FrontendStatus, NodeInfo, NodeInfoKey, NodeStatus, Role}; use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS; use common_meta::kv_backend::ResettableKvBackendRef; use common_meta::peer::{Peer, PeerDiscovery}; use common_meta::rpc::store::PutRequest; - use common_time::util::current_time_millis; + use common_time::util::{DefaultSystemTimer, SystemTimer, current_time_millis}; use common_workload::DatanodeWorkloadType; use crate::discovery::utils::{self, accept_ingest_workload}; - use crate::key::{DatanodeLeaseKey, LeaseValue}; + use crate::key::{DatanodeLeaseKey, FlownodeLeaseKey, LeaseValue}; use crate::test_util::create_meta_peer_client; async fn put_lease_value( @@ -126,17 +128,47 @@ mod tests { .unwrap(); } + async fn put_flownode_lease_value( + kv_backend: &ResettableKvBackendRef, + key: FlownodeLeaseKey, + value: LeaseValue, + ) { + kv_backend + .put(PutRequest { + key: key.try_into().unwrap(), + value: value.try_into().unwrap(), + prev_kv: false, + }) + .await + .unwrap(); + } + + struct MockTimer { + current: Arc, + } + + impl SystemTimer for MockTimer { + fn current_time_millis(&self) -> i64 { + self.current.fetch_add(1, Ordering::Relaxed) + } + + fn current_time_rfc3339(&self) -> String { + unimplemented!() + } + } + #[tokio::test] async fn test_alive_datanodes() { let client = create_meta_peer_client(); let in_memory = client.memory_backend(); let lease_secs = 10; + let timer = DefaultSystemTimer; // put a stale lease value for node 1 let key = DatanodeLeaseKey { node_id: 1 }; let value = LeaseValue { // 20s ago - timestamp_millis: current_time_millis() - lease_secs * 2 * 1000, + timestamp_millis: timer.current_time_millis() - lease_secs * 2 * 1000, node_addr: "127.0.0.1:20201".to_string(), workloads: NodeWorkloads::Datanode(DatanodeWorkloads { types: vec![DatanodeWorkloadType::Hybrid as i32], @@ -147,7 +179,7 @@ mod tests { // put a fresh lease value for node 2 let key = DatanodeLeaseKey { node_id: 2 }; let value = LeaseValue { - timestamp_millis: current_time_millis(), + timestamp_millis: timer.current_time_millis(), node_addr: "127.0.0.1:20202".to_string(), workloads: NodeWorkloads::Datanode(DatanodeWorkloads { types: vec![DatanodeWorkloadType::Hybrid as i32], @@ -155,6 +187,37 @@ mod tests { }; put_lease_value(&in_memory, key.clone(), value.clone()).await; let peers = utils::alive_datanodes( + &timer, + client.as_ref(), + Duration::from_secs(lease_secs as u64), + None, + ) + .await + .unwrap(); + assert_eq!(peers.len(), 1); + assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]); + } + + #[tokio::test] + async fn test_alive_datanodes_with_timer() { + let client = create_meta_peer_client(); + let in_memory = client.memory_backend(); + let lease_secs = 10; + let timer = MockTimer { + current: Arc::new(AtomicI64::new(current_time_millis())), + }; + + let key = DatanodeLeaseKey { node_id: 2 }; + let value = LeaseValue { + timestamp_millis: timer.current_time_millis(), + node_addr: "127.0.0.1:20202".to_string(), + workloads: NodeWorkloads::Datanode(DatanodeWorkloads { + types: vec![DatanodeWorkloadType::Hybrid as i32], + }), + }; + put_lease_value(&in_memory, key.clone(), value.clone()).await; + let peers = utils::alive_datanodes( + &timer, client.as_ref(), Duration::from_secs(lease_secs as u64), None, @@ -170,12 +233,13 @@ mod tests { let client = create_meta_peer_client(); let in_memory = client.memory_backend(); let lease_secs = 10; + let timer = DefaultSystemTimer; // put a lease value for node 1 without mode info let key = DatanodeLeaseKey { node_id: 1 }; let value = LeaseValue { // 20s ago - timestamp_millis: current_time_millis() - 20 * 1000, + timestamp_millis: timer.current_time_millis() - 20 * 1000, node_addr: "127.0.0.1:20201".to_string(), workloads: NodeWorkloads::Datanode(DatanodeWorkloads { types: vec![DatanodeWorkloadType::Hybrid as i32], @@ -186,7 +250,7 @@ mod tests { // put a lease value for node 2 with mode info let key = DatanodeLeaseKey { node_id: 2 }; let value = LeaseValue { - timestamp_millis: current_time_millis(), + timestamp_millis: timer.current_time_millis(), node_addr: "127.0.0.1:20202".to_string(), workloads: NodeWorkloads::Datanode(DatanodeWorkloads { types: vec![DatanodeWorkloadType::Hybrid as i32], @@ -197,7 +261,7 @@ mod tests { // put a lease value for node 3 with mode info let key = DatanodeLeaseKey { node_id: 3 }; let value = LeaseValue { - timestamp_millis: current_time_millis(), + timestamp_millis: timer.current_time_millis(), node_addr: "127.0.0.1:20203".to_string(), workloads: NodeWorkloads::Datanode(DatanodeWorkloads { types: vec![i32::MAX], @@ -208,7 +272,7 @@ mod tests { // put a lease value for node 3 with mode info let key = DatanodeLeaseKey { node_id: 4 }; let value = LeaseValue { - timestamp_millis: current_time_millis(), + timestamp_millis: timer.current_time_millis(), node_addr: "127.0.0.1:20204".to_string(), workloads: NodeWorkloads::Datanode(DatanodeWorkloads { types: vec![i32::MAX], @@ -217,6 +281,7 @@ mod tests { put_lease_value(&in_memory, key, value).await; let peers = utils::alive_datanodes( + &timer, client.as_ref(), Duration::from_secs(lease_secs), Some(accept_ingest_workload), @@ -227,24 +292,92 @@ mod tests { assert!(peers.contains(&Peer::new(2, "127.0.0.1:20202".to_string()))); } + #[tokio::test] + async fn test_alive_flownodes() { + let client = create_meta_peer_client(); + let in_memory = client.memory_backend(); + let lease_secs = 10; + let timer = DefaultSystemTimer; + + // put a stale lease value for node 1 + let key = FlownodeLeaseKey { node_id: 1 }; + let value = LeaseValue { + // 20s ago + timestamp_millis: timer.current_time_millis() - lease_secs * 2 * 1000, + node_addr: "127.0.0.1:20201".to_string(), + workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }), + }; + put_flownode_lease_value(&in_memory, key, value).await; + + // put a fresh lease value for node 2 + let key = FlownodeLeaseKey { node_id: 2 }; + let value = LeaseValue { + timestamp_millis: timer.current_time_millis(), + node_addr: "127.0.0.1:20202".to_string(), + workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }), + }; + put_flownode_lease_value(&in_memory, key.clone(), value.clone()).await; + let peers = utils::alive_flownodes( + &timer, + client.as_ref(), + Duration::from_secs(lease_secs as u64), + None, + ) + .await + .unwrap(); + assert_eq!(peers.len(), 1); + assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]); + } + + #[tokio::test] + async fn test_alive_flownodes_with_timer() { + let client = create_meta_peer_client(); + let in_memory = client.memory_backend(); + let lease_secs = 10; + let timer = MockTimer { + current: Arc::new(AtomicI64::new(current_time_millis())), + }; + + let key = FlownodeLeaseKey { node_id: 2 }; + let value = LeaseValue { + timestamp_millis: timer.current_time_millis(), + node_addr: "127.0.0.1:20202".to_string(), + workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }), + }; + put_flownode_lease_value(&in_memory, key.clone(), value.clone()).await; + let peers = utils::alive_flownodes( + &timer, + client.as_ref(), + Duration::from_secs(lease_secs as u64), + None, + ) + .await + .unwrap(); + assert_eq!(peers.len(), 1); + assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]); + } + #[tokio::test] async fn test_lookup_frontends() { let client = create_meta_peer_client(); let in_memory = client.memory_backend(); let lease_secs = 10; + let timer = DefaultSystemTimer; let active_frontend_node = NodeInfo { peer: Peer { id: 0, addr: "127.0.0.1:20201".to_string(), }, - last_activity_ts: current_time_millis(), + last_activity_ts: timer.current_time_millis(), status: NodeStatus::Frontend(FrontendStatus {}), version: "1.0.0".to_string(), git_commit: "1234567890".to_string(), start_time_ms: current_time_millis() as u64, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; @@ -264,13 +397,15 @@ mod tests { id: 1, addr: "127.0.0.1:20201".to_string(), }, - last_activity_ts: current_time_millis() - 20 * 1000, + last_activity_ts: timer.current_time_millis() - 20 * 1000, status: NodeStatus::Frontend(FrontendStatus {}), version: "1.0.0".to_string(), git_commit: "1234567890".to_string(), start_time_ms: current_time_millis() as u64, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; @@ -283,9 +418,52 @@ mod tests { .await .unwrap(); - let peers = utils::alive_frontends(client.as_ref(), Duration::from_secs(lease_secs)) + let peers = + utils::alive_frontends(&timer, client.as_ref(), Duration::from_secs(lease_secs)) + .await + .unwrap(); + assert_eq!(peers.len(), 1); + assert_eq!(peers[0].id, 0); + } + + #[tokio::test] + async fn test_lookup_frontends_with_timer() { + let client = create_meta_peer_client(); + let in_memory = client.memory_backend(); + let lease_secs = 10; + let timer = MockTimer { + current: Arc::new(AtomicI64::new(current_time_millis())), + }; + + let active_frontend_node = NodeInfo { + peer: Peer { + id: 0, + addr: "127.0.0.1:20201".to_string(), + }, + last_activity_ts: timer.current_time_millis(), + status: NodeStatus::Frontend(FrontendStatus {}), + version: "1.0.0".to_string(), + git_commit: "1234567890".to_string(), + start_time_ms: current_time_millis() as u64, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, + hostname: "test_hostname".to_string(), + }; + let key_prefix = NodeInfoKey::key_prefix_with_role(Role::Frontend); + in_memory + .put(PutRequest { + key: format!("{}{}", key_prefix, "0").into(), + value: active_frontend_node.try_into().unwrap(), + prev_kv: false, + }) .await .unwrap(); + let peers = + utils::alive_frontends(&timer, client.as_ref(), Duration::from_secs(lease_secs)) + .await + .unwrap(); assert_eq!(peers.len(), 1); assert_eq!(peers[0].id, 0); } @@ -307,8 +485,10 @@ mod tests { version: "1.0.0".to_string(), git_commit: "1234567890".to_string(), start_time_ms: last_activity_ts as u64, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; diff --git a/src/meta-srv/src/discovery/utils.rs b/src/meta-srv/src/discovery/utils.rs index 9a8ec7a82c..317033e0cf 100644 --- a/src/meta-srv/src/discovery/utils.rs +++ b/src/meta-srv/src/discovery/utils.rs @@ -19,7 +19,7 @@ use common_meta::DatanodeId; use common_meta::cluster::NodeInfo; use common_meta::kv_backend::KvBackendRef; use common_meta::peer::Peer; -use common_time::util::{DefaultSystemTimer, SystemTimer}; +use common_time::util::SystemTimer; use common_workload::DatanodeWorkloadType; use snafu::ResultExt; @@ -49,16 +49,9 @@ pub trait LastActiveTs { /// Builds a filter closure that checks whether a [`LastActiveTs`] item /// is still within the specified active duration, relative to the /// current time provided by the given [`SystemTimer`]. -/// -/// The returned closure uses the timestamp at the time of building, -/// so the "now" reference point is fixed when this function is called. -pub fn build_active_filter( - timer: impl SystemTimer, - active_duration: Duration, -) -> impl Fn(&T) -> bool { - let now = timer.current_time_millis(); - let active_duration = active_duration.as_millis() as u64; - move |item: &T| { +pub fn build_active_filter(active_duration: Duration) -> impl Fn(i64, &T) -> bool { + move |now: i64, item: &T| { + let active_duration = active_duration.as_millis() as u64; let elapsed = now.saturating_sub(item.last_active_ts()) as u64; elapsed < active_duration } @@ -66,18 +59,19 @@ pub fn build_active_filter( /// Returns the alive datanodes. pub async fn alive_datanodes( + timer: &impl SystemTimer, accessor: &impl LeaseValueAccessor, active_duration: Duration, condition: Option bool>, ) -> Result> { - let active_filter = build_active_filter(DefaultSystemTimer, active_duration); + let active_filter = build_active_filter(active_duration); let condition = condition.unwrap_or(|_| true); - Ok(accessor - .lease_values(LeaseValueType::Datanode) - .await? + let lease_values = accessor.lease_values(LeaseValueType::Datanode).await?; + let now = timer.current_time_millis(); + Ok(lease_values .into_iter() .filter_map(|(peer_id, lease_value)| { - if active_filter(&lease_value) && condition(&lease_value.workloads) { + if active_filter(now, &lease_value) && condition(&lease_value.workloads) { Some(Peer::new(peer_id, lease_value.node_addr)) } else { None @@ -88,18 +82,19 @@ pub async fn alive_datanodes( /// Returns the alive flownodes. pub async fn alive_flownodes( + timer: &impl SystemTimer, accessor: &impl LeaseValueAccessor, active_duration: Duration, condition: Option bool>, ) -> Result> { - let active_filter = build_active_filter(DefaultSystemTimer, active_duration); + let active_filter = build_active_filter(active_duration); let condition = condition.unwrap_or(|_| true); - Ok(accessor - .lease_values(LeaseValueType::Flownode) - .await? + let lease_values = accessor.lease_values(LeaseValueType::Flownode).await?; + let now = timer.current_time_millis(); + Ok(lease_values .into_iter() .filter_map(|(peer_id, lease_value)| { - if active_filter(&lease_value) && condition(&lease_value.workloads) { + if active_filter(now, &lease_value) && condition(&lease_value.workloads) { Some(Peer::new(peer_id, lease_value.node_addr)) } else { None @@ -110,16 +105,17 @@ pub async fn alive_flownodes( /// Returns the alive frontends. pub async fn alive_frontends( + timer: &impl SystemTimer, lister: &impl NodeInfoAccessor, active_duration: Duration, ) -> Result> { - let active_filter = build_active_filter(DefaultSystemTimer, active_duration); - Ok(lister - .node_infos(NodeInfoType::Frontend) - .await? + let active_filter = build_active_filter(active_duration); + let node_infos = lister.node_infos(NodeInfoType::Frontend).await?; + let now = timer.current_time_millis(); + Ok(node_infos .into_iter() .filter_map(|(_, node_info)| { - if active_filter(&node_info) { + if active_filter(now, &node_info) { Some(node_info.peer) } else { None @@ -130,15 +126,18 @@ pub async fn alive_frontends( /// Returns the alive datanode peer. pub async fn alive_datanode( + timer: &impl SystemTimer, lister: &impl LeaseValueAccessor, peer_id: u64, active_duration: Duration, ) -> Result> { - let active_filter = build_active_filter(DefaultSystemTimer, active_duration); - let v = lister + let active_filter = build_active_filter(active_duration); + let lease_value = lister .lease_value(LeaseValueType::Datanode, peer_id) - .await? - .filter(|(_, lease)| active_filter(lease)) + .await?; + let now = timer.current_time_millis(); + let v = lease_value + .filter(|(_, lease)| active_filter(now, lease)) .map(|(peer_id, lease)| Peer::new(peer_id, lease.node_addr)); Ok(v) diff --git a/src/meta-srv/src/election/rds/mysql.rs b/src/meta-srv/src/election/rds/mysql.rs index a0890969f8..78832e3e11 100644 --- a/src/meta-srv/src/election/rds/mysql.rs +++ b/src/meta-srv/src/election/rds/mysql.rs @@ -17,7 +17,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; -use common_telemetry::{error, warn}; +use common_telemetry::{error, info, warn}; use common_time::Timestamp; use snafu::{OptionExt, ResultExt, ensure}; use sqlx::mysql::{MySqlArguments, MySqlRow}; @@ -645,6 +645,13 @@ impl Election for MySqlElection { } async fn reset_campaign(&self) { + info!("Resetting campaign"); + if self.is_leader.load(Ordering::Relaxed) { + if let Err(err) = self.step_down_without_lock().await { + error!(err; "Failed to step down without lock"); + } + info!("Step down without lock successfully, due to reset campaign"); + } if let Err(err) = self.client.lock().await.reset_client().await { error!(err; "Failed to reset client"); } @@ -1161,8 +1168,10 @@ mod tests { version: "test_version".to_string(), git_commit: "test_git_commit".to_string(), start_time_ms: 0, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; mysql_election.register_candidate(&node_info).await.unwrap(); @@ -1642,6 +1651,41 @@ mod tests { drop_table(&leader_mysql_election.client, table_name).await; } + #[tokio::test] + async fn test_reset_campaign() { + maybe_skip_mysql_integration_test!(); + common_telemetry::init_default_ut_logging(); + let leader_value = "test_leader".to_string(); + let uuid = uuid::Uuid::new_v4().to_string(); + let table_name = "test_reset_campaign_greptime_metakv"; + let candidate_lease_ttl = Duration::from_secs(5); + let meta_lease_ttl = Duration::from_secs(2); + let execution_timeout = Duration::from_secs(10); + let idle_session_timeout = Duration::from_secs(0); + let client = create_mysql_client(Some(table_name), execution_timeout, idle_session_timeout) + .await + .unwrap(); + + let (tx, _) = broadcast::channel(100); + let leader_mysql_election = MySqlElection { + leader_value, + client, + is_leader: AtomicBool::new(false), + leader_infancy: AtomicBool::new(true), + leader_watcher: tx, + store_key_prefix: uuid, + candidate_lease_ttl, + meta_lease_ttl, + sql_set: ElectionSqlFactory::new(table_name).build(), + }; + leader_mysql_election + .is_leader + .store(true, Ordering::Relaxed); + leader_mysql_election.reset_campaign().await; + assert!(!leader_mysql_election.is_leader()); + drop_table(&leader_mysql_election.client, table_name).await; + } + #[tokio::test] async fn test_follower_action() { maybe_skip_mysql_integration_test!(); diff --git a/src/meta-srv/src/election/rds/postgres.rs b/src/meta-srv/src/election/rds/postgres.rs index 14b2bbb409..77bcd30dfe 100644 --- a/src/meta-srv/src/election/rds/postgres.rs +++ b/src/meta-srv/src/election/rds/postgres.rs @@ -17,7 +17,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; -use common_telemetry::{error, warn}; +use common_telemetry::{error, info, warn}; use common_time::Timestamp; use deadpool_postgres::{Manager, Pool}; use snafu::{OptionExt, ResultExt, ensure}; @@ -477,6 +477,13 @@ impl Election for PgElection { } async fn reset_campaign(&self) { + info!("Resetting campaign"); + if self.is_leader.load(Ordering::Relaxed) { + if let Err(err) = self.step_down_without_lock().await { + error!(err; "Failed to step down without lock"); + } + info!("Step down without lock successfully, due to reset campaign"); + } if let Err(err) = self.pg_client.write().await.reset_client().await { error!(err; "Failed to reset client"); } @@ -774,16 +781,12 @@ impl PgElection { key: key.clone(), ..Default::default() }; - if self - .is_leader - .compare_exchange(true, false, Ordering::AcqRel, Ordering::Acquire) - .is_ok() - && let Err(e) = self - .leader_watcher - .send(LeaderChangeMessage::StepDown(Arc::new(leader_key))) - { - error!(e; "Failed to send leader change message"); - } + send_leader_change_and_set_flags( + &self.is_leader, + &self.leader_infancy, + &self.leader_watcher, + LeaderChangeMessage::StepDown(Arc::new(leader_key)), + ); Ok(()) } @@ -1000,8 +1003,10 @@ mod tests { version: "test_version".to_string(), git_commit: "test_git_commit".to_string(), start_time_ms: 0, - cpus: 0, - memory_bytes: 0, + total_cpu_millicores: 0, + total_memory_bytes: 0, + cpu_usage_millicores: 0, + memory_usage_bytes: 0, hostname: "test_hostname".to_string(), }; pg_election.register_candidate(&node_info).await.unwrap(); @@ -1577,6 +1582,44 @@ mod tests { drop_table(&follower_pg_election, table_name).await; } + #[tokio::test] + async fn test_reset_campaign() { + maybe_skip_postgres_integration_test!(); + let leader_value = "test_leader".to_string(); + let uuid = uuid::Uuid::new_v4().to_string(); + let table_name = "test_reset_campaign_greptime_metakv"; + let candidate_lease_ttl = Duration::from_secs(5); + let execution_timeout = Duration::from_secs(10); + let statement_timeout = Duration::from_secs(10); + let meta_lease_ttl = Duration::from_secs(2); + let idle_session_timeout = Duration::from_secs(0); + let client = create_postgres_client( + Some(table_name), + execution_timeout, + idle_session_timeout, + statement_timeout, + ) + .await + .unwrap(); + + let (tx, _) = broadcast::channel(100); + let leader_pg_election = PgElection { + leader_value, + pg_client: RwLock::new(client), + is_leader: AtomicBool::new(false), + leader_infancy: AtomicBool::new(true), + leader_watcher: tx, + store_key_prefix: uuid, + candidate_lease_ttl, + meta_lease_ttl, + sql_set: ElectionSqlFactory::new(28321, None, table_name).build(), + }; + leader_pg_election.is_leader.store(true, Ordering::Relaxed); + leader_pg_election.reset_campaign().await; + assert!(!leader_pg_election.is_leader()); + drop_table(&leader_pg_election, table_name).await; + } + #[tokio::test] async fn test_idle_session_timeout() { maybe_skip_postgres_integration_test!(); diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs index 2f4756c2ae..f00ccdeb3a 100644 --- a/src/meta-srv/src/error.rs +++ b/src/meta-srv/src/error.rs @@ -23,6 +23,7 @@ use store_api::storage::RegionId; use table::metadata::TableId; use tokio::sync::mpsc::error::SendError; use tonic::codegen::http; +use uuid::Uuid; use crate::metasrv::SelectTarget; use crate::pubsub::Message; @@ -304,13 +305,6 @@ pub enum Error { source: servers::error::Error, }, - #[snafu(display("Failed to init export metrics task"))] - InitExportMetricsTask { - #[snafu(implicit)] - location: Location, - source: servers::error::Error, - }, - #[snafu(display("Failed to parse address {}", addr))] ParseAddr { addr: String, @@ -989,13 +983,63 @@ pub enum Error { #[snafu(source)] source: common_meta::error::Error, }, + + #[snafu(display( + "Repartition group {} source region missing, region id: {}", + group_id, + region_id + ))] + RepartitionSourceRegionMissing { + group_id: Uuid, + region_id: RegionId, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Repartition group {} target region missing, region id: {}", + group_id, + region_id + ))] + RepartitionTargetRegionMissing { + group_id: Uuid, + region_id: RegionId, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to serialize partition expression: {}", source))] + SerializePartitionExpr { + #[snafu(source)] + source: partition::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Partition expression mismatch, region id: {}, expected: {}, actual: {}", + region_id, + expected, + actual + ))] + PartitionExprMismatch { + region_id: RegionId, + expected: String, + actual: String, + #[snafu(implicit)] + location: Location, + }, } impl Error { /// Returns `true` if the error is retryable. pub fn is_retryable(&self) -> bool { - matches!(self, Error::RetryLater { .. }) - || matches!(self, Error::RetryLaterWithSource { .. }) + matches!( + self, + Error::RetryLater { .. } + | Error::RetryLaterWithSource { .. } + | Error::MailboxTimeout { .. } + ) } } @@ -1044,6 +1088,7 @@ impl ErrorExt for Error { | Error::MailboxChannelClosed { .. } | Error::IsNotLeader { .. } => StatusCode::IllegalState, Error::RetryLaterWithSource { source, .. } => source.status_code(), + Error::SerializePartitionExpr { source, .. } => source.status_code(), Error::Unsupported { .. } => StatusCode::Unsupported, @@ -1061,12 +1106,14 @@ impl ErrorExt for Error { | Error::ParseAddr { .. } | Error::UnsupportedSelectorType { .. } | Error::InvalidArguments { .. } - | Error::InitExportMetricsTask { .. } | Error::ProcedureNotFound { .. } | Error::TooManyPartitions { .. } | Error::TomlFormat { .. } | Error::HandlerNotFound { .. } - | Error::LeaderPeerChanged { .. } => StatusCode::InvalidArguments, + | Error::LeaderPeerChanged { .. } + | Error::RepartitionSourceRegionMissing { .. } + | Error::RepartitionTargetRegionMissing { .. } + | Error::PartitionExprMismatch { .. } => StatusCode::InvalidArguments, Error::LeaseKeyFromUtf8 { .. } | Error::LeaseValueFromUtf8 { .. } | Error::InvalidRegionKeyFromUtf8 { .. } diff --git a/src/meta-srv/src/events/region_migration_event.rs b/src/meta-srv/src/events/region_migration_event.rs index 3fc8500599..7e5c5b6fc2 100644 --- a/src/meta-srv/src/events/region_migration_event.rs +++ b/src/meta-srv/src/events/region_migration_event.rs @@ -21,7 +21,7 @@ use common_event_recorder::Event; use common_event_recorder::error::{Result, SerializeEventSnafu}; use serde::Serialize; use snafu::ResultExt; -use store_api::storage::{RegionId, TableId}; +use store_api::storage::RegionId; use crate::procedure::region_migration::{PersistentContext, RegionMigrationTriggerReason}; @@ -37,35 +37,34 @@ pub const EVENTS_TABLE_DST_NODE_ID_COLUMN_NAME: &str = "region_migration_dst_nod pub const EVENTS_TABLE_DST_PEER_ADDR_COLUMN_NAME: &str = "region_migration_dst_peer_addr"; /// RegionMigrationEvent is the event of region migration. -#[derive(Debug, Serialize)] +#[derive(Debug)] pub(crate) struct RegionMigrationEvent { - #[serde(skip)] - region_id: RegionId, - #[serde(skip)] - table_id: TableId, - #[serde(skip)] - region_number: u32, - #[serde(skip)] + // The region ids of the region migration. + region_ids: Vec, + // The trigger reason of the region migration. trigger_reason: RegionMigrationTriggerReason, - #[serde(skip)] + // The source node id of the region migration. src_node_id: u64, - #[serde(skip)] + // The source peer address of the region migration. src_peer_addr: String, - #[serde(skip)] + // The destination node id of the region migration. dst_node_id: u64, - #[serde(skip)] + // The destination peer address of the region migration. dst_peer_addr: String, + // The timeout of the region migration. + timeout: Duration, +} - // The following fields will be serialized as the json payload. +#[derive(Debug, Serialize)] +struct Payload { + #[serde(with = "humantime_serde")] timeout: Duration, } impl RegionMigrationEvent { pub fn from_persistent_ctx(ctx: &PersistentContext) -> Self { Self { - region_id: ctx.region_id, - table_id: ctx.region_id.table_id(), - region_number: ctx.region_id.region_number(), + region_ids: ctx.region_ids.clone(), trigger_reason: ctx.trigger_reason, src_node_id: ctx.from_peer.id, src_peer_addr: ctx.from_peer.addr.clone(), @@ -134,23 +133,31 @@ impl Event for RegionMigrationEvent { ] } - fn extra_row(&self) -> Result { - Ok(Row { - values: vec![ - ValueData::U64Value(self.region_id.as_u64()).into(), - ValueData::U32Value(self.table_id).into(), - ValueData::U32Value(self.region_number).into(), - ValueData::StringValue(self.trigger_reason.to_string()).into(), - ValueData::U64Value(self.src_node_id).into(), - ValueData::StringValue(self.src_peer_addr.clone()).into(), - ValueData::U64Value(self.dst_node_id).into(), - ValueData::StringValue(self.dst_peer_addr.clone()).into(), - ], - }) + fn extra_rows(&self) -> Result> { + let mut extra_rows = Vec::with_capacity(self.region_ids.len()); + for region_id in &self.region_ids { + extra_rows.push(Row { + values: vec![ + ValueData::U64Value(region_id.as_u64()).into(), + ValueData::U32Value(region_id.table_id()).into(), + ValueData::U32Value(region_id.region_number()).into(), + ValueData::StringValue(self.trigger_reason.to_string()).into(), + ValueData::U64Value(self.src_node_id).into(), + ValueData::StringValue(self.src_peer_addr.clone()).into(), + ValueData::U64Value(self.dst_node_id).into(), + ValueData::StringValue(self.dst_peer_addr.clone()).into(), + ], + }); + } + + Ok(extra_rows) } fn json_payload(&self) -> Result { - serde_json::to_string(self).context(SerializeEventSnafu) + serde_json::to_string(&Payload { + timeout: self.timeout, + }) + .context(SerializeEventSnafu) } fn as_any(&self) -> &dyn Any { diff --git a/src/meta-srv/src/gc.rs b/src/meta-srv/src/gc.rs new file mode 100644 index 0000000000..3677e72a41 --- /dev/null +++ b/src/meta-srv/src/gc.rs @@ -0,0 +1,37 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// TODO(discord9): remove this once gc scheduler is fully merged +#![allow(unused)] + +use std::collections::{HashMap, HashSet}; + +use common_meta::peer::Peer; +use store_api::storage::RegionId; + +mod candidate; +mod ctx; +mod handler; +mod options; +mod procedure; +mod scheduler; +mod tracker; + +pub use options::GcSchedulerOptions; +pub use procedure::BatchGcProcedure; +pub(crate) use scheduler::{GcScheduler, GcTickerRef}; + +pub type Region2Peers = HashMap)>; + +pub(crate) type Peer2Regions = HashMap>; diff --git a/src/meta-srv/src/gc/candidate.rs b/src/meta-srv/src/gc/candidate.rs new file mode 100644 index 0000000000..7d9ac9558b --- /dev/null +++ b/src/meta-srv/src/gc/candidate.rs @@ -0,0 +1,134 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::time::Instant; + +use common_meta::datanode::{RegionManifestInfo, RegionStat}; +use common_telemetry::{debug, info}; +use ordered_float::OrderedFloat; +use store_api::region_engine::RegionRole; +use store_api::storage::RegionId; +use table::metadata::TableId; + +use crate::error::Result; +use crate::gc::scheduler::GcScheduler; + +/// Represents a region candidate for GC with its priority score. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct GcCandidate { + pub(crate) region_id: RegionId, + pub(crate) score: OrderedFloat, + pub(crate) region_stat: RegionStat, +} + +impl GcCandidate { + fn new(region_id: RegionId, score: f64, region_stat: RegionStat) -> Self { + Self { + region_id, + score: OrderedFloat(score), + region_stat, + } + } + + #[allow(unused)] + fn score_f64(&self) -> f64 { + self.score.into_inner() + } +} + +impl GcScheduler { + /// Calculate GC priority score for a region based on various metrics. + fn calculate_gc_score(&self, region_stat: &RegionStat) -> f64 { + let sst_count_score = region_stat.sst_num as f64 * self.config.sst_count_weight; + + let file_remove_cnt_score = match ®ion_stat.region_manifest { + RegionManifestInfo::Mito { + file_removed_cnt, .. + } => *file_removed_cnt as f64 * self.config.file_removed_count_weight, + // Metric engine doesn't have file_removal_rate, also this should be unreachable since metrics engine doesn't support gc + RegionManifestInfo::Metric { .. } => 0.0, + }; + + sst_count_score + file_remove_cnt_score + } + + /// Filter and score regions that are candidates for GC, grouped by table. + pub(crate) async fn select_gc_candidates( + &self, + table_to_region_stats: &HashMap>, + ) -> Result>> { + let mut table_candidates: HashMap> = HashMap::new(); + let now = Instant::now(); + + for (table_id, region_stats) in table_to_region_stats { + let mut candidates = Vec::new(); + let tracker = self.region_gc_tracker.lock().await; + + for region_stat in region_stats { + if region_stat.role != RegionRole::Leader { + continue; + } + + // Skip regions that are too small + if region_stat.approximate_bytes < self.config.min_region_size_threshold { + continue; + } + + // Skip regions that are in cooldown period + if let Some(gc_info) = tracker.get(®ion_stat.id) + && now.duration_since(gc_info.last_gc_time) < self.config.gc_cooldown_period + { + debug!("Skipping region {} due to cooldown", region_stat.id); + continue; + } + + let score = self.calculate_gc_score(region_stat); + + debug!( + "Region {} (table {}) has GC score {:.4}", + region_stat.id, table_id, score + ); + + // Only consider regions with a meaningful score + if score > 0.0 { + candidates.push(GcCandidate::new(region_stat.id, score, region_stat.clone())); + } + } + + // Sort candidates by score in descending order and take top N + candidates.sort_by(|a, b| b.score.cmp(&a.score)); + let top_candidates: Vec = candidates + .into_iter() + .take(self.config.regions_per_table_threshold) + .collect(); + + if !top_candidates.is_empty() { + info!( + "Selected {} GC candidates for table {} (top {} out of all qualified)", + top_candidates.len(), + table_id, + self.config.regions_per_table_threshold + ); + table_candidates.insert(*table_id, top_candidates); + } + } + + info!( + "Selected GC candidates for {} tables", + table_candidates.len() + ); + Ok(table_candidates) + } +} diff --git a/src/meta-srv/src/gc/ctx.rs b/src/meta-srv/src/gc/ctx.rs new file mode 100644 index 0000000000..7b1cfc68e1 --- /dev/null +++ b/src/meta-srv/src/gc/ctx.rs @@ -0,0 +1,380 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::time::Duration; + +use api::v1::meta::MailboxMessage; +use common_meta::datanode::RegionStat; +use common_meta::instruction::{ + GcRegions, GetFileRefs, GetFileRefsReply, Instruction, InstructionReply, +}; +use common_meta::key::TableMetadataManagerRef; +use common_meta::key::table_route::PhysicalTableRouteValue; +use common_meta::peer::Peer; +use common_procedure::{ProcedureManagerRef, ProcedureWithId, watcher}; +use common_telemetry::{debug, error, warn}; +use snafu::{OptionExt as _, ResultExt as _}; +use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId}; +use table::metadata::TableId; + +use crate::cluster::MetaPeerClientRef; +use crate::error::{self, Result, TableMetadataManagerSnafu, UnexpectedSnafu}; +use crate::gc::Region2Peers; +use crate::gc::procedure::{BatchGcProcedure, GcRegionProcedure}; +use crate::handler::HeartbeatMailbox; +use crate::service::mailbox::{Channel, MailboxRef}; + +#[async_trait::async_trait] +pub(crate) trait SchedulerCtx: Send + Sync { + async fn get_table_to_region_stats(&self) -> Result>>; + + async fn get_table_route( + &self, + table_id: TableId, + ) -> Result<(TableId, PhysicalTableRouteValue)>; + + async fn get_file_references( + &self, + query_regions: &[RegionId], + related_regions: HashMap>, + region_routes: &Region2Peers, + timeout: Duration, + ) -> Result; + + async fn gc_regions( + &self, + peer: Peer, + region_ids: &[RegionId], + file_refs_manifest: &FileRefsManifest, + full_file_listing: bool, + timeout: Duration, + ) -> Result; +} + +pub(crate) struct DefaultGcSchedulerCtx { + /// The metadata manager. + pub(crate) table_metadata_manager: TableMetadataManagerRef, + /// Procedure manager. + pub(crate) procedure_manager: ProcedureManagerRef, + /// For getting `RegionStats`. + pub(crate) meta_peer_client: MetaPeerClientRef, + /// The mailbox to send messages. + pub(crate) mailbox: MailboxRef, + /// The server address. + pub(crate) server_addr: String, +} + +impl DefaultGcSchedulerCtx { + pub fn try_new( + table_metadata_manager: TableMetadataManagerRef, + procedure_manager: ProcedureManagerRef, + meta_peer_client: MetaPeerClientRef, + mailbox: MailboxRef, + server_addr: String, + ) -> Result { + Ok(Self { + table_metadata_manager, + procedure_manager, + meta_peer_client, + mailbox, + server_addr, + }) + } +} + +#[async_trait::async_trait] +impl SchedulerCtx for DefaultGcSchedulerCtx { + async fn get_table_to_region_stats(&self) -> Result>> { + let dn_stats = self.meta_peer_client.get_all_dn_stat_kvs().await?; + let mut table_to_region_stats: HashMap> = HashMap::new(); + for (_dn_id, stats) in dn_stats { + let mut stats = stats.stats; + + let Some(latest_stat) = stats.iter().max_by_key(|s| s.timestamp_millis).cloned() else { + continue; + }; + + for region_stat in latest_stat.region_stats { + table_to_region_stats + .entry(region_stat.id.table_id()) + .or_default() + .push(region_stat); + } + } + Ok(table_to_region_stats) + } + + async fn get_table_route( + &self, + table_id: TableId, + ) -> Result<(TableId, PhysicalTableRouteValue)> { + self.table_metadata_manager + .table_route_manager() + .get_physical_table_route(table_id) + .await + .context(TableMetadataManagerSnafu) + } + + async fn gc_regions( + &self, + peer: Peer, + region_ids: &[RegionId], + file_refs_manifest: &FileRefsManifest, + full_file_listing: bool, + timeout: Duration, + ) -> Result { + self.gc_regions_inner( + peer, + region_ids, + file_refs_manifest, + full_file_listing, + timeout, + ) + .await + } + + async fn get_file_references( + &self, + query_regions: &[RegionId], + related_regions: HashMap>, + region_routes: &Region2Peers, + timeout: Duration, + ) -> Result { + debug!( + "Getting file references for {} regions", + query_regions.len() + ); + + // Group regions by datanode to minimize RPC calls + let mut datanode2query_regions: HashMap> = HashMap::new(); + + for region_id in query_regions { + if let Some((leader, followers)) = region_routes.get(region_id) { + datanode2query_regions + .entry(leader.clone()) + .or_default() + .push(*region_id); + // also need to send for follower regions for file refs in case query is running on follower + for follower in followers { + datanode2query_regions + .entry(follower.clone()) + .or_default() + .push(*region_id); + } + } else { + return error::UnexpectedSnafu { + violated: format!( + "region_routes: {region_routes:?} does not contain region_id: {region_id}", + ), + } + .fail(); + } + } + let mut datanode2related_regions: HashMap>> = + HashMap::new(); + for (related_region, queries) in related_regions { + if let Some((leader, followers)) = region_routes.get(&related_region) { + datanode2related_regions + .entry(leader.clone()) + .or_default() + .insert(related_region, queries.clone()); + } // since read from manifest, no need to send to followers + } + + // Send GetFileRefs instructions to each datanode + let mut all_file_refs: HashMap> = HashMap::new(); + let mut all_manifest_versions = HashMap::new(); + + for (peer, regions) in datanode2query_regions { + let related_regions = datanode2related_regions.remove(&peer).unwrap_or_default(); + match self + .send_get_file_refs_instruction(&peer, ®ions, related_regions, timeout) + .await + { + Ok(manifest) => { + // TODO(discord9): if other regions provide file refs for one region on other datanode, and no version, + // is it correct to merge manifest_version directly? + // FIXME: follower region how to merge version??? + + for (region_id, file_refs) in manifest.file_refs { + all_file_refs + .entry(region_id) + .or_default() + .extend(file_refs); + } + // region manifest version should be the smallest one among all peers, so outdated region can be detected + for (region_id, version) in manifest.manifest_version { + let entry = all_manifest_versions.entry(region_id).or_insert(version); + *entry = (*entry).min(version); + } + } + Err(e) => { + warn!( + "Failed to get file refs from datanode {}: {}. Skipping regions on this datanode.", + peer, e + ); + // Continue processing other datanodes instead of failing the entire operation + continue; + } + } + } + + Ok(FileRefsManifest { + file_refs: all_file_refs, + manifest_version: all_manifest_versions, + }) + } +} + +impl DefaultGcSchedulerCtx { + async fn gc_regions_inner( + &self, + peer: Peer, + region_ids: &[RegionId], + file_refs_manifest: &FileRefsManifest, + full_file_listing: bool, + timeout: Duration, + ) -> Result { + debug!( + "Sending GC instruction to datanode {} for {} regions (full_file_listing: {})", + peer, + region_ids.len(), + full_file_listing + ); + + let gc_regions = GcRegions { + regions: region_ids.to_vec(), + file_refs_manifest: file_refs_manifest.clone(), + full_file_listing, + }; + let procedure = GcRegionProcedure::new( + self.mailbox.clone(), + self.server_addr.clone(), + peer, + gc_regions, + format!("GC for {} regions", region_ids.len()), + timeout, + ); + let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure)); + + let id = procedure_with_id.id; + + let mut watcher = self + .procedure_manager + .submit(procedure_with_id) + .await + .context(error::SubmitProcedureSnafu)?; + let res = watcher::wait(&mut watcher) + .await + .context(error::WaitProcedureSnafu)? + .with_context(|| error::UnexpectedSnafu { + violated: format!( + "GC procedure {id} successfully completed but no result returned" + ), + })?; + + let gc_report = GcRegionProcedure::cast_result(res)?; + + Ok(gc_report) + } + + /// TODO(discord9): add support to read manifest of related regions for file refs too + /// (now it's only reading active FileHandles) + async fn send_get_file_refs_instruction( + &self, + peer: &Peer, + query_regions: &[RegionId], + related_regions: HashMap>, + timeout: Duration, + ) -> Result { + debug!( + "Sending GetFileRefs instruction to datanode {} for {} regions", + peer, + query_regions.len() + ); + + let instruction = Instruction::GetFileRefs(GetFileRefs { + query_regions: query_regions.to_vec(), + related_regions, + }); + + let reply = self + .send_instruction(peer, instruction, "Get file references", timeout) + .await?; + + let InstructionReply::GetFileRefs(GetFileRefsReply { + file_refs_manifest, + success, + error, + }) = reply + else { + return error::UnexpectedInstructionReplySnafu { + mailbox_message: format!("{:?}", reply), + reason: "Unexpected reply of the GetFileRefs instruction", + } + .fail(); + }; + + if !success { + return error::UnexpectedSnafu { + violated: format!( + "Failed to get file references from datanode {}: {:?}", + peer, error + ), + } + .fail(); + } + + Ok(file_refs_manifest) + } + + async fn send_instruction( + &self, + peer: &Peer, + instruction: Instruction, + description: &str, + timeout: Duration, + ) -> Result { + let msg = MailboxMessage::json_message( + &format!("{}: {}", description, instruction), + &format!("Metasrv@{}", self.server_addr), + &format!("Datanode-{}@{}", peer.id, peer.addr), + common_time::util::current_time_millis(), + &instruction, + ) + .with_context(|_| error::SerializeToJsonSnafu { + input: instruction.to_string(), + })?; + + let mailbox_rx = self + .mailbox + .send(&Channel::Datanode(peer.id), msg, timeout) + .await?; + + match mailbox_rx.await { + Ok(reply_msg) => { + let reply = HeartbeatMailbox::json_reply(&reply_msg)?; + Ok(reply) + } + Err(e) => { + error!( + "Failed to receive reply from datanode {} for {}: {}", + peer, description, e + ); + Err(e) + } + } + } +} diff --git a/src/meta-srv/src/gc/handler.rs b/src/meta-srv/src/gc/handler.rs new file mode 100644 index 0000000000..c5574f1adb --- /dev/null +++ b/src/meta-srv/src/gc/handler.rs @@ -0,0 +1,459 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::time::Instant; + +use common_meta::key::table_route::PhysicalTableRouteValue; +use common_meta::peer::Peer; +use common_telemetry::{debug, error, info, warn}; +use futures::StreamExt; +use itertools::Itertools; +use store_api::storage::{FileRefsManifest, GcReport, RegionId}; +use table::metadata::TableId; +use tokio::time::sleep; + +use crate::error::Result; +use crate::gc::candidate::GcCandidate; +use crate::gc::scheduler::{GcJobReport, GcScheduler}; +use crate::gc::tracker::RegionGcInfo; +use crate::region; + +pub(crate) type Region2Peers = HashMap)>; + +pub(crate) type Peer2Regions = HashMap>; + +impl GcScheduler { + /// Iterate through all region stats, find region that might need gc, and send gc instruction to + /// the corresponding datanode with improved parallel processing and retry logic. + pub(crate) async fn trigger_gc(&self) -> Result { + let start_time = Instant::now(); + info!("Starting GC cycle"); + + // Step 1: Get all region statistics + let table_to_region_stats = self.ctx.get_table_to_region_stats().await?; + info!( + "Fetched region stats for {} tables", + table_to_region_stats.len() + ); + + // Step 2: Select GC candidates based on our scoring algorithm + let per_table_candidates = self.select_gc_candidates(&table_to_region_stats).await?; + + if per_table_candidates.is_empty() { + info!("No GC candidates found, skipping GC cycle"); + return Ok(Default::default()); + } + + // Step 3: Aggregate candidates by datanode + let datanode_to_candidates = self + .aggregate_candidates_by_datanode(per_table_candidates) + .await?; + + if datanode_to_candidates.is_empty() { + info!("No valid datanode candidates found, skipping GC cycle"); + return Ok(Default::default()); + } + + // Step 4: Process datanodes concurrently with limited parallelism + let report = self + .parallel_process_datanodes(datanode_to_candidates) + .await; + + let duration = start_time.elapsed(); + info!( + "Finished GC cycle. Processed {} datanodes ({} failed). Duration: {:?}", + report.per_datanode_reports.len(), // Reuse field for datanode count + report.failed_datanodes.len(), + duration + ); + debug!("Detailed GC Job Report: {report:#?}"); + + Ok(report) + } + + /// Find related regions that might share files with the candidate regions. + /// Currently returns the same regions since repartition is not implemented yet. + /// TODO(discord9): When repartition is implemented, this should also find src/dst regions + /// that might share files with the candidate regions. + pub(crate) async fn find_related_regions( + &self, + candidate_region_ids: &[RegionId], + ) -> Result>> { + Ok(candidate_region_ids.iter().map(|&r| (r, vec![r])).collect()) + } + + /// Aggregate GC candidates by their corresponding datanode peer. + pub(crate) async fn aggregate_candidates_by_datanode( + &self, + per_table_candidates: HashMap>, + ) -> Result>> { + let mut datanode_to_candidates: HashMap> = HashMap::new(); + + for (table_id, candidates) in per_table_candidates { + if candidates.is_empty() { + continue; + } + + // Get table route information to map regions to peers + let (phy_table_id, table_peer) = self.ctx.get_table_route(table_id).await?; + + if phy_table_id != table_id { + // Skip logical tables + continue; + } + + let region_to_peer = table_peer + .region_routes + .iter() + .filter_map(|r| { + r.leader_peer + .as_ref() + .map(|peer| (r.region.id, peer.clone())) + }) + .collect::>(); + + for candidate in candidates { + if let Some(peer) = region_to_peer.get(&candidate.region_id) { + datanode_to_candidates + .entry(peer.clone()) + .or_default() + .push((table_id, candidate)); + } else { + warn!( + "Skipping region {} for table {}: no leader peer found", + candidate.region_id, table_id + ); + } + } + } + + info!( + "Aggregated GC candidates for {} datanodes", + datanode_to_candidates.len() + ); + Ok(datanode_to_candidates) + } + + /// Process multiple datanodes concurrently with limited parallelism. + pub(crate) async fn parallel_process_datanodes( + &self, + datanode_to_candidates: HashMap>, + ) -> GcJobReport { + let mut report = GcJobReport::default(); + + // Create a stream of datanode GC tasks with limited concurrency + let results: Vec<_> = futures::stream::iter( + datanode_to_candidates + .into_iter() + .filter(|(_, candidates)| !candidates.is_empty()), + ) + .map(|(peer, candidates)| { + let scheduler = self; + let peer_clone = peer.clone(); + async move { + ( + peer, + scheduler.process_datanode_gc(peer_clone, candidates).await, + ) + } + }) + .buffer_unordered(self.config.max_concurrent_tables) // Reuse table concurrency limit for datanodes + .collect() + .await; + + // Process all datanode GC results and collect regions that need retry from table reports + for (peer, result) in results { + match result { + Ok(dn_report) => { + report.per_datanode_reports.insert(peer.id, dn_report); + } + Err(e) => { + error!("Failed to process datanode GC for peer {}: {:#?}", peer, e); + // Note: We don't have a direct way to map peer to table_id here, + // so we just log the error. The table_reports will contain individual region failures. + report.failed_datanodes.entry(peer.id).or_default().push(e); + } + } + } + + report + } + + /// Process GC for a single datanode with all its candidate regions. + /// Returns the table reports for this datanode. + pub(crate) async fn process_datanode_gc( + &self, + peer: Peer, + candidates: Vec<(TableId, GcCandidate)>, + ) -> Result { + info!( + "Starting GC for datanode {} with {} candidate regions", + peer, + candidates.len() + ); + + if candidates.is_empty() { + return Ok(Default::default()); + } + + let all_region_ids: Vec = candidates.iter().map(|(_, c)| c.region_id).collect(); + + let all_related_regions = self.find_related_regions(&all_region_ids).await?; + + let (region_to_peer, _) = self + .discover_datanodes_for_regions(&all_related_regions.keys().cloned().collect_vec()) + .await?; + + // Step 1: Get file references for all regions on this datanode + let file_refs_manifest = self + .ctx + .get_file_references( + &all_region_ids, + all_related_regions, + ®ion_to_peer, + self.config.mailbox_timeout, + ) + .await?; + + // Step 2: Create a single GcRegionProcedure for all regions on this datanode + let (gc_report, fully_listed_regions) = { + // Partition regions into full listing and fast listing in a single pass + + let mut batch_full_listing_decisions = + self.batch_should_use_full_listing(&all_region_ids).await; + + let need_full_list_regions = batch_full_listing_decisions + .iter() + .filter_map( + |(®ion_id, &need_full)| { + if need_full { Some(region_id) } else { None } + }, + ) + .collect_vec(); + let mut fast_list_regions = batch_full_listing_decisions + .iter() + .filter_map( + |(®ion_id, &need_full)| { + if !need_full { Some(region_id) } else { None } + }, + ) + .collect_vec(); + + let mut combined_report = GcReport::default(); + + // First process regions that can fast list + if !fast_list_regions.is_empty() { + match self + .ctx + .gc_regions( + peer.clone(), + &fast_list_regions, + &file_refs_manifest, + false, + self.config.mailbox_timeout, + ) + .await + { + Ok(report) => combined_report.merge(report), + Err(e) => { + error!( + "Failed to GC regions {:?} on datanode {}: {}", + fast_list_regions, peer, e + ); + + // Add to need_retry_regions since it failed + combined_report + .need_retry_regions + .extend(fast_list_regions.clone().into_iter()); + } + } + } + + if !need_full_list_regions.is_empty() { + match self + .ctx + .gc_regions( + peer.clone(), + &need_full_list_regions, + &file_refs_manifest, + true, + self.config.mailbox_timeout, + ) + .await + { + Ok(report) => combined_report.merge(report), + Err(e) => { + error!( + "Failed to GC regions {:?} on datanode {}: {}", + need_full_list_regions, peer, e + ); + + // Add to need_retry_regions since it failed + combined_report + .need_retry_regions + .extend(need_full_list_regions.clone()); + } + } + } + let fully_listed_regions = need_full_list_regions + .into_iter() + .filter(|r| !combined_report.need_retry_regions.contains(r)) + .collect::>(); + + (combined_report, fully_listed_regions) + }; + + // Step 3: Process the combined GC report and update table reports + for region_id in &all_region_ids { + self.update_full_listing_time(*region_id, fully_listed_regions.contains(region_id)) + .await; + } + + info!( + "Completed GC for datanode {}: {} regions processed", + peer, + all_region_ids.len() + ); + + Ok(gc_report) + } + + /// Discover datanodes for the given regions(and it's related regions) by fetching table routes in batches. + /// Returns mappings from region to peer(leader, Vec) and peer to regions. + async fn discover_datanodes_for_regions( + &self, + regions: &[RegionId], + ) -> Result<(Region2Peers, Peer2Regions)> { + let all_related_regions = self + .find_related_regions(regions) + .await? + .into_iter() + .flat_map(|(k, mut v)| { + v.push(k); + v + }) + .collect_vec(); + let mut region_to_peer = HashMap::new(); + let mut peer_to_regions = HashMap::new(); + + // Group regions by table ID for batch processing + let mut table_to_regions: HashMap> = HashMap::new(); + for region_id in all_related_regions { + let table_id = region_id.table_id(); + table_to_regions + .entry(table_id) + .or_default() + .push(region_id); + } + + // Process each table's regions together for efficiency + for (table_id, table_regions) in table_to_regions { + match self.ctx.get_table_route(table_id).await { + Ok((_phy_table_id, table_route)) => { + self.get_table_regions_peer( + &table_route, + &table_regions, + &mut region_to_peer, + &mut peer_to_regions, + ); + } + Err(e) => { + // Continue with other tables instead of failing completely + // TODO(discord9): consider failing here instead + warn!( + "Failed to get table route for table {}: {}, skipping its regions", + table_id, e + ); + continue; + } + } + } + + Ok((region_to_peer, peer_to_regions)) + } + + /// Process regions for a single table to find their current leader peers. + fn get_table_regions_peer( + &self, + table_route: &PhysicalTableRouteValue, + table_regions: &[RegionId], + region_to_peer: &mut Region2Peers, + peer_to_regions: &mut Peer2Regions, + ) { + for ®ion_id in table_regions { + let mut found = false; + + // Find the region in the table route + for region_route in &table_route.region_routes { + if region_route.region.id == region_id + && let Some(leader_peer) = ®ion_route.leader_peer + { + region_to_peer.insert( + region_id, + (leader_peer.clone(), region_route.follower_peers.clone()), + ); + peer_to_regions + .entry(leader_peer.clone()) + .or_default() + .insert(region_id); + found = true; + break; + } + } + + if !found { + warn!( + "Failed to find region {} in table route or no leader peer found", + region_id, + ); + } + } + } + + async fn batch_should_use_full_listing( + &self, + region_ids: &[RegionId], + ) -> HashMap { + let mut result = HashMap::new(); + let mut gc_tracker = self.region_gc_tracker.lock().await; + let now = Instant::now(); + for ®ion_id in region_ids { + let use_full_listing = { + if let Some(gc_info) = gc_tracker.get(®ion_id) { + if let Some(last_full_listing) = gc_info.last_full_listing_time { + // check if pass cooling down interval after last full listing + let elapsed = now.duration_since(last_full_listing); + elapsed >= self.config.full_file_listing_interval + } else { + // Never did full listing for this region, do it now + true + } + } else { + // First time GC for this region, skip doing full listing, for this time + gc_tracker.insert( + region_id, + RegionGcInfo { + last_gc_time: now, + last_full_listing_time: Some(now), + }, + ); + false + } + }; + result.insert(region_id, use_full_listing); + } + result + } +} diff --git a/src/meta-srv/src/gc/options.rs b/src/meta-srv/src/gc/options.rs new file mode 100644 index 0000000000..02ed25323a --- /dev/null +++ b/src/meta-srv/src/gc/options.rs @@ -0,0 +1,171 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use serde::{Deserialize, Serialize}; +use snafu::ensure; + +use crate::error::{self, Result}; + +/// The interval of the gc ticker. +#[allow(unused)] +pub(crate) const TICKER_INTERVAL: Duration = Duration::from_secs(60 * 5); + +/// Configuration for GC operations. +/// +/// TODO(discord9): not expose most config to users for now, until GC scheduler is fully stable. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(default)] +pub struct GcSchedulerOptions { + /// Whether GC is enabled. Default to false. + /// If set to false, no GC will be performed, and potentially some + /// files from datanodes will never be deleted. + pub enable: bool, + /// Maximum number of tables to process concurrently. + pub max_concurrent_tables: usize, + /// Maximum number of retries per region when GC fails. + pub max_retries_per_region: usize, + /// Concurrency for region GC within a table. + pub region_gc_concurrency: usize, + /// Backoff duration between retries. + pub retry_backoff_duration: Duration, + /// Minimum region size threshold for GC (in bytes). + pub min_region_size_threshold: u64, + /// Weight for SST file count in GC scoring. + pub sst_count_weight: f64, + /// Weight for file removal rate in GC scoring. + pub file_removed_count_weight: f64, + /// Cooldown period between GC operations on the same region. + pub gc_cooldown_period: Duration, + /// Maximum number of regions to select for GC per table. + pub regions_per_table_threshold: usize, + /// Timeout duration for mailbox communication with datanodes. + pub mailbox_timeout: Duration, + /// Interval for performing full file listing during GC to find orphan files. + /// Full file listing is expensive but necessary to clean up orphan files. + /// Set to a larger value (e.g., 24 hours) to balance performance and cleanup. + /// Every Nth GC cycle will use full file listing, where N = full_file_listing_interval / TICKER_INTERVAL. + pub full_file_listing_interval: Duration, + /// Interval for cleaning up stale region entries from the GC tracker. + /// This removes entries for regions that no longer exist (e.g., after table drops). + /// Set to a larger value (e.g., 6 hours) since this is just for memory cleanup. + pub tracker_cleanup_interval: Duration, +} + +impl Default for GcSchedulerOptions { + fn default() -> Self { + Self { + enable: false, + max_concurrent_tables: 10, + max_retries_per_region: 3, + retry_backoff_duration: Duration::from_secs(5), + region_gc_concurrency: 16, + min_region_size_threshold: 100 * 1024 * 1024, // 100MB + sst_count_weight: 1.0, + file_removed_count_weight: 0.5, + gc_cooldown_period: Duration::from_secs(60 * 5), // 5 minutes + regions_per_table_threshold: 20, // Select top 20 regions per table + mailbox_timeout: Duration::from_secs(60), // 60 seconds + // Perform full file listing every 24 hours to find orphan files + full_file_listing_interval: Duration::from_secs(60 * 60 * 24), + // Clean up stale tracker entries every 6 hours + tracker_cleanup_interval: Duration::from_secs(60 * 60 * 6), + } + } +} + +impl GcSchedulerOptions { + /// Validates the configuration options. + pub fn validate(&self) -> Result<()> { + ensure!( + self.max_concurrent_tables > 0, + error::InvalidArgumentsSnafu { + err_msg: "max_concurrent_tables must be greater than 0", + } + ); + + ensure!( + self.max_retries_per_region > 0, + error::InvalidArgumentsSnafu { + err_msg: "max_retries_per_region must be greater than 0", + } + ); + + ensure!( + self.region_gc_concurrency > 0, + error::InvalidArgumentsSnafu { + err_msg: "region_gc_concurrency must be greater than 0", + } + ); + + ensure!( + !self.retry_backoff_duration.is_zero(), + error::InvalidArgumentsSnafu { + err_msg: "retry_backoff_duration must be greater than 0", + } + ); + + ensure!( + self.sst_count_weight >= 0.0, + error::InvalidArgumentsSnafu { + err_msg: "sst_count_weight must be non-negative", + } + ); + + ensure!( + self.file_removed_count_weight >= 0.0, + error::InvalidArgumentsSnafu { + err_msg: "file_removal_rate_weight must be non-negative", + } + ); + + ensure!( + !self.gc_cooldown_period.is_zero(), + error::InvalidArgumentsSnafu { + err_msg: "gc_cooldown_period must be greater than 0", + } + ); + + ensure!( + self.regions_per_table_threshold > 0, + error::InvalidArgumentsSnafu { + err_msg: "regions_per_table_threshold must be greater than 0", + } + ); + + ensure!( + !self.mailbox_timeout.is_zero(), + error::InvalidArgumentsSnafu { + err_msg: "mailbox_timeout must be greater than 0", + } + ); + + ensure!( + !self.full_file_listing_interval.is_zero(), + error::InvalidArgumentsSnafu { + err_msg: "full_file_listing_interval must be greater than 0", + } + ); + + ensure!( + !self.tracker_cleanup_interval.is_zero(), + error::InvalidArgumentsSnafu { + err_msg: "tracker_cleanup_interval must be greater than 0", + } + ); + + Ok(()) + } +} diff --git a/src/meta-srv/src/gc/procedure.rs b/src/meta-srv/src/gc/procedure.rs new file mode 100644 index 0000000000..4ddd606630 --- /dev/null +++ b/src/meta-srv/src/gc/procedure.rs @@ -0,0 +1,544 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; + +use api::v1::meta::MailboxMessage; +use common_meta::instruction::{self, GcRegions, GetFileRefs, GetFileRefsReply, InstructionReply}; +use common_meta::lock_key::RegionLock; +use common_meta::peer::Peer; +use common_procedure::error::ToJsonSnafu; +use common_procedure::{ + Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, + Result as ProcedureResult, Status, +}; +use common_telemetry::{debug, error, info, warn}; +use itertools::Itertools as _; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt as _; +use store_api::storage::{FileRefsManifest, GcReport, RegionId}; + +use crate::error::{self, Result, SerializeToJsonSnafu}; +use crate::gc::Region2Peers; +use crate::handler::HeartbeatMailbox; +use crate::service::mailbox::{Channel, MailboxRef}; + +/// Helper function to send GetFileRefs instruction and wait for reply. +async fn send_get_file_refs( + mailbox: &MailboxRef, + server_addr: &str, + peer: &Peer, + instruction: GetFileRefs, + timeout: Duration, +) -> Result { + let instruction = instruction::Instruction::GetFileRefs(instruction); + let msg = MailboxMessage::json_message( + &format!("Get file references: {}", instruction), + &format!("Metasrv@{}", server_addr), + &format!("Datanode-{}@{}", peer.id, peer.addr), + common_time::util::current_time_millis(), + &instruction, + ) + .with_context(|_| SerializeToJsonSnafu { + input: instruction.to_string(), + })?; + + let mailbox_rx = mailbox + .send(&Channel::Datanode(peer.id), msg, timeout) + .await?; + + let reply = match mailbox_rx.await { + Ok(reply_msg) => HeartbeatMailbox::json_reply(&reply_msg)?, + Err(e) => { + error!( + "Failed to receive reply from datanode {} for GetFileRefs: {}", + peer, e + ); + return Err(e); + } + }; + + let InstructionReply::GetFileRefs(reply) = reply else { + return error::UnexpectedInstructionReplySnafu { + mailbox_message: format!("{:?}", reply), + reason: "Unexpected reply of the GetFileRefs instruction", + } + .fail(); + }; + + Ok(reply) +} + +/// Helper function to send GcRegions instruction and wait for reply. +async fn send_gc_regions( + mailbox: &MailboxRef, + peer: &Peer, + gc_regions: GcRegions, + server_addr: &str, + timeout: Duration, + description: &str, +) -> Result { + let instruction = instruction::Instruction::GcRegions(gc_regions.clone()); + let msg = MailboxMessage::json_message( + &format!("{}: {}", description, instruction), + &format!("Metasrv@{}", server_addr), + &format!("Datanode-{}@{}", peer.id, peer.addr), + common_time::util::current_time_millis(), + &instruction, + ) + .with_context(|_| SerializeToJsonSnafu { + input: instruction.to_string(), + })?; + + let mailbox_rx = mailbox + .send(&Channel::Datanode(peer.id), msg, timeout) + .await?; + + let reply = match mailbox_rx.await { + Ok(reply_msg) => HeartbeatMailbox::json_reply(&reply_msg)?, + Err(e) => { + error!( + "Failed to receive reply from datanode {} for {}: {}", + peer, description, e + ); + return Err(e); + } + }; + + let InstructionReply::GcRegions(reply) = reply else { + return error::UnexpectedInstructionReplySnafu { + mailbox_message: format!("{:?}", reply), + reason: "Unexpected reply of the GcRegions instruction", + } + .fail(); + }; + + let res = reply.result; + match res { + Ok(report) => Ok(report), + Err(e) => { + error!( + "Datanode {} reported error during GC for regions {:?}: {}", + peer, gc_regions, e + ); + error::UnexpectedSnafu { + violated: format!( + "Datanode {} reported error during GC for regions {:?}: {}", + peer, gc_regions, e + ), + } + .fail() + } + } +} + +/// TODO(discord9): another procedure which do both get file refs and gc regions. +pub struct GcRegionProcedure { + mailbox: MailboxRef, + data: GcRegionData, +} + +#[derive(Serialize, Deserialize)] +pub struct GcRegionData { + server_addr: String, + peer: Peer, + gc_regions: GcRegions, + description: String, + timeout: Duration, +} + +impl GcRegionProcedure { + pub const TYPE_NAME: &'static str = "metasrv-procedure::GcRegionProcedure"; + + pub fn new( + mailbox: MailboxRef, + server_addr: String, + peer: Peer, + gc_regions: GcRegions, + description: String, + timeout: Duration, + ) -> Self { + Self { + mailbox, + data: GcRegionData { + peer, + server_addr, + gc_regions, + description, + timeout, + }, + } + } + + async fn send_gc_instr(&self) -> Result { + send_gc_regions( + &self.mailbox, + &self.data.peer, + self.data.gc_regions.clone(), + &self.data.server_addr, + self.data.timeout, + &self.data.description, + ) + .await + } + + pub fn cast_result(res: Arc) -> Result { + res.downcast_ref::().cloned().ok_or_else(|| { + error::UnexpectedSnafu { + violated: format!( + "Failed to downcast procedure result to GcReport, got {:?}", + std::any::type_name_of_val(&res.as_ref()) + ), + } + .build() + }) + } +} + +#[async_trait::async_trait] +impl Procedure for GcRegionProcedure { + fn type_name(&self) -> &str { + Self::TYPE_NAME + } + + async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + // Send GC instruction to the datanode. This procedure only handle lock&send, results or other kind of + // errors will be reported back via the oneshot channel. + let reply = self + .send_gc_instr() + .await + .map_err(ProcedureError::external)?; + + Ok(Status::done_with_output(reply)) + } + + fn dump(&self) -> ProcedureResult { + serde_json::to_string(&self.data).context(ToJsonSnafu) + } + + /// Read lock all regions involved in this GC procedure. + /// So i.e. region migration won't happen during GC and cause race conditions. + /// + /// only read lock the regions not catatlog/schema because it can run concurrently with other procedures(i.e. drop database/table) + /// TODO:(discord9): integration test to verify this + fn lock_key(&self) -> LockKey { + let lock_key: Vec<_> = self + .data + .gc_regions + .regions + .iter() + .sorted() // sort to have a deterministic lock order + .map(|id| RegionLock::Read(*id).into()) + .collect(); + + LockKey::new(lock_key) + } +} + +/// Procedure to perform get file refs then batch GC for multiple regions, should only be used by admin function +/// for triggering manual gc, as it holds locks for too long and for all regions during the procedure. +pub struct BatchGcProcedure { + mailbox: MailboxRef, + data: BatchGcData, +} + +#[derive(Serialize, Deserialize)] +pub struct BatchGcData { + state: State, + server_addr: String, + /// The regions to be GC-ed + regions: Vec, + full_file_listing: bool, + region_routes: Region2Peers, + /// Related regions (e.g., for shared files). Map: RegionId -> List of related RegionIds. + related_regions: HashMap>, + /// Acquired file references (Populated in Acquiring state) + file_refs: FileRefsManifest, + /// mailbox timeout duration + timeout: Duration, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum State { + /// Initial state + Start, + /// Fetching file references from datanodes + Acquiring, + /// Sending GC instruction to the target datanode + Gcing, +} + +impl BatchGcProcedure { + pub const TYPE_NAME: &'static str = "metasrv-procedure::BatchGcProcedure"; + + pub fn new( + mailbox: MailboxRef, + server_addr: String, + regions: Vec, + full_file_listing: bool, + region_routes: Region2Peers, + related_regions: HashMap>, + timeout: Duration, + ) -> Self { + Self { + mailbox, + data: BatchGcData { + state: State::Start, + server_addr, + regions, + full_file_listing, + region_routes, + related_regions, + file_refs: FileRefsManifest::default(), + timeout, + }, + } + } + + /// Get file references from all datanodes that host the regions + async fn get_file_references(&self) -> Result { + use std::collections::{HashMap, HashSet}; + + let query_regions = &self.data.regions; + let related_regions = &self.data.related_regions; + let region_routes = &self.data.region_routes; + let timeout = self.data.timeout; + + // Group regions by datanode to minimize RPC calls + let mut datanode2query_regions: HashMap> = HashMap::new(); + + for region_id in query_regions { + if let Some((leader, followers)) = region_routes.get(region_id) { + datanode2query_regions + .entry(leader.clone()) + .or_default() + .push(*region_id); + // also need to send for follower regions for file refs in case query is running on follower + for follower in followers { + datanode2query_regions + .entry(follower.clone()) + .or_default() + .push(*region_id); + } + } else { + return error::UnexpectedSnafu { + violated: format!( + "region_routes: {region_routes:?} does not contain region_id: {region_id}", + ), + } + .fail(); + } + } + + let mut datanode2related_regions: HashMap>> = + HashMap::new(); + for (related_region, queries) in related_regions { + if let Some((leader, _followers)) = region_routes.get(related_region) { + datanode2related_regions + .entry(leader.clone()) + .or_default() + .insert(*related_region, queries.clone()); + } // since read from manifest, no need to send to followers + } + + // Send GetFileRefs instructions to each datanode + let mut all_file_refs: HashMap> = + HashMap::new(); + let mut all_manifest_versions = HashMap::new(); + + for (peer, regions) in datanode2query_regions { + let related_regions_for_peer = + datanode2related_regions.remove(&peer).unwrap_or_default(); + + let instruction = GetFileRefs { + query_regions: regions.clone(), + related_regions: related_regions_for_peer, + }; + + let reply = send_get_file_refs( + &self.mailbox, + &self.data.server_addr, + &peer, + instruction, + timeout, + ) + .await?; + + if !reply.success { + return error::UnexpectedSnafu { + violated: format!( + "Failed to get file references from datanode {}: {:?}", + peer, reply.error + ), + } + .fail(); + } + + // Merge the file references from this datanode + for (region_id, file_refs) in reply.file_refs_manifest.file_refs { + all_file_refs + .entry(region_id) + .or_default() + .extend(file_refs); + } + + // region manifest version should be the smallest one among all peers, so outdated region can be detected + for (region_id, version) in reply.file_refs_manifest.manifest_version { + let entry = all_manifest_versions.entry(region_id).or_insert(version); + *entry = (*entry).min(version); + } + } + + Ok(FileRefsManifest { + file_refs: all_file_refs, + manifest_version: all_manifest_versions, + }) + } + + /// Send GC instruction to all datanodes that host the regions, + /// returns regions that need retry. + async fn send_gc_instructions(&self) -> Result> { + let regions = &self.data.regions; + let region_routes = &self.data.region_routes; + let file_refs = &self.data.file_refs; + let timeout = self.data.timeout; + + // Group regions by datanode + let mut datanode2regions: HashMap> = HashMap::new(); + + for region_id in regions { + if let Some((leader, _followers)) = region_routes.get(region_id) { + datanode2regions + .entry(leader.clone()) + .or_default() + .push(*region_id); + } else { + return error::UnexpectedSnafu { + violated: format!( + "region_routes: {region_routes:?} does not contain region_id: {region_id}", + ), + } + .fail(); + } + } + + let mut all_need_retry = HashSet::new(); + // Send GC instructions to each datanode + for (peer, regions_for_peer) in datanode2regions { + let gc_regions = GcRegions { + regions: regions_for_peer.clone(), + // file_refs_manifest can be large; cloning for each datanode is acceptable here since this is an admin-only operation. + file_refs_manifest: file_refs.clone(), + full_file_listing: self.data.full_file_listing, + }; + + let report = send_gc_regions( + &self.mailbox, + &peer, + gc_regions, + self.data.server_addr.as_str(), + timeout, + "Batch GC", + ) + .await?; + + let success = report.deleted_files.keys().collect_vec(); + let need_retry = report.need_retry_regions.iter().cloned().collect_vec(); + + if need_retry.is_empty() { + info!( + "GC report from datanode {}: successfully deleted files for regions {:?}", + peer, success + ); + } else { + warn!( + "GC report from datanode {}: successfully deleted files for regions {:?}, need retry for regions {:?}", + peer, success, need_retry + ); + } + all_need_retry.extend(report.need_retry_regions); + } + + Ok(all_need_retry.into_iter().collect()) + } +} + +#[async_trait::async_trait] +impl Procedure for BatchGcProcedure { + fn type_name(&self) -> &str { + Self::TYPE_NAME + } + + async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + match self.data.state { + State::Start => { + // Transition to Acquiring state + self.data.state = State::Acquiring; + Ok(Status::executing(false)) + } + State::Acquiring => { + // Get file references from all datanodes + match self.get_file_references().await { + Ok(file_refs) => { + self.data.file_refs = file_refs; + self.data.state = State::Gcing; + Ok(Status::executing(false)) + } + Err(e) => { + error!("Failed to get file references: {}", e); + Err(ProcedureError::external(e)) + } + } + } + State::Gcing => { + // Send GC instructions to all datanodes + // TODO(discord9): handle need-retry regions + match self.send_gc_instructions().await { + Ok(_) => { + info!( + "Batch GC completed successfully for regions {:?}", + self.data.regions + ); + Ok(Status::done()) + } + Err(e) => { + error!("Failed to send GC instructions: {}", e); + Err(ProcedureError::external(e)) + } + } + } + } + } + + fn dump(&self) -> ProcedureResult { + serde_json::to_string(&self.data).context(ToJsonSnafu) + } + + /// Read lock all regions involved in this GC procedure. + /// So i.e. region migration won't happen during GC and cause race conditions. + fn lock_key(&self) -> LockKey { + let lock_key: Vec<_> = self + .data + .regions + .iter() + .sorted() // sort to have a deterministic lock order + .map(|id| RegionLock::Read(*id).into()) + .collect(); + + LockKey::new(lock_key) + } +} diff --git a/src/meta-srv/src/gc/scheduler.rs b/src/meta-srv/src/gc/scheduler.rs new file mode 100644 index 0000000000..e3ed3834bb --- /dev/null +++ b/src/meta-srv/src/gc/scheduler.rs @@ -0,0 +1,162 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Instant; + +use common_meta::DatanodeId; +use common_meta::key::TableMetadataManagerRef; +use common_procedure::ProcedureManagerRef; +use common_telemetry::{error, info}; +use store_api::storage::GcReport; +use tokio::sync::Mutex; +use tokio::sync::mpsc::{Receiver, Sender}; + +use crate::cluster::MetaPeerClientRef; +use crate::define_ticker; +use crate::error::{Error, Result}; +use crate::gc::ctx::{DefaultGcSchedulerCtx, SchedulerCtx}; +use crate::gc::options::{GcSchedulerOptions, TICKER_INTERVAL}; +use crate::gc::tracker::RegionGcTracker; +use crate::service::mailbox::MailboxRef; + +/// Report for a GC job. +#[derive(Debug, Default)] +pub struct GcJobReport { + pub per_datanode_reports: HashMap, + pub failed_datanodes: HashMap>, +} +impl GcJobReport { + pub fn merge(&mut self, mut other: GcJobReport) { + // merge per_datanode_reports&failed_datanodes + for (dn_id, report) in other.per_datanode_reports { + let self_report = self.per_datanode_reports.entry(dn_id).or_default(); + self_report.merge(report); + } + let all_failed_dn_ids = self + .failed_datanodes + .keys() + .cloned() + .chain(other.failed_datanodes.keys().cloned()) + .collect::>(); + for dn_id in all_failed_dn_ids { + let entry = self.failed_datanodes.entry(dn_id).or_default(); + if let Some(other_errors) = other.failed_datanodes.remove(&dn_id) { + entry.extend(other_errors); + } + } + self.failed_datanodes + .retain(|dn_id, _| !self.per_datanode_reports.contains_key(dn_id)); + } +} + +/// [`Event`] represents various types of events that can be processed by the gc ticker. +/// +/// Variants: +/// - `Tick`: This event is used to trigger gc periodically. +pub(crate) enum Event { + Tick, +} + +#[allow(unused)] +pub(crate) type GcTickerRef = Arc; + +define_ticker!( + /// [GcTicker] is used to trigger gc periodically. + GcTicker, + event_type = Event, + event_value = Event::Tick +); + +/// [`GcScheduler`] is used to periodically trigger garbage collection on datanodes. +pub struct GcScheduler { + pub(crate) ctx: Arc, + /// The receiver of events. + pub(crate) receiver: Receiver, + /// GC configuration. + pub(crate) config: GcSchedulerOptions, + /// Tracks the last GC time for regions. + pub(crate) region_gc_tracker: Arc>, + /// Last time the tracker was cleaned up. + pub(crate) last_tracker_cleanup: Arc>, +} + +impl GcScheduler { + /// Creates a new [`GcScheduler`] with custom configuration. + pub(crate) fn new_with_config( + table_metadata_manager: TableMetadataManagerRef, + procedure_manager: ProcedureManagerRef, + meta_peer_client: MetaPeerClientRef, + mailbox: MailboxRef, + server_addr: String, + config: GcSchedulerOptions, + ) -> Result<(Self, GcTicker)> { + // Validate configuration before creating the scheduler + config.validate()?; + + let (tx, rx) = Self::channel(); + let gc_ticker = GcTicker::new(TICKER_INTERVAL, tx); + let gc_trigger = Self { + ctx: Arc::new(DefaultGcSchedulerCtx::try_new( + table_metadata_manager, + procedure_manager, + meta_peer_client, + mailbox, + server_addr, + )?), + receiver: rx, + config, + region_gc_tracker: Arc::new(Mutex::new(HashMap::new())), + last_tracker_cleanup: Arc::new(Mutex::new(Instant::now())), + }; + Ok((gc_trigger, gc_ticker)) + } + + pub(crate) fn channel() -> (Sender, Receiver) { + tokio::sync::mpsc::channel(8) + } + + /// Starts the gc trigger. + pub fn try_start(mut self) -> Result<()> { + common_runtime::spawn_global(async move { self.run().await }); + info!("GC trigger started"); + Ok(()) + } + + pub(crate) async fn run(&mut self) { + while let Some(event) = self.receiver.recv().await { + match event { + Event::Tick => { + info!("Received gc tick"); + if let Err(e) = self.handle_tick().await { + error!("Failed to handle gc tick: {}", e); + } + } + } + } + } + + pub(crate) async fn handle_tick(&self) -> Result { + info!("Start to trigger gc"); + let report = self.trigger_gc().await?; + + // Periodically clean up stale tracker entries + self.cleanup_tracker_if_needed().await?; + + info!("Finished gc trigger"); + + Ok(report) + } +} diff --git a/src/meta-srv/src/gc/tracker.rs b/src/meta-srv/src/gc/tracker.rs new file mode 100644 index 0000000000..a5d6757c2c --- /dev/null +++ b/src/meta-srv/src/gc/tracker.rs @@ -0,0 +1,129 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::time::Instant; + +use common_telemetry::info; +use store_api::storage::RegionId; + +use crate::error::Result; +use crate::gc::scheduler::GcScheduler; + +/// Tracks GC timing information for a region. +#[derive(Debug, Clone)] +pub(crate) struct RegionGcInfo { + /// Last time a regular GC was performed on this region. + pub(crate) last_gc_time: Instant, + /// Last time a full file listing GC was performed on this region. + pub(crate) last_full_listing_time: Option, +} + +impl RegionGcInfo { + pub(crate) fn new(last_gc_time: Instant) -> Self { + Self { + last_gc_time, + last_full_listing_time: None, + } + } +} + +/// Tracks the last GC time for regions to implement cooldown. +pub(crate) type RegionGcTracker = HashMap; + +impl GcScheduler { + /// Clean up stale entries from the region GC tracker if enough time has passed. + /// This removes entries for regions that no longer exist in the current table routes. + pub(crate) async fn cleanup_tracker_if_needed(&self) -> Result<()> { + let mut last_cleanup = *self.last_tracker_cleanup.lock().await; + let now = Instant::now(); + + // Check if enough time has passed since last cleanup + if now.duration_since(last_cleanup) < self.config.tracker_cleanup_interval { + return Ok(()); + } + + info!("Starting region GC tracker cleanup"); + let cleanup_start = Instant::now(); + + // Get all current region IDs from table routes + let table_to_region_stats = self.ctx.get_table_to_region_stats().await?; + let mut current_regions = HashSet::new(); + for region_stats in table_to_region_stats.values() { + for region_stat in region_stats { + current_regions.insert(region_stat.id); + } + } + + // Remove stale entries from tracker + let mut tracker = self.region_gc_tracker.lock().await; + let initial_count = tracker.len(); + tracker.retain(|region_id, _| current_regions.contains(region_id)); + let removed_count = initial_count - tracker.len(); + + *self.last_tracker_cleanup.lock().await = now; + + info!( + "Completed region GC tracker cleanup: removed {} stale entries out of {} total (retained {}). Duration: {:?}", + removed_count, + initial_count, + tracker.len(), + cleanup_start.elapsed() + ); + + Ok(()) + } + + /// Determine if full file listing should be used for a region based on the last full listing time. + pub(crate) async fn should_use_full_listing(&self, region_id: RegionId) -> bool { + let gc_tracker = self.region_gc_tracker.lock().await; + let now = Instant::now(); + + if let Some(gc_info) = gc_tracker.get(®ion_id) { + if let Some(last_full_listing) = gc_info.last_full_listing_time { + let elapsed = now.duration_since(last_full_listing); + elapsed >= self.config.full_file_listing_interval + } else { + // Never did full listing for this region, do it now + true + } + } else { + // First time GC for this region, do full listing + true + } + } + + pub(crate) async fn update_full_listing_time( + &self, + region_id: RegionId, + did_full_listing: bool, + ) { + let mut gc_tracker = self.region_gc_tracker.lock().await; + let now = Instant::now(); + + gc_tracker + .entry(region_id) + .and_modify(|info| { + if did_full_listing { + info.last_full_listing_time = Some(now); + } + info.last_gc_time = now; + }) + .or_insert_with(|| RegionGcInfo { + last_gc_time: now, + // prevent need to full listing on the first GC + last_full_listing_time: Some(now), + }); + } +} diff --git a/src/meta-srv/src/handler/collect_cluster_info_handler.rs b/src/meta-srv/src/handler/collect_cluster_info_handler.rs index f144f3edc5..c96229f9cf 100644 --- a/src/meta-srv/src/handler/collect_cluster_info_handler.rs +++ b/src/meta-srv/src/handler/collect_cluster_info_handler.rs @@ -52,8 +52,10 @@ impl HeartbeatHandler for CollectFrontendClusterInfoHandler { version: info.version, git_commit: info.git_commit, start_time_ms: info.start_time_ms, - cpus: info.cpus, - memory_bytes: info.memory_bytes, + total_cpu_millicores: info.total_cpu_millicores, + total_memory_bytes: info.total_memory_bytes, + cpu_usage_millicores: info.cpu_usage_millicores, + memory_usage_bytes: info.memory_usage_bytes, hostname: info.hostname, }; @@ -88,8 +90,10 @@ impl HeartbeatHandler for CollectFlownodeClusterInfoHandler { version: info.version, git_commit: info.git_commit, start_time_ms: info.start_time_ms, - cpus: info.cpus, - memory_bytes: info.memory_bytes, + total_cpu_millicores: info.total_cpu_millicores, + total_memory_bytes: info.total_memory_bytes, + cpu_usage_millicores: info.cpu_usage_millicores, + memory_usage_bytes: info.memory_usage_bytes, hostname: info.hostname, }; @@ -142,8 +146,10 @@ impl HeartbeatHandler for CollectDatanodeClusterInfoHandler { version: info.version, git_commit: info.git_commit, start_time_ms: info.start_time_ms, - cpus: info.cpus, - memory_bytes: info.memory_bytes, + total_cpu_millicores: info.total_cpu_millicores, + total_memory_bytes: info.total_memory_bytes, + cpu_usage_millicores: info.cpu_usage_millicores, + memory_usage_bytes: info.memory_usage_bytes, hostname: info.hostname, }; diff --git a/src/meta-srv/src/handler/collect_leader_region_handler.rs b/src/meta-srv/src/handler/collect_leader_region_handler.rs index fc81143b82..ddb4cd0ea3 100644 --- a/src/meta-srv/src/handler/collect_leader_region_handler.rs +++ b/src/meta-srv/src/handler/collect_leader_region_handler.rs @@ -73,6 +73,7 @@ mod tests { region_manifest: RegionManifestInfo::Mito { manifest_version, flushed_entry_id: 0, + file_removed_cnt: 0, }, rcus: 0, wcus: 0, diff --git a/src/meta-srv/src/handler/failure_handler.rs b/src/meta-srv/src/handler/failure_handler.rs index 7039678654..eb79a1c30d 100644 --- a/src/meta-srv/src/handler/failure_handler.rs +++ b/src/meta-srv/src/handler/failure_handler.rs @@ -102,6 +102,7 @@ mod tests { region_manifest: RegionManifestInfo::Mito { manifest_version: 0, flushed_entry_id: 0, + file_removed_cnt: 0, }, data_topic_latest_entry_id: 0, metadata_topic_latest_entry_id: 0, diff --git a/src/meta-srv/src/handler/persist_stats_handler.rs b/src/meta-srv/src/handler/persist_stats_handler.rs index 1dc81f49eb..75281f982a 100644 --- a/src/meta-srv/src/handler/persist_stats_handler.rs +++ b/src/meta-srv/src/handler/persist_stats_handler.rs @@ -77,6 +77,7 @@ struct PersistRegionStat<'a> { sst_size: u64, write_bytes_delta: u64, #[col( + // This col name is for the information schema table, so we don't touch it name = "greptime_timestamp", semantic = "Timestamp", datatype = "TimestampMillisecond" @@ -293,6 +294,7 @@ mod tests { region_manifest: RegionManifestInfo::Mito { manifest_version: 1, flushed_entry_id: 100, + file_removed_cnt: 0, }, written_bytes, data_topic_latest_entry_id: 200, diff --git a/src/meta-srv/src/handler/region_lease_handler.rs b/src/meta-srv/src/handler/region_lease_handler.rs index d7c2466e07..d0e9757742 100644 --- a/src/meta-srv/src/handler/region_lease_handler.rs +++ b/src/meta-srv/src/handler/region_lease_handler.rs @@ -19,6 +19,7 @@ use api::v1::meta::{HeartbeatRequest, RegionLease, Role}; use async_trait::async_trait; use common_meta::key::TableMetadataManagerRef; use common_meta::region_keeper::MemoryRegionKeeperRef; +use common_telemetry::error; use store_api::region_engine::GrantedRegion; use store_api::storage::RegionId; @@ -83,36 +84,44 @@ impl HeartbeatHandler for RegionLeaseHandler { let regions = stat.regions(); let datanode_id = stat.id; - let RenewRegionLeasesResponse { - non_exists, - renewed, - } = self + match self .region_lease_keeper .renew_region_leases(datanode_id, ®ions) - .await?; + .await + { + Ok(RenewRegionLeasesResponse { + non_exists, + renewed, + }) => { + let renewed = if let Some(renewer) = &self.customized_region_lease_renewer { + renewer + .renew(ctx, renewed) + .into_iter() + .map(|region| region.into()) + .collect() + } else { + renewed + .into_iter() + .map(|(region_id, region_lease_info)| { + GrantedRegion::new(region_id, region_lease_info.role).into() + }) + .collect::>() + }; - let renewed = if let Some(renewer) = &self.customized_region_lease_renewer { - renewer - .renew(ctx, renewed) - .into_iter() - .map(|region| region.into()) - .collect() - } else { - renewed - .into_iter() - .map(|(region_id, region_lease_info)| { - GrantedRegion::new(region_id, region_lease_info.role).into() - }) - .collect::>() - }; - - acc.region_lease = Some(RegionLease { - regions: renewed, - duration_since_epoch: req.duration_since_epoch, - lease_seconds: self.region_lease_seconds, - closeable_region_ids: non_exists.iter().map(|region| region.as_u64()).collect(), - }); - acc.inactive_region_ids = non_exists; + acc.region_lease = Some(RegionLease { + regions: renewed, + duration_since_epoch: req.duration_since_epoch, + lease_seconds: self.region_lease_seconds, + closeable_region_ids: non_exists.iter().map(|region| region.as_u64()).collect(), + }); + acc.inactive_region_ids = non_exists; + } + Err(e) => { + error!(e; "Failed to renew region leases for datanode: {datanode_id:?}, regions: {:?}", regions); + // If we throw error here, the datanode will be marked as failure by region failure handler. + // So we only log the error and continue. + } + } Ok(HandleControl::Continue) } @@ -120,6 +129,7 @@ impl HeartbeatHandler for RegionLeaseHandler { #[cfg(test)] mod test { + use std::collections::{HashMap, HashSet}; use std::sync::Arc; @@ -129,6 +139,7 @@ mod test { use common_meta::key::table_route::TableRouteValue; use common_meta::key::test_utils::new_test_table_info; use common_meta::kv_backend::memory::MemoryKvBackend; + use common_meta::kv_backend::test_util::MockKvBackendBuilder; use common_meta::peer::Peer; use common_meta::region_keeper::MemoryRegionKeeper; use common_meta::rpc::router::{LeaderState, Region, RegionRoute}; @@ -164,6 +175,7 @@ mod test { region_manifest: RegionManifestInfo::Mito { manifest_version: 0, flushed_entry_id: 0, + file_removed_cnt: 0, }, data_topic_latest_entry_id: 0, metadata_topic_latest_entry_id: 0, @@ -405,4 +417,58 @@ mod test { assert_eq!(granted, expected); } + + #[tokio::test] + async fn test_handle_renew_region_lease_failure() { + common_telemetry::init_default_ut_logging(); + let kv = MockKvBackendBuilder::default() + .batch_get_fn(Arc::new(|_| { + common_meta::error::UnexpectedSnafu { + err_msg: "mock err", + } + .fail() + }) as _) + .build() + .unwrap(); + let kvbackend = Arc::new(kv); + let table_metadata_manager = Arc::new(TableMetadataManager::new(kvbackend)); + + let datanode_id = 1; + let region_number = 1u32; + let table_id = 10; + let region_id = RegionId::new(table_id, region_number); + let another_region_id = RegionId::new(table_id, region_number + 1); + let no_exist_region_id = RegionId::new(table_id, region_number + 2); + let peer = Peer::empty(datanode_id); + + let builder = MetasrvBuilder::new(); + let metasrv = builder.build().await.unwrap(); + let ctx = &mut metasrv.new_ctx(); + + let req = HeartbeatRequest { + duration_since_epoch: 1234, + ..Default::default() + }; + + let acc = &mut HeartbeatAccumulator::default(); + acc.stat = Some(Stat { + id: peer.id, + region_stats: vec![ + new_empty_region_stat(region_id, RegionRole::Leader), + new_empty_region_stat(another_region_id, RegionRole::Leader), + new_empty_region_stat(no_exist_region_id, RegionRole::Leader), + ], + ..Default::default() + }); + let handler = RegionLeaseHandler::new( + distributed_time_constants::REGION_LEASE_SECS, + table_metadata_manager.clone(), + Default::default(), + None, + ); + handler.handle(&req, ctx, acc).await.unwrap(); + + assert!(acc.region_lease.is_none()); + assert!(acc.inactive_region_ids.is_empty()); + } } diff --git a/src/meta-srv/src/lib.rs b/src/meta-srv/src/lib.rs index 71d57ca83f..c67bc32b40 100644 --- a/src/meta-srv/src/lib.rs +++ b/src/meta-srv/src/lib.rs @@ -25,6 +25,7 @@ pub mod election; pub mod error; pub mod events; mod failure_detector; +pub mod gc; pub mod handler; pub mod key; pub mod metasrv; diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs index 4c2c7fcf53..c454bc1ca5 100644 --- a/src/meta-srv/src/metasrv.rs +++ b/src/meta-srv/src/metasrv.rs @@ -22,7 +22,6 @@ use std::time::Duration; use clap::ValueEnum; use common_base::Plugins; use common_base::readable_size::ReadableSize; -use common_config::utils::ResourceSpec; use common_config::{Configurable, DEFAULT_DATA_HOME}; use common_event_recorder::EventRecorderOptions; use common_greptimedb_telemetry::GreptimeDBTelemetryTask; @@ -47,11 +46,12 @@ use common_options::datanode::DatanodeClientOptions; use common_options::memory::MemoryOptions; use common_procedure::ProcedureManagerRef; use common_procedure::options::ProcedureConfig; +use common_stat::ResourceStatRef; use common_telemetry::logging::{LoggingOptions, TracingOptions}; use common_telemetry::{error, info, warn}; +use common_time::util::DefaultSystemTimer; use common_wal::config::MetasrvWalConfig; use serde::{Deserialize, Serialize}; -use servers::export_metrics::ExportMetricsOption; use servers::grpc::GrpcOptions; use servers::http::HttpOptions; use servers::tls::TlsOption; @@ -67,6 +67,7 @@ use crate::error::{ StartTelemetryTaskSnafu, StopProcedureManagerSnafu, }; use crate::failure_detector::PhiAccrualFailureDetectorOptions; +use crate::gc::{GcSchedulerOptions, GcTickerRef}; use crate::handler::{HeartbeatHandlerGroupBuilder, HeartbeatHandlerGroupRef}; use crate::procedure::ProcedureManagerListenerAdapter; use crate::procedure::region_migration::manager::RegionMigrationManagerRef; @@ -168,8 +169,6 @@ pub struct MetasrvOptions { pub data_home: String, /// The WAL options. pub wal: MetasrvWalConfig, - /// The metrics export options. - pub export_metrics: ExportMetricsOption, /// The store key prefix. If it is not empty, all keys in the store will be prefixed with it. /// This is useful when multiple metasrv clusters share the same store. pub store_key_prefix: String, @@ -209,6 +208,8 @@ pub struct MetasrvOptions { pub event_recorder: EventRecorderOptions, /// The stats persistence options. pub stats_persistence: StatsPersistenceOptions, + /// The GC scheduler options. + pub gc: GcSchedulerOptions, } impl fmt::Debug for MetasrvOptions { @@ -233,7 +234,6 @@ impl fmt::Debug for MetasrvOptions { .field("enable_telemetry", &self.enable_telemetry) .field("data_home", &self.data_home) .field("wal", &self.wal) - .field("export_metrics", &self.export_metrics) .field("store_key_prefix", &self.store_key_prefix) .field("max_txn_ops", &self.max_txn_ops) .field("flush_stats_factor", &self.flush_stats_factor) @@ -291,7 +291,6 @@ impl Default for MetasrvOptions { enable_telemetry: true, data_home: DEFAULT_DATA_HOME.to_string(), wal: MetasrvWalConfig::default(), - export_metrics: ExportMetricsOption::default(), store_key_prefix: String::new(), max_txn_ops: 128, flush_stats_factor: 3, @@ -307,6 +306,7 @@ impl Default for MetasrvOptions { node_max_idle_time: Duration::from_secs(24 * 60 * 60), event_recorder: EventRecorderOptions::default(), stats_persistence: StatsPersistenceOptions::default(), + gc: GcSchedulerOptions::default(), } } } @@ -372,12 +372,18 @@ pub struct MetasrvNodeInfo { pub git_commit: String, // The node start timestamp in milliseconds pub start_time_ms: u64, - // The node cpus + // The node total cpu millicores #[serde(default)] - pub cpus: u32, - // The node memory bytes + pub total_cpu_millicores: i64, + // The node total memory bytes #[serde(default)] - pub memory_bytes: u64, + pub total_memory_bytes: i64, + /// The node build cpu usage millicores + #[serde(default)] + pub cpu_usage_millicores: i64, + /// The node build memory usage bytes + #[serde(default)] + pub memory_usage_bytes: i64, // The node hostname #[serde(default)] pub hostname: String, @@ -397,15 +403,19 @@ impl From for api::v1::meta::MetasrvNodeInfo { version: node_info.version.clone(), git_commit: node_info.git_commit.clone(), start_time_ms: node_info.start_time_ms, - cpus: node_info.cpus, - memory_bytes: node_info.memory_bytes, + cpus: node_info.total_cpu_millicores as u32, + memory_bytes: node_info.total_memory_bytes as u64, // The canonical location for node information. info: Some(api::v1::meta::NodeInfo { version: node_info.version, git_commit: node_info.git_commit, start_time_ms: node_info.start_time_ms, - cpus: node_info.cpus, - memory_bytes: node_info.memory_bytes, + total_cpu_millicores: node_info.total_cpu_millicores, + total_memory_bytes: node_info.total_memory_bytes, + cpu_usage_millicores: node_info.cpu_usage_millicores, + memory_usage_bytes: node_info.memory_usage_bytes, + cpus: node_info.total_cpu_millicores as u32, + memory_bytes: node_info.total_memory_bytes as u64, hostname: node_info.hostname, }), } @@ -517,7 +527,8 @@ pub struct Metasrv { region_flush_ticker: Option, table_id_sequence: SequenceRef, reconciliation_manager: ReconciliationManagerRef, - resource_spec: ResourceSpec, + resource_stat: ResourceStatRef, + gc_ticker: Option, plugins: Plugins, } @@ -578,6 +589,9 @@ impl Metasrv { if let Some(region_flush_trigger) = &self.region_flush_ticker { leadership_change_notifier.add_listener(region_flush_trigger.clone() as _); } + if let Some(gc_ticker) = &self.gc_ticker { + leadership_change_notifier.add_listener(gc_ticker.clone() as _); + } if let Some(customizer) = self.plugins.get::() { customizer.customize(&mut leadership_change_notifier); } @@ -699,8 +713,8 @@ impl Metasrv { self.start_time_ms } - pub fn resource_spec(&self) -> &ResourceSpec { - &self.resource_spec + pub fn resource_stat(&self) -> &ResourceStatRef { + &self.resource_stat } pub fn node_info(&self) -> MetasrvNodeInfo { @@ -710,8 +724,10 @@ impl Metasrv { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms: self.start_time_ms(), - cpus: self.resource_spec().cpus as u32, - memory_bytes: self.resource_spec().memory.unwrap_or_default().as_bytes(), + total_cpu_millicores: self.resource_stat.get_total_cpu_millicores(), + total_memory_bytes: self.resource_stat.get_total_memory_bytes(), + cpu_usage_millicores: self.resource_stat.get_cpu_usage_millicores(), + memory_usage_bytes: self.resource_stat.get_memory_usage_bytes(), hostname: hostname::get() .unwrap_or_default() .to_string_lossy() @@ -723,6 +739,7 @@ impl Metasrv { /// A datanode is considered alive when it's still within the lease period. pub(crate) async fn lookup_datanode_peer(&self, peer_id: u64) -> Result> { discovery::utils::alive_datanode( + &DefaultSystemTimer, self.meta_peer_client.as_ref(), peer_id, Duration::from_secs(distributed_time_constants::DATANODE_LEASE_SECS), @@ -848,3 +865,18 @@ impl Metasrv { } } } + +#[cfg(test)] +mod tests { + use crate::metasrv::MetasrvNodeInfo; + + #[test] + fn test_deserialize_metasrv_node_info() { + let str = r#"{"addr":"127.0.0.1:4002","version":"0.1.0","git_commit":"1234567890","start_time_ms":1715145600}"#; + let node_info: MetasrvNodeInfo = serde_json::from_str(str).unwrap(); + assert_eq!(node_info.addr, "127.0.0.1:4002"); + assert_eq!(node_info.version, "0.1.0"); + assert_eq!(node_info.git_commit, "1234567890"); + assert_eq!(node_info.start_time_ms, 1715145600); + } +} diff --git a/src/meta-srv/src/metasrv/builder.rs b/src/meta-srv/src/metasrv/builder.rs index 9cc0b8cc72..cbefb79cfa 100644 --- a/src/meta-srv/src/metasrv/builder.rs +++ b/src/meta-srv/src/metasrv/builder.rs @@ -28,7 +28,7 @@ use common_meta::ddl::table_meta::{TableMetadataAllocator, TableMetadataAllocato use common_meta::ddl::{ DdlContext, NoopRegionFailureDetectorControl, RegionFailureDetectorControllerRef, }; -use common_meta::ddl_manager::DdlManager; +use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef}; use common_meta::distributed_time_constants::{self}; use common_meta::key::TableMetadataManager; use common_meta::key::flow::FlowMetadataManager; @@ -46,6 +46,7 @@ use common_meta::stats::topic::TopicStatsRegistry; use common_meta::wal_options_allocator::{build_kafka_client, build_wal_options_allocator}; use common_procedure::ProcedureManagerRef; use common_procedure::local::{LocalManager, ManagerConfig}; +use common_stat::ResourceStatImpl; use common_telemetry::{info, warn}; use snafu::{ResultExt, ensure}; use store_api::storage::MAX_REGION_SEQ; @@ -53,8 +54,9 @@ use store_api::storage::MAX_REGION_SEQ; use crate::bootstrap::build_default_meta_peer_client; use crate::cache_invalidator::MetasrvCacheInvalidator; use crate::cluster::MetaPeerClientRef; -use crate::error::{self, BuildWalOptionsAllocatorSnafu, Result}; +use crate::error::{self, BuildWalOptionsAllocatorSnafu, OtherSnafu, Result}; use crate::events::EventHandlerImpl; +use crate::gc::GcScheduler; use crate::greptimedb_telemetry::get_greptimedb_telemetry_task; use crate::handler::failure_handler::RegionFailureHandler; use crate::handler::flow_state_handler::FlowStateHandler; @@ -372,7 +374,8 @@ impl MetasrvBuilder { runtime_switch_manager.clone(), meta_peer_client.clone(), leader_cached_kv_backend.clone(), - ); + ) + .with_state(state.clone()); Some(RegionFailureHandler::new( region_supervisor, @@ -399,13 +402,23 @@ impl MetasrvBuilder { let procedure_manager_c = procedure_manager.clone(); let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager_c, true) .context(error::InitDdlManagerSnafu)?; - #[cfg(feature = "enterprise")] - let ddl_manager = { - let trigger_ddl_manager = plugins.as_ref().and_then(|plugins| { - plugins.get::() - }); - ddl_manager.with_trigger_ddl_manager(trigger_ddl_manager) + + let ddl_manager = if let Some(configurator) = plugins + .as_ref() + .and_then(|p| p.get::>()) + { + let ctx = DdlManagerConfigureContext { + kv_backend: kv_backend.clone(), + meta_peer_client: meta_peer_client.clone(), + }; + configurator + .configure(ddl_manager, ctx) + .await + .context(OtherSnafu)? + } else { + ddl_manager }; + let ddl_manager = Arc::new(ddl_manager); let region_flush_ticker = if is_remote_wal { @@ -456,6 +469,22 @@ impl MetasrvBuilder { None }; + let gc_ticker = if options.gc.enable { + let (gc_scheduler, gc_ticker) = GcScheduler::new_with_config( + table_metadata_manager.clone(), + procedure_manager.clone(), + meta_peer_client.clone(), + mailbox.clone(), + options.grpc.server_addr.clone(), + options.gc.clone(), + )?; + gc_scheduler.try_start()?; + + Some(Arc::new(gc_ticker)) + } else { + None + }; + let customized_region_lease_renewer = plugins .as_ref() .and_then(|plugins| plugins.get::()); @@ -517,6 +546,9 @@ impl MetasrvBuilder { .try_start() .context(error::InitReconciliationManagerSnafu)?; + let mut resource_stat = ResourceStatImpl::default(); + resource_stat.start_collect_cpu_usage(); + Ok(Metasrv { state, started: Arc::new(AtomicBool::new(false)), @@ -556,7 +588,8 @@ impl MetasrvBuilder { table_id_sequence, reconciliation_manager, topic_stats_registry, - resource_spec: Default::default(), + resource_stat: Arc::new(resource_stat), + gc_ticker, }) } } @@ -605,3 +638,9 @@ impl Default for MetasrvBuilder { Self::new() } } + +/// The context for [`DdlManagerConfiguratorRef`]. +pub struct DdlManagerConfigureContext { + pub kv_backend: KvBackendRef, + pub meta_peer_client: MetaPeerClientRef, +} diff --git a/src/meta-srv/src/mocks.rs b/src/meta-srv/src/mocks.rs index 6c2f0d3892..c805f8ea1b 100644 --- a/src/meta-srv/src/mocks.rs +++ b/src/meta-srv/src/mocks.rs @@ -134,7 +134,7 @@ pub async fn mock( .timeout(Duration::from_secs(10)) .connect_timeout(Duration::from_secs(10)) .tcp_nodelay(true); - let channel_manager = ChannelManager::with_config(config); + let channel_manager = ChannelManager::with_config(config, None); // Move client to an option so we can _move_ the inner value // on the first attempt to connect. All other attempts will fail. diff --git a/src/meta-srv/src/procedure.rs b/src/meta-srv/src/procedure.rs index 88869d8482..da1a1b00e7 100644 --- a/src/meta-srv/src/procedure.rs +++ b/src/meta-srv/src/procedure.rs @@ -19,6 +19,7 @@ use common_procedure::ProcedureManagerRef; use snafu::ResultExt; pub mod region_migration; +pub mod repartition; #[cfg(any(test, feature = "testing"))] pub mod test_util; #[cfg(test)] diff --git a/src/meta-srv/src/procedure/region_migration.rs b/src/meta-srv/src/procedure/region_migration.rs index cfef9158ef..3613fd0894 100644 --- a/src/meta-srv/src/procedure/region_migration.rs +++ b/src/meta-srv/src/procedure/region_migration.rs @@ -24,8 +24,10 @@ pub(crate) mod open_candidate_region; pub mod test_util; pub(crate) mod update_metadata; pub(crate) mod upgrade_candidate_region; +pub(crate) mod utils; use std::any::Any; +use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Display}; use std::sync::Arc; use std::time::Duration; @@ -36,12 +38,11 @@ use common_meta::cache_invalidator::CacheInvalidatorRef; use common_meta::ddl::RegionFailureDetectorControllerRef; use common_meta::instruction::CacheIdent; use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue}; -use common_meta::key::table_info::TableInfoValue; use common_meta::key::table_route::TableRouteValue; use common_meta::key::topic_region::{ReplayCheckpoint, TopicRegionKey}; use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef}; -use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock}; +use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock}; use common_meta::peer::Peer; use common_meta::region_keeper::{MemoryRegionKeeperRef, OperatingRegionGuard}; use common_procedure::error::{ @@ -56,9 +57,9 @@ pub use manager::{ RegionMigrationManagerRef, RegionMigrationProcedureTask, RegionMigrationProcedureTracker, RegionMigrationTriggerReason, }; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Deserializer, Serialize}; use snafu::{OptionExt, ResultExt}; -use store_api::storage::RegionId; +use store_api::storage::{RegionId, TableId}; use tokio::time::Instant; use self::migration_start::RegionMigrationStart; @@ -73,6 +74,25 @@ use crate::service::mailbox::MailboxRef; /// The default timeout for region migration. pub const DEFAULT_REGION_MIGRATION_TIMEOUT: Duration = Duration::from_secs(120); +#[derive(Debug, Deserialize)] +#[serde(untagged)] +enum SingleOrMultiple { + Single(T), + Multiple(Vec), +} + +fn single_or_multiple_from<'de, D, T>(deserializer: D) -> std::result::Result, D::Error> +where + D: Deserializer<'de>, + T: Deserialize<'de>, +{ + let helper = SingleOrMultiple::::deserialize(deserializer)?; + Ok(match helper { + SingleOrMultiple::Single(x) => vec![x], + SingleOrMultiple::Multiple(xs) => xs, + }) +} + /// It's shared in each step and available even after recovering. /// /// It will only be updated/stored after the Red node has succeeded. @@ -81,15 +101,23 @@ pub const DEFAULT_REGION_MIGRATION_TIMEOUT: Duration = Duration::from_secs(120); #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct PersistentContext { /// The table catalog. - pub(crate) catalog: String, + #[deprecated(note = "use `catalog_and_schema` instead")] + #[serde(default, skip_serializing_if = "Option::is_none")] + pub(crate) catalog: Option, /// The table schema. - pub(crate) schema: String, + #[deprecated(note = "use `catalog_and_schema` instead")] + #[serde(default, skip_serializing_if = "Option::is_none")] + pub(crate) schema: Option, + /// The catalog and schema of the regions. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub(crate) catalog_and_schema: Vec<(String, String)>, /// The [Peer] of migration source. pub(crate) from_peer: Peer, /// The [Peer] of migration destination. pub(crate) to_peer: Peer, /// The [RegionId] of migration region. - pub(crate) region_id: RegionId, + #[serde(deserialize_with = "single_or_multiple_from", alias = "region_id")] + pub(crate) region_ids: Vec, /// The timeout for downgrading leader region and upgrading candidate region operations. #[serde(with = "humantime_serde", default = "default_timeout")] pub(crate) timeout: Duration, @@ -98,20 +126,80 @@ pub struct PersistentContext { pub(crate) trigger_reason: RegionMigrationTriggerReason, } +impl PersistentContext { + pub fn new( + catalog_and_schema: Vec<(String, String)>, + from_peer: Peer, + to_peer: Peer, + region_ids: Vec, + timeout: Duration, + trigger_reason: RegionMigrationTriggerReason, + ) -> Self { + #[allow(deprecated)] + Self { + catalog: None, + schema: None, + catalog_and_schema, + from_peer, + to_peer, + region_ids, + timeout, + trigger_reason, + } + } +} + fn default_timeout() -> Duration { Duration::from_secs(10) } impl PersistentContext { pub fn lock_key(&self) -> Vec { - let region_id = self.region_id; - let lock_key = vec![ - CatalogLock::Read(&self.catalog).into(), - SchemaLock::read(&self.catalog, &self.schema).into(), - RegionLock::Write(region_id).into(), - ]; + let mut lock_keys = + Vec::with_capacity(self.region_ids.len() + 2 + self.catalog_and_schema.len() * 2); + #[allow(deprecated)] + if let (Some(catalog), Some(schema)) = (&self.catalog, &self.schema) { + lock_keys.push(CatalogLock::Read(catalog).into()); + lock_keys.push(SchemaLock::read(catalog, schema).into()); + } + for (catalog, schema) in self.catalog_and_schema.iter() { + lock_keys.push(CatalogLock::Read(catalog).into()); + lock_keys.push(SchemaLock::read(catalog, schema).into()); + } - lock_key + // Sort the region ids to ensure the same order of region ids. + let mut region_ids = self.region_ids.clone(); + region_ids.sort_unstable(); + for region_id in region_ids { + lock_keys.push(RegionLock::Write(region_id).into()); + } + lock_keys + } + + /// Returns the table ids of the regions. + /// + /// The return value is a set of table ids. + pub fn region_table_ids(&self) -> Vec { + self.region_ids + .iter() + .map(|region_id| region_id.table_id()) + .collect::>() + .into_iter() + .collect() + } + + /// Returns the table regions map. + /// + /// The key is the table id, the value is the region ids of the table. + pub fn table_regions(&self) -> HashMap> { + let mut table_regions = HashMap::new(); + for region_id in &self.region_ids { + table_regions + .entry(region_id.table_id()) + .or_insert_with(Vec::new) + .push(*region_id); + } + table_regions } } @@ -227,25 +315,18 @@ pub struct VolatileContext { /// `opening_region_guard` will be set after the /// [OpenCandidateRegion](crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion) step. /// - /// `opening_region_guard` should be consumed after + /// `opening_region_guards` should be consumed after /// the corresponding [RegionRoute](common_meta::rpc::router::RegionRoute) of the opening region /// was written into [TableRouteValue](common_meta::key::table_route::TableRouteValue). - opening_region_guard: Option, - /// `table_route` is stored via previous steps for future use. - table_route: Option>, - /// `datanode_table` is stored via previous steps for future use. - from_peer_datanode_table: Option, - /// `table_info` is stored via previous steps for future use. - /// - /// `table_info` should remain unchanged during the procedure; - /// no other DDL procedure executed concurrently for the current table. - table_info: Option>, + opening_region_guards: Vec, /// The deadline of leader region lease. leader_region_lease_deadline: Option, - /// The last_entry_id of leader region. - leader_region_last_entry_id: Option, - /// The last_entry_id of leader metadata region (Only used for metric engine). - leader_region_metadata_last_entry_id: Option, + /// The datanode table values. + from_peer_datanode_table_values: Option>, + /// The last_entry_ids of leader regions. + leader_region_last_entry_ids: HashMap, + /// The last_entry_ids of leader metadata regions (Only used for metric engine). + leader_region_metadata_last_entry_ids: HashMap, /// Metrics of region migration. metrics: Metrics, } @@ -264,13 +345,15 @@ impl VolatileContext { } /// Sets the `leader_region_last_entry_id`. - pub fn set_last_entry_id(&mut self, last_entry_id: u64) { - self.leader_region_last_entry_id = Some(last_entry_id) + pub fn set_last_entry_id(&mut self, region_id: RegionId, last_entry_id: u64) { + self.leader_region_last_entry_ids + .insert(region_id, last_entry_id); } /// Sets the `leader_region_metadata_last_entry_id`. - pub fn set_metadata_last_entry_id(&mut self, last_entry_id: u64) { - self.leader_region_metadata_last_entry_id = Some(last_entry_id); + pub fn set_metadata_last_entry_id(&mut self, region_id: RegionId, last_entry_id: u64) { + self.leader_region_metadata_last_entry_ids + .insert(region_id, last_entry_id); } } @@ -319,7 +402,7 @@ impl DefaultContextFactory { impl ContextFactory for DefaultContextFactory { fn new_context(self, persistent_ctx: PersistentContext) -> Context { Context { - persistent_ctx: Arc::new(persistent_ctx), + persistent_ctx, volatile_ctx: self.volatile_ctx, in_memory: self.in_memory_key, table_metadata_manager: self.table_metadata_manager, @@ -334,7 +417,7 @@ impl ContextFactory for DefaultContextFactory { /// The context of procedure execution. pub struct Context { - persistent_ctx: Arc, + persistent_ctx: PersistentContext, volatile_ctx: VolatileContext, in_memory: KvBackendRef, table_metadata_manager: TableMetadataManagerRef, @@ -393,35 +476,135 @@ impl Context { &self.server_addr } + /// Returns the table ids of the regions. + pub fn region_table_ids(&self) -> Vec { + self.persistent_ctx + .region_ids + .iter() + .map(|region_id| region_id.table_id()) + .collect::>() + .into_iter() + .collect() + } + + /// Returns the `table_routes` of [VolatileContext] if any. + /// Otherwise, returns the value retrieved from remote. + /// + /// Retry: + /// - Failed to retrieve the metadata of table. + pub async fn get_table_route_values( + &self, + ) -> Result>> { + let table_ids = self.persistent_ctx.region_table_ids(); + let table_routes = self + .table_metadata_manager + .table_route_manager() + .table_route_storage() + .batch_get_with_raw_bytes(&table_ids) + .await + .context(error::TableMetadataManagerSnafu) + .map_err(BoxedError::new) + .with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!("Failed to get table routes: {table_ids:?}"), + })?; + let table_routes = table_ids + .into_iter() + .zip(table_routes) + .filter_map(|(table_id, table_route)| { + table_route.map(|table_route| (table_id, table_route)) + }) + .collect::>(); + Ok(table_routes) + } + /// Returns the `table_route` of [VolatileContext] if any. /// Otherwise, returns the value retrieved from remote. /// /// Retry: /// - Failed to retrieve the metadata of table. pub async fn get_table_route_value( - &mut self, - ) -> Result<&DeserializedValueWithBytes> { - let table_route_value = &mut self.volatile_ctx.table_route; + &self, + table_id: TableId, + ) -> Result> { + let table_route_value = self + .table_metadata_manager + .table_route_manager() + .table_route_storage() + .get_with_raw_bytes(table_id) + .await + .context(error::TableMetadataManagerSnafu) + .map_err(BoxedError::new) + .with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!("Failed to get table routes: {table_id:}"), + })? + .context(error::TableRouteNotFoundSnafu { table_id })?; + Ok(table_route_value) + } - if table_route_value.is_none() { - let table_id = self.persistent_ctx.region_id.table_id(); - let table_route = self + /// Returns the `from_peer_datanode_table_values` of [VolatileContext] if any. + /// Otherwise, returns the value retrieved from remote. + /// + /// Retry: + /// - Failed to retrieve the metadata of datanode table. + pub async fn get_from_peer_datanode_table_values( + &mut self, + ) -> Result<&HashMap> { + let from_peer_datanode_table_values = + &mut self.volatile_ctx.from_peer_datanode_table_values; + if from_peer_datanode_table_values.is_none() { + let table_ids = self.persistent_ctx.region_table_ids(); + let datanode_table_keys = table_ids + .iter() + .map(|table_id| DatanodeTableKey { + datanode_id: self.persistent_ctx.from_peer.id, + table_id: *table_id, + }) + .collect::>(); + let datanode_table_values = self .table_metadata_manager - .table_route_manager() - .table_route_storage() - .get_with_raw_bytes(table_id) + .datanode_table_manager() + .batch_get(&datanode_table_keys) .await .context(error::TableMetadataManagerSnafu) .map_err(BoxedError::new) .with_context(|_| error::RetryLaterWithSourceSnafu { - reason: format!("Failed to get TableRoute: {table_id}"), + reason: format!("Failed to get DatanodeTable: {table_ids:?}"), })? - .context(error::TableRouteNotFoundSnafu { table_id })?; - - *table_route_value = Some(table_route); + .into_iter() + .map(|(k, v)| (k.table_id, v)) + .collect(); + *from_peer_datanode_table_values = Some(datanode_table_values); } + Ok(from_peer_datanode_table_values.as_ref().unwrap()) + } - Ok(table_route_value.as_ref().unwrap()) + /// Returns the `from_peer_datanode_table_value` of [VolatileContext] if any. + /// Otherwise, returns the value retrieved from remote. + /// + /// Retry: + /// - Failed to retrieve the metadata of datanode table. + pub async fn get_from_peer_datanode_table_value( + &self, + table_id: TableId, + ) -> Result { + let datanode_table_value = self + .table_metadata_manager + .datanode_table_manager() + .get(&DatanodeTableKey { + datanode_id: self.persistent_ctx.from_peer.id, + table_id, + }) + .await + .context(error::TableMetadataManagerSnafu) + .map_err(BoxedError::new) + .with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!("Failed to get DatanodeTable: {table_id}"), + })? + .context(error::DatanodeTableNotFoundSnafu { + table_id, + datanode_id: self.persistent_ctx.from_peer.id, + })?; + Ok(datanode_table_value) } /// Notifies the RegionSupervisor to register failure detectors of failed region. @@ -430,11 +613,18 @@ impl Context { /// Now, we need to register the failure detector for the failed region again. pub async fn register_failure_detectors(&self) { let datanode_id = self.persistent_ctx.from_peer.id; - let region_id = self.persistent_ctx.region_id; - + let region_ids = &self.persistent_ctx.region_ids; + let detecting_regions = region_ids + .iter() + .map(|region_id| (datanode_id, *region_id)) + .collect::>(); self.region_failure_detector_controller - .register_failure_detectors(vec![(datanode_id, region_id)]) + .register_failure_detectors(detecting_regions) .await; + info!( + "Registered failure detectors after migration failures for datanode {}, regions {:?}", + datanode_id, region_ids + ); } /// Notifies the RegionSupervisor to deregister failure detectors. @@ -443,10 +633,14 @@ impl Context { /// We need to deregister the failure detectors for the original region if the procedure is finished. pub async fn deregister_failure_detectors(&self) { let datanode_id = self.persistent_ctx.from_peer.id; - let region_id = self.persistent_ctx.region_id; + let region_ids = &self.persistent_ctx.region_ids; + let detecting_regions = region_ids + .iter() + .map(|region_id| (datanode_id, *region_id)) + .collect::>(); self.region_failure_detector_controller - .deregister_failure_detectors(vec![(datanode_id, region_id)]) + .deregister_failure_detectors(detecting_regions) .await; } @@ -456,118 +650,52 @@ impl Context { /// so we need to deregister the failure detectors for the candidate region if the procedure is aborted. pub async fn deregister_failure_detectors_for_candidate_region(&self) { let to_peer_id = self.persistent_ctx.to_peer.id; - let region_id = self.persistent_ctx.region_id; + let region_ids = &self.persistent_ctx.region_ids; + let detecting_regions = region_ids + .iter() + .map(|region_id| (to_peer_id, *region_id)) + .collect::>(); self.region_failure_detector_controller - .deregister_failure_detectors(vec![(to_peer_id, region_id)]) + .deregister_failure_detectors(detecting_regions) .await; } - /// Removes the `table_route` of [VolatileContext], returns true if any. - pub fn remove_table_route_value(&mut self) -> bool { - let value = self.volatile_ctx.table_route.take(); - value.is_some() - } - - /// Returns the `table_info` of [VolatileContext] if any. - /// Otherwise, returns the value retrieved from remote. - /// - /// Retry: - /// - Failed to retrieve the metadata of table. - pub async fn get_table_info_value( - &mut self, - ) -> Result<&DeserializedValueWithBytes> { - let table_info_value = &mut self.volatile_ctx.table_info; - - if table_info_value.is_none() { - let table_id = self.persistent_ctx.region_id.table_id(); - let table_info = self - .table_metadata_manager - .table_info_manager() - .get(table_id) - .await - .context(error::TableMetadataManagerSnafu) - .map_err(BoxedError::new) - .with_context(|_| error::RetryLaterWithSourceSnafu { - reason: format!("Failed to get TableInfo: {table_id}"), - })? - .context(error::TableInfoNotFoundSnafu { table_id })?; - - *table_info_value = Some(table_info); - } - - Ok(table_info_value.as_ref().unwrap()) - } - - /// Returns the `table_info` of [VolatileContext] if any. - /// Otherwise, returns the value retrieved from remote. - /// - /// Retry: - /// - Failed to retrieve the metadata of datanode. - pub async fn get_from_peer_datanode_table_value(&mut self) -> Result<&DatanodeTableValue> { - let datanode_value = &mut self.volatile_ctx.from_peer_datanode_table; - - if datanode_value.is_none() { - let table_id = self.persistent_ctx.region_id.table_id(); - let datanode_id = self.persistent_ctx.from_peer.id; - - let datanode_table = self - .table_metadata_manager - .datanode_table_manager() - .get(&DatanodeTableKey { - datanode_id, - table_id, - }) - .await - .context(error::TableMetadataManagerSnafu) - .map_err(BoxedError::new) - .with_context(|_| error::RetryLaterWithSourceSnafu { - reason: format!("Failed to get DatanodeTable: ({datanode_id},{table_id})"), - })? - .context(error::DatanodeTableNotFoundSnafu { - table_id, - datanode_id, - })?; - - *datanode_value = Some(datanode_table); - } - - Ok(datanode_value.as_ref().unwrap()) - } - - /// Fetches the replay checkpoint for the given topic. - pub async fn fetch_replay_checkpoint(&self, topic: &str) -> Result> { - let region_id = self.region_id(); - let topic_region_key = TopicRegionKey::new(region_id, topic); - let value = self + /// Fetches the replay checkpoints for the given topic region keys. + pub async fn get_replay_checkpoints( + &self, + topic_region_keys: Vec>, + ) -> Result> { + let topic_region_values = self .table_metadata_manager .topic_region_manager() - .get(topic_region_key) + .batch_get(topic_region_keys) .await .context(error::TableMetadataManagerSnafu)?; - Ok(value.and_then(|value| value.checkpoint)) - } + let replay_checkpoints = topic_region_values + .into_iter() + .flat_map(|(key, value)| value.checkpoint.map(|value| (key, value))) + .collect::>(); - /// Returns the [RegionId]. - pub fn region_id(&self) -> RegionId { - self.persistent_ctx.region_id + Ok(replay_checkpoints) } /// Broadcasts the invalidate table cache message. pub async fn invalidate_table_cache(&self) -> Result<()> { - let table_id = self.region_id().table_id(); + let table_ids = self.region_table_ids(); + let mut cache_idents = Vec::with_capacity(table_ids.len()); + for table_id in &table_ids { + cache_idents.push(CacheIdent::TableId(*table_id)); + } // ignore the result let ctx = common_meta::cache_invalidator::Context::default(); - let _ = self - .cache_invalidator - .invalidate(&ctx, &[CacheIdent::TableId(table_id)]) - .await; + let _ = self.cache_invalidator.invalidate(&ctx, &cache_idents).await; Ok(()) } /// Returns the [PersistentContext] of the procedure. - pub fn persistent_ctx(&self) -> Arc { + pub fn persistent_ctx(&self) -> PersistentContext { self.persistent_ctx.clone() } } @@ -609,7 +737,7 @@ pub struct RegionMigrationData<'a> { pub(crate) struct RegionMigrationProcedure { state: Box, context: Context, - _guard: Option, + _guards: Vec, } impl RegionMigrationProcedure { @@ -618,22 +746,22 @@ impl RegionMigrationProcedure { pub fn new( persistent_context: PersistentContext, context_factory: impl ContextFactory, - guard: Option, + guards: Vec, ) -> Self { let state = Box::new(RegionMigrationStart {}); - Self::new_inner(state, persistent_context, context_factory, guard) + Self::new_inner(state, persistent_context, context_factory, guards) } fn new_inner( state: Box, persistent_context: PersistentContext, context_factory: impl ContextFactory, - guard: Option, + guards: Vec, ) -> Self { Self { state, context: context_factory.new_context(persistent_context), - _guard: guard, + _guards: guards, } } @@ -646,47 +774,52 @@ impl RegionMigrationProcedure { persistent_ctx, state, } = serde_json::from_str(json).context(FromJsonSnafu)?; + let guards = persistent_ctx + .region_ids + .iter() + .flat_map(|region_id| { + tracker.insert_running_procedure(&RegionMigrationProcedureTask { + region_id: *region_id, + from_peer: persistent_ctx.from_peer.clone(), + to_peer: persistent_ctx.to_peer.clone(), + timeout: persistent_ctx.timeout, + trigger_reason: persistent_ctx.trigger_reason, + }) + }) + .collect::>(); - let guard = tracker.insert_running_procedure(&RegionMigrationProcedureTask { - region_id: persistent_ctx.region_id, - from_peer: persistent_ctx.from_peer.clone(), - to_peer: persistent_ctx.to_peer.clone(), - timeout: persistent_ctx.timeout, - trigger_reason: persistent_ctx.trigger_reason, - }); let context = context_factory.new_context(persistent_ctx); Ok(Self { state, context, - _guard: guard, + _guards: guards, }) } - async fn rollback_inner(&mut self) -> Result<()> { + async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> { let _timer = METRIC_META_REGION_MIGRATION_EXECUTE .with_label_values(&["rollback"]) .start_timer(); - - let table_id = self.context.region_id().table_id(); - let region_id = self.context.region_id(); - self.context.remove_table_route_value(); - let table_metadata_manager = self.context.table_metadata_manager.clone(); - let table_route = self.context.get_table_route_value().await?; - - // Safety: It must be a physical table route. - let downgraded = table_route - .region_routes() - .unwrap() - .iter() - .filter(|route| route.region.id == region_id) - .any(|route| route.is_leader_downgrading()); - - if downgraded { - info!("Rollbacking downgraded region leader table route, region: {region_id}"); - table_metadata_manager - .update_leader_region_status(table_id, table_route, |route| { - if route.region.id == region_id { + let ctx = &self.context; + let table_regions = ctx.persistent_ctx.table_regions(); + for (table_id, regions) in table_regions { + let table_lock = TableLock::Write(table_id).into(); + let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await; + let table_route = ctx.get_table_route_value(table_id).await?; + let region_routes = table_route.region_routes().unwrap(); + let downgraded = region_routes + .iter() + .filter(|route| regions.contains(&route.region.id)) + .any(|route| route.is_leader_downgrading()); + if downgraded { + info!( + "Rollbacking downgraded region leader table route, table: {table_id}, regions: {regions:?}" + ); + let table_metadata_manager = &ctx.table_metadata_manager; + table_metadata_manager + .update_leader_region_status(table_id, &table_route, |route| { + if regions.contains(&route.region.id) { Some(None) } else { None @@ -696,10 +829,13 @@ impl RegionMigrationProcedure { .context(error::TableMetadataManagerSnafu) .map_err(BoxedError::new) .with_context(|_| error::RetryLaterWithSourceSnafu { - reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"), + reason: format!("Failed to update the table route during the rollback downgraded leader region: {regions:?}"), })?; + } } - + self.context + .deregister_failure_detectors_for_candidate_region() + .await; self.context.register_failure_detectors().await; Ok(()) @@ -712,8 +848,8 @@ impl Procedure for RegionMigrationProcedure { Self::TYPE_NAME } - async fn rollback(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<()> { - self.rollback_inner() + async fn rollback(&mut self, ctx: &ProcedureContext) -> ProcedureResult<()> { + self.rollback_inner(ctx) .await .map_err(ProcedureError::external) } @@ -742,14 +878,14 @@ impl Procedure for RegionMigrationProcedure { Err(ProcedureError::retry_later(e)) } else { // Consumes the opening region guard before deregistering the failure detectors. - self.context.volatile_ctx.opening_region_guard.take(); + self.context.volatile_ctx.opening_region_guards.clear(); self.context .deregister_failure_detectors_for_candidate_region() .await; error!( e; - "Region migration procedure failed, region_id: {}, from_peer: {}, to_peer: {}, {}", - self.context.region_id(), + "Region migration procedure failed, regions: {:?}, from_peer: {}, to_peer: {}, {}", + self.context.persistent_ctx.region_ids, self.context.persistent_ctx.from_peer, self.context.persistent_ctx.to_peer, self.context.volatile_ctx.metrics, @@ -776,7 +912,7 @@ impl Procedure for RegionMigrationProcedure { } fn user_metadata(&self) -> Option { - Some(UserMetadata::new(self.context.persistent_ctx())) + Some(UserMetadata::new(Arc::new(self.context.persistent_ctx()))) } } @@ -790,7 +926,6 @@ mod tests { use common_meta::key::test_utils::new_test_table_info; use common_meta::rpc::router::{Region, RegionRoute}; - use super::update_metadata::UpdateMetadata; use super::*; use crate::handler::HeartbeatMailbox; use crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion; @@ -813,7 +948,7 @@ mod tests { let env = TestingEnv::new(); let context = env.context_factory(); - let procedure = RegionMigrationProcedure::new(persistent_context, context, None); + let procedure = RegionMigrationProcedure::new(persistent_context, context, vec![]); let key = procedure.lock_key(); let keys = key.keys_to_lock().cloned().collect::>(); @@ -830,16 +965,27 @@ mod tests { let env = TestingEnv::new(); let context = env.context_factory(); - let procedure = RegionMigrationProcedure::new(persistent_context, context, None); + let procedure = RegionMigrationProcedure::new(persistent_context, context, vec![]); let serialized = procedure.dump().unwrap(); - let expected = r#"{"persistent_ctx":{"catalog":"greptime","schema":"public","from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105,"timeout":"10s","trigger_reason":"Unknown"},"state":{"region_migration_state":"RegionMigrationStart"}}"#; + let expected = r#"{"persistent_ctx":{"catalog_and_schema":[["greptime","public"]],"from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_ids":[4398046511105],"timeout":"10s","trigger_reason":"Unknown"},"state":{"region_migration_state":"RegionMigrationStart"}}"#; assert_eq!(expected, serialized); } #[test] fn test_backward_compatibility() { - let persistent_ctx = test_util::new_persistent_context(1, 2, RegionId::new(1024, 1)); + let persistent_ctx = PersistentContext { + #[allow(deprecated)] + catalog: Some("greptime".into()), + #[allow(deprecated)] + schema: Some("public".into()), + catalog_and_schema: vec![], + from_peer: Peer::empty(1), + to_peer: Peer::empty(2), + region_ids: vec![RegionId::new(1024, 1)], + timeout: Duration::from_secs(10), + trigger_reason: RegionMigrationTriggerReason::default(), + }; // NOTES: Changes it will break backward compatibility. let serialized = r#"{"catalog":"greptime","schema":"public","from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105}"#; let deserialized: PersistentContext = serde_json::from_str(serialized).unwrap(); @@ -874,7 +1020,7 @@ mod tests { let persistent_context = new_persistent_context(); let context_factory = env.context_factory(); let state = Box::::default(); - RegionMigrationProcedure::new_inner(state, persistent_context, context_factory, None) + RegionMigrationProcedure::new_inner(state, persistent_context, context_factory, vec![]) } let ctx = TestingEnv::procedure_context(); @@ -897,7 +1043,9 @@ mod tests { let mut procedure = RegionMigrationProcedure::from_json(&serialized, context_factory, tracker.clone()) .unwrap(); - assert!(tracker.contains(procedure.context.persistent_ctx.region_id)); + for region_id in &procedure.context.persistent_ctx.region_ids { + assert!(tracker.contains(*region_id)); + } for _ in 1..3 { status = Some(procedure.execute(&ctx).await.unwrap()); @@ -937,9 +1085,34 @@ mod tests { vec![ // MigrationStart Step::next( - "Should be the update metadata for downgrading", + "Should be the open candidate region", None, - Assertion::simple(assert_update_metadata_downgrade, assert_need_persist), + Assertion::simple(assert_open_candidate_region, assert_need_persist), + ), + // OpenCandidateRegion + Step::next( + "Should be the flush leader region", + Some(mock_datanode_reply( + to_peer_id, + Arc::new(|id| Ok(new_open_region_reply(id, true, None))), + )), + Assertion::simple(assert_flush_leader_region, assert_no_persist), + ), + // Flush Leader Region + Step::next( + "Should be the flush leader region", + Some(mock_datanode_reply( + from_peer_id, + Arc::new(move |id| { + Ok(new_flush_region_reply_for_region( + id, + RegionId::new(1024, 1), + true, + None, + )) + }), + )), + Assertion::simple(assert_update_metadata_downgrade, assert_no_persist), ), // UpdateMetadata::Downgrade Step::next( @@ -998,7 +1171,7 @@ mod tests { let to_peer_id = persistent_context.to_peer.id; let from_peer = persistent_context.from_peer.clone(); let to_peer = persistent_context.to_peer.clone(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let table_info = new_test_table_info(1024, vec![1]).into(); let region_routes = vec![RegionRoute { region: Region::new_test(region_id), @@ -1025,61 +1198,6 @@ mod tests { runner.suite.verify_table_metadata().await; } - #[tokio::test] - async fn test_procedure_flow_idempotent() { - common_telemetry::init_default_ut_logging(); - - let persistent_context = test_util::new_persistent_context(1, 2, RegionId::new(1024, 1)); - let state = Box::new(RegionMigrationStart); - - // The table metadata. - let from_peer_id = persistent_context.from_peer.id; - let to_peer_id = persistent_context.to_peer.id; - let from_peer = persistent_context.from_peer.clone(); - let to_peer = persistent_context.to_peer.clone(); - let region_id = persistent_context.region_id; - let table_info = new_test_table_info(1024, vec![1]).into(); - let region_routes = vec![RegionRoute { - region: Region::new_test(region_id), - leader_peer: Some(from_peer), - follower_peers: vec![to_peer], - ..Default::default() - }]; - - let suite = ProcedureMigrationTestSuite::new(persistent_context, state); - suite.init_table_metadata(table_info, region_routes).await; - - let steps = procedure_flow_steps(from_peer_id, to_peer_id); - let setup_to_latest_persisted_state = Step::setup( - "Sets state to UpdateMetadata::Downgrade", - merge_before_test_fn(vec![ - setup_state(Arc::new(|| Box::new(UpdateMetadata::Downgrade))), - Arc::new(reset_volatile_ctx), - ]), - ); - - let steps = [ - steps.clone(), - vec![setup_to_latest_persisted_state.clone()], - steps.clone()[1..].to_vec(), - vec![setup_to_latest_persisted_state], - steps.clone()[1..].to_vec(), - ] - .concat(); - let timer = Instant::now(); - - // Run the table tests. - let runner = ProcedureMigrationSuiteRunner::new(suite) - .steps(steps.clone()) - .run_once() - .await; - - // Ensure it didn't run into the slow path. - assert!(timer.elapsed().as_secs() < REGION_LEASE_SECS / 2); - - runner.suite.verify_table_metadata().await; - } - #[tokio::test] async fn test_procedure_flow_open_candidate_region_retryable_error() { common_telemetry::init_default_ut_logging(); @@ -1090,7 +1208,7 @@ mod tests { // The table metadata. let to_peer_id = persistent_context.to_peer.id; let from_peer = persistent_context.from_peer.clone(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let table_info = new_test_table_info(1024, vec![1]).into(); let region_routes = vec![RegionRoute { region: Region::new_test(region_id), @@ -1178,13 +1296,12 @@ mod tests { let from_peer_id = persistent_context.from_peer.id; let to_peer_id = persistent_context.to_peer.id; let from_peer = persistent_context.from_peer.clone(); - let to_peer = persistent_context.to_peer.clone(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let table_info = new_test_table_info(1024, vec![1]).into(); let region_routes = vec![RegionRoute { region: Region::new_test(region_id), leader_peer: Some(from_peer), - follower_peers: vec![to_peer], + follower_peers: vec![], ..Default::default() }]; @@ -1194,9 +1311,34 @@ mod tests { let steps = vec![ // MigrationStart Step::next( - "Should be the update metadata for downgrading", + "Should be the open candidate region", None, - Assertion::simple(assert_update_metadata_downgrade, assert_need_persist), + Assertion::simple(assert_open_candidate_region, assert_need_persist), + ), + // OpenCandidateRegion + Step::next( + "Should be the flush leader region", + Some(mock_datanode_reply( + to_peer_id, + Arc::new(|id| Ok(new_open_region_reply(id, true, None))), + )), + Assertion::simple(assert_flush_leader_region, assert_no_persist), + ), + // Flush Leader Region + Step::next( + "Should be the flush leader region", + Some(mock_datanode_reply( + from_peer_id, + Arc::new(move |id| { + Ok(new_flush_region_reply_for_region( + id, + RegionId::new(1024, 1), + true, + None, + )) + }), + )), + Assertion::simple(assert_update_metadata_downgrade, assert_no_persist), ), // UpdateMetadata::Downgrade Step::next( @@ -1240,9 +1382,9 @@ mod tests { ]; let setup_to_latest_persisted_state = Step::setup( - "Sets state to UpdateMetadata::Downgrade", + "Sets state to OpenCandidateRegion", merge_before_test_fn(vec![ - setup_state(Arc::new(|| Box::new(UpdateMetadata::Downgrade))), + setup_state(Arc::new(|| Box::new(OpenCandidateRegion))), Arc::new(reset_volatile_ctx), ]), ); @@ -1274,7 +1416,7 @@ mod tests { let to_peer_id = persistent_context.to_peer.id; let from_peer_id = persistent_context.from_peer.id; let from_peer = persistent_context.from_peer.clone(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let table_info = new_test_table_info(1024, vec![1]).into(); let region_routes = vec![RegionRoute { region: Region::new_test(region_id), diff --git a/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs b/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs index 5a8beb7ca4..c20c7fede2 100644 --- a/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs +++ b/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs @@ -19,7 +19,6 @@ use api::v1::meta::MailboxMessage; use common_meta::RegionIdent; use common_meta::distributed_time_constants::REGION_LEASE_SECS; use common_meta::instruction::{Instruction, InstructionReply, SimpleReply}; -use common_meta::key::datanode_table::RegionInfo; use common_procedure::{Context as ProcedureContext, Status}; use common_telemetry::{info, warn}; use serde::{Deserialize, Serialize}; @@ -47,12 +46,12 @@ impl State for CloseDowngradedRegion { ) -> Result<(Box, Status)> { if let Err(err) = self.close_downgraded_leader_region(ctx).await { let downgrade_leader_datanode = &ctx.persistent_ctx.from_peer; - let region_id = ctx.region_id(); - warn!(err; "Failed to close downgraded leader region: {region_id} on datanode {:?}", downgrade_leader_datanode); + let region_ids = &ctx.persistent_ctx.region_ids; + warn!(err; "Failed to close downgraded leader regions: {region_ids:?} on datanode {:?}", downgrade_leader_datanode); } info!( - "Region migration is finished: region_id: {}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}", - ctx.region_id(), + "Region migration is finished: regions: {:?}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}", + ctx.persistent_ctx.region_ids, ctx.persistent_ctx.from_peer, ctx.persistent_ctx.to_peer, ctx.persistent_ctx.trigger_reason, @@ -74,28 +73,30 @@ impl CloseDowngradedRegion { async fn build_close_region_instruction(&self, ctx: &mut Context) -> Result { let pc = &ctx.persistent_ctx; let downgrade_leader_datanode_id = pc.from_peer.id; - let table_id = pc.region_id.table_id(); - let region_number = pc.region_id.region_number(); - let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?; + let region_ids = &ctx.persistent_ctx.region_ids; + let mut idents = Vec::with_capacity(region_ids.len()); - let RegionInfo { engine, .. } = datanode_table_value.region_info.clone(); + for region_id in region_ids { + idents.push(RegionIdent { + datanode_id: downgrade_leader_datanode_id, + table_id: region_id.table_id(), + region_number: region_id.region_number(), + // The `engine` field is not used for closing region. + engine: String::new(), + }); + } - Ok(Instruction::CloseRegions(vec![RegionIdent { - datanode_id: downgrade_leader_datanode_id, - table_id, - region_number, - engine, - }])) + Ok(Instruction::CloseRegions(idents)) } /// Closes the downgraded leader region. async fn close_downgraded_leader_region(&self, ctx: &mut Context) -> Result<()> { let close_instruction = self.build_close_region_instruction(ctx).await?; - let region_id = ctx.region_id(); + let region_ids = &ctx.persistent_ctx.region_ids; let pc = &ctx.persistent_ctx; let downgrade_leader_datanode = &pc.from_peer; let msg = MailboxMessage::json_message( - &format!("Close downgraded region: {}", region_id), + &format!("Close downgraded regions: {:?}", region_ids), &format!("Metasrv@{}", ctx.server_addr()), &format!( "Datanode-{}@{}", @@ -118,8 +119,8 @@ impl CloseDowngradedRegion { Ok(msg) => { let reply = HeartbeatMailbox::json_reply(&msg)?; info!( - "Received close downgraded leade region reply: {:?}, region: {}", - reply, region_id + "Received close downgraded leade region reply: {:?}, region: {:?}", + reply, region_ids ); let InstructionReply::CloseRegions(SimpleReply { result, error }) = reply else { return error::UnexpectedInstructionReplySnafu { @@ -134,7 +135,7 @@ impl CloseDowngradedRegion { } else { error::UnexpectedSnafu { violated: format!( - "Failed to close downgraded leader region: {region_id} on datanode {:?}, error: {error:?}", + "Failed to close downgraded leader region: {region_ids:?} on datanode {:?}, error: {error:?}", downgrade_leader_datanode, ), } diff --git a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs index ad805ae680..d10220098f 100644 --- a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs @@ -19,10 +19,10 @@ use api::v1::meta::MailboxMessage; use common_error::ext::BoxedError; use common_meta::distributed_time_constants::REGION_LEASE_SECS; use common_meta::instruction::{ - DowngradeRegion, DowngradeRegionReply, Instruction, InstructionReply, + DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply, }; use common_procedure::{Context as ProcedureContext, Status}; -use common_telemetry::{error, info, warn}; +use common_telemetry::{debug, error, info, warn}; use common_time::util::current_time_millis; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; @@ -70,30 +70,30 @@ impl State for DowngradeLeaderRegion { Ok(_) => { // Do nothing info!( - "Downgraded region leader success, region: {}", - ctx.persistent_ctx.region_id + "Downgraded region leader success, region: {:?}", + ctx.persistent_ctx.region_ids ); } Err(error::Error::ExceededDeadline { .. }) => { info!( - "Downgrade region leader exceeded deadline, region: {}", - ctx.persistent_ctx.region_id + "Downgrade region leader exceeded deadline, region: {:?}", + ctx.persistent_ctx.region_ids ); // Rollbacks the metadata if procedure is timeout return Ok((Box::new(UpdateMetadata::Rollback), Status::executing(false))); } Err(err) => { - error!(err; "Occurs non-retryable error, region: {}", ctx.persistent_ctx.region_id); + error!(err; "Occurs non-retryable error, region: {:?}", ctx.persistent_ctx.region_ids); if let Some(deadline) = ctx.volatile_ctx.leader_region_lease_deadline.as_ref() { info!( - "Running into the downgrade region leader slow path, region: {}, sleep until {:?}", - ctx.persistent_ctx.region_id, deadline + "Running into the downgrade region leader slow path, region: {:?}, sleep until {:?}", + ctx.persistent_ctx.region_ids, deadline ); tokio::time::sleep_until(*deadline).await; } else { warn!( - "Leader region lease deadline is not set, region: {}", - ctx.persistent_ctx.region_id + "Leader region lease deadline is not set, region: {:?}", + ctx.persistent_ctx.region_ids ); } } @@ -118,12 +118,76 @@ impl DowngradeLeaderRegion { ctx: &Context, flush_timeout: Duration, ) -> Instruction { - let pc = &ctx.persistent_ctx; - let region_id = pc.region_id; - Instruction::DowngradeRegion(DowngradeRegion { + let region_ids = &ctx.persistent_ctx.region_ids; + let mut downgrade_regions = Vec::with_capacity(region_ids.len()); + for region_id in region_ids { + downgrade_regions.push(DowngradeRegion { + region_id: *region_id, + flush_timeout: Some(flush_timeout), + }); + } + + Instruction::DowngradeRegions(downgrade_regions) + } + + fn handle_downgrade_region_reply( + &self, + ctx: &mut Context, + reply: &DowngradeRegionReply, + now: &Instant, + ) -> Result<()> { + let leader = &ctx.persistent_ctx.from_peer; + let DowngradeRegionReply { region_id, - flush_timeout: Some(flush_timeout), - }) + last_entry_id, + metadata_last_entry_id, + exists, + error, + } = reply; + + if error.is_some() { + return error::RetryLaterSnafu { + reason: format!( + "Failed to downgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}", + region_id, leader, error, now.elapsed() + ), + } + .fail(); + } + + if !exists { + warn!( + "Trying to downgrade the region {} on datanode {:?}, but region doesn't exist!, elapsed: {:?}", + region_id, + leader, + now.elapsed() + ); + } else { + info!( + "Region {} leader is downgraded on datanode {:?}, last_entry_id: {:?}, metadata_last_entry_id: {:?}, elapsed: {:?}", + region_id, + leader, + last_entry_id, + metadata_last_entry_id, + now.elapsed() + ); + } + + if let Some(last_entry_id) = last_entry_id { + debug!( + "set last_entry_id: {:?}, region_id: {:?}", + last_entry_id, region_id + ); + ctx.volatile_ctx + .set_last_entry_id(*region_id, *last_entry_id); + } + + if let Some(metadata_last_entry_id) = metadata_last_entry_id { + ctx.volatile_ctx + .set_metadata_last_entry_id(*region_id, *metadata_last_entry_id); + } + + Ok(()) } /// Tries to downgrade a leader region. @@ -140,7 +204,7 @@ impl DowngradeLeaderRegion { /// - [ExceededDeadline](error::Error::ExceededDeadline) /// - Invalid JSON. async fn downgrade_region(&self, ctx: &mut Context) -> Result<()> { - let region_id = ctx.persistent_ctx.region_id; + let region_ids = &ctx.persistent_ctx.region_ids; let operation_timeout = ctx.next_operation_timeout() .context(error::ExceededDeadlineSnafu { @@ -150,7 +214,7 @@ impl DowngradeLeaderRegion { let leader = &ctx.persistent_ctx.from_peer; let msg = MailboxMessage::json_message( - &format!("Downgrade leader region: {}", region_id), + &format!("Downgrade leader regions: {:?}", region_ids), &format!("Metasrv@{}", ctx.server_addr()), &format!("Datanode-{}@{}", leader.id, leader.addr), common_time::util::current_time_millis(), @@ -168,17 +232,12 @@ impl DowngradeLeaderRegion { Ok(msg) => { let reply = HeartbeatMailbox::json_reply(&msg)?; info!( - "Received downgrade region reply: {:?}, region: {}, elapsed: {:?}", + "Received downgrade region reply: {:?}, region: {:?}, elapsed: {:?}", reply, - region_id, + region_ids, now.elapsed() ); - let InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id, - metadata_last_entry_id, - exists, - error, - }) = reply + let InstructionReply::DowngradeRegions(DowngradeRegionsReply { replies }) = reply else { return error::UnexpectedInstructionReplySnafu { mailbox_message: msg.to_string(), @@ -187,48 +246,14 @@ impl DowngradeLeaderRegion { .fail(); }; - if error.is_some() { - return error::RetryLaterSnafu { - reason: format!( - "Failed to downgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}", - region_id, leader, error, now.elapsed() - ), - } - .fail(); + for reply in replies { + self.handle_downgrade_region_reply(ctx, &reply, &now)?; } - - if !exists { - warn!( - "Trying to downgrade the region {} on datanode {:?}, but region doesn't exist!, elapsed: {:?}", - region_id, - leader, - now.elapsed() - ); - } else { - info!( - "Region {} leader is downgraded on datanode {:?}, last_entry_id: {:?}, metadata_last_entry_id: {:?}, elapsed: {:?}", - region_id, - leader, - last_entry_id, - metadata_last_entry_id, - now.elapsed() - ); - } - - if let Some(last_entry_id) = last_entry_id { - ctx.volatile_ctx.set_last_entry_id(last_entry_id); - } - - if let Some(metadata_last_entry_id) = metadata_last_entry_id { - ctx.volatile_ctx - .set_metadata_last_entry_id(metadata_last_entry_id); - } - Ok(()) } Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( - "Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}, elapsed: {:?}", + "Mailbox received timeout for downgrade leader region {region_ids:?} on datanode {:?}, elapsed: {:?}", leader, now.elapsed() ); @@ -244,7 +269,7 @@ impl DowngradeLeaderRegion { let last_connection_at = match find_datanode_lease_value(&ctx.in_memory, leader.id).await { Ok(lease_value) => lease_value.map(|lease_value| lease_value.timestamp_millis), Err(err) => { - error!(err; "Failed to find datanode lease value for datanode: {}, during region migration, region: {}", leader, ctx.persistent_ctx.region_id); + error!(err; "Failed to find datanode lease value for datanode: {}, during region migration, region: {:?}", leader, ctx.persistent_ctx.region_ids); return; } }; @@ -262,8 +287,8 @@ impl DowngradeLeaderRegion { if elapsed >= (REGION_LEASE_SECS * 1000) as i64 { ctx.volatile_ctx.reset_leader_region_lease_deadline(); info!( - "Datanode {}({}) has been disconnected for longer than the region lease period ({:?}), reset leader region lease deadline to None, region: {}", - leader, last_connection_at, region_lease, ctx.persistent_ctx.region_id + "Datanode {}({}) has been disconnected for longer than the region lease period ({:?}), reset leader region lease deadline to None, region: {:?}", + leader, last_connection_at, region_lease, ctx.persistent_ctx.region_ids ); } else if elapsed > 0 { // `now - last_connection_at` < REGION_LEASE_SECS * 1000 @@ -273,23 +298,23 @@ impl DowngradeLeaderRegion { ctx.volatile_ctx .set_leader_region_lease_deadline(lease_timeout); info!( - "Datanode {}({}) last connected {:?} ago, updated leader region lease deadline to {:?}, region: {}", + "Datanode {}({}) last connected {:?} ago, updated leader region lease deadline to {:?}, region: {:?}", leader, last_connection_at, elapsed, ctx.volatile_ctx.leader_region_lease_deadline, - ctx.persistent_ctx.region_id + ctx.persistent_ctx.region_ids ); } else { warn!( - "Datanode {} has invalid last connection timestamp: {} (which is after current time: {}), region: {}", - leader, last_connection_at, now, ctx.persistent_ctx.region_id + "Datanode {} has invalid last connection timestamp: {} (which is after current time: {}), region: {:?}", + leader, last_connection_at, now, ctx.persistent_ctx.region_ids ) } } else { warn!( - "Failed to find last connection time for datanode {}, unable to update region lease deadline, region: {}", - leader, ctx.persistent_ctx.region_id + "Failed to find last connection time for datanode {}, unable to update region lease deadline, region: {:?}", + leader, ctx.persistent_ctx.region_ids ) } } @@ -314,19 +339,20 @@ impl DowngradeLeaderRegion { retry += 1; // Throws the error immediately if the procedure exceeded the deadline. if matches!(err, error::Error::ExceededDeadline { .. }) { - error!(err; "Failed to downgrade region leader, region: {}, exceeded deadline", ctx.persistent_ctx.region_id); + error!(err; "Failed to downgrade region leader, regions: {:?}, exceeded deadline", ctx.persistent_ctx.region_ids); return Err(err); } else if matches!(err, error::Error::PusherNotFound { .. }) { // Throws the error immediately if the datanode is unreachable. - error!(err; "Failed to downgrade region leader, region: {}, datanode({}) is unreachable(PusherNotFound)", ctx.persistent_ctx.region_id, ctx.persistent_ctx.from_peer.id); + error!(err; "Failed to downgrade region leader, regions: {:?}, datanode({}) is unreachable(PusherNotFound)", ctx.persistent_ctx.region_ids, ctx.persistent_ctx.from_peer.id); self.update_leader_region_lease_deadline(ctx).await; return Err(err); } else if err.is_retryable() && retry < self.optimistic_retry { - error!(err; "Failed to downgrade region leader, region: {}, retry later", ctx.persistent_ctx.region_id); + error!(err; "Failed to downgrade region leader, regions: {:?}, retry later", ctx.persistent_ctx.region_ids); sleep(self.retry_initial_interval).await; } else { return Err(BoxedError::new(err)).context(error::DowngradeLeaderSnafu { - region_id: ctx.persistent_ctx.region_id, + // TODO(weny): handle multiple regions. + region_id: ctx.persistent_ctx.region_ids[0], })?; } } else { @@ -363,22 +389,21 @@ mod tests { }; fn new_persistent_context() -> PersistentContext { - PersistentContext { - catalog: "greptime".into(), - schema: "public".into(), - from_peer: Peer::empty(1), - to_peer: Peer::empty(2), - region_id: RegionId::new(1024, 1), - timeout: Duration::from_millis(1000), - trigger_reason: RegionMigrationTriggerReason::Manual, - } + PersistentContext::new( + vec![("greptime".into(), "public".into())], + Peer::empty(1), + Peer::empty(2), + vec![RegionId::new(1024, 1)], + Duration::from_millis(1000), + RegionMigrationTriggerReason::Manual, + ) } async fn prepare_table_metadata(ctx: &Context, wal_options: HashMap) { - let table_info = - new_test_table_info(ctx.persistent_ctx.region_id.table_id(), vec![1]).into(); + let region_id = ctx.persistent_ctx.region_ids[0]; + let table_info = new_test_table_info(region_id.table_id(), vec![1]).into(); let region_routes = vec![RegionRoute { - region: Region::new_test(ctx.persistent_ctx.region_id), + region: Region::new_test(region_id), leader_peer: Some(ctx.persistent_ctx.from_peer.clone()), follower_peers: vec![ctx.persistent_ctx.to_peer.clone()], ..Default::default() @@ -586,7 +611,13 @@ mod tests { }); state.downgrade_region_with_retry(&mut ctx).await.unwrap(); - assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, Some(1)); + assert_eq!( + ctx.volatile_ctx + .leader_region_last_entry_ids + .get(&RegionId::new(0, 0)) + .cloned(), + Some(1) + ); assert!(ctx.volatile_ctx.leader_region_lease_deadline.is_none()); } @@ -632,7 +663,7 @@ mod tests { .await .unwrap_err(); assert_matches!(err, error::Error::DowngradeLeader { .. }); - assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, None); + // assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, None); // Should remain no change. assert_eq!( ctx.volatile_ctx.leader_region_lease_deadline.unwrap(), @@ -667,7 +698,13 @@ mod tests { let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap(); let elapsed = timer.elapsed().as_secs(); assert!(elapsed < REGION_LEASE_SECS / 2); - assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, Some(1)); + assert_eq!( + ctx.volatile_ctx + .leader_region_last_entry_ids + .get(&RegionId::new(0, 0)) + .cloned(), + Some(1) + ); assert!(ctx.volatile_ctx.leader_region_lease_deadline.is_none()); let _ = next diff --git a/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs b/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs index b5cc1a955c..f9e5900cbb 100644 --- a/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs @@ -15,7 +15,7 @@ use std::any::Any; use api::v1::meta::MailboxMessage; -use common_meta::instruction::{FlushRegions, Instruction, InstructionReply}; +use common_meta::instruction::{FlushErrorStrategy, FlushRegions, Instruction, InstructionReply}; use common_procedure::{Context as ProcedureContext, Status}; use common_telemetry::{info, warn}; use serde::{Deserialize, Serialize}; @@ -64,8 +64,10 @@ impl PreFlushRegion { /// Builds flush leader region instruction. fn build_flush_leader_region_instruction(&self, ctx: &Context) -> Instruction { let pc = &ctx.persistent_ctx; - let region_id = pc.region_id; - Instruction::FlushRegions(FlushRegions::sync_single(region_id)) + Instruction::FlushRegions(FlushRegions::sync_batch( + pc.region_ids.clone(), + FlushErrorStrategy::TryAll, + )) } /// Tries to flush a leader region. @@ -88,11 +90,11 @@ impl PreFlushRegion { operation: "Flush leader region", })?; let flush_instruction = self.build_flush_leader_region_instruction(ctx); - let region_id = ctx.persistent_ctx.region_id; + let region_ids = &ctx.persistent_ctx.region_ids; let leader = &ctx.persistent_ctx.from_peer; let msg = MailboxMessage::json_message( - &format!("Flush leader region: {}", region_id), + &format!("Flush leader region: {:?}", region_ids), &format!("Metasrv@{}", ctx.server_addr()), &format!("Datanode-{}@{}", leader.id, leader.addr), common_time::util::current_time_millis(), @@ -111,32 +113,42 @@ impl PreFlushRegion { Ok(msg) => { let reply = HeartbeatMailbox::json_reply(&msg)?; info!( - "Received flush leader region reply: {:?}, region: {}, elapsed: {:?}", + "Received flush leader region reply: {:?}, region: {:?}, elapsed: {:?}", reply, - region_id, + region_ids, now.elapsed() ); let reply_result = match reply { InstructionReply::FlushRegions(flush_reply) => { - if flush_reply.results.len() != 1 { + if flush_reply.results.len() != region_ids.len() { return error::UnexpectedInstructionReplySnafu { mailbox_message: msg.to_string(), - reason: "expect single region flush result", + reason: format!( + "expect {} region flush result, but got {}", + region_ids.len(), + flush_reply.results.len() + ), } .fail(); } - let (reply_region_id, result) = &flush_reply.results[0]; - if *reply_region_id != region_id { - return error::UnexpectedInstructionReplySnafu { - mailbox_message: msg.to_string(), - reason: "flush reply region ID mismatch", - } - .fail(); - } - match result { - Ok(()) => (true, None), - Err(err) => (false, Some(err.clone())), + + match flush_reply.overall_success { + true => (true, None), + false => ( + false, + Some( + flush_reply + .results + .iter() + .filter_map(|(region_id, result)| match result { + Ok(_) => None, + Err(e) => Some(format!("{}: {}", region_id, e)), + }) + .collect::>() + .join("; "), + ), + ), } } _ => { @@ -149,15 +161,15 @@ impl PreFlushRegion { }; let (result, error) = reply_result; - if error.is_some() { + if let Some(error) = error { warn!( - "Failed to flush leader region {} on datanode {:?}, error: {:?}. Skip flush operation.", - region_id, leader, error + "Failed to flush leader regions {:?} on datanode {:?}, error: {}. Skip flush operation.", + region_ids, leader, &error ); } else if result { info!( - "The flush leader region {} on datanode {:?} is successful, elapsed: {:?}", - region_id, + "The flush leader regions {:?} on datanode {:?} is successful, elapsed: {:?}", + region_ids, leader, now.elapsed() ); @@ -166,15 +178,15 @@ impl PreFlushRegion { Ok(()) } Err(Error::MailboxTimeout { .. }) => error::ExceededDeadlineSnafu { - operation: "Flush leader region", + operation: "Flush leader regions", } .fail(), Err(err) => Err(err), }, Err(Error::PusherNotFound { .. }) => { warn!( - "Failed to flush leader region({}), the datanode({}) is unreachable(PusherNotFound). Skip flush operation.", - region_id, leader + "Failed to flush leader regions({:?}), the datanode({}) is unreachable(PusherNotFound). Skip flush operation.", + region_ids, leader ); Ok(()) } @@ -268,7 +280,7 @@ mod tests { // to_peer: 2 let persistent_context = new_persistent_context(); let from_peer_id = persistent_context.from_peer.id; - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let mut env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); let mailbox_ctx = env.mailbox_context(); @@ -297,7 +309,7 @@ mod tests { // to_peer: 2 let persistent_context = new_persistent_context(); let from_peer_id = persistent_context.from_peer.id; - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let mut env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); let mailbox_ctx = env.mailbox_context(); diff --git a/src/meta-srv/src/procedure/region_migration/manager.rs b/src/meta-srv/src/procedure/region_migration/manager.rs index 563b0f290d..fcd8f7a6e6 100644 --- a/src/meta-srv/src/procedure/region_migration/manager.rs +++ b/src/meta-srv/src/procedure/region_migration/manager.rs @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::{Arc, RwLock}; use std::time::Duration; @@ -31,6 +31,9 @@ use table::table_name::TableName; use crate::error::{self, Result}; use crate::metrics::{METRIC_META_REGION_MIGRATION_DATANODES, METRIC_META_REGION_MIGRATION_FAIL}; +use crate::procedure::region_migration::utils::{ + RegionMigrationAnalysis, RegionMigrationTaskBatch, analyze_region_migration_task, +}; use crate::procedure::region_migration::{ DefaultContextFactory, PersistentContext, RegionMigrationProcedure, }; @@ -99,6 +102,7 @@ impl Drop for RegionMigrationProcedureGuard { } } +/// A task of region migration procedure. #[derive(Debug, Clone)] pub struct RegionMigrationProcedureTask { pub(crate) region_id: RegionId, @@ -151,6 +155,25 @@ impl Display for RegionMigrationProcedureTask { } } +/// The result of submitting a region migration task. +#[derive(Debug, Default, PartialEq, Eq)] +pub struct SubmitRegionMigrationTaskResult { + /// Regions already migrated to the `to_peer`. + pub migrated: Vec, + /// Regions where the leader peer has changed. + pub leader_changed: Vec, + /// Regions where `to_peer` is already a follower (conflict). + pub peer_conflict: Vec, + /// Regions whose table is not found. + pub table_not_found: Vec, + /// Regions still pending migration. + pub migrating: Vec, + /// Regions that have been submitted for migration. + pub submitted: Vec, + /// The procedure id of the region migration procedure. + pub procedure_id: Option, +} + impl RegionMigrationManager { /// Returns new [`RegionMigrationManager`] pub(crate) fn new( @@ -332,6 +355,168 @@ impl RegionMigrationManager { Ok(()) } + /// Extracts regions from the migration task that are already running migration procedures. + /// + /// Returns a tuple containing those region ids that are already running and the newly created procedure guards. + /// The regions that are already running will be removed from the [`RegionMigrationTask`]. + fn extract_running_regions( + &self, + task: &mut RegionMigrationTaskBatch, + ) -> (Vec, Vec) { + let mut migrating_region_ids = Vec::new(); + let mut procedure_guards = Vec::with_capacity(task.region_ids.len()); + + for region_id in &task.region_ids { + let Some(guard) = self.insert_running_procedure(&RegionMigrationProcedureTask::new( + *region_id, + task.from_peer.clone(), + task.to_peer.clone(), + task.timeout, + task.trigger_reason, + )) else { + migrating_region_ids.push(*region_id); + continue; + }; + procedure_guards.push(guard); + } + + let migrating_set = migrating_region_ids.iter().cloned().collect::>(); + task.region_ids.retain(|id| !migrating_set.contains(id)); + + (migrating_region_ids, procedure_guards) + } + + pub async fn submit_region_migration_task( + &self, + mut task: RegionMigrationTaskBatch, + ) -> Result { + let (migrating_region_ids, procedure_guards) = self.extract_running_regions(&mut task); + let RegionMigrationAnalysis { + migrated, + leader_changed, + peer_conflict, + mut table_not_found, + pending, + } = analyze_region_migration_task(&task, &self.context_factory.table_metadata_manager) + .await?; + if pending.is_empty() { + return Ok(SubmitRegionMigrationTaskResult { + migrated, + leader_changed, + peer_conflict, + table_not_found, + migrating: migrating_region_ids, + submitted: vec![], + procedure_id: None, + }); + } + + // Updates the region ids to the pending region ids. + task.region_ids = pending; + let table_regions = task.table_regions(); + let table_ids = table_regions.keys().cloned().collect::>(); + let table_info_values = self + .context_factory + .table_metadata_manager + .table_info_manager() + .batch_get(&table_ids) + .await + .context(error::TableMetadataManagerSnafu)?; + let mut catalog_and_schema = Vec::with_capacity(table_info_values.len()); + for (table_id, regions) in table_regions { + match table_info_values.get(&table_id) { + Some(table_info) => { + let TableName { + catalog_name, + schema_name, + .. + } = table_info.table_name(); + catalog_and_schema.push((catalog_name, schema_name)); + } + None => { + task.region_ids.retain(|id| id.table_id() != table_id); + table_not_found.extend(regions); + } + } + } + if task.region_ids.is_empty() { + return Ok(SubmitRegionMigrationTaskResult { + migrated, + leader_changed, + peer_conflict, + table_not_found, + migrating: migrating_region_ids, + submitted: vec![], + procedure_id: None, + }); + } + + let submitting_region_ids = task.region_ids.clone(); + let procedure_id = self + .submit_procedure_inner(task, procedure_guards, catalog_and_schema) + .await?; + Ok(SubmitRegionMigrationTaskResult { + migrated, + leader_changed, + peer_conflict, + table_not_found, + migrating: migrating_region_ids, + submitted: submitting_region_ids, + procedure_id: Some(procedure_id), + }) + } + + async fn submit_procedure_inner( + &self, + task: RegionMigrationTaskBatch, + procedure_guards: Vec, + catalog_and_schema: Vec<(String, String)>, + ) -> Result { + let procedure = RegionMigrationProcedure::new( + PersistentContext::new( + catalog_and_schema, + task.from_peer.clone(), + task.to_peer.clone(), + task.region_ids.clone(), + task.timeout, + task.trigger_reason, + ), + self.context_factory.clone(), + procedure_guards, + ); + let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure)); + let procedure_id = procedure_with_id.id; + info!("Starting region migration procedure {procedure_id} for {task}"); + let procedure_manager = self.procedure_manager.clone(); + let num_region = task.region_ids.len(); + + common_runtime::spawn_global(async move { + let watcher = &mut match procedure_manager.submit(procedure_with_id).await { + Ok(watcher) => watcher, + Err(e) => { + error!(e; "Failed to submit region migration procedure {procedure_id} for {task}"); + return; + } + }; + METRIC_META_REGION_MIGRATION_DATANODES + .with_label_values(&["src", &task.from_peer.id.to_string()]) + .inc_by(num_region as u64); + METRIC_META_REGION_MIGRATION_DATANODES + .with_label_values(&["desc", &task.to_peer.id.to_string()]) + .inc_by(num_region as u64); + + if let Err(e) = watcher::wait(watcher).await { + error!(e; "Failed to wait region migration procedure {procedure_id} for {task}"); + METRIC_META_REGION_MIGRATION_FAIL.inc(); + return; + } + + info!("Region migration procedure {procedure_id} for {task} is finished successfully!"); + }); + + Ok(procedure_id) + } + /// Submits a new region migration procedure. pub async fn submit_procedure( &self, @@ -384,17 +569,16 @@ impl RegionMigrationManager { trigger_reason, } = task.clone(); let procedure = RegionMigrationProcedure::new( - PersistentContext { - catalog: catalog_name, - schema: schema_name, - region_id, + PersistentContext::new( + vec![(catalog_name, schema_name)], from_peer, to_peer, + vec![region_id], timeout, trigger_reason, - }, + ), self.context_factory.clone(), - Some(guard), + vec![guard], ); let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure)); let procedure_id = procedure_with_id.id; @@ -645,4 +829,162 @@ mod test { assert_matches!(err, error::Error::Unexpected { .. }); } + + #[tokio::test] + async fn test_submit_procedure_with_multiple_regions_invalid_task() { + let env = TestingEnv::new(); + let context_factory = env.context_factory(); + let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory); + let task = RegionMigrationTaskBatch { + region_ids: vec![RegionId::new(1024, 1)], + from_peer: Peer::empty(1), + to_peer: Peer::empty(1), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + + let err = manager + .submit_region_migration_task(task) + .await + .unwrap_err(); + assert_matches!(err, error::Error::InvalidArguments { .. }); + } + + #[tokio::test] + async fn test_submit_procedure_with_multiple_regions_no_region_to_migrate() { + common_telemetry::init_default_ut_logging(); + let env = TestingEnv::new(); + let context_factory = env.context_factory(); + let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory); + let region_id = RegionId::new(1024, 1); + let task = RegionMigrationTaskBatch { + region_ids: vec![region_id], + from_peer: Peer::empty(1), + to_peer: Peer::empty(2), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + let table_info = new_test_table_info(1024, vec![1]).into(); + let region_routes = vec![RegionRoute { + region: Region::new_test(region_id), + leader_peer: Some(Peer::empty(2)), + ..Default::default() + }]; + env.create_physical_table_metadata(table_info, region_routes) + .await; + let result = manager.submit_region_migration_task(task).await.unwrap(); + + assert_eq!( + result, + SubmitRegionMigrationTaskResult { + migrated: vec![region_id], + ..Default::default() + } + ); + } + + #[tokio::test] + async fn test_submit_procedure_with_multiple_regions_leader_peer_changed() { + let env = TestingEnv::new(); + let context_factory = env.context_factory(); + let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory); + let region_id = RegionId::new(1024, 1); + let task = RegionMigrationTaskBatch { + region_ids: vec![region_id], + from_peer: Peer::empty(1), + to_peer: Peer::empty(2), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + + let table_info = new_test_table_info(1024, vec![1]).into(); + let region_routes = vec![RegionRoute { + region: Region::new_test(RegionId::new(1024, 1)), + leader_peer: Some(Peer::empty(3)), + ..Default::default() + }]; + + env.create_physical_table_metadata(table_info, region_routes) + .await; + let result = manager.submit_region_migration_task(task).await.unwrap(); + assert_eq!( + result, + SubmitRegionMigrationTaskResult { + leader_changed: vec![region_id], + ..Default::default() + } + ); + } + + #[tokio::test] + async fn test_submit_procedure_with_multiple_regions_peer_conflict() { + let env = TestingEnv::new(); + let context_factory = env.context_factory(); + let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory); + let region_id = RegionId::new(1024, 1); + let task = RegionMigrationTaskBatch { + region_ids: vec![region_id], + from_peer: Peer::empty(3), + to_peer: Peer::empty(2), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + + let table_info = new_test_table_info(1024, vec![1]).into(); + let region_routes = vec![RegionRoute { + region: Region::new_test(region_id), + leader_peer: Some(Peer::empty(3)), + follower_peers: vec![Peer::empty(2)], + ..Default::default() + }]; + + env.create_physical_table_metadata(table_info, region_routes) + .await; + let result = manager.submit_region_migration_task(task).await.unwrap(); + assert_eq!( + result, + SubmitRegionMigrationTaskResult { + peer_conflict: vec![region_id], + ..Default::default() + } + ); + } + + #[tokio::test] + async fn test_running_regions() { + let env = TestingEnv::new(); + let context_factory = env.context_factory(); + let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory); + let region_id = RegionId::new(1024, 1); + let task = RegionMigrationTaskBatch { + region_ids: vec![region_id, RegionId::new(1024, 2)], + from_peer: Peer::empty(1), + to_peer: Peer::empty(2), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + // Inserts one + manager.tracker.running_procedures.write().unwrap().insert( + region_id, + RegionMigrationProcedureTask::new( + region_id, + task.from_peer.clone(), + task.to_peer.clone(), + task.timeout, + task.trigger_reason, + ), + ); + let table_info = new_test_table_info(1024, vec![1]).into(); + let region_routes = vec![RegionRoute { + region: Region::new_test(RegionId::new(1024, 2)), + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }]; + env.create_physical_table_metadata(table_info, region_routes) + .await; + let result = manager.submit_region_migration_task(task).await.unwrap(); + assert_eq!(result.migrating, vec![region_id]); + assert_eq!(result.submitted, vec![RegionId::new(1024, 2)]); + assert!(result.procedure_id.is_some()); + } } diff --git a/src/meta-srv/src/procedure/region_migration/migration_abort.rs b/src/meta-srv/src/procedure/region_migration/migration_abort.rs index a25443c815..f3ad8052de 100644 --- a/src/meta-srv/src/procedure/region_migration/migration_abort.rs +++ b/src/meta-srv/src/procedure/region_migration/migration_abort.rs @@ -44,9 +44,9 @@ impl State for RegionMigrationAbort { _procedure_ctx: &ProcedureContext, ) -> Result<(Box, Status)> { warn!( - "Region migration is aborted: {}, region_id: {}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}", + "Region migration is aborted: {}, regions: {:?}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}", self.reason, - ctx.region_id(), + ctx.persistent_ctx.region_ids, ctx.persistent_ctx.from_peer, ctx.persistent_ctx.to_peer, ctx.persistent_ctx.trigger_reason, diff --git a/src/meta-srv/src/procedure/region_migration/migration_start.rs b/src/meta-srv/src/procedure/region_migration/migration_start.rs index e544adbf4c..99d2972aa8 100644 --- a/src/meta-srv/src/procedure/region_migration/migration_start.rs +++ b/src/meta-srv/src/procedure/region_migration/migration_start.rs @@ -20,22 +20,18 @@ use common_procedure::{Context as ProcedureContext, Status}; use common_telemetry::info; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; -use store_api::storage::RegionId; use crate::error::{self, Result}; use crate::procedure::region_migration::migration_abort::RegionMigrationAbort; use crate::procedure::region_migration::migration_end::RegionMigrationEnd; use crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion; -use crate::procedure::region_migration::update_metadata::UpdateMetadata; use crate::procedure::region_migration::{Context, State}; /// The behaviors: /// -/// If the expected leader region has been opened on `to_peer`, go to the [RegionMigrationEnd] state. -/// -/// If the candidate region has been opened on `to_peer`, go to the [UpdateMetadata::Downgrade] state. -/// -/// Otherwise go to the [OpenCandidateRegion] state. +/// - If all regions have been migrated, transitions to [RegionMigrationEnd]. +/// - If any of the region leaders is not the `from_peer`, transitions to [RegionMigrationAbort]. +/// - Otherwise, continues with [OpenCandidateRegion] to initiate the candidate region. #[derive(Debug, Serialize, Deserialize)] pub struct RegionMigrationStart; @@ -44,44 +40,62 @@ pub struct RegionMigrationStart; impl State for RegionMigrationStart { /// Yields next [State]. /// - /// If the expected leader region has been opened on `to_peer`, go to the [RegionMigrationEnd] state. + /// Determines the next [State] for region migration: /// - /// If the candidate region has been opened on `to_peer`, go to the [UpdateMetadata::Downgrade] state. - /// - /// Otherwise go to the [OpenCandidateRegion] state. + /// - If all regions have been migrated, transitions to [RegionMigrationEnd]. + /// - If any of the region leaders is not the `from_peer`, transitions to [RegionMigrationAbort]. + /// - Otherwise, continues with [OpenCandidateRegion] to initiate the candidate region. async fn next( &mut self, ctx: &mut Context, _procedure_ctx: &ProcedureContext, ) -> Result<(Box, Status)> { - let region_id = ctx.persistent_ctx.region_id; - let region_route = self.retrieve_region_route(ctx, region_id).await?; + let mut region_routes = self.retrieve_region_routes(ctx).await?; let to_peer = &ctx.persistent_ctx.to_peer; let from_peer = &ctx.persistent_ctx.from_peer; + let region_ids = &ctx.persistent_ctx.region_ids; - if self.has_migrated(®ion_route, to_peer)? { + self.filter_unmigrated_regions(&mut region_routes, to_peer); + + // No region to migrate, skip the migration. + if region_routes.is_empty() { info!( - "Region has been migrated, region: {:?}, to_peer: {:?}", - region_route.region.id, to_peer + "All regions have been migrated, regions: {:?}, to_peer: {:?}", + region_ids, to_peer ); - Ok((Box::new(RegionMigrationEnd), Status::done())) - } else if self.invalid_leader_peer(®ion_route, from_peer)? { - info!( - "Abort region migration, region:{:?}, unexpected leader peer: {:?}, expected: {:?}", - region_route.region.id, region_route.leader_peer, from_peer, - ); - Ok(( - Box::new(RegionMigrationAbort::new(&format!( - "Invalid region leader peer: {from_peer:?}, expected: {:?}", - region_route.leader_peer.as_ref().unwrap(), - ))), - Status::done(), - )) - } else if self.check_candidate_region_on_peer(®ion_route, to_peer) { - Ok((Box::new(UpdateMetadata::Downgrade), Status::executing(true))) - } else { - Ok((Box::new(OpenCandidateRegion), Status::executing(true))) + return Ok((Box::new(RegionMigrationEnd), Status::done())); } + + // Updates the region ids to the unmigrated regions. + if region_routes.len() != region_ids.len() { + let unmigrated_region_ids = region_routes.iter().map(|route| route.region.id).collect(); + info!( + "Some of the regions have been migrated, only migrate the following regions: {:?}, to_peer: {:?}", + unmigrated_region_ids, to_peer + ); + ctx.persistent_ctx.region_ids = unmigrated_region_ids; + } + + // Checks if any of the region leaders is not the `from_peer`. + for region_route in ®ion_routes { + if self.invalid_leader_peer(region_route, from_peer)? { + info!( + "Abort region migration, region:{}, unexpected leader peer: {:?}, expected: {:?}", + region_route.region.id, region_route.leader_peer, from_peer, + ); + return Ok(( + Box::new(RegionMigrationAbort::new(&format!( + "Invalid region leader peer: {:?}, expected: {:?}", + region_route.leader_peer.as_ref().unwrap(), + from_peer, + ))), + Status::done(), + )); + } + } + + // If all checks pass, open the candidate region. + Ok((Box::new(OpenCandidateRegion), Status::executing(true))) } fn as_any(&self) -> &dyn Any { @@ -90,7 +104,7 @@ impl State for RegionMigrationStart { } impl RegionMigrationStart { - /// Retrieves region route. + /// Retrieves region routes for multiple regions. /// /// Abort(non-retry): /// - TableRoute is not found. @@ -98,39 +112,32 @@ impl RegionMigrationStart { /// /// Retry: /// - Failed to retrieve the metadata of table. - async fn retrieve_region_route( - &self, - ctx: &mut Context, - region_id: RegionId, - ) -> Result { - let table_id = region_id.table_id(); - let table_route = ctx.get_table_route_value().await?; + async fn retrieve_region_routes(&self, ctx: &mut Context) -> Result> { + let region_ids = &ctx.persistent_ctx.region_ids; + let table_route_values = ctx.get_table_route_values().await?; + let mut region_routes = Vec::with_capacity(region_ids.len()); + for region_id in region_ids { + let table_id = region_id.table_id(); + let region_route = table_route_values + .get(&table_id) + .context(error::TableRouteNotFoundSnafu { table_id })? + .region_routes() + .with_context(|_| error::UnexpectedLogicalRouteTableSnafu { + err_msg: format!("TableRoute({table_id:?}) is a non-physical TableRouteValue."), + })? + .iter() + .find(|route| route.region.id == *region_id) + .cloned() + .with_context(|| error::UnexpectedSnafu { + violated: format!( + "RegionRoute({}) is not found in TableRoute({})", + region_id, table_id + ), + })?; + region_routes.push(region_route); + } - let region_route = table_route - .region_routes() - .context(error::UnexpectedLogicalRouteTableSnafu { - err_msg: format!("{self:?} is a non-physical TableRouteValue."), - })? - .iter() - .find(|route| route.region.id == region_id) - .cloned() - .context(error::UnexpectedSnafu { - violated: format!( - "RegionRoute({}) is not found in TableRoute({})", - region_id, table_id - ), - })?; - - Ok(region_route) - } - - /// Checks whether the candidate region on region has been opened. - /// Returns true if it's been opened. - fn check_candidate_region_on_peer(&self, region_route: &RegionRoute, to_peer: &Peer) -> bool { - region_route - .follower_peers - .iter() - .any(|peer| peer.id == to_peer.id) + Ok(region_routes) } /// Returns true if the region leader is not the `from_peer`. @@ -143,7 +150,7 @@ impl RegionMigrationStart { let is_invalid_leader_peer = region_route .leader_peer .as_ref() - .context(error::UnexpectedSnafu { + .with_context(|| error::UnexpectedSnafu { violated: format!("Leader peer is not found in TableRoute({})", region_id), })? .id @@ -151,6 +158,12 @@ impl RegionMigrationStart { Ok(is_invalid_leader_peer) } + /// Filters out regions that unmigrated. + fn filter_unmigrated_regions(&self, region_routes: &mut Vec, to_peer: &Peer) { + region_routes + .retain(|region_route| !self.has_migrated(region_route, to_peer).unwrap_or(false)); + } + /// Checks whether the region has been migrated. /// Returns true if it's. /// @@ -162,7 +175,7 @@ impl RegionMigrationStart { let region_migrated = region_route .leader_peer .as_ref() - .context(error::UnexpectedSnafu { + .with_context(|| error::UnexpectedSnafu { violated: format!("Leader peer is not found in TableRoute({})", region_id), })? .id @@ -173,6 +186,7 @@ impl RegionMigrationStart { #[cfg(test)] mod tests { + use std::assert_matches::assert_matches; use common_meta::key::test_utils::new_test_table_info; @@ -183,7 +197,6 @@ mod tests { use super::*; use crate::error::Error; use crate::procedure::region_migration::test_util::{self, TestingEnv, new_procedure_context}; - use crate::procedure::region_migration::update_metadata::UpdateMetadata; use crate::procedure::region_migration::{ContextFactory, PersistentContext}; fn new_persistent_context() -> PersistentContext { @@ -196,14 +209,8 @@ mod tests { let env = TestingEnv::new(); let persistent_context = new_persistent_context(); let mut ctx = env.context_factory().new_context(persistent_context); - - let err = state - .retrieve_region_route(&mut ctx, RegionId::new(1024, 1)) - .await - .unwrap_err(); - + let err = state.retrieve_region_routes(&mut ctx).await.unwrap_err(); assert_matches!(err, Error::TableRouteNotFound { .. }); - assert!(!err.is_retryable()); } @@ -216,56 +223,20 @@ mod tests { let env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); - let table_info = new_test_table_info(1024, vec![1]).into(); + let table_info = new_test_table_info(1024, vec![3]).into(); let region_route = RegionRoute { - region: Region::new_test(RegionId::new(1024, 1)), + region: Region::new_test(RegionId::new(1024, 3)), leader_peer: Some(from_peer.clone()), ..Default::default() }; env.create_physical_table_metadata(table_info, vec![region_route]) .await; - - let err = state - .retrieve_region_route(&mut ctx, RegionId::new(1024, 3)) - .await - .unwrap_err(); - + let err = state.retrieve_region_routes(&mut ctx).await.unwrap_err(); assert_matches!(err, Error::Unexpected { .. }); assert!(!err.is_retryable()); } - #[tokio::test] - async fn test_next_update_metadata_downgrade_state() { - let mut state = Box::new(RegionMigrationStart); - // from_peer: 1 - // to_peer: 2 - let persistent_context = new_persistent_context(); - let from_peer_id = persistent_context.from_peer.id; - let to_peer = persistent_context.to_peer.clone(); - let region_id = persistent_context.region_id; - - let env = TestingEnv::new(); - let mut ctx = env.context_factory().new_context(persistent_context); - - let table_info = new_test_table_info(1024, vec![1]).into(); - let region_routes = vec![RegionRoute { - region: Region::new_test(region_id), - leader_peer: Some(Peer::empty(from_peer_id)), - follower_peers: vec![to_peer], - ..Default::default() - }]; - - env.create_physical_table_metadata(table_info, region_routes) - .await; - let procedure_ctx = new_procedure_context(); - let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap(); - - let update_metadata = next.as_any().downcast_ref::().unwrap(); - - assert_matches!(update_metadata, UpdateMetadata::Downgrade); - } - #[tokio::test] async fn test_next_migration_end_state() { let mut state = Box::new(RegionMigrationStart); @@ -274,7 +245,7 @@ mod tests { let persistent_context = new_persistent_context(); let to_peer = persistent_context.to_peer.clone(); let from_peer = persistent_context.from_peer.clone(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); @@ -302,7 +273,7 @@ mod tests { // to_peer: 2 let persistent_context = new_persistent_context(); let from_peer_id = persistent_context.from_peer.id; - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); @@ -327,12 +298,12 @@ mod tests { // from_peer: 1 // to_peer: 2 let persistent_context = new_persistent_context(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); let table_info = new_test_table_info(1024, vec![1]).into(); - let region_routes = vec![RegionRoute { + let region_routes: Vec = vec![RegionRoute { region: Region::new_test(region_id), leader_peer: Some(Peer::empty(1024)), ..Default::default() diff --git a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs index f3e767cfd9..67e1bfb857 100644 --- a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::any::Any; +use std::ops::Div; use std::time::Duration; use api::v1::meta::MailboxMessage; @@ -65,33 +66,43 @@ impl OpenCandidateRegion { /// Abort(non-retry): /// - Datanode Table is not found. async fn build_open_region_instruction(&self, ctx: &mut Context) -> Result { - let pc = &ctx.persistent_ctx; - let table_id = pc.region_id.table_id(); - let region_number = pc.region_id.region_number(); - let candidate_id = pc.to_peer.id; - let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?; + let region_ids = ctx.persistent_ctx.region_ids.clone(); + let from_peer_id = ctx.persistent_ctx.from_peer.id; + let to_peer_id = ctx.persistent_ctx.to_peer.id; + let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?; + let mut open_regions = Vec::with_capacity(region_ids.len()); - let RegionInfo { - region_storage_path, - region_options, - region_wal_options, - engine, - } = datanode_table_value.region_info.clone(); - - let open_instruction = Instruction::OpenRegions(vec![OpenRegion::new( - RegionIdent { - datanode_id: candidate_id, - table_id, - region_number, + for region_id in region_ids { + let table_id = region_id.table_id(); + let region_number = region_id.region_number(); + let datanode_table_value = datanode_table_values.get(&table_id).context( + error::DatanodeTableNotFoundSnafu { + table_id, + datanode_id: from_peer_id, + }, + )?; + let RegionInfo { + region_storage_path, + region_options, + region_wal_options, engine, - }, - ®ion_storage_path, - region_options, - region_wal_options, - true, - )]); + } = datanode_table_value.region_info.clone(); - Ok(open_instruction) + open_regions.push(OpenRegion::new( + RegionIdent { + datanode_id: to_peer_id, + table_id, + region_number, + engine, + }, + ®ion_storage_path, + region_options, + region_wal_options, + true, + )); + } + + Ok(Instruction::OpenRegions(open_regions)) } /// Opens the candidate region. @@ -111,25 +122,27 @@ impl OpenCandidateRegion { ) -> Result<()> { let pc = &ctx.persistent_ctx; let vc = &mut ctx.volatile_ctx; - let region_id = pc.region_id; + let region_ids = &pc.region_ids; let candidate = &pc.to_peer; // This method might be invoked multiple times. // Only registers the guard if `opening_region_guard` is absent. - if vc.opening_region_guard.is_none() { - // Registers the opening region. - let guard = ctx - .opening_region_keeper - .register(candidate.id, region_id) - .context(error::RegionOpeningRaceSnafu { - peer_id: candidate.id, - region_id, - })?; - vc.opening_region_guard = Some(guard); + if vc.opening_region_guards.is_empty() { + for region_id in region_ids { + // Registers the opening region. + let guard = ctx + .opening_region_keeper + .register(candidate.id, *region_id) + .context(error::RegionOpeningRaceSnafu { + peer_id: candidate.id, + region_id: *region_id, + })?; + vc.opening_region_guards.push(guard); + } } let msg = MailboxMessage::json_message( - &format!("Open candidate region: {}", region_id), + &format!("Open candidate regions: {:?}", region_ids), &format!("Metasrv@{}", ctx.server_addr()), &format!("Datanode-{}@{}", candidate.id, candidate.addr), common_time::util::current_time_millis(), @@ -139,20 +152,23 @@ impl OpenCandidateRegion { input: open_instruction.to_string(), })?; + let operation_timeout = + ctx.next_operation_timeout() + .context(error::ExceededDeadlineSnafu { + operation: "Open candidate region", + })?; + let operation_timeout = operation_timeout.div(2).max(OPEN_CANDIDATE_REGION_TIMEOUT); let ch = Channel::Datanode(candidate.id); let now = Instant::now(); - let receiver = ctx - .mailbox - .send(&ch, msg, OPEN_CANDIDATE_REGION_TIMEOUT) - .await?; + let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?; match receiver.await { Ok(msg) => { let reply = HeartbeatMailbox::json_reply(&msg)?; info!( - "Received open region reply: {:?}, region: {}, elapsed: {:?}", + "Received open region reply: {:?}, region: {:?}, elapsed: {:?}", reply, - region_id, + region_ids, now.elapsed() ); let InstructionReply::OpenRegions(SimpleReply { result, error }) = reply else { @@ -168,7 +184,7 @@ impl OpenCandidateRegion { } else { error::RetryLaterSnafu { reason: format!( - "Region {region_id} is not opened by datanode {:?}, error: {error:?}, elapsed: {:?}", + "Region {region_ids:?} is not opened by datanode {:?}, error: {error:?}, elapsed: {:?}", candidate, now.elapsed() ), @@ -178,7 +194,7 @@ impl OpenCandidateRegion { } Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( - "Mailbox received timeout for open candidate region {region_id} on datanode {:?}, elapsed: {:?}", + "Mailbox received timeout for open candidate region {region_ids:?} on datanode {:?}, elapsed: {:?}", candidate, now.elapsed() ); @@ -251,7 +267,7 @@ mod tests { // from_peer: 1 // to_peer: 2 let persistent_context = new_persistent_context(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let to_peer_id = persistent_context.to_peer.id; let env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); @@ -272,7 +288,7 @@ mod tests { // from_peer: 1 // to_peer: 2 let persistent_context = new_persistent_context(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let to_peer_id = persistent_context.to_peer.id; let env = TestingEnv::new(); @@ -298,7 +314,7 @@ mod tests { // from_peer: 1 // to_peer: 2 let persistent_context = new_persistent_context(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let to_peer_id = persistent_context.to_peer.id; let mut env = TestingEnv::new(); @@ -331,7 +347,7 @@ mod tests { // from_peer: 1 // to_peer: 2 let persistent_context = new_persistent_context(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let to_peer_id = persistent_context.to_peer.id; let mut env = TestingEnv::new(); @@ -366,7 +382,7 @@ mod tests { // from_peer: 1 // to_peer: 2 let persistent_context = new_persistent_context(); - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let to_peer_id = persistent_context.to_peer.id; let mut env = TestingEnv::new(); @@ -406,14 +422,14 @@ mod tests { // to_peer: 2 let persistent_context = new_persistent_context(); let from_peer_id = persistent_context.from_peer.id; - let region_id = persistent_context.region_id; + let region_id = persistent_context.region_ids[0]; let to_peer_id = persistent_context.to_peer.id; let mut env = TestingEnv::new(); // Prepares table let table_info = new_test_table_info(1024, vec![1]).into(); let region_routes = vec![RegionRoute { - region: Region::new_test(persistent_context.region_id), + region: Region::new_test(region_id), leader_peer: Some(Peer::empty(from_peer_id)), ..Default::default() }]; @@ -441,10 +457,7 @@ mod tests { let procedure_ctx = new_procedure_context(); let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap(); let vc = ctx.volatile_ctx; - assert_eq!( - vc.opening_region_guard.unwrap().info(), - (to_peer_id, region_id) - ); + assert_eq!(vc.opening_region_guards[0].info(), (to_peer_id, region_id)); let flush_leader_region = next.as_any().downcast_ref::().unwrap(); assert_matches!(flush_leader_region, PreFlushRegion); diff --git a/src/meta-srv/src/procedure/region_migration/test_util.rs b/src/meta-srv/src/procedure/region_migration/test_util.rs index a44c3a20c6..c039fc441d 100644 --- a/src/meta-srv/src/procedure/region_migration/test_util.rs +++ b/src/meta-srv/src/procedure/region_migration/test_util.rs @@ -185,15 +185,14 @@ impl TestingEnv { /// Generates a [PersistentContext]. pub fn new_persistent_context(from: u64, to: u64, region_id: RegionId) -> PersistentContext { - PersistentContext { - catalog: "greptime".into(), - schema: "public".into(), - from_peer: Peer::empty(from), - to_peer: Peer::empty(to), - region_id, - timeout: Duration::from_secs(10), - trigger_reason: RegionMigrationTriggerReason::default(), - } + PersistentContext::new( + vec![("greptime".into(), "public".into())], + Peer::empty(from), + Peer::empty(to), + vec![region_id], + Duration::from_secs(10), + RegionMigrationTriggerReason::default(), + ) } /// The test suite for region migration procedure. @@ -306,37 +305,38 @@ impl ProcedureMigrationTestSuite { /// Verifies table metadata after region migration. pub(crate) async fn verify_table_metadata(&self) { - let region_id = self.context.persistent_ctx.region_id; - let table_route = self - .env - .table_metadata_manager - .table_route_manager() - .table_route_storage() - .get(region_id.table_id()) - .await - .unwrap() - .unwrap(); - let region_routes = table_route.region_routes().unwrap(); + for region_id in &self.context.persistent_ctx.region_ids { + let table_route = self + .env + .table_metadata_manager + .table_route_manager() + .table_route_storage() + .get(region_id.table_id()) + .await + .unwrap() + .unwrap(); + let region_routes = table_route.region_routes().unwrap(); - let expected_leader_id = self.context.persistent_ctx.to_peer.id; - let removed_follower_id = self.context.persistent_ctx.from_peer.id; + let expected_leader_id = self.context.persistent_ctx.to_peer.id; + let removed_follower_id = self.context.persistent_ctx.from_peer.id; - let region_route = region_routes - .iter() - .find(|route| route.region.id == region_id) - .unwrap(); - - assert!(!region_route.is_leader_downgrading()); - assert_eq!( - region_route.leader_peer.as_ref().unwrap().id, - expected_leader_id - ); - assert!( - !region_route - .follower_peers + let region_route = region_routes .iter() - .any(|route| route.id == removed_follower_id) - ) + .find(|route| route.region.id == *region_id) + .unwrap(); + + assert!(!region_route.is_leader_downgrading()); + assert_eq!( + region_route.leader_peer.as_ref().unwrap().id, + expected_leader_id + ); + assert!( + !region_route + .follower_peers + .iter() + .any(|route| route.id == removed_follower_id) + ) + } } } diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata.rs b/src/meta-srv/src/procedure/region_migration/update_metadata.rs index 8e7b2d4d3b..e96a025c5d 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata.rs @@ -18,7 +18,6 @@ pub(crate) mod upgrade_candidate_region; use std::any::Any; -use common_meta::lock_key::TableLock; use common_procedure::{Context as ProcedureContext, Status}; use common_telemetry::warn; use serde::{Deserialize, Serialize}; @@ -48,12 +47,10 @@ impl State for UpdateMetadata { ctx: &mut Context, procedure_ctx: &ProcedureContext, ) -> Result<(Box, Status)> { - let table_id = TableLock::Write(ctx.region_id().table_id()).into(); - let _guard = procedure_ctx.provider.acquire_lock(&table_id).await; - match self { UpdateMetadata::Downgrade => { - self.downgrade_leader_region(ctx).await?; + self.downgrade_leader_region(ctx, &procedure_ctx.provider) + .await?; Ok(( Box::::default(), @@ -61,7 +58,8 @@ impl State for UpdateMetadata { )) } UpdateMetadata::Upgrade => { - self.upgrade_candidate_region(ctx).await?; + self.upgrade_candidate_region(ctx, &procedure_ctx.provider) + .await?; if let Err(err) = ctx.invalidate_table_cache().await { warn!( @@ -71,7 +69,8 @@ impl State for UpdateMetadata { Ok((Box::new(CloseDowngradedRegion), Status::executing(false))) } UpdateMetadata::Rollback => { - self.rollback_downgraded_region(ctx).await?; + self.rollback_downgraded_region(ctx, &procedure_ctx.provider) + .await?; if let Err(err) = ctx.invalidate_table_cache().await { warn!( diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs index 28633efa56..05e29c9b08 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs @@ -13,7 +13,10 @@ // limitations under the License. use common_error::ext::BoxedError; +use common_meta::lock_key::TableLock; use common_meta::rpc::router::LeaderState; +use common_procedure::ContextProviderRef; +use common_telemetry::{error, info}; use snafu::ResultExt; use crate::error::{self, Result}; @@ -37,40 +40,48 @@ impl UpdateMetadata { /// It will only update **other region** info. Therefore, It's safe to retry after failure. /// /// - There is no other DDL procedure executed concurrently for the current table. - pub async fn downgrade_leader_region(&self, ctx: &mut Context) -> Result<()> { + pub async fn downgrade_leader_region( + &self, + ctx: &mut Context, + ctx_provider: &ContextProviderRef, + ) -> Result<()> { let table_metadata_manager = ctx.table_metadata_manager.clone(); let from_peer_id = ctx.persistent_ctx.from_peer.id; - let region_id = ctx.region_id(); - let table_id = region_id.table_id(); - let current_table_route_value = ctx.get_table_route_value().await?; + let table_regions = ctx.persistent_ctx.table_regions(); - // TODO(weny): ensures the leader region peer is the `from_peer`. - if let Err(err) = table_metadata_manager - .update_leader_region_status(table_id, current_table_route_value, |route| { - if route.region.id == region_id - && route - .leader_peer - .as_ref() - .is_some_and(|leader_peer| leader_peer.id == from_peer_id) - { - Some(Some(LeaderState::Downgrading)) - } else { - None - } - }) - .await - .context(error::TableMetadataManagerSnafu) - { - ctx.remove_table_route_value(); - return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { - reason: format!( - "Failed to update the table route during the downgrading leader region, region_id: {region_id}, from_peer_id: {from_peer_id}" - ), - }); + for (table_id, regions) in table_regions { + let table_lock = TableLock::Write(table_id).into(); + let _guard = ctx_provider.acquire_lock(&table_lock).await; + + let current_table_route_value = ctx.get_table_route_value(table_id).await?; + if let Err(err) = table_metadata_manager + .update_leader_region_status(table_id, ¤t_table_route_value, |route| { + if regions.contains(&route.region.id) + && route + .leader_peer + .as_ref() + .is_some_and(|leader_peer| leader_peer.id == from_peer_id) + { + Some(Some(LeaderState::Downgrading)) + } else { + None + } + }) + .await + .context(error::TableMetadataManagerSnafu) + { + error!(err; "Failed to update the table route during the downgrading leader region, regions: {regions:?}, from_peer_id: {from_peer_id}"); + return Err(BoxedError::new(err)).with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!( + "Failed to update the table route during the downgrading leader region, regions: {regions:?}, from_peer_id: {from_peer_id}" + ), + }); + } + info!( + "Downgrading leader region table route success, table_id: {table_id}, regions: {regions:?}, from_peer_id: {from_peer_id}" + ); } - ctx.remove_table_route_value(); - Ok(()) } } @@ -78,10 +89,13 @@ impl UpdateMetadata { #[cfg(test)] mod tests { use std::assert_matches::assert_matches; + use std::collections::HashMap; + use std::sync::Arc; use common_meta::key::test_utils::new_test_table_info; use common_meta::peer::Peer; - use common_meta::rpc::router::{LeaderState, Region, RegionRoute}; + use common_meta::rpc::router::{Region, RegionRoute}; + use common_procedure_test::MockContextProvider; use store_api::storage::RegionId; use crate::error::Error; @@ -107,71 +121,18 @@ mod tests { let env = TestingEnv::new(); let persistent_context = new_persistent_context(); let mut ctx = env.context_factory().new_context(persistent_context); + let provider = Arc::new(MockContextProvider::new(HashMap::new())) as _; - let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err(); + let err = state + .downgrade_leader_region(&mut ctx, &provider) + .await + .unwrap_err(); assert_matches!(err, Error::TableRouteNotFound { .. }); assert!(!err.is_retryable()); } - #[tokio::test] - async fn test_failed_to_update_table_route_error() { - let state = UpdateMetadata::Downgrade; - let persistent_context = new_persistent_context(); - let from_peer = persistent_context.from_peer.clone(); - - let env = TestingEnv::new(); - let mut ctx = env.context_factory().new_context(persistent_context); - let table_id = ctx.region_id().table_id(); - - let table_info = new_test_table_info(1024, vec![1, 2]).into(); - let region_routes = vec![ - RegionRoute { - region: Region::new_test(RegionId::new(1024, 1)), - leader_peer: Some(from_peer.clone()), - ..Default::default() - }, - RegionRoute { - region: Region::new_test(RegionId::new(1024, 2)), - leader_peer: Some(Peer::empty(4)), - ..Default::default() - }, - ]; - - env.create_physical_table_metadata(table_info, region_routes) - .await; - - let table_metadata_manager = env.table_metadata_manager(); - let original_table_route = table_metadata_manager - .table_route_manager() - .table_route_storage() - .get_with_raw_bytes(table_id) - .await - .unwrap() - .unwrap(); - - // modifies the table route. - table_metadata_manager - .update_leader_region_status(table_id, &original_table_route, |route| { - if route.region.id == RegionId::new(1024, 2) { - Some(Some(LeaderState::Downgrading)) - } else { - None - } - }) - .await - .unwrap(); - - // sets the old table route. - ctx.volatile_ctx.table_route = Some(original_table_route); - - let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err(); - assert!(ctx.volatile_ctx.table_route.is_none()); - assert!(err.is_retryable()); - assert!(format!("{err:?}").contains("Failed to update the table route")); - } - #[tokio::test] async fn test_only_downgrade_from_peer() { let mut state = Box::new(UpdateMetadata::Downgrade); @@ -179,7 +140,7 @@ mod tests { let env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); - let table_id = ctx.region_id().table_id(); + let table_id = ctx.persistent_ctx.region_ids[0].table_id(); let table_info = new_test_table_info(1024, vec![1, 2]).into(); let region_routes = vec![RegionRoute { @@ -212,7 +173,6 @@ mod tests { // It should remain unchanged. assert_eq!(latest_table_route.version().unwrap(), 0); assert!(!latest_table_route.region_routes().unwrap()[0].is_leader_downgrading()); - assert!(ctx.volatile_ctx.table_route.is_none()); } #[tokio::test] @@ -223,7 +183,7 @@ mod tests { let env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); - let table_id = ctx.region_id().table_id(); + let table_id = ctx.persistent_ctx.region_ids[0].table_id(); let table_info = new_test_table_info(1024, vec![1, 2]).into(); let region_routes = vec![RegionRoute { @@ -254,6 +214,5 @@ mod tests { .unwrap(); assert!(latest_table_route.region_routes().unwrap()[0].is_leader_downgrading()); - assert!(ctx.volatile_ctx.table_route.is_none()); } } diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs index 7c4a7b713e..fc32e37672 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs @@ -13,6 +13,9 @@ // limitations under the License. use common_error::ext::BoxedError; +use common_meta::lock_key::TableLock; +use common_procedure::ContextProviderRef; +use common_telemetry::{error, info}; use snafu::ResultExt; use crate::error::{self, Result}; @@ -28,31 +31,40 @@ impl UpdateMetadata { /// Retry: /// - Failed to update [TableRouteValue](common_meta::key::table_region::TableRegionValue). /// - Failed to retrieve the metadata of table. - pub async fn rollback_downgraded_region(&self, ctx: &mut Context) -> Result<()> { + pub async fn rollback_downgraded_region( + &self, + ctx: &mut Context, + ctx_provider: &ContextProviderRef, + ) -> Result<()> { let table_metadata_manager = ctx.table_metadata_manager.clone(); - let region_id = ctx.region_id(); - let table_id = region_id.table_id(); - let current_table_route_value = ctx.get_table_route_value().await?; + let table_regions = ctx.persistent_ctx.table_regions(); - if let Err(err) = table_metadata_manager - .update_leader_region_status(table_id, current_table_route_value, |route| { - if route.region.id == region_id { - Some(None) - } else { - None - } - }) - .await - .context(error::TableMetadataManagerSnafu) - { - ctx.remove_table_route_value(); - return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { - reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"), - }); + for (table_id, regions) in table_regions { + let table_lock = TableLock::Write(table_id).into(); + let _guard = ctx_provider.acquire_lock(&table_lock).await; + + let current_table_route_value = ctx.get_table_route_value(table_id).await?; + if let Err(err) = table_metadata_manager + .update_leader_region_status(table_id, ¤t_table_route_value, |route| { + if regions.contains(&route.region.id) { + Some(None) + } else { + None + } + }) + .await + .context(error::TableMetadataManagerSnafu) + { + error!(err; "Failed to update the table route during the rollback downgraded leader regions: {regions:?}"); + return Err(BoxedError::new(err)).with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!("Failed to update the table route during the rollback downgraded leader regions: {regions:?}"), + }); + } + info!( + "Rolling back downgraded leader region table route success, table_id: {table_id}, regions: {regions:?}" + ); } - ctx.register_failure_detectors().await; - ctx.remove_table_route_value(); Ok(()) } @@ -61,11 +73,13 @@ impl UpdateMetadata { #[cfg(test)] mod tests { use std::assert_matches::assert_matches; + use std::collections::HashMap; use std::sync::Arc; use common_meta::key::test_utils::new_test_table_info; use common_meta::peer::Peer; use common_meta::rpc::router::{LeaderState, Region, RegionRoute}; + use common_procedure_test::MockContextProvider; use store_api::storage::RegionId; use crate::error::Error; @@ -73,7 +87,6 @@ mod tests { use crate::procedure::region_migration::test_util::{self, TestingEnv, new_procedure_context}; use crate::procedure::region_migration::update_metadata::UpdateMetadata; use crate::procedure::region_migration::{ContextFactory, PersistentContext, State}; - use crate::region::supervisor::RegionFailureDetectorControl; fn new_persistent_context() -> PersistentContext { test_util::new_persistent_context(1, 2, RegionId::new(1024, 1)) @@ -86,108 +99,17 @@ mod tests { let persistent_context = new_persistent_context(); let mut ctx = env.context_factory().new_context(persistent_context); - let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err(); + let provider = Arc::new(MockContextProvider::new(HashMap::new())) as _; + let err = state + .rollback_downgraded_region(&mut ctx, &provider) + .await + .unwrap_err(); assert_matches!(err, Error::TableRouteNotFound { .. }); assert!(!err.is_retryable()); } - #[tokio::test] - async fn test_update_table_route_with_retry() { - let state = UpdateMetadata::Rollback; - let persistent_context = new_persistent_context(); - let from_peer = persistent_context.from_peer.clone(); - - let env = TestingEnv::new(); - let mut ctx = env.context_factory().new_context(persistent_context); - let (tx, mut rx) = tokio::sync::mpsc::channel(8); - ctx.region_failure_detector_controller = Arc::new(RegionFailureDetectorControl::new(tx)); - let table_id = ctx.region_id().table_id(); - - let table_info = new_test_table_info(1024, vec![1, 2, 3]).into(); - let region_routes = vec![ - RegionRoute { - region: Region::new_test(RegionId::new(1024, 1)), - leader_peer: Some(from_peer.clone()), - leader_state: Some(LeaderState::Downgrading), - ..Default::default() - }, - RegionRoute { - region: Region::new_test(RegionId::new(1024, 2)), - leader_peer: Some(Peer::empty(4)), - leader_state: Some(LeaderState::Downgrading), - ..Default::default() - }, - RegionRoute { - region: Region::new_test(RegionId::new(1024, 3)), - leader_peer: Some(Peer::empty(5)), - ..Default::default() - }, - ]; - - let expected_region_routes = { - let mut region_routes = region_routes.clone(); - region_routes[0].leader_state = None; - region_routes[1].leader_state = None; - region_routes - }; - - env.create_physical_table_metadata(table_info, region_routes) - .await; - - let table_metadata_manager = env.table_metadata_manager(); - let old_table_route = table_metadata_manager - .table_route_manager() - .table_route_storage() - .get_with_raw_bytes(table_id) - .await - .unwrap() - .unwrap(); - - // modifies the table route. - table_metadata_manager - .update_leader_region_status(table_id, &old_table_route, |route| { - if route.region.id == RegionId::new(1024, 2) { - Some(None) - } else { - None - } - }) - .await - .unwrap(); - - ctx.volatile_ctx.table_route = Some(old_table_route); - - let err = state - .rollback_downgraded_region(&mut ctx) - .await - .unwrap_err(); - assert!(ctx.volatile_ctx.table_route.is_none()); - assert!(err.is_retryable()); - assert!(format!("{err:?}").contains("Failed to update the table route")); - assert_eq!(rx.len(), 0); - state.rollback_downgraded_region(&mut ctx).await.unwrap(); - let event = rx.try_recv().unwrap(); - let detecting_regions = event.into_region_failure_detectors(); - assert_eq!( - detecting_regions, - vec![(from_peer.id, ctx.persistent_ctx.region_id)] - ); - - let table_route = table_metadata_manager - .table_route_manager() - .table_route_storage() - .get(table_id) - .await - .unwrap() - .unwrap(); - assert_eq!( - &expected_region_routes, - table_route.region_routes().unwrap() - ); - } - #[tokio::test] async fn test_next_migration_end_state() { let mut state = Box::new(UpdateMetadata::Rollback); @@ -196,7 +118,7 @@ mod tests { let env = TestingEnv::new(); let mut ctx = env.context_factory().new_context(persistent_context); - let table_id = ctx.region_id().table_id(); + let table_id = ctx.persistent_ctx.region_ids[0].table_id(); let table_info = new_test_table_info(1024, vec![1, 2, 3]).into(); let region_routes = vec![ @@ -238,8 +160,6 @@ mod tests { .downcast_ref::() .unwrap(); - assert!(ctx.volatile_ctx.table_route.is_none()); - let table_route = table_metadata_manager .table_route_manager() .table_route_storage() diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs index 7352336d86..0e545f5d92 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs @@ -14,9 +14,12 @@ use common_error::ext::BoxedError; use common_meta::key::datanode_table::RegionInfo; +use common_meta::lock_key::TableLock; use common_meta::rpc::router::{RegionRoute, region_distribution}; -use common_telemetry::{info, warn}; +use common_procedure::ContextProviderRef; +use common_telemetry::{error, info, warn}; use snafu::{OptionExt, ResultExt, ensure}; +use store_api::storage::RegionId; use crate::error::{self, Result}; use crate::procedure::region_migration::Context; @@ -24,104 +27,114 @@ use crate::procedure::region_migration::update_metadata::UpdateMetadata; impl UpdateMetadata { /// Returns new [Vec]. - async fn build_upgrade_candidate_region_metadata( + fn build_upgrade_candidate_region_metadata( &self, ctx: &mut Context, + region_ids: &[RegionId], + mut region_routes: Vec, ) -> Result> { - let region_id = ctx.region_id(); - let table_route_value = ctx.get_table_route_value().await?.clone(); + let old_leader_peer = &ctx.persistent_ctx.from_peer; + let new_leader_peer = &ctx.persistent_ctx.to_peer; + for region_id in region_ids { + // Find the RegionRoute for this region_id. + let region_route = region_routes + .iter_mut() + .find(|route| route.region.id == *region_id) + .context(error::RegionRouteNotFoundSnafu { + region_id: *region_id, + })?; - let mut region_routes = table_route_value - .region_routes() - .context(error::UnexpectedLogicalRouteTableSnafu { - err_msg: format!("{self:?} is a non-physical TableRouteValue."), - })? - .clone(); - let region_route = region_routes - .iter_mut() - .find(|route| route.region.id == region_id) - .context(error::RegionRouteNotFoundSnafu { region_id })?; + // Remove any "downgraded leader" state. + region_route.set_leader_state(None); - // Removes downgraded status. - region_route.set_leader_state(None); - - let candidate = &ctx.persistent_ctx.to_peer; - let expected_old_leader = &ctx.persistent_ctx.from_peer; - - // Upgrades candidate to leader. - ensure!( - region_route - .leader_peer - .take_if(|old_leader| old_leader.id == expected_old_leader.id) - .is_some(), - error::UnexpectedSnafu { - violated: format!( - "Unexpected region leader: {:?} during the upgrading candidate metadata, expected: {:?}", - region_route.leader_peer, expected_old_leader - ), - } - ); - - region_route.leader_peer = Some(candidate.clone()); - info!( - "Upgrading candidate region to leader region: {:?} for region: {}", - candidate, region_id - ); - - // Removes the candidate region in followers. - let removed = region_route - .follower_peers - .extract_if(.., |peer| peer.id == candidate.id) - .collect::>(); - - if removed.len() > 1 { - warn!( - "Removes duplicated regions: {removed:?} during the upgrading candidate metadata for region: {region_id}" - ); - } - - Ok(region_routes) - } - - /// Returns true if region metadata has been updated. - async fn check_metadata_updated(&self, ctx: &mut Context) -> Result { - let region_id = ctx.region_id(); - let table_route_value = ctx.get_table_route_value().await?.clone(); - - let region_routes = table_route_value - .region_routes() - .context(error::UnexpectedLogicalRouteTableSnafu { - err_msg: format!("{self:?} is a non-physical TableRouteValue."), - })? - .clone(); - let region_route = region_routes - .into_iter() - .find(|route| route.region.id == region_id) - .context(error::RegionRouteNotFoundSnafu { region_id })?; - - let leader_peer = region_route - .leader_peer - .as_ref() - .context(error::UnexpectedSnafu { - violated: format!("The leader peer of region {region_id} is not found during the update metadata for upgrading"), - })?; - - let candidate_peer_id = ctx.persistent_ctx.to_peer.id; - - if leader_peer.id == candidate_peer_id { + // Check old leader matches expectation before upgrading to new leader. ensure!( - !region_route.is_leader_downgrading(), + region_route + .leader_peer + .take_if(|old_leader| old_leader.id == old_leader_peer.id) + .is_some(), error::UnexpectedSnafu { violated: format!( - "Unexpected intermediate state is found during the update metadata for upgrading region {region_id}" + "Unexpected region leader: {:?} during the candidate-to-leader upgrade; expected: {:?}", + region_route.leader_peer, old_leader_peer ), } ); - Ok(true) - } else { - Ok(false) + // Set new leader. + region_route.leader_peer = Some(new_leader_peer.clone()); + + // Remove new leader from followers (avoids duplicate leader/follower). + let removed = region_route + .follower_peers + .extract_if(.., |peer| peer.id == new_leader_peer.id) + .collect::>(); + + // Warn if more than one follower with the new leader id was present. + if removed.len() > 1 { + warn!( + "Removed duplicate followers: {removed:?} during candidate-to-leader upgrade for region: {region_id}" + ); + } } + + info!( + "Building metadata for upgrading candidate region to new leader: {:?} for regions: {:?}", + new_leader_peer, region_ids, + ); + + Ok(region_routes) + } + + /// Checks if metadata has been upgraded for a list of regions by verifying if their + /// leader peers have been switched to a specified peer ID (`to_peer_id`) and that + /// no region is in a leader downgrading state. + /// + /// Returns: + /// - `Ok(true)` if all regions' leader is the target peer and no downgrading occurs. + /// - `Ok(false)` if any region's leader is not the target peer. + /// - Error if region route or leader peer cannot be found, or an unexpected state is detected. + fn check_metadata_updated( + &self, + ctx: &mut Context, + region_ids: &[RegionId], + region_routes: &[RegionRoute], + ) -> Result { + // Iterate through each provided region ID + for region_id in region_ids { + // Find the route info for this region + let region_route = region_routes + .iter() + .find(|route| route.region.id == *region_id) + .context(error::RegionRouteNotFoundSnafu { + region_id: *region_id, + })?; + + // Get the leader peer for the region, error if not found + let leader_peer = region_route.leader_peer.as_ref().with_context(||error::UnexpectedSnafu { + violated: format!( + "The leader peer of region {region_id} is not found during the metadata upgrade check" + ), + })?; + + // If the leader is not the expected peer, return false (i.e., not yet upgraded) + if leader_peer.id != ctx.persistent_ctx.to_peer.id { + return Ok(false); + } else { + // If leader matches but region is in leader downgrading state, error (unexpected state) + ensure!( + !region_route.is_leader_downgrading(), + error::UnexpectedSnafu { + violated: format!( + "Unexpected intermediate state is found during the metadata upgrade check for region {region_id}" + ), + } + ); + } + } + + // All regions' leader match expected peer and are not downgrading; considered upgraded + Ok(true) } /// Upgrades the candidate region. @@ -133,57 +146,77 @@ impl UpdateMetadata { /// Retry: /// - Failed to update [TableRouteValue](common_meta::key::table_region::TableRegionValue). /// - Failed to retrieve the metadata of table. - pub async fn upgrade_candidate_region(&self, ctx: &mut Context) -> Result<()> { - let region_id = ctx.region_id(); + pub async fn upgrade_candidate_region( + &self, + ctx: &mut Context, + ctx_provider: &ContextProviderRef, + ) -> Result<()> { let table_metadata_manager = ctx.table_metadata_manager.clone(); + let table_regions = ctx.persistent_ctx.table_regions(); + let from_peer_id = ctx.persistent_ctx.from_peer.id; + let to_peer_id = ctx.persistent_ctx.to_peer.id; - if self.check_metadata_updated(ctx).await? { - return Ok(()); + for (table_id, region_ids) in table_regions { + let table_lock = TableLock::Write(table_id).into(); + let _guard = ctx_provider.acquire_lock(&table_lock).await; + + let table_route_value = ctx.get_table_route_value(table_id).await?; + let region_routes = table_route_value.region_routes().with_context(|_| { + error::UnexpectedLogicalRouteTableSnafu { + err_msg: format!("TableRoute({table_id:?}) is a non-physical TableRouteValue."), + } + })?; + if self.check_metadata_updated(ctx, ®ion_ids, region_routes)? { + continue; + } + let datanode_table_value = ctx.get_from_peer_datanode_table_value(table_id).await?; + let RegionInfo { + region_storage_path, + region_options, + region_wal_options, + engine, + } = datanode_table_value.region_info.clone(); + let new_region_routes = self.build_upgrade_candidate_region_metadata( + ctx, + ®ion_ids, + region_routes.clone(), + )?; + let region_distribution = region_distribution(region_routes); + info!( + "Trying to update region routes to {:?} for table: {}", + region_distribution, table_id, + ); + + if let Err(err) = table_metadata_manager + .update_table_route( + table_id, + RegionInfo { + engine: engine.clone(), + region_storage_path: region_storage_path.clone(), + region_options: region_options.clone(), + region_wal_options: region_wal_options.clone(), + }, + &table_route_value, + new_region_routes, + ®ion_options, + ®ion_wal_options, + ) + .await + .context(error::TableMetadataManagerSnafu) + { + error!(err; "Failed to update the table route during the upgrading candidate region: {region_ids:?}, from_peer_id: {from_peer_id}"); + return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { + reason: format!("Failed to update the table route during the upgrading candidate region: {table_id}"), + }); + }; + info!( + "Upgrading candidate region table route success, table_id: {table_id}, regions: {region_ids:?}, to_peer_id: {to_peer_id}" + ); } - let region_routes = self.build_upgrade_candidate_region_metadata(ctx).await?; - let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?; - let RegionInfo { - region_storage_path, - region_options, - region_wal_options, - engine, - } = datanode_table_value.region_info.clone(); - let table_route_value = ctx.get_table_route_value().await?; - - let region_distribution = region_distribution(®ion_routes); - info!( - "Trying to update region routes to {:?} for table: {}", - region_distribution, - region_id.table_id() - ); - if let Err(err) = table_metadata_manager - .update_table_route( - region_id.table_id(), - RegionInfo { - engine: engine.clone(), - region_storage_path: region_storage_path.clone(), - region_options: region_options.clone(), - region_wal_options: region_wal_options.clone(), - }, - table_route_value, - region_routes, - ®ion_options, - ®ion_wal_options, - ) - .await - .context(error::TableMetadataManagerSnafu) - { - ctx.remove_table_route_value(); - return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { - reason: format!("Failed to update the table route during the upgrading candidate region: {region_id}"), - }); - }; - - ctx.remove_table_route_value(); ctx.deregister_failure_detectors().await; // Consumes the guard. - ctx.volatile_ctx.opening_region_guard.take(); + ctx.volatile_ctx.opening_region_guards.clear(); Ok(()) } @@ -212,16 +245,11 @@ mod tests { #[tokio::test] async fn test_table_route_is_not_found_error() { - let state = UpdateMetadata::Upgrade; - let env = TestingEnv::new(); let persistent_context = new_persistent_context(); - let mut ctx = env.context_factory().new_context(persistent_context); + let ctx = env.context_factory().new_context(persistent_context); - let err = state - .build_upgrade_candidate_region_metadata(&mut ctx) - .await - .unwrap_err(); + let err = ctx.get_table_route_value(1024).await.unwrap_err(); assert_matches!(err, Error::TableRouteNotFound { .. }); assert!(!err.is_retryable()); @@ -240,13 +268,20 @@ mod tests { leader_peer: Some(Peer::empty(4)), ..Default::default() }]; - env.create_physical_table_metadata(table_info, region_routes) .await; + let table_route_value = ctx.get_table_route_value(1024).await.unwrap(); + let region_routes = table_route_value + .into_inner() + .into_physical_table_route() + .region_routes; let err = state - .build_upgrade_candidate_region_metadata(&mut ctx) - .await + .build_upgrade_candidate_region_metadata( + &mut ctx, + &[RegionId::new(1024, 1)], + region_routes, + ) .unwrap_err(); assert_matches!(err, Error::RegionRouteNotFound { .. }); @@ -270,9 +305,17 @@ mod tests { env.create_physical_table_metadata(table_info, region_routes) .await; + let table_route_value = ctx.get_table_route_value(1024).await.unwrap(); + let region_routes = table_route_value + .into_inner() + .into_physical_table_route() + .region_routes; let err = state - .build_upgrade_candidate_region_metadata(&mut ctx) - .await + .build_upgrade_candidate_region_metadata( + &mut ctx, + &[RegionId::new(1024, 1)], + region_routes, + ) .unwrap_err(); assert_matches!(err, Error::Unexpected { .. }); @@ -299,9 +342,17 @@ mod tests { env.create_physical_table_metadata(table_info, region_routes) .await; + let table_route_value = ctx.get_table_route_value(1024).await.unwrap(); + let region_routes = table_route_value + .into_inner() + .into_physical_table_route() + .region_routes; let new_region_routes = state - .build_upgrade_candidate_region_metadata(&mut ctx) - .await + .build_upgrade_candidate_region_metadata( + &mut ctx, + &[RegionId::new(1024, 1)], + region_routes, + ) .unwrap(); assert!(!new_region_routes[0].is_leader_downgrading()); @@ -310,71 +361,6 @@ mod tests { assert_eq!(new_region_routes[0].leader_peer.as_ref().unwrap().id, 2); } - #[tokio::test] - async fn test_failed_to_update_table_route_error() { - let state = UpdateMetadata::Upgrade; - let env = TestingEnv::new(); - let persistent_context = new_persistent_context(); - let mut ctx = env.context_factory().new_context(persistent_context); - let opening_keeper = MemoryRegionKeeper::default(); - - let table_id = 1024; - let table_info = new_test_table_info(table_id, vec![1]).into(); - let region_routes = vec![ - RegionRoute { - region: Region::new_test(RegionId::new(table_id, 1)), - leader_peer: Some(Peer::empty(1)), - follower_peers: vec![Peer::empty(5), Peer::empty(3)], - leader_state: Some(LeaderState::Downgrading), - leader_down_since: Some(current_time_millis()), - }, - RegionRoute { - region: Region::new_test(RegionId::new(table_id, 2)), - leader_peer: Some(Peer::empty(4)), - leader_state: Some(LeaderState::Downgrading), - ..Default::default() - }, - ]; - - env.create_physical_table_metadata(table_info, region_routes) - .await; - - let table_metadata_manager = env.table_metadata_manager(); - let original_table_route = table_metadata_manager - .table_route_manager() - .table_route_storage() - .get_with_raw_bytes(table_id) - .await - .unwrap() - .unwrap(); - - // modifies the table route. - table_metadata_manager - .update_leader_region_status(table_id, &original_table_route, |route| { - if route.region.id == RegionId::new(1024, 2) { - // Removes the status. - Some(None) - } else { - None - } - }) - .await - .unwrap(); - - // sets the old table route. - ctx.volatile_ctx.table_route = Some(original_table_route); - let guard = opening_keeper - .register(2, RegionId::new(table_id, 1)) - .unwrap(); - ctx.volatile_ctx.opening_region_guard = Some(guard); - let err = state.upgrade_candidate_region(&mut ctx).await.unwrap_err(); - - assert!(ctx.volatile_ctx.table_route.is_none()); - assert!(ctx.volatile_ctx.opening_region_guard.is_some()); - assert!(err.is_retryable()); - assert!(format!("{err:?}").contains("Failed to update the table route")); - } - #[tokio::test] async fn test_check_metadata() { let state = UpdateMetadata::Upgrade; @@ -394,8 +380,11 @@ mod tests { env.create_physical_table_metadata(table_info, region_routes) .await; - - let updated = state.check_metadata_updated(&mut ctx).await.unwrap(); + let table_routes = ctx.get_table_route_value(1024).await.unwrap(); + let region_routes = table_routes.region_routes().unwrap(); + let updated = state + .check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes) + .unwrap(); assert!(!updated); } @@ -419,7 +408,11 @@ mod tests { env.create_physical_table_metadata(table_info, region_routes) .await; - let updated = state.check_metadata_updated(&mut ctx).await.unwrap(); + let table_routes = ctx.get_table_route_value(1024).await.unwrap(); + let region_routes = table_routes.region_routes().unwrap(); + let updated = state + .check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes) + .unwrap(); assert!(updated); } @@ -443,7 +436,11 @@ mod tests { env.create_physical_table_metadata(table_info, region_routes) .await; - let err = state.check_metadata_updated(&mut ctx).await.unwrap_err(); + let table_routes = ctx.get_table_route_value(1024).await.unwrap(); + let region_routes = table_routes.region_routes().unwrap(); + let err = state + .check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes) + .unwrap_err(); assert_matches!(err, Error::Unexpected { .. }); assert!(err.to_string().contains("intermediate state")); } @@ -468,7 +465,7 @@ mod tests { let guard = opening_keeper .register(2, RegionId::new(table_id, 1)) .unwrap(); - ctx.volatile_ctx.opening_region_guard = Some(guard); + ctx.volatile_ctx.opening_region_guards.push(guard); env.create_physical_table_metadata(table_info, region_routes) .await; @@ -492,8 +489,7 @@ mod tests { .unwrap(); let region_routes = table_route.region_routes().unwrap(); - assert!(ctx.volatile_ctx.table_route.is_none()); - assert!(ctx.volatile_ctx.opening_region_guard.is_none()); + assert!(ctx.volatile_ctx.opening_region_guards.is_empty()); assert_eq!(region_routes.len(), 1); assert!(!region_routes[0].is_leader_downgrading()); assert!(region_routes[0].follower_peers.is_empty()); diff --git a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs index 2dfaa21a89..0390ddf0da 100644 --- a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs @@ -13,15 +13,19 @@ // limitations under the License. use std::any::Any; +use std::collections::HashSet; use std::time::Duration; use api::v1::meta::MailboxMessage; use common_meta::ddl::utils::parse_region_wal_options; -use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply}; +use common_meta::instruction::{ + Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply, +}; +use common_meta::key::topic_region::TopicRegionKey; use common_meta::lock_key::RemoteWalLock; use common_meta::wal_options_allocator::extract_topic_from_wal_options; use common_procedure::{Context as ProcedureContext, Status}; -use common_telemetry::{error, warn}; +use common_telemetry::{error, info}; use common_wal::options::WalOptions; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, ensure}; @@ -64,17 +68,9 @@ impl State for UpgradeCandidateRegion { ) -> Result<(Box, Status)> { let now = Instant::now(); - let region_wal_option = self.get_region_wal_option(ctx).await?; - let region_id = ctx.persistent_ctx.region_id; - if region_wal_option.is_none() { - warn!( - "Region {} wal options not found, during upgrade candidate region", - region_id - ); - } - + let topics = self.get_kafka_topics(ctx).await?; if self - .upgrade_region_with_retry(ctx, procedure_ctx, region_wal_option.as_ref()) + .upgrade_region_with_retry(ctx, procedure_ctx, topics) .await { ctx.update_upgrade_candidate_region_elapsed(now); @@ -91,24 +87,32 @@ impl State for UpgradeCandidateRegion { } impl UpgradeCandidateRegion { - async fn get_region_wal_option(&self, ctx: &mut Context) -> Result> { - let region_id = ctx.persistent_ctx.region_id; - match ctx.get_from_peer_datanode_table_value().await { - Ok(datanode_table_value) => { - let region_wal_options = - parse_region_wal_options(&datanode_table_value.region_info.region_wal_options) - .context(error::ParseWalOptionsSnafu)?; - Ok(region_wal_options.get(®ion_id.region_number()).cloned()) + async fn get_kafka_topics(&self, ctx: &mut Context) -> Result> { + let table_regions = ctx.persistent_ctx.table_regions(); + let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?; + let mut topics = HashSet::new(); + for (table_id, regions) in table_regions { + let Some(datanode_table_value) = datanode_table_values.get(&table_id) else { + continue; + }; + + let region_wal_options = + parse_region_wal_options(&datanode_table_value.region_info.region_wal_options) + .context(error::ParseWalOptionsSnafu)?; + + for region_id in regions { + let Some(WalOptions::Kafka(kafka_wal_options)) = + region_wal_options.get(®ion_id.region_number()) + else { + continue; + }; + if !topics.contains(&kafka_wal_options.topic) { + topics.insert(kafka_wal_options.topic.clone()); + } } - Err(error::Error::DatanodeTableNotFound { datanode_id, .. }) => { - warn!( - "Datanode table not found, during upgrade candidate region, the target region might already been migrated, region_id: {}, datanode_id: {}", - region_id, datanode_id - ); - Ok(None) - } - Err(e) => Err(e), } + + Ok(topics) } /// Builds upgrade region instruction. @@ -117,35 +121,105 @@ impl UpgradeCandidateRegion { ctx: &mut Context, replay_timeout: Duration, ) -> Result { - let pc = &ctx.persistent_ctx; - let region_id = pc.region_id; - let last_entry_id = ctx.volatile_ctx.leader_region_last_entry_id; - let metadata_last_entry_id = ctx.volatile_ctx.leader_region_metadata_last_entry_id; - // Try our best to retrieve replay checkpoint. - let datanode_table_value = ctx.get_from_peer_datanode_table_value().await.ok(); - let checkpoint = if let Some(topic) = datanode_table_value.as_ref().and_then(|v| { - extract_topic_from_wal_options(region_id, &v.region_info.region_wal_options) - }) { - ctx.fetch_replay_checkpoint(&topic).await.ok().flatten() - } else { - None - }; + let region_ids = ctx.persistent_ctx.region_ids.clone(); + let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?; + let mut region_topic = Vec::with_capacity(region_ids.len()); + for region_id in region_ids.iter() { + let table_id = region_id.table_id(); + if let Some(datanode_table_value) = datanode_table_values.get(&table_id) + && let Some(topic) = extract_topic_from_wal_options( + *region_id, + &datanode_table_value.region_info.region_wal_options, + ) + { + region_topic.push((*region_id, topic)); + } + } - let upgrade_instruction = Instruction::UpgradeRegion( - UpgradeRegion { + let replay_checkpoints = ctx + .get_replay_checkpoints( + region_topic + .iter() + .map(|(region_id, topic)| TopicRegionKey::new(*region_id, topic)) + .collect(), + ) + .await?; + // Build upgrade regions instruction. + let mut upgrade_regions = Vec::with_capacity(region_ids.len()); + for region_id in region_ids { + let last_entry_id = ctx + .volatile_ctx + .leader_region_last_entry_ids + .get(®ion_id) + .copied(); + let metadata_last_entry_id = ctx + .volatile_ctx + .leader_region_metadata_last_entry_ids + .get(®ion_id) + .copied(); + let checkpoint = replay_checkpoints.get(®ion_id).copied(); + upgrade_regions.push(UpgradeRegion { region_id, last_entry_id, metadata_last_entry_id, - replay_timeout: Some(replay_timeout), + replay_timeout, location_id: Some(ctx.persistent_ctx.from_peer.id), - replay_entry_id: None, - metadata_replay_entry_id: None, + replay_entry_id: checkpoint.map(|c| c.entry_id), + metadata_replay_entry_id: checkpoint.and_then(|c| c.metadata_entry_id), + }); + } + + Ok(Instruction::UpgradeRegions(upgrade_regions)) + } + + fn handle_upgrade_region_reply( + &self, + ctx: &mut Context, + UpgradeRegionReply { + region_id, + ready, + exists, + error, + }: &UpgradeRegionReply, + now: &Instant, + ) -> Result<()> { + let candidate = &ctx.persistent_ctx.to_peer; + if error.is_some() { + return error::RetryLaterSnafu { + reason: format!( + "Failed to upgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}", + region_id, + candidate, + error, + now.elapsed() + ), + } + .fail(); + } + + ensure!( + exists, + error::UnexpectedSnafu { + violated: format!( + "Candidate region {} doesn't exist on datanode {:?}", + region_id, candidate + ) } - .with_replay_entry_id(checkpoint.map(|c| c.entry_id)) - .with_metadata_replay_entry_id(checkpoint.and_then(|c| c.metadata_entry_id)), ); - Ok(upgrade_instruction) + if self.require_ready && !ready { + return error::RetryLaterSnafu { + reason: format!( + "Candidate region {} still replaying the wal on datanode {:?}, elapsed: {:?}", + region_id, + candidate, + now.elapsed() + ), + } + .fail(); + } + + Ok(()) } /// Tries to upgrade a candidate region. @@ -173,11 +247,11 @@ impl UpgradeCandidateRegion { .await?; let pc = &ctx.persistent_ctx; - let region_id = pc.region_id; + let region_ids = &pc.region_ids; let candidate = &pc.to_peer; let msg = MailboxMessage::json_message( - &format!("Upgrade candidate region: {}", region_id), + &format!("Upgrade candidate regions: {:?}", region_ids), &format!("Metasrv@{}", ctx.server_addr()), &format!("Datanode-{}@{}", candidate.id, candidate.addr), common_time::util::current_time_millis(), @@ -190,14 +264,17 @@ impl UpgradeCandidateRegion { let ch = Channel::Datanode(candidate.id); let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?; + let now = Instant::now(); match receiver.await { Ok(msg) => { let reply = HeartbeatMailbox::json_reply(&msg)?; - let InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready, - exists, - error, - }) = reply + info!( + "Received upgrade region reply: {:?}, regions: {:?}, elapsed: {:?}", + reply, + region_ids, + now.elapsed() + ); + let InstructionReply::UpgradeRegions(UpgradeRegionsReply { replies }) = reply else { return error::UnexpectedInstructionReplySnafu { mailbox_message: msg.to_string(), @@ -205,44 +282,16 @@ impl UpgradeCandidateRegion { } .fail(); }; - - // Notes: The order of handling is important. - if error.is_some() { - return error::RetryLaterSnafu { - reason: format!( - "Failed to upgrade the region {} on datanode {:?}, error: {:?}", - region_id, candidate, error - ), - } - .fail(); + for reply in replies { + self.handle_upgrade_region_reply(ctx, &reply, &now)?; } - - ensure!( - exists, - error::UnexpectedSnafu { - violated: format!( - "Candidate region {} doesn't exist on datanode {:?}", - region_id, candidate - ) - } - ); - - if self.require_ready && !ready { - return error::RetryLaterSnafu { - reason: format!( - "Candidate region {} still replaying the wal on datanode {:?}", - region_id, candidate - ), - } - .fail(); - } - Ok(()) } Err(error::Error::MailboxTimeout { .. }) => { let reason = format!( - "Mailbox received timeout for upgrade candidate region {region_id} on datanode {:?}", + "Mailbox received timeout for upgrade candidate regions {region_ids:?} on datanode {:?}, elapsed: {:?}", candidate, + now.elapsed() ); error::RetryLaterSnafu { reason }.fail() } @@ -257,26 +306,24 @@ impl UpgradeCandidateRegion { &self, ctx: &mut Context, procedure_ctx: &ProcedureContext, - wal_options: Option<&WalOptions>, + topics: HashSet, ) -> bool { let mut retry = 0; let mut upgraded = false; + let mut guards = Vec::with_capacity(topics.len()); loop { let timer = Instant::now(); // If using Kafka WAL, acquire a read lock on the topic to prevent WAL pruning during the upgrade. - let _guard = if let Some(WalOptions::Kafka(kafka_wal_options)) = wal_options { - Some( + for topic in &topics { + guards.push( procedure_ctx .provider - .acquire_lock( - &(RemoteWalLock::Read(kafka_wal_options.topic.clone()).into()), - ) + .acquire_lock(&(RemoteWalLock::Read(topic.clone()).into())) .await, - ) - } else { - None - }; + ); + } + if let Err(err) = self.upgrade_region(ctx).await { retry += 1; ctx.update_operations_elapsed(timer); @@ -322,22 +369,21 @@ mod tests { }; fn new_persistent_context() -> PersistentContext { - PersistentContext { - catalog: "greptime".into(), - schema: "public".into(), - from_peer: Peer::empty(1), - to_peer: Peer::empty(2), - region_id: RegionId::new(1024, 1), - timeout: Duration::from_millis(1000), - trigger_reason: RegionMigrationTriggerReason::Manual, - } + PersistentContext::new( + vec![("greptime".into(), "public".into())], + Peer::empty(1), + Peer::empty(2), + vec![RegionId::new(1024, 1)], + Duration::from_millis(1000), + RegionMigrationTriggerReason::Manual, + ) } async fn prepare_table_metadata(ctx: &Context, wal_options: HashMap) { - let table_info = - new_test_table_info(ctx.persistent_ctx.region_id.table_id(), vec![1]).into(); + let region_id = ctx.persistent_ctx.region_ids[0]; + let table_info = new_test_table_info(region_id.table_id(), vec![1]).into(); let region_routes = vec![RegionRoute { - region: Region::new_test(ctx.persistent_ctx.region_id), + region: Region::new_test(region_id), leader_peer: Some(ctx.persistent_ctx.from_peer.clone()), follower_peers: vec![ctx.persistent_ctx.to_peer.clone()], ..Default::default() diff --git a/src/meta-srv/src/procedure/region_migration/utils.rs b/src/meta-srv/src/procedure/region_migration/utils.rs new file mode 100644 index 0000000000..09921ee0d6 --- /dev/null +++ b/src/meta-srv/src/procedure/region_migration/utils.rs @@ -0,0 +1,487 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::Display; +use std::time::Duration; + +use common_meta::key::TableMetadataManagerRef; +use common_meta::peer::Peer; +use common_meta::rpc::router::RegionRoute; +use itertools::Itertools; +use snafu::{OptionExt, ResultExt}; +use store_api::storage::{RegionId, TableId}; + +use crate::error::{self, Result}; +use crate::procedure::region_migration::{ + DEFAULT_REGION_MIGRATION_TIMEOUT, RegionMigrationProcedureTask, RegionMigrationTriggerReason, +}; + +/// A migration task describing how regions are intended to move between peers. +#[derive(Debug, Clone)] +pub struct RegionMigrationTaskBatch { + /// Region ids involved in this migration. + pub region_ids: Vec, + /// Source peer where regions currently reside. + pub from_peer: Peer, + /// Destination peer to migrate regions to. + pub to_peer: Peer, + /// Timeout for migration. + pub timeout: Duration, + /// Reason why this migration was triggered. + pub trigger_reason: RegionMigrationTriggerReason, +} + +impl RegionMigrationTaskBatch { + /// Constructs a [`RegionMigrationTaskBatch`] from a vector of region migration procedure tasks. + /// + /// Aggregates region IDs, determines source and destination peers, sets an appropriate timeout, + /// and assigns the trigger reason for the migration batch. + /// + /// # Panic + /// if the `tasks` are empty. + pub fn from_tasks(tasks: Vec<(RegionMigrationProcedureTask, u32)>) -> Self { + let max_count = tasks.iter().map(|(_, count)| *count).max().unwrap_or(1); + let region_ids = tasks.iter().map(|(r, _)| r.region_id).collect::>(); + let from_peer = tasks[0].0.from_peer.clone(); + let to_peer = tasks[0].0.to_peer.clone(); + let timeout = DEFAULT_REGION_MIGRATION_TIMEOUT * max_count; + let trigger_reason = RegionMigrationTriggerReason::Failover; + Self { + region_ids, + from_peer, + to_peer, + timeout, + trigger_reason, + } + } +} + +impl Display for RegionMigrationTaskBatch { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "RegionMigrationTask {{ region_ids: {:?}, from_peer: {:?}, to_peer: {:?}, timeout: {:?}, trigger_reason: {:?} }}", + self.region_ids, self.from_peer, self.to_peer, self.timeout, self.trigger_reason + ) + } +} + +impl RegionMigrationTaskBatch { + /// Returns the table regions map. + /// + /// The key is the table id, the value is the region ids of the table. + pub(crate) fn table_regions(&self) -> HashMap> { + let mut table_regions = HashMap::new(); + for region_id in &self.region_ids { + table_regions + .entry(region_id.table_id()) + .or_insert_with(Vec::new) + .push(*region_id); + } + table_regions + } +} + +/// Represents the result of analyzing a migration task. +#[derive(Debug, Clone, Default, PartialEq)] +pub(crate) struct RegionMigrationAnalysis { + /// Regions already migrated to the `to_peer`. + pub(crate) migrated: Vec, + /// Regions where the leader peer has changed. + pub(crate) leader_changed: Vec, + /// Regions where `to_peer` is already a follower (conflict). + pub(crate) peer_conflict: Vec, + /// Regions whose table is not found. + pub(crate) table_not_found: Vec, + /// Regions still pending migration. + pub(crate) pending: Vec, +} + +fn leader_peer(region_route: &RegionRoute) -> Result<&Peer> { + region_route + .leader_peer + .as_ref() + .with_context(|| error::UnexpectedSnafu { + violated: format!( + "Region route leader peer is not found in region({})", + region_route.region.id + ), + }) +} + +/// Returns true if the region has already been migrated to `to_peer`. +fn has_migrated(region_route: &RegionRoute, to_peer_id: u64) -> Result { + if region_route.is_leader_downgrading() { + return Ok(false); + } + + let leader_peer = leader_peer(region_route)?; + Ok(leader_peer.id == to_peer_id) +} + +/// Returns true if the leader peer of the region has changed. +fn has_leader_changed(region_route: &RegionRoute, from_peer_id: u64) -> Result { + let leader_peer = leader_peer(region_route)?; + + Ok(leader_peer.id != from_peer_id) +} + +/// Returns true if `to_peer` is already a follower of the region (conflict). +fn has_peer_conflict(region_route: &RegionRoute, to_peer_id: u64) -> bool { + region_route + .follower_peers + .iter() + .map(|p| p.id) + .contains(&to_peer_id) +} + +/// Updates the verification result based on a single region route. +fn update_result_with_region_route( + result: &mut RegionMigrationAnalysis, + region_route: &RegionRoute, + from_peer_id: u64, + to_peer_id: u64, +) -> Result<()> { + if has_migrated(region_route, to_peer_id)? { + result.migrated.push(region_route.region.id); + return Ok(()); + } + if has_leader_changed(region_route, from_peer_id)? { + result.leader_changed.push(region_route.region.id); + return Ok(()); + } + if has_peer_conflict(region_route, to_peer_id) { + result.peer_conflict.push(region_route.region.id); + return Ok(()); + } + result.pending.push(region_route.region.id); + Ok(()) +} + +/// Analyzes the migration task and categorizes regions by their current state. +/// +/// Returns a [`RegionMigrationAnalysis`] describing the migration status. +pub async fn analyze_region_migration_task( + task: &RegionMigrationTaskBatch, + table_metadata_manager: &TableMetadataManagerRef, +) -> Result { + if task.to_peer.id == task.from_peer.id { + return error::InvalidArgumentsSnafu { + err_msg: format!( + "The `from_peer_id`({}) can't equal `to_peer_id`({})", + task.from_peer.id, task.to_peer.id + ), + } + .fail(); + } + let table_regions = task.table_regions(); + let table_ids = table_regions.keys().cloned().collect::>(); + let mut result = RegionMigrationAnalysis::default(); + + let table_routes = table_metadata_manager + .table_route_manager() + .table_route_storage() + .batch_get_with_raw_bytes(&table_ids) + .await + .context(error::TableMetadataManagerSnafu)?; + + for (table_id, table_route) in table_ids.into_iter().zip(table_routes) { + let region_ids = table_regions.get(&table_id).unwrap(); + let Some(table_route) = table_route else { + result.table_not_found.extend(region_ids); + continue; + }; + // Throws error if the table route is not a physical table route. + let region_routes = table_route.region_routes().with_context(|_| { + error::UnexpectedLogicalRouteTableSnafu { + err_msg: format!("TableRoute({table_id:?}) is a non-physical TableRouteValue."), + } + })?; + for region_route in region_routes + .iter() + .filter(|r| region_ids.contains(&r.region.id)) + { + update_result_with_region_route( + &mut result, + region_route, + task.from_peer.id, + task.to_peer.id, + )?; + } + } + + Ok(result) +} + +#[cfg(test)] +mod tests { + + use std::assert_matches::assert_matches; + use std::sync::Arc; + use std::time::Duration; + + use common_meta::key::TableMetadataManager; + use common_meta::key::table_route::{ + LogicalTableRouteValue, PhysicalTableRouteValue, TableRouteValue, + }; + use common_meta::kv_backend::TxnService; + use common_meta::kv_backend::memory::MemoryKvBackend; + use common_meta::peer::Peer; + use common_meta::rpc::router::{Region, RegionRoute}; + use store_api::storage::RegionId; + + use crate::error::Error; + use crate::procedure::region_migration::RegionMigrationTriggerReason; + use crate::procedure::region_migration::utils::{ + RegionMigrationAnalysis, RegionMigrationTaskBatch, analyze_region_migration_task, + update_result_with_region_route, + }; + + #[test] + fn test_update_result_with_region_route() { + // The region is already migrated to the to_peer. + let mut result = RegionMigrationAnalysis::default(); + let region_id = RegionId::new(1, 1); + let region_route = RegionRoute { + region: Region::new_test(region_id), + leader_peer: Some(Peer::empty(1)), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }; + update_result_with_region_route(&mut result, ®ion_route, 2, 1).unwrap(); + assert_eq!( + result, + RegionMigrationAnalysis { + migrated: vec![region_id], + ..Default::default() + } + ); + + // Test region leader changed. + let mut result = RegionMigrationAnalysis::default(); + let region_id = RegionId::new(1, 1); + let region_route = RegionRoute { + region: Region::new_test(region_id), + leader_peer: Some(Peer::empty(1)), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }; + update_result_with_region_route(&mut result, ®ion_route, 2, 3).unwrap(); + assert_eq!( + result, + RegionMigrationAnalysis { + leader_changed: vec![region_id], + ..Default::default() + } + ); + + // Test region peer conflict. + let mut result = RegionMigrationAnalysis::default(); + let region_id = RegionId::new(1, 1); + let region_route = RegionRoute { + region: Region::new_test(region_id), + leader_peer: Some(Peer::empty(1)), + follower_peers: vec![Peer::empty(2)], + leader_state: None, + leader_down_since: None, + }; + update_result_with_region_route(&mut result, ®ion_route, 1, 2).unwrap(); + assert_eq!( + result, + RegionMigrationAnalysis { + peer_conflict: vec![region_id], + ..Default::default() + } + ); + + // Test normal case. + let mut result = RegionMigrationAnalysis::default(); + let region_id = RegionId::new(1, 1); + let region_route = RegionRoute { + region: Region::new_test(region_id), + leader_peer: Some(Peer::empty(1)), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }; + update_result_with_region_route(&mut result, ®ion_route, 1, 3).unwrap(); + assert_eq!( + result, + RegionMigrationAnalysis { + pending: vec![region_id], + ..Default::default() + } + ); + + // Test leader peer not set + let mut result = RegionMigrationAnalysis::default(); + let region_id = RegionId::new(1, 1); + let region_route = RegionRoute { + region: Region::new_test(region_id), + leader_peer: None, + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }; + let err = update_result_with_region_route(&mut result, ®ion_route, 1, 3).unwrap_err(); + assert_matches!(err, Error::Unexpected { .. }); + } + + #[tokio::test] + async fn test_analyze_region_migration_task_invalid_task() { + let task = &RegionMigrationTaskBatch { + region_ids: vec![RegionId::new(1, 1)], + from_peer: Peer::empty(1), + to_peer: Peer::empty(1), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + let kv_backend = Arc::new(MemoryKvBackend::default()); + let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone())); + let err = analyze_region_migration_task(task, &table_metadata_manager) + .await + .unwrap_err(); + assert_matches!(err, Error::InvalidArguments { .. }); + } + + #[tokio::test] + async fn test_analyze_region_migration_table_not_found() { + let task = &RegionMigrationTaskBatch { + region_ids: vec![RegionId::new(1, 1)], + from_peer: Peer::empty(1), + to_peer: Peer::empty(2), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + let kv_backend = Arc::new(MemoryKvBackend::default()); + let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone())); + let result = analyze_region_migration_task(task, &table_metadata_manager) + .await + .unwrap(); + assert_eq!( + result, + RegionMigrationAnalysis { + table_not_found: vec![RegionId::new(1, 1)], + ..Default::default() + } + ); + } + + #[tokio::test] + async fn test_analyze_region_migration_unexpected_logical_table() { + let kv_backend = Arc::new(MemoryKvBackend::default()); + let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone())); + let (txn, _) = table_metadata_manager + .table_route_manager() + .table_route_storage() + .build_create_txn( + 1024, + &TableRouteValue::Logical(LogicalTableRouteValue::new( + 1024, + vec![RegionId::new(1023, 1)], + )), + ) + .unwrap(); + kv_backend.txn(txn).await.unwrap(); + let task = &RegionMigrationTaskBatch { + region_ids: vec![RegionId::new(1024, 1)], + from_peer: Peer::empty(1), + to_peer: Peer::empty(2), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + let err = analyze_region_migration_task(task, &table_metadata_manager) + .await + .unwrap_err(); + assert_matches!(err, Error::UnexpectedLogicalRouteTable { .. }); + } + + #[tokio::test] + async fn test_analyze_region_migration_normal_case() { + let kv_backend = Arc::new(MemoryKvBackend::default()); + let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone())); + let (txn, _) = table_metadata_manager + .table_route_manager() + .table_route_storage() + .build_create_txn( + 1024, + &TableRouteValue::Physical(PhysicalTableRouteValue::new(vec![ + // Already migrated to the to_peer. + RegionRoute { + region: Region::new_test(RegionId::new(1024, 1)), + leader_peer: Some(Peer::empty(2)), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }, + // Leader peer changed. + RegionRoute { + region: Region::new_test(RegionId::new(1024, 2)), + leader_peer: Some(Peer::empty(3)), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }, + // Peer conflict. + RegionRoute { + region: Region::new_test(RegionId::new(1024, 3)), + leader_peer: Some(Peer::empty(1)), + follower_peers: vec![Peer::empty(2)], + leader_state: None, + leader_down_since: None, + }, + // Normal case. + RegionRoute { + region: Region::new_test(RegionId::new(1024, 4)), + leader_peer: Some(Peer::empty(1)), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }, + ])), + ) + .unwrap(); + + kv_backend.txn(txn).await.unwrap(); + let task = &RegionMigrationTaskBatch { + region_ids: vec![ + RegionId::new(1024, 1), + RegionId::new(1024, 2), + RegionId::new(1024, 3), + RegionId::new(1024, 4), + RegionId::new(1025, 1), + ], + from_peer: Peer::empty(1), + to_peer: Peer::empty(2), + timeout: Duration::from_millis(1000), + trigger_reason: RegionMigrationTriggerReason::Manual, + }; + let result = analyze_region_migration_task(task, &table_metadata_manager) + .await + .unwrap(); + assert_eq!( + result, + RegionMigrationAnalysis { + pending: vec![RegionId::new(1024, 4)], + migrated: vec![RegionId::new(1024, 1)], + leader_changed: vec![RegionId::new(1024, 2)], + peer_conflict: vec![RegionId::new(1024, 3)], + table_not_found: vec![RegionId::new(1025, 1)], + } + ); + } +} diff --git a/src/meta-srv/src/procedure/repartition.rs b/src/meta-srv/src/procedure/repartition.rs new file mode 100644 index 0000000000..f55d349df5 --- /dev/null +++ b/src/meta-srv/src/procedure/repartition.rs @@ -0,0 +1,19 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod group; +pub mod plan; + +#[cfg(test)] +pub mod test_util; diff --git a/src/meta-srv/src/procedure/repartition/group.rs b/src/meta-srv/src/procedure/repartition/group.rs new file mode 100644 index 0000000000..7c3ee14e64 --- /dev/null +++ b/src/meta-srv/src/procedure/repartition/group.rs @@ -0,0 +1,284 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) mod repartition_start; +pub(crate) mod update_metadata; + +use std::any::Any; +use std::fmt::Debug; + +use common_error::ext::BoxedError; +use common_meta::DatanodeId; +use common_meta::cache_invalidator::CacheInvalidatorRef; +use common_meta::instruction::CacheIdent; +use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue, RegionInfo}; +use common_meta::key::table_route::TableRouteValue; +use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; +use common_meta::rpc::router::RegionRoute; +use common_procedure::{Context as ProcedureContext, Status}; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt}; +use store_api::storage::{RegionId, TableId}; +use uuid::Uuid; + +use crate::error::{self, Result}; +use crate::procedure::repartition::plan::RegionDescriptor; + +pub type GroupId = Uuid; + +pub struct RepartitionGroupProcedure {} + +pub struct Context { + pub persistent_ctx: PersistentContext, + + pub cache_invalidator: CacheInvalidatorRef, + + pub table_metadata_manager: TableMetadataManagerRef, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct GroupPrepareResult { + pub source_routes: Vec, + pub target_routes: Vec, + pub central_region: RegionId, + pub central_region_datanode_id: DatanodeId, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PersistentContext { + pub group_id: GroupId, + /// The table id of the repartition group. + pub table_id: TableId, + /// The source regions of the repartition group. + pub sources: Vec, + /// The target regions of the repartition group. + pub targets: Vec, + /// The result of group prepare. + /// The value will be set in [RepartitionStart](crate::procedure::repartition::group::repartition_start::RepartitionStart) state. + pub group_prepare_result: Option, +} + +impl Context { + /// Retrieves the table route value for the given table id. + /// + /// Retry: + /// - Failed to retrieve the metadata of table. + /// + /// Abort: + /// - Table route not found. + pub async fn get_table_route_value( + &self, + ) -> Result> { + let table_id = self.persistent_ctx.table_id; + let group_id = self.persistent_ctx.group_id; + let table_route_value = self + .table_metadata_manager + .table_route_manager() + .table_route_storage() + .get_with_raw_bytes(table_id) + .await + .map_err(BoxedError::new) + .with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!( + "Failed to get table route for table: {}, repartition group: {}", + table_id, group_id + ), + })? + .context(error::TableRouteNotFoundSnafu { table_id })?; + + Ok(table_route_value) + } + + /// Returns the `datanode_table_value` + /// + /// Retry: + /// - Failed to retrieve the metadata of datanode table. + pub async fn get_datanode_table_value( + &self, + table_id: TableId, + datanode_id: u64, + ) -> Result { + let datanode_table_value = self + .table_metadata_manager + .datanode_table_manager() + .get(&DatanodeTableKey { + datanode_id, + table_id, + }) + .await + .context(error::TableMetadataManagerSnafu) + .map_err(BoxedError::new) + .with_context(|_| error::RetryLaterWithSourceSnafu { + reason: format!("Failed to get DatanodeTable: {table_id}"), + })? + .context(error::DatanodeTableNotFoundSnafu { + table_id, + datanode_id, + })?; + Ok(datanode_table_value) + } + + /// Broadcasts the invalidate table cache message. + pub async fn invalidate_table_cache(&self) -> Result<()> { + let table_id = self.persistent_ctx.table_id; + let group_id = self.persistent_ctx.group_id; + let subject = format!( + "Invalidate table cache for repartition table, group: {}, table: {}", + group_id, table_id, + ); + let ctx = common_meta::cache_invalidator::Context { + subject: Some(subject), + }; + let _ = self + .cache_invalidator + .invalidate(&ctx, &[CacheIdent::TableId(table_id)]) + .await; + Ok(()) + } + + /// Updates the table route. + /// + /// Retry: + /// - Failed to retrieve the metadata of datanode table. + /// + /// Abort: + /// - Table route not found. + /// - Failed to update the table route. + pub async fn update_table_route( + &self, + current_table_route_value: &DeserializedValueWithBytes, + new_region_routes: Vec, + ) -> Result<()> { + let table_id = self.persistent_ctx.table_id; + // Safety: prepare result is set in [RepartitionStart] state. + let prepare_result = self.persistent_ctx.group_prepare_result.as_ref().unwrap(); + let central_region_datanode_table_value = self + .get_datanode_table_value(table_id, prepare_result.central_region_datanode_id) + .await?; + let RegionInfo { + region_options, + region_wal_options, + .. + } = ¢ral_region_datanode_table_value.region_info; + + self.table_metadata_manager + .update_table_route( + table_id, + central_region_datanode_table_value.region_info.clone(), + current_table_route_value, + new_region_routes, + region_options, + region_wal_options, + ) + .await + .context(error::TableMetadataManagerSnafu) + } +} + +/// Returns the region routes of the given table route value. +/// +/// Abort: +/// - Table route value is not physical. +pub fn region_routes( + table_id: TableId, + table_route_value: &TableRouteValue, +) -> Result<&Vec> { + table_route_value + .region_routes() + .with_context(|_| error::UnexpectedLogicalRouteTableSnafu { + err_msg: format!( + "TableRoute({:?}) is a non-physical TableRouteValue.", + table_id + ), + }) +} + +#[async_trait::async_trait] +#[typetag::serde(tag = "repartition_group_state")] +pub(crate) trait State: Sync + Send + Debug { + fn name(&self) -> &'static str { + let type_name = std::any::type_name::(); + // short name + type_name.split("::").last().unwrap_or(type_name) + } + + /// Yields the next [State] and [Status]. + async fn next( + &mut self, + ctx: &mut Context, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)>; + + fn as_any(&self) -> &dyn Any; +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + use std::sync::Arc; + + use common_meta::key::TableMetadataManager; + use common_meta::kv_backend::test_util::MockKvBackendBuilder; + + use crate::error::Error; + use crate::procedure::repartition::test_util::{TestingEnv, new_persistent_context}; + + #[tokio::test] + async fn test_get_table_route_value_not_found_error() { + let env = TestingEnv::new(); + let persistent_context = new_persistent_context(1024, vec![], vec![]); + let ctx = env.create_context(persistent_context); + let err = ctx.get_table_route_value().await.unwrap_err(); + assert_matches!(err, Error::TableRouteNotFound { .. }); + assert!(!err.is_retryable()); + } + + #[tokio::test] + async fn test_get_table_route_value_retry_error() { + let kv = MockKvBackendBuilder::default() + .range_fn(Arc::new(|_| { + common_meta::error::UnexpectedSnafu { + err_msg: "mock err", + } + .fail() + })) + .build() + .unwrap(); + let mut env = TestingEnv::new(); + env.table_metadata_manager = Arc::new(TableMetadataManager::new(Arc::new(kv))); + let persistent_context = new_persistent_context(1024, vec![], vec![]); + let ctx = env.create_context(persistent_context); + let err = ctx.get_table_route_value().await.unwrap_err(); + assert!(err.is_retryable()); + } + + #[tokio::test] + async fn test_get_datanode_table_value_retry_error() { + let kv = MockKvBackendBuilder::default() + .range_fn(Arc::new(|_| { + common_meta::error::UnexpectedSnafu { + err_msg: "mock err", + } + .fail() + })) + .build() + .unwrap(); + let mut env = TestingEnv::new(); + env.table_metadata_manager = Arc::new(TableMetadataManager::new(Arc::new(kv))); + let persistent_context = new_persistent_context(1024, vec![], vec![]); + let ctx = env.create_context(persistent_context); + let err = ctx.get_datanode_table_value(1024, 1).await.unwrap_err(); + assert!(err.is_retryable()); + } +} diff --git a/src/meta-srv/src/procedure/repartition/group/repartition_start.rs b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs new file mode 100644 index 0000000000..5e72ce613c --- /dev/null +++ b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs @@ -0,0 +1,273 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::HashMap; + +use common_meta::rpc::router::RegionRoute; +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::debug; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt, ensure}; + +use crate::error::{self, Result}; +use crate::procedure::repartition::group::{ + Context, GroupId, GroupPrepareResult, State, region_routes, +}; +use crate::procedure::repartition::plan::RegionDescriptor; + +#[derive(Debug, Serialize, Deserialize)] +pub struct RepartitionStart; + +/// Ensures that the partition expression of the region route matches the partition expression of the region descriptor. +fn ensure_region_route_expr_match( + region_route: &RegionRoute, + region_descriptor: &RegionDescriptor, +) -> Result { + let actual = ®ion_route.region.partition_expr; + let expected = region_descriptor + .partition_expr + .as_json_str() + .context(error::SerializePartitionExprSnafu)?; + ensure!( + actual == &expected, + error::PartitionExprMismatchSnafu { + region_id: region_route.region.id, + expected, + actual, + } + ); + Ok(region_route.clone()) +} + +impl RepartitionStart { + /// Ensures that both source and target regions are present in the region routes. + /// + /// Both source and target regions must be present in the region routes (target regions should be allocated before repartitioning). + #[allow(dead_code)] + fn ensure_route_present( + group_id: GroupId, + region_routes: &[RegionRoute], + sources: &[RegionDescriptor], + targets: &[RegionDescriptor], + ) -> Result { + ensure!( + !sources.is_empty(), + error::UnexpectedSnafu { + violated: "Sources are empty" + } + ); + + let region_routes_map = region_routes + .iter() + .map(|r| (r.region.id, r)) + .collect::>(); + let source_region_routes = sources + .iter() + .map(|s| { + region_routes_map + .get(&s.region_id) + .context(error::RepartitionSourceRegionMissingSnafu { + group_id, + region_id: s.region_id, + }) + .and_then(|r| ensure_region_route_expr_match(r, s)) + }) + .collect::>>()?; + let target_region_routes = targets + .iter() + .map(|t| { + region_routes_map + .get(&t.region_id) + .context(error::RepartitionTargetRegionMissingSnafu { + group_id, + region_id: t.region_id, + }) + .map(|r| (*r).clone()) + }) + .collect::>>()?; + let central_region = sources[0].region_id; + let central_region_datanode_id = source_region_routes[0] + .leader_peer + .as_ref() + .context(error::UnexpectedSnafu { + violated: format!( + "Leader peer is not set for central region: {}", + central_region + ), + })? + .id; + + Ok(GroupPrepareResult { + source_routes: source_region_routes, + target_routes: target_region_routes, + central_region, + central_region_datanode_id, + }) + } + + #[allow(dead_code)] + fn next_state() -> (Box, Status) { + // TODO(weny): change it later. + (Box::new(RepartitionStart), Status::executing(true)) + } +} + +#[async_trait::async_trait] +#[typetag::serde] +impl State for RepartitionStart { + /// Captures the group prepare result. + /// + /// Retry: + /// - Failed to get the table route. + /// + /// Abort + /// - Table route not found. + /// - Table route is not physical. + /// - Failed to ensure the route is present. + /// - Failed to capture the group prepare result. + async fn next( + &mut self, + ctx: &mut Context, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + if ctx.persistent_ctx.group_prepare_result.is_some() { + return Ok(Self::next_state()); + } + let table_id = ctx.persistent_ctx.table_id; + let group_id = ctx.persistent_ctx.group_id; + let table_route_value = ctx.get_table_route_value().await?.into_inner(); + let region_routes = region_routes(table_id, &table_route_value)?; + let group_prepare_result = Self::ensure_route_present( + group_id, + region_routes, + &ctx.persistent_ctx.sources, + &ctx.persistent_ctx.targets, + )?; + ctx.persistent_ctx.group_prepare_result = Some(group_prepare_result); + debug!( + "Repartition group {}: captured {} sources, {} targets", + group_id, + ctx.persistent_ctx.sources.len(), + ctx.persistent_ctx.targets.len() + ); + + Ok(Self::next_state()) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + + use common_meta::peer::Peer; + use common_meta::rpc::router::{Region, RegionRoute}; + use store_api::storage::RegionId; + use uuid::Uuid; + + use crate::error::Error; + use crate::procedure::repartition::group::repartition_start::RepartitionStart; + use crate::procedure::repartition::plan::RegionDescriptor; + use crate::procedure::repartition::test_util::range_expr; + + #[test] + fn test_ensure_route_present_missing_source_region() { + let source_region = RegionDescriptor { + region_id: RegionId::new(1024, 1), + partition_expr: range_expr("x", 0, 100), + }; + let target_region = RegionDescriptor { + region_id: RegionId::new(1024, 2), + partition_expr: range_expr("x", 0, 10), + }; + let region_routes = vec![RegionRoute { + region: Region { + id: RegionId::new(1024, 2), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }]; + let err = RepartitionStart::ensure_route_present( + Uuid::new_v4(), + ®ion_routes, + &[source_region], + &[target_region], + ) + .unwrap_err(); + assert_matches!(err, Error::RepartitionSourceRegionMissing { .. }); + } + + #[test] + fn test_ensure_route_present_partition_expr_mismatch() { + let source_region = RegionDescriptor { + region_id: RegionId::new(1024, 1), + partition_expr: range_expr("x", 0, 100), + }; + let target_region = RegionDescriptor { + region_id: RegionId::new(1024, 2), + partition_expr: range_expr("x", 0, 10), + }; + let region_routes = vec![RegionRoute { + region: Region { + id: RegionId::new(1024, 1), + partition_expr: range_expr("x", 0, 5).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }]; + let err = RepartitionStart::ensure_route_present( + Uuid::new_v4(), + ®ion_routes, + &[source_region], + &[target_region], + ) + .unwrap_err(); + assert_matches!(err, Error::PartitionExprMismatch { .. }); + } + + #[test] + fn test_ensure_route_present_missing_target_region() { + let source_region = RegionDescriptor { + region_id: RegionId::new(1024, 1), + partition_expr: range_expr("x", 0, 100), + }; + let target_region = RegionDescriptor { + region_id: RegionId::new(1024, 2), + partition_expr: range_expr("x", 0, 10), + }; + let region_routes = vec![RegionRoute { + region: Region { + id: RegionId::new(1024, 1), + partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }]; + let err = RepartitionStart::ensure_route_present( + Uuid::new_v4(), + ®ion_routes, + &[source_region], + &[target_region], + ) + .unwrap_err(); + assert_matches!(err, Error::RepartitionTargetRegionMissing { .. }); + } +} diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata.rs new file mode 100644 index 0000000000..8f42ff8432 --- /dev/null +++ b/src/meta-srv/src/procedure/repartition/group/update_metadata.rs @@ -0,0 +1,80 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) mod apply_staging_region; +pub(crate) mod rollback_staging_region; + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::warn; +use serde::{Deserialize, Serialize}; + +use crate::error::Result; +use crate::procedure::repartition::group::repartition_start::RepartitionStart; +use crate::procedure::repartition::group::{Context, State}; + +#[derive(Debug, Serialize, Deserialize)] +pub enum UpdateMetadata { + /// Applies the new partition expressions for staging regions. + ApplyStaging, + /// Rolls back the new partition expressions for staging regions. + RollbackStaging, +} + +impl UpdateMetadata { + #[allow(dead_code)] + fn next_state() -> (Box, Status) { + // TODO(weny): change it later. + (Box::new(RepartitionStart), Status::executing(true)) + } +} + +#[async_trait::async_trait] +#[typetag::serde] +impl State for UpdateMetadata { + async fn next( + &mut self, + ctx: &mut Context, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + match self { + UpdateMetadata::ApplyStaging => { + // TODO(weny): If all metadata have already been updated, skip applying staging regions. + self.apply_staging_regions(ctx).await?; + + if let Err(err) = ctx.invalidate_table_cache().await { + warn!( + "Failed to broadcast the invalidate table cache message during the apply staging regions, error: {err:?}" + ); + }; + Ok(Self::next_state()) + } + UpdateMetadata::RollbackStaging => { + self.rollback_staging_regions(ctx).await?; + + if let Err(err) = ctx.invalidate_table_cache().await { + warn!( + "Failed to broadcast the invalidate table cache message during the rollback staging regions, error: {err:?}" + ); + }; + Ok(Self::next_state()) + } + } + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs new file mode 100644 index 0000000000..6f342931a8 --- /dev/null +++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs @@ -0,0 +1,181 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use common_error::ext::BoxedError; +use common_meta::rpc::router::RegionRoute; +use common_telemetry::error; +use snafu::{OptionExt, ResultExt}; + +use crate::error::{self, Result}; +use crate::procedure::repartition::group::update_metadata::UpdateMetadata; +use crate::procedure::repartition::group::{Context, GroupId, region_routes}; +use crate::procedure::repartition::plan::RegionDescriptor; + +impl UpdateMetadata { + /// Applies the new partition expressions for staging regions. + /// + /// Abort: + /// - Target region not found. + /// - Source region not found. + fn apply_staging_region_routes( + group_id: GroupId, + sources: &[RegionDescriptor], + targets: &[RegionDescriptor], + current_region_routes: &[RegionRoute], + ) -> Result> { + let mut region_routes = current_region_routes.to_vec(); + let mut region_routes_map = region_routes + .iter_mut() + .map(|route| (route.region.id, route)) + .collect::>(); + + for target in targets { + let region_route = region_routes_map.get_mut(&target.region_id).context( + error::RepartitionTargetRegionMissingSnafu { + group_id, + region_id: target.region_id, + }, + )?; + region_route.region.partition_expr = target + .partition_expr + .as_json_str() + .context(error::SerializePartitionExprSnafu)?; + region_route.set_leader_staging(); + } + + for source in sources { + let region_route = region_routes_map.get_mut(&source.region_id).context( + error::RepartitionSourceRegionMissingSnafu { + group_id, + region_id: source.region_id, + }, + )?; + region_route.set_leader_staging(); + } + + Ok(region_routes) + } + + /// Applies the new partition expressions for staging regions. + /// + /// Abort: + /// - Table route is not physical. + /// - Target region not found. + /// - Source region not found. + /// - Failed to update the table route. + /// - Central region datanode table value not found. + #[allow(dead_code)] + pub(crate) async fn apply_staging_regions(&self, ctx: &mut Context) -> Result<()> { + let table_id = ctx.persistent_ctx.table_id; + let group_id = ctx.persistent_ctx.group_id; + let current_table_route_value = ctx.get_table_route_value().await?; + let region_routes = region_routes(table_id, current_table_route_value.get_inner_ref())?; + let new_region_routes = Self::apply_staging_region_routes( + group_id, + &ctx.persistent_ctx.sources, + &ctx.persistent_ctx.targets, + region_routes, + )?; + + if let Err(err) = ctx + .update_table_route(¤t_table_route_value, new_region_routes) + .await + { + error!(err; "Failed to update the table route during the updating metadata for repartition: {table_id}, group_id: {group_id}"); + return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { + reason: format!( + "Failed to update the table route during the updating metadata for repartition: {table_id}, group_id: {group_id}" + ), + }); + }; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use common_meta::peer::Peer; + use common_meta::rpc::router::{Region, RegionRoute}; + use store_api::storage::RegionId; + use uuid::Uuid; + + use crate::procedure::repartition::group::update_metadata::UpdateMetadata; + use crate::procedure::repartition::plan::RegionDescriptor; + use crate::procedure::repartition::test_util::range_expr; + + #[test] + fn test_generate_region_routes() { + let group_id = Uuid::new_v4(); + let table_id = 1024; + let region_routes = vec![ + RegionRoute { + region: Region { + id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }, + RegionRoute { + region: Region { + id: RegionId::new(table_id, 2), + partition_expr: String::new(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }, + RegionRoute { + region: Region { + id: RegionId::new(table_id, 3), + partition_expr: String::new(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }, + ]; + let source_region = RegionDescriptor { + region_id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100), + }; + let target_region = RegionDescriptor { + region_id: RegionId::new(table_id, 2), + partition_expr: range_expr("x", 0, 10), + }; + + let new_region_routes = UpdateMetadata::apply_staging_region_routes( + group_id, + &[source_region], + &[target_region], + ®ion_routes, + ) + .unwrap(); + assert!(new_region_routes[0].is_leader_staging()); + assert_eq!( + new_region_routes[0].region.partition_expr, + range_expr("x", 0, 100).as_json_str().unwrap() + ); + assert_eq!( + new_region_routes[1].region.partition_expr, + range_expr("x", 0, 10).as_json_str().unwrap() + ); + assert!(new_region_routes[1].is_leader_staging()); + assert!(!new_region_routes[2].is_leader_staging()); + } +} diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs new file mode 100644 index 0000000000..3d147d82ad --- /dev/null +++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs @@ -0,0 +1,187 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use common_error::ext::BoxedError; +use common_meta::rpc::router::RegionRoute; +use common_telemetry::error; +use snafu::{OptionExt, ResultExt}; + +use crate::error::{self, Result}; +use crate::procedure::repartition::group::update_metadata::UpdateMetadata; +use crate::procedure::repartition::group::{Context, GroupId, region_routes}; + +impl UpdateMetadata { + /// Rolls back the staging regions. + /// + /// Abort: + /// - Source region not found. + /// - Target region not found. + #[allow(dead_code)] + fn rollback_staging_region_routes( + group_id: GroupId, + source_routes: &[RegionRoute], + target_routes: &[RegionRoute], + current_region_routes: &[RegionRoute], + ) -> Result> { + let mut region_routes = current_region_routes.to_vec(); + let mut region_routes_map = region_routes + .iter_mut() + .map(|route| (route.region.id, route)) + .collect::>(); + + for source in source_routes { + let region_route = region_routes_map.get_mut(&source.region.id).context( + error::RepartitionSourceRegionMissingSnafu { + group_id, + region_id: source.region.id, + }, + )?; + region_route.region.partition_expr = source.region.partition_expr.clone(); + region_route.clear_leader_staging(); + } + + for target in target_routes { + let region_route = region_routes_map.get_mut(&target.region.id).context( + error::RepartitionTargetRegionMissingSnafu { + group_id, + region_id: target.region.id, + }, + )?; + region_route.clear_leader_staging(); + } + + Ok(region_routes) + } + + /// Rolls back the metadata for staging regions. + /// + /// Abort: + /// - Table route is not physical. + /// - Source region not found. + /// - Target region not found. + /// - Failed to update the table route. + /// - Central region datanode table value not found. + #[allow(dead_code)] + pub(crate) async fn rollback_staging_regions(&self, ctx: &mut Context) -> Result<()> { + let table_id = ctx.persistent_ctx.table_id; + let group_id = ctx.persistent_ctx.group_id; + let current_table_route_value = ctx.get_table_route_value().await?; + let region_routes = region_routes(table_id, current_table_route_value.get_inner_ref())?; + // Safety: prepare result is set in [RepartitionStart] state. + let prepare_result = ctx.persistent_ctx.group_prepare_result.as_ref().unwrap(); + let new_region_routes = Self::rollback_staging_region_routes( + group_id, + &prepare_result.source_routes, + &prepare_result.target_routes, + region_routes, + )?; + + if let Err(err) = ctx + .update_table_route(¤t_table_route_value, new_region_routes) + .await + { + error!(err; "Failed to update the table route during the updating metadata for repartition: {table_id}, group_id: {group_id}"); + return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu { + reason: format!( + "Failed to update the table route during the updating metadata for repartition: {table_id}, group_id: {group_id}" + ), + }); + }; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use common_meta::peer::Peer; + use common_meta::rpc::router::{LeaderState, Region, RegionRoute}; + use store_api::storage::RegionId; + use uuid::Uuid; + + use crate::procedure::repartition::group::update_metadata::UpdateMetadata; + use crate::procedure::repartition::test_util::range_expr; + + #[test] + fn test_rollback_staging_region_routes() { + let group_id = Uuid::new_v4(); + let table_id = 1024; + let region_routes = vec![ + RegionRoute { + region: Region { + id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + leader_state: Some(LeaderState::Staging), + ..Default::default() + }, + RegionRoute { + region: Region { + id: RegionId::new(table_id, 2), + partition_expr: String::new(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + leader_state: Some(LeaderState::Staging), + ..Default::default() + }, + RegionRoute { + region: Region { + id: RegionId::new(table_id, 3), + partition_expr: String::new(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + leader_state: Some(LeaderState::Downgrading), + ..Default::default() + }, + ]; + let source_routes = vec![RegionRoute { + region: Region { + id: RegionId::new(table_id, 1), + partition_expr: range_expr("x", 0, 20).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }]; + let target_routes = vec![RegionRoute { + region: Region { + id: RegionId::new(table_id, 2), + partition_expr: range_expr("x", 0, 20).as_json_str().unwrap(), + ..Default::default() + }, + leader_peer: Some(Peer::empty(1)), + ..Default::default() + }]; + let new_region_routes = UpdateMetadata::rollback_staging_region_routes( + group_id, + &source_routes, + &target_routes, + ®ion_routes, + ) + .unwrap(); + assert!(!new_region_routes[0].is_leader_staging()); + assert_eq!( + new_region_routes[0].region.partition_expr, + range_expr("x", 0, 20).as_json_str().unwrap(), + ); + assert!(!new_region_routes[1].is_leader_staging()); + assert!(new_region_routes[2].is_leader_downgrading()); + } +} diff --git a/src/meta-srv/src/procedure/repartition/plan.rs b/src/meta-srv/src/procedure/repartition/plan.rs new file mode 100644 index 0000000000..6d753a044c --- /dev/null +++ b/src/meta-srv/src/procedure/repartition/plan.rs @@ -0,0 +1,26 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use partition::expr::PartitionExpr; +use serde::{Deserialize, Serialize}; +use store_api::storage::RegionId; + +/// Metadata describing a region involved in the plan. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct RegionDescriptor { + /// The region id of the region involved in the plan. + pub region_id: RegionId, + /// The new partition expression of the region. + pub partition_expr: PartitionExpr, +} diff --git a/src/meta-srv/src/procedure/repartition/test_util.rs b/src/meta-srv/src/procedure/repartition/test_util.rs new file mode 100644 index 0000000000..3c0ebee58a --- /dev/null +++ b/src/meta-srv/src/procedure/repartition/test_util.rs @@ -0,0 +1,91 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; +use common_meta::kv_backend::memory::MemoryKvBackend; +use common_meta::sequence::SequenceBuilder; +use datatypes::value::Value; +use partition::expr::{PartitionExpr, col}; +use store_api::storage::TableId; +use uuid::Uuid; + +use crate::cache_invalidator::MetasrvCacheInvalidator; +use crate::metasrv::MetasrvInfo; +use crate::procedure::repartition::group::{Context, PersistentContext}; +use crate::procedure::repartition::plan::RegionDescriptor; +use crate::procedure::test_util::MailboxContext; + +/// `TestingEnv` provides components during the tests. +pub struct TestingEnv { + pub table_metadata_manager: TableMetadataManagerRef, + pub mailbox_ctx: MailboxContext, +} + +impl Default for TestingEnv { + fn default() -> Self { + Self::new() + } +} + +impl TestingEnv { + pub fn new() -> Self { + let kv_backend = Arc::new(MemoryKvBackend::new()); + let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone())); + let mailbox_sequence = + SequenceBuilder::new("test_heartbeat_mailbox", kv_backend.clone()).build(); + let mailbox_ctx = MailboxContext::new(mailbox_sequence); + + Self { + table_metadata_manager, + mailbox_ctx, + } + } + + pub fn create_context(self, persistent_context: PersistentContext) -> Context { + let cache_invalidator = Arc::new(MetasrvCacheInvalidator::new( + self.mailbox_ctx.mailbox().clone(), + MetasrvInfo { + server_addr: String::new(), + }, + )); + + Context { + persistent_ctx: persistent_context, + table_metadata_manager: self.table_metadata_manager.clone(), + cache_invalidator, + } + } +} + +pub fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr { + col(col_name) + .gt_eq(Value::Int64(start)) + .and(col(col_name).lt(Value::Int64(end))) +} + +pub fn new_persistent_context( + table_id: TableId, + sources: Vec, + targets: Vec, +) -> PersistentContext { + PersistentContext { + group_id: Uuid::new_v4(), + table_id, + sources, + targets, + group_prepare_result: None, + } +} diff --git a/src/meta-srv/src/procedure/test_util.rs b/src/meta-srv/src/procedure/test_util.rs index 8197087351..1586ad5f5f 100644 --- a/src/meta-srv/src/procedure/test_util.rs +++ b/src/meta-srv/src/procedure/test_util.rs @@ -17,7 +17,8 @@ use std::collections::HashMap; use api::v1::meta::mailbox_message::Payload; use api::v1::meta::{HeartbeatResponse, MailboxMessage}; use common_meta::instruction::{ - DowngradeRegionReply, FlushRegionReply, InstructionReply, SimpleReply, UpgradeRegionReply, + DowngradeRegionReply, DowngradeRegionsReply, FlushRegionReply, InstructionReply, SimpleReply, + UpgradeRegionReply, UpgradeRegionsReply, }; use common_meta::key::TableMetadataManagerRef; use common_meta::key::table_route::TableRouteValue; @@ -183,12 +184,15 @@ pub fn new_downgrade_region_reply( to: "meta".to_string(), timestamp_millis: current_time_millis(), payload: Some(Payload::Json( - serde_json::to_string(&InstructionReply::DowngradeRegion(DowngradeRegionReply { - last_entry_id, - metadata_last_entry_id: None, - exists: exist, - error, - })) + serde_json::to_string(&InstructionReply::DowngradeRegions( + DowngradeRegionsReply::new(vec![DowngradeRegionReply { + region_id: RegionId::new(0, 0), + last_entry_id, + metadata_last_entry_id: None, + exists: exist, + error, + }]), + )) .unwrap(), )), } @@ -208,11 +212,14 @@ pub fn new_upgrade_region_reply( to: "meta".to_string(), timestamp_millis: current_time_millis(), payload: Some(Payload::Json( - serde_json::to_string(&InstructionReply::UpgradeRegion(UpgradeRegionReply { - ready, - exists, - error, - })) + serde_json::to_string(&InstructionReply::UpgradeRegions( + UpgradeRegionsReply::single(UpgradeRegionReply { + region_id: RegionId::new(0, 0), + ready, + exists, + error, + }), + )) .unwrap(), )), } diff --git a/src/meta-srv/src/region/supervisor.rs b/src/meta-srv/src/region/supervisor.rs index cb198b6787..866431dec1 100644 --- a/src/meta-srv/src/region/supervisor.rs +++ b/src/meta-srv/src/region/supervisor.rs @@ -32,7 +32,6 @@ use common_meta::rpc::store::RangeRequest; use common_runtime::JoinHandle; use common_telemetry::{debug, error, info, warn}; use common_time::util::current_time_millis; -use error::Error::{LeaderPeerChanged, MigrationRunning, RegionMigrated, TableRouteNotFound}; use futures::{StreamExt, TryStreamExt}; use snafu::{ResultExt, ensure}; use store_api::storage::RegionId; @@ -45,13 +44,15 @@ use crate::error::{self, Result}; use crate::failure_detector::PhiAccrualFailureDetectorOptions; use crate::metasrv::{RegionStatAwareSelectorRef, SelectTarget, SelectorContext, SelectorRef}; use crate::procedure::region_migration::manager::{ - RegionMigrationManagerRef, RegionMigrationTriggerReason, + RegionMigrationManagerRef, RegionMigrationTriggerReason, SubmitRegionMigrationTaskResult, }; +use crate::procedure::region_migration::utils::RegionMigrationTaskBatch; use crate::procedure::region_migration::{ DEFAULT_REGION_MIGRATION_TIMEOUT, RegionMigrationProcedureTask, }; use crate::region::failure_detector::RegionFailureDetector; use crate::selector::SelectorOptions; +use crate::state::StateRef; /// `DatanodeHeartbeat` represents the heartbeat signal sent from a datanode. /// It includes identifiers for the cluster and datanode, a list of regions being monitored, @@ -100,16 +101,6 @@ pub(crate) enum Event { Dump(tokio::sync::oneshot::Sender), } -#[cfg(test)] -impl Event { - pub(crate) fn into_region_failure_detectors(self) -> Vec { - match self { - Self::RegisterFailureDetectors(detecting_regions) => detecting_regions, - _ => unreachable!(), - } - } -} - impl Debug for Event { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -139,6 +130,9 @@ pub struct RegionSupervisorTicker { /// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`]. tick_handle: Mutex>>, + /// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`]. + initialization_handle: Mutex>>, + /// The interval of tick. tick_interval: Duration, @@ -182,6 +176,7 @@ impl RegionSupervisorTicker { ); Self { tick_handle: Mutex::new(None), + initialization_handle: Mutex::new(None), tick_interval, initialization_delay, initialization_retry_period, @@ -202,7 +197,7 @@ impl RegionSupervisorTicker { self.initialization_retry_period, ); initialization_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - common_runtime::spawn_global(async move { + let initialization_handler = common_runtime::spawn_global(async move { loop { initialization_interval.tick().await; let (tx, rx) = oneshot::channel(); @@ -218,6 +213,7 @@ impl RegionSupervisorTicker { } } }); + *self.initialization_handle.lock().unwrap() = Some(initialization_handler); let sender = self.sender.clone(); let ticker_loop = tokio::spawn(async move { @@ -247,6 +243,11 @@ impl RegionSupervisorTicker { handle.abort(); info!("The tick loop is stopped."); } + let initialization_handler = self.initialization_handle.lock().unwrap().take(); + if let Some(initialization_handler) = initialization_handler { + initialization_handler.abort(); + info!("The initialization loop is stopped."); + } } } @@ -290,6 +291,8 @@ pub struct RegionSupervisor { peer_resolver: PeerResolverRef, /// The kv backend. kv_backend: KvBackendRef, + /// The meta state, used to check if the current metasrv is the leader. + state: Option, } /// Controller for managing failure detectors for regions. @@ -373,12 +376,29 @@ impl RegionSupervisor { runtime_switch_manager, peer_resolver, kv_backend, + state: None, } } + /// Sets the meta state. + pub(crate) fn with_state(mut self, state: StateRef) -> Self { + self.state = Some(state); + self + } + /// Runs the main loop. pub(crate) async fn run(&mut self) { while let Some(event) = self.receiver.recv().await { + if let Some(state) = self.state.as_ref() + && !state.read().unwrap().is_leader() + { + warn!( + "The current metasrv is not the leader, ignore {:?} event", + event + ); + continue; + } + match event { Event::InitializeAllRegions(sender) => { match self.is_maintenance_mode_enabled().await { @@ -413,7 +433,10 @@ impl RegionSupervisor { self.deregister_failure_detectors(detecting_regions).await } Event::HeartbeatArrived(heartbeat) => self.on_heartbeat_arrived(heartbeat), - Event::Clear => self.clear(), + Event::Clear => { + self.clear(); + info!("Region supervisor is initialized."); + } #[cfg(test)] Event::Dump(sender) => { let _ = sender.send(self.failure_detector.dump()); @@ -552,11 +575,22 @@ impl RegionSupervisor { .await { Ok(tasks) => { + let mut grouped_tasks: HashMap<(u64, u64), Vec<_>> = HashMap::new(); for (task, count) in tasks { - let region_id = task.region_id; - let datanode_id = task.from_peer.id; - if let Err(err) = self.do_failover(task, count).await { - error!(err; "Failed to execute region failover for region: {}, datanode: {}", region_id, datanode_id); + grouped_tasks + .entry((task.from_peer.id, task.to_peer.id)) + .or_default() + .push((task, count)); + } + + for ((from_peer_id, to_peer_id), tasks) in grouped_tasks { + if tasks.is_empty() { + continue; + } + let task = RegionMigrationTaskBatch::from_tasks(tasks); + let region_ids = task.region_ids.clone(); + if let Err(err) = self.do_failover_tasks(task).await { + error!(err; "Failed to execute region failover for regions: {:?}, from_peer: {}, to_peer: {}", region_ids, from_peer_id, to_peer_id); } } } @@ -665,56 +699,92 @@ impl RegionSupervisor { Ok(tasks) } - async fn do_failover(&mut self, task: RegionMigrationProcedureTask, count: u32) -> Result<()> { + async fn do_failover_tasks(&mut self, task: RegionMigrationTaskBatch) -> Result<()> { let from_peer_id = task.from_peer.id; let to_peer_id = task.to_peer.id; - let region_id = task.region_id; + let timeout = task.timeout; + let trigger_reason = task.trigger_reason; + let result = self + .region_migration_manager + .submit_region_migration_task(task) + .await?; + self.handle_submit_region_migration_task_result( + from_peer_id, + to_peer_id, + timeout, + trigger_reason, + result, + ) + .await + } - info!( - "Failover for region: {}, from_peer: {}, to_peer: {}, timeout: {:?}, tries: {}", - task.region_id, task.from_peer, task.to_peer, task.timeout, count - ); - - if let Err(err) = self.region_migration_manager.submit_procedure(task).await { - return match err { - RegionMigrated { .. } => { - info!( - "Region has been migrated to target peer: {}, removed failover detector for region: {}, datanode: {}", - to_peer_id, region_id, from_peer_id - ); - self.deregister_failure_detectors(vec![(from_peer_id, region_id)]) - .await; - Ok(()) - } - // Returns Ok if it's running or table is dropped. - MigrationRunning { .. } => { - info!( - "Another region migration is running, skip failover for region: {}, datanode: {}", - region_id, from_peer_id - ); - Ok(()) - } - TableRouteNotFound { .. } => { - self.deregister_failure_detectors(vec![(from_peer_id, region_id)]) - .await; - info!( - "Table route is not found, the table is dropped, removed failover detector for region: {}, datanode: {}", - region_id, from_peer_id - ); - Ok(()) - } - LeaderPeerChanged { .. } => { - self.deregister_failure_detectors(vec![(from_peer_id, region_id)]) - .await; - info!( - "Region's leader peer changed, removed failover detector for region: {}, datanode: {}", - region_id, from_peer_id - ); - Ok(()) - } - err => Err(err), - }; - }; + async fn handle_submit_region_migration_task_result( + &mut self, + from_peer_id: DatanodeId, + to_peer_id: DatanodeId, + timeout: Duration, + trigger_reason: RegionMigrationTriggerReason, + result: SubmitRegionMigrationTaskResult, + ) -> Result<()> { + if !result.migrated.is_empty() { + let detecting_regions = result + .migrated + .iter() + .map(|region_id| (from_peer_id, *region_id)) + .collect::>(); + self.deregister_failure_detectors(detecting_regions).await; + info!( + "Region has been migrated to target peer: {}, removed failover detectors for regions: {:?}", + to_peer_id, result.migrated, + ) + } + if !result.migrating.is_empty() { + info!( + "Region is still migrating, skipping failover for regions: {:?}", + result.migrating + ); + } + if !result.table_not_found.is_empty() { + let detecting_regions = result + .table_not_found + .iter() + .map(|region_id| (from_peer_id, *region_id)) + .collect::>(); + self.deregister_failure_detectors(detecting_regions).await; + info!( + "Table is not found, removed failover detectors for regions: {:?}", + result.table_not_found + ); + } + if !result.leader_changed.is_empty() { + let detecting_regions = result + .leader_changed + .iter() + .map(|region_id| (from_peer_id, *region_id)) + .collect::>(); + self.deregister_failure_detectors(detecting_regions).await; + info!( + "Region's leader peer changed, removed failover detectors for regions: {:?}", + result.leader_changed + ); + } + if !result.peer_conflict.is_empty() { + info!( + "Region has peer conflict, ignore failover for regions: {:?}", + result.peer_conflict + ); + } + if !result.submitted.is_empty() { + info!( + "Failover for regions: {:?}, from_peer: {}, to_peer: {}, procedure_id: {:?}, timeout: {:?}, trigger_reason: {:?}", + result.submitted, + from_peer_id, + to_peer_id, + result.procedure_id, + timeout, + trigger_reason, + ); + } Ok(()) } @@ -790,7 +860,10 @@ pub(crate) mod tests { use tokio::time::sleep; use super::RegionSupervisorSelector; - use crate::procedure::region_migration::manager::RegionMigrationManager; + use crate::procedure::region_migration::RegionMigrationTriggerReason; + use crate::procedure::region_migration::manager::{ + RegionMigrationManager, SubmitRegionMigrationTaskResult, + }; use crate::procedure::region_migration::test_util::TestingEnv; use crate::region::supervisor::{ DatanodeHeartbeat, Event, RegionFailureDetectorControl, RegionSupervisor, @@ -906,6 +979,7 @@ pub(crate) mod tests { let (tx, mut rx) = tokio::sync::mpsc::channel(128); let ticker = RegionSupervisorTicker { tick_handle: Mutex::new(None), + initialization_handle: Mutex::new(None), tick_interval: Duration::from_millis(10), initialization_delay: Duration::from_millis(100), initialization_retry_period: Duration::from_millis(100), @@ -923,6 +997,8 @@ pub(crate) mod tests { Event::Tick | Event::Clear | Event::InitializeAllRegions(_) ); } + assert!(ticker.initialization_handle.lock().unwrap().is_none()); + assert!(ticker.tick_handle.lock().unwrap().is_none()); } } @@ -932,6 +1008,7 @@ pub(crate) mod tests { let (tx, mut rx) = tokio::sync::mpsc::channel(128); let ticker = RegionSupervisorTicker { tick_handle: Mutex::new(None), + initialization_handle: Mutex::new(None), tick_interval: Duration::from_millis(1000), initialization_delay: Duration::from_millis(50), initialization_retry_period: Duration::from_millis(50), @@ -1060,4 +1137,172 @@ pub(crate) mod tests { sender.send(Event::Dump(tx)).await.unwrap(); assert!(rx.await.unwrap().is_empty()); } + + #[tokio::test] + async fn test_handle_submit_region_migration_task_result_migrated() { + common_telemetry::init_default_ut_logging(); + let (mut supervisor, _) = new_test_supervisor(); + let region_id = RegionId::new(1, 1); + let detecting_region = (1, region_id); + supervisor + .register_failure_detectors(vec![detecting_region]) + .await; + supervisor.failover_counts.insert(detecting_region, 1); + let result = SubmitRegionMigrationTaskResult { + migrated: vec![region_id], + ..Default::default() + }; + supervisor + .handle_submit_region_migration_task_result( + 1, + 2, + Duration::from_millis(1000), + RegionMigrationTriggerReason::Manual, + result, + ) + .await + .unwrap(); + assert!(!supervisor.failure_detector.contains(&detecting_region)); + assert!(supervisor.failover_counts.is_empty()); + } + + #[tokio::test] + async fn test_handle_submit_region_migration_task_result_migrating() { + common_telemetry::init_default_ut_logging(); + let (mut supervisor, _) = new_test_supervisor(); + let region_id = RegionId::new(1, 1); + let detecting_region = (1, region_id); + supervisor + .register_failure_detectors(vec![detecting_region]) + .await; + supervisor.failover_counts.insert(detecting_region, 1); + let result = SubmitRegionMigrationTaskResult { + migrating: vec![region_id], + ..Default::default() + }; + supervisor + .handle_submit_region_migration_task_result( + 1, + 2, + Duration::from_millis(1000), + RegionMigrationTriggerReason::Manual, + result, + ) + .await + .unwrap(); + assert!(supervisor.failure_detector.contains(&detecting_region)); + assert!(supervisor.failover_counts.contains_key(&detecting_region)); + } + + #[tokio::test] + async fn test_handle_submit_region_migration_task_result_table_not_found() { + common_telemetry::init_default_ut_logging(); + let (mut supervisor, _) = new_test_supervisor(); + let region_id = RegionId::new(1, 1); + let detecting_region = (1, region_id); + supervisor + .register_failure_detectors(vec![detecting_region]) + .await; + supervisor.failover_counts.insert(detecting_region, 1); + let result = SubmitRegionMigrationTaskResult { + table_not_found: vec![region_id], + ..Default::default() + }; + supervisor + .handle_submit_region_migration_task_result( + 1, + 2, + Duration::from_millis(1000), + RegionMigrationTriggerReason::Manual, + result, + ) + .await + .unwrap(); + assert!(!supervisor.failure_detector.contains(&detecting_region)); + assert!(supervisor.failover_counts.is_empty()); + } + + #[tokio::test] + async fn test_handle_submit_region_migration_task_result_leader_changed() { + common_telemetry::init_default_ut_logging(); + let (mut supervisor, _) = new_test_supervisor(); + let region_id = RegionId::new(1, 1); + let detecting_region = (1, region_id); + supervisor + .register_failure_detectors(vec![detecting_region]) + .await; + supervisor.failover_counts.insert(detecting_region, 1); + let result = SubmitRegionMigrationTaskResult { + leader_changed: vec![region_id], + ..Default::default() + }; + supervisor + .handle_submit_region_migration_task_result( + 1, + 2, + Duration::from_millis(1000), + RegionMigrationTriggerReason::Manual, + result, + ) + .await + .unwrap(); + assert!(!supervisor.failure_detector.contains(&detecting_region)); + assert!(supervisor.failover_counts.is_empty()); + } + + #[tokio::test] + async fn test_handle_submit_region_migration_task_result_peer_conflict() { + common_telemetry::init_default_ut_logging(); + let (mut supervisor, _) = new_test_supervisor(); + let region_id = RegionId::new(1, 1); + let detecting_region = (1, region_id); + supervisor + .register_failure_detectors(vec![detecting_region]) + .await; + supervisor.failover_counts.insert(detecting_region, 1); + let result = SubmitRegionMigrationTaskResult { + peer_conflict: vec![region_id], + ..Default::default() + }; + supervisor + .handle_submit_region_migration_task_result( + 1, + 2, + Duration::from_millis(1000), + RegionMigrationTriggerReason::Manual, + result, + ) + .await + .unwrap(); + assert!(supervisor.failure_detector.contains(&detecting_region)); + assert!(supervisor.failover_counts.contains_key(&detecting_region)); + } + + #[tokio::test] + async fn test_handle_submit_region_migration_task_result_submitted() { + common_telemetry::init_default_ut_logging(); + let (mut supervisor, _) = new_test_supervisor(); + let region_id = RegionId::new(1, 1); + let detecting_region = (1, region_id); + supervisor + .register_failure_detectors(vec![detecting_region]) + .await; + supervisor.failover_counts.insert(detecting_region, 1); + let result = SubmitRegionMigrationTaskResult { + submitted: vec![region_id], + ..Default::default() + }; + supervisor + .handle_submit_region_migration_task_result( + 1, + 2, + Duration::from_millis(1000), + RegionMigrationTriggerReason::Manual, + result, + ) + .await + .unwrap(); + assert!(supervisor.failure_detector.contains(&detecting_region)); + assert!(supervisor.failover_counts.contains_key(&detecting_region)); + } } diff --git a/src/meta-srv/src/selector/weight_compute.rs b/src/meta-srv/src/selector/weight_compute.rs index 4e651e4ecc..6508f78efe 100644 --- a/src/meta-srv/src/selector/weight_compute.rs +++ b/src/meta-srv/src/selector/weight_compute.rs @@ -195,6 +195,7 @@ mod tests { region_manifest: RegionManifestInfo::Mito { manifest_version: 0, flushed_entry_id: 0, + file_removed_cnt: 0, }, data_topic_latest_entry_id: 0, metadata_topic_latest_entry_id: 0, @@ -224,6 +225,7 @@ mod tests { region_manifest: RegionManifestInfo::Mito { manifest_version: 0, flushed_entry_id: 0, + file_removed_cnt: 0, }, data_topic_latest_entry_id: 0, metadata_topic_latest_entry_id: 0, @@ -253,6 +255,7 @@ mod tests { region_manifest: RegionManifestInfo::Mito { manifest_version: 0, flushed_entry_id: 0, + file_removed_cnt: 0, }, data_topic_latest_entry_id: 0, metadata_topic_latest_entry_id: 0, diff --git a/src/meta-srv/src/service/admin/heartbeat.rs b/src/meta-srv/src/service/admin/heartbeat.rs index cb13764d30..35ada0d3ae 100644 --- a/src/meta-srv/src/service/admin/heartbeat.rs +++ b/src/meta-srv/src/service/admin/heartbeat.rs @@ -254,7 +254,7 @@ mod tests { assert_eq!(status, http::StatusCode::OK); assert_eq!( body, - "[[{\"timestamp_millis\":3,\"id\":0,\"addr\":\"127.0.0.1:3001\",\"rcus\":0,\"wcus\":0,\"region_num\":0,\"region_stats\":[],\"topic_stats\":[],\"node_epoch\":0,\"datanode_workloads\":{\"types\":[]}}]]" + "[[{\"timestamp_millis\":3,\"id\":0,\"addr\":\"127.0.0.1:3001\",\"rcus\":0,\"wcus\":0,\"region_num\":0,\"region_stats\":[],\"topic_stats\":[],\"node_epoch\":0,\"datanode_workloads\":{\"types\":[]},\"gc_stat\":null}]]" ); } } diff --git a/src/meta-srv/src/service/cluster.rs b/src/meta-srv/src/service/cluster.rs index e39337c374..5c0ae4c71f 100644 --- a/src/meta-srv/src/service/cluster.rs +++ b/src/meta-srv/src/service/cluster.rs @@ -97,8 +97,10 @@ impl Metasrv { version: build_info.version.to_string(), git_commit: build_info.commit_short.to_string(), start_time_ms: self.start_time_ms(), - cpus: self.resource_spec().cpus as u32, - memory_bytes: self.resource_spec().memory.unwrap_or_default().as_bytes(), + total_cpu_millicores: self.resource_stat().get_total_cpu_millicores(), + total_memory_bytes: self.resource_stat().get_total_memory_bytes(), + cpu_usage_millicores: self.resource_stat().get_cpu_usage_millicores(), + memory_usage_bytes: self.resource_stat().get_memory_usage_bytes(), hostname: hostname::get() .unwrap_or_default() .to_string_lossy() diff --git a/src/meta-srv/src/service/heartbeat.rs b/src/meta-srv/src/service/heartbeat.rs index 6b63116de6..046318def9 100644 --- a/src/meta-srv/src/service/heartbeat.rs +++ b/src/meta-srv/src/service/heartbeat.rs @@ -79,6 +79,7 @@ impl heartbeat_server::Heartbeat for Metasrv { let res = handler_group .handle(req, ctx.clone()) .await + .inspect_err(|e| warn!(e; "Failed to handle heartbeat request, pusher: {pusher_id:?}", )) .map_err(|e| e.into()); is_not_leader = res.as_ref().is_ok_and(|r| r.is_not_leader()); diff --git a/src/meta-srv/src/state.rs b/src/meta-srv/src/state.rs index 4d062c0b2e..e5edc5f169 100644 --- a/src/meta-srv/src/state.rs +++ b/src/meta-srv/src/state.rs @@ -75,6 +75,12 @@ impl State { }) } + /// Returns true if the current state is a leader. + pub fn is_leader(&self) -> bool { + matches!(self, State::Leader(_)) + } + + /// Returns true if the leader cache is enabled. pub fn enable_leader_cache(&self) -> bool { match &self { State::Leader(leader) => leader.enable_leader_cache, diff --git a/src/metric-engine/Cargo.toml b/src/metric-engine/Cargo.toml index 53bafb89f3..c601c10912 100644 --- a/src/metric-engine/Cargo.toml +++ b/src/metric-engine/Cargo.toml @@ -46,6 +46,7 @@ tracing.workspace = true common-meta = { workspace = true, features = ["testing"] } common-test-util.workspace = true mito2 = { workspace = true, features = ["test"] } +common-wal = { workspace = true } [package.metadata.cargo-udeps.ignore] normal = ["aquamarine"] diff --git a/src/metric-engine/src/config.rs b/src/metric-engine/src/config.rs index 20df8fa739..e342cd9d73 100644 --- a/src/metric-engine/src/config.rs +++ b/src/metric-engine/src/config.rs @@ -17,14 +17,15 @@ use std::time::Duration; use common_telemetry::warn; use serde::{Deserialize, Serialize}; -/// The default flush interval of the metadata region. +/// The default flush interval of the metadata region. pub(crate) const DEFAULT_FLUSH_METADATA_REGION_INTERVAL: Duration = Duration::from_secs(30); /// Configuration for the metric engine. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct EngineConfig { - /// Experimental feature to use sparse primary key encoding. - pub experimental_sparse_primary_key_encoding: bool, + /// Whether to use sparse primary key encoding. + #[serde(default = "EngineConfig::default_sparse_primary_key_encoding")] + pub sparse_primary_key_encoding: bool, /// The flush interval of the metadata region. #[serde( with = "humantime_serde", @@ -37,7 +38,7 @@ impl Default for EngineConfig { fn default() -> Self { Self { flush_metadata_region_interval: DEFAULT_FLUSH_METADATA_REGION_INTERVAL, - experimental_sparse_primary_key_encoding: false, + sparse_primary_key_encoding: Self::default_sparse_primary_key_encoding(), } } } @@ -47,6 +48,10 @@ impl EngineConfig { DEFAULT_FLUSH_METADATA_REGION_INTERVAL } + fn default_sparse_primary_key_encoding() -> bool { + true + } + /// Sanitizes the configuration. pub fn sanitize(&mut self) { if self.flush_metadata_region_interval.is_zero() { diff --git a/src/metric-engine/src/data_region.rs b/src/metric-engine/src/data_region.rs index 5056cd0352..9bc22e1102 100644 --- a/src/metric-engine/src/data_region.rs +++ b/src/metric-engine/src/data_region.rs @@ -20,7 +20,7 @@ use snafu::ResultExt; use store_api::metadata::ColumnMetadata; use store_api::region_engine::RegionEngine; use store_api::region_request::{ - AddColumn, AffectedRows, AlterKind, RegionAlterRequest, RegionPutRequest, RegionRequest, + AddColumn, AffectedRows, AlterKind, RegionAlterRequest, RegionRequest, }; use store_api::storage::consts::ReservedColumnId; use store_api::storage::{ConcreteDataType, RegionId}; @@ -183,11 +183,11 @@ impl DataRegion { pub async fn write_data( &self, region_id: RegionId, - request: RegionPutRequest, + request: RegionRequest, ) -> Result { let region_id = utils::to_data_region_id(region_id); self.mito - .handle_request(region_id, RegionRequest::Put(request)) + .handle_request(region_id, request) .await .context(MitoWriteOperationSnafu) .map(|result| result.affected_rows) @@ -240,6 +240,7 @@ impl DataRegion { #[cfg(test)] mod test { + use common_query::prelude::{greptime_timestamp, greptime_value}; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; @@ -300,8 +301,8 @@ mod test { .map(|c| &c.column_schema.name) .collect::>(); let expected = vec![ - "greptime_timestamp", - "greptime_value", + greptime_timestamp(), + greptime_value(), "__table_id", "__tsid", "job", diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs index eabd98b135..3e5a1e3c48 100644 --- a/src/metric-engine/src/engine.rs +++ b/src/metric-engine/src/engine.rs @@ -23,6 +23,7 @@ mod options; mod put; mod read; mod region_metadata; +mod staging; mod state; mod sync; @@ -37,21 +38,26 @@ use common_error::status_code::StatusCode; use common_runtime::RepeatedTask; use mito2::engine::MitoEngine; pub(crate) use options::IndexOptions; -use snafu::ResultExt; +use snafu::{OptionExt, ResultExt}; pub(crate) use state::MetricEngineState; use store_api::metadata::RegionMetadataRef; use store_api::metric_engine_consts::METRIC_ENGINE_NAME; use store_api::region_engine::{ BatchResponses, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, - RegionStatistic, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, - SettableRegionRoleState, SyncManifestResponse, + RegionStatistic, RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse, + SetRegionRoleStateSuccess, SettableRegionRoleState, SyncManifestResponse, +}; +use store_api::region_request::{ + BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest, }; -use store_api::region_request::{BatchRegionDdlRequest, RegionOpenRequest, RegionRequest}; use store_api::storage::{RegionId, ScanRequest, SequenceNumber}; use crate::config::EngineConfig; use crate::data_region::DataRegion; -use crate::error::{self, Error, Result, StartRepeatedTaskSnafu, UnsupportedRegionRequestSnafu}; +use crate::error::{ + self, Error, Result, StartRepeatedTaskSnafu, UnsupportedRegionRequestSnafu, + UnsupportedRemapManifestsRequestSnafu, +}; use crate::metadata_region::MetadataRegion; use crate::repeated_task::FlushMetadataRegionTask; use crate::row_modifier::RowModifier; @@ -142,6 +148,17 @@ impl RegionEngine for MetricEngine { .map_err(BoxedError::new) } + async fn handle_batch_catchup_requests( + &self, + parallelism: usize, + requests: Vec<(RegionId, RegionCatchupRequest)>, + ) -> Result { + self.inner + .handle_batch_catchup_requests(parallelism, requests) + .await + .map_err(BoxedError::new) + } + async fn handle_batch_ddl_requests( &self, batch_request: BatchRegionDdlRequest, @@ -195,6 +212,13 @@ impl RegionEngine for MetricEngine { let mut extension_return_value = HashMap::new(); let result = match request { + RegionRequest::EnterStaging(_) => { + if self.inner.is_physical_region(region_id) { + self.handle_enter_staging_request(region_id, request).await + } else { + UnsupportedRegionRequestSnafu { request }.fail() + } + } RegionRequest::Put(put) => self.inner.put_region(region_id, put).await, RegionRequest::Create(create) => { self.inner @@ -235,19 +259,26 @@ impl RegionEngine for MetricEngine { } } RegionRequest::Truncate(_) => UnsupportedRegionRequestSnafu { request }.fail(), - RegionRequest::Delete(_) => { - if self.inner.is_physical_region(region_id) { - self.inner - .mito - .handle_request(region_id, request) - .await - .context(error::MitoDeleteOperationSnafu) - .map(|response| response.affected_rows) - } else { - UnsupportedRegionRequestSnafu { request }.fail() - } + RegionRequest::Delete(delete) => self.inner.delete_region(region_id, delete).await, + RegionRequest::Catchup(_) => { + let mut response = self + .inner + .handle_batch_catchup_requests( + 1, + vec![(region_id, RegionCatchupRequest::default())], + ) + .await + .map_err(BoxedError::new)?; + debug_assert_eq!(response.len(), 1); + let (resp_region_id, response) = response + .pop() + .context(error::UnexpectedRequestSnafu { + reason: "expected 1 response, but got zero responses", + }) + .map_err(BoxedError::new)?; + debug_assert_eq!(region_id, resp_region_id); + return response; } - RegionRequest::Catchup(req) => self.inner.catchup_region(region_id, req).await, RegionRequest::BulkInserts(_) => { // todo(hl): find a way to support bulk inserts in metric engine. UnsupportedRegionRequestSnafu { request }.fail() @@ -330,6 +361,20 @@ impl RegionEngine for MetricEngine { .map_err(BoxedError::new) } + async fn remap_manifests( + &self, + request: RemapManifestsRequest, + ) -> Result { + let region_id = request.region_id; + if self.inner.is_physical_region(region_id) { + self.inner.mito.remap_manifests(request).await + } else { + Err(BoxedError::new( + UnsupportedRemapManifestsRequestSnafu { region_id }.build(), + )) + } + } + async fn set_region_role_state_gracefully( &self, region_id: RegionId, @@ -496,13 +541,17 @@ mod test { use std::collections::HashMap; use common_telemetry::info; + use common_wal::options::{KafkaWalOptions, WalOptions}; use mito2::sst::location::region_dir_from_table_dir; + use mito2::test_util::{kafka_log_store_factory, prepare_test_for_kafka_log_store}; use store_api::metric_engine_consts::PHYSICAL_TABLE_METADATA_KEY; + use store_api::mito_engine_options::WAL_OPTIONS_KEY; use store_api::region_request::{ PathType, RegionCloseRequest, RegionFlushRequest, RegionOpenRequest, RegionRequest, }; use super::*; + use crate::maybe_skip_kafka_log_store_integration_test; use crate::test_util::TestEnv; #[tokio::test] @@ -683,4 +732,128 @@ mod test { .unwrap_err(); assert_eq!(err.status_code(), StatusCode::RegionNotFound); } + + #[tokio::test] + async fn test_catchup_regions() { + common_telemetry::init_default_ut_logging(); + maybe_skip_kafka_log_store_integration_test!(); + let kafka_log_store_factory = kafka_log_store_factory().unwrap(); + let mito_env = mito2::test_util::TestEnv::new() + .await + .with_log_store_factory(kafka_log_store_factory.clone()); + let env = TestEnv::with_mito_env(mito_env).await; + let table_dir = |region_id| format!("table/{region_id}"); + let mut physical_region_ids = vec![]; + let mut logical_region_ids = vec![]; + + let num_topics = 3; + let num_physical_regions = 8; + let num_logical_regions = 16; + let parallelism = 2; + let mut topics = Vec::with_capacity(num_topics); + for _ in 0..num_topics { + let topic = prepare_test_for_kafka_log_store(&kafka_log_store_factory) + .await + .unwrap(); + topics.push(topic); + } + + let topic_idx = |id| (id as usize) % num_topics; + // Creates physical regions + for i in 0..num_physical_regions { + let physical_region_id = RegionId::new(1, i); + physical_region_ids.push(physical_region_id); + + let wal_options = WalOptions::Kafka(KafkaWalOptions { + topic: topics[topic_idx(i)].clone(), + }); + env.create_physical_region( + physical_region_id, + &table_dir(physical_region_id), + vec![( + WAL_OPTIONS_KEY.to_string(), + serde_json::to_string(&wal_options).unwrap(), + )], + ) + .await; + // Creates logical regions for each physical region + for j in 0..num_logical_regions { + let logical_region_id = RegionId::new(1024 + i, j); + logical_region_ids.push(logical_region_id); + env.create_logical_region(physical_region_id, logical_region_id) + .await; + } + } + + let metric_engine = env.metric(); + // Closes all regions + for region_id in logical_region_ids.iter().chain(physical_region_ids.iter()) { + metric_engine + .handle_request(*region_id, RegionRequest::Close(RegionCloseRequest {})) + .await + .unwrap(); + } + + // Opens all regions and skip the wal + let requests = physical_region_ids + .iter() + .enumerate() + .map(|(idx, region_id)| { + let mut options = HashMap::new(); + let wal_options = WalOptions::Kafka(KafkaWalOptions { + topic: topics[topic_idx(idx as u32)].clone(), + }); + options.insert(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new()); + options.insert( + WAL_OPTIONS_KEY.to_string(), + serde_json::to_string(&wal_options).unwrap(), + ); + ( + *region_id, + RegionOpenRequest { + engine: METRIC_ENGINE_NAME.to_string(), + table_dir: table_dir(*region_id), + path_type: PathType::Bare, + options: options.clone(), + skip_wal_replay: true, + checkpoint: None, + }, + ) + }) + .collect::>(); + info!("Open batch regions with parallelism: {parallelism}"); + metric_engine + .handle_batch_open_requests(parallelism, requests) + .await + .unwrap(); + { + let state = metric_engine.inner.state.read().unwrap(); + for logical_region in &logical_region_ids { + assert!(!state.logical_regions().contains_key(logical_region)); + } + } + + let catch_requests = physical_region_ids + .iter() + .map(|region_id| { + ( + *region_id, + RegionCatchupRequest { + set_writable: true, + ..Default::default() + }, + ) + }) + .collect::>(); + metric_engine + .handle_batch_catchup_requests(parallelism, catch_requests) + .await + .unwrap(); + { + let state = metric_engine.inner.state.read().unwrap(); + for logical_region in &logical_region_ids { + assert!(state.logical_regions().contains_key(logical_region)); + } + } + } } diff --git a/src/metric-engine/src/engine/alter.rs b/src/metric-engine/src/engine/alter.rs index 1c4cb93639..4b6b67f31b 100644 --- a/src/metric-engine/src/engine/alter.rs +++ b/src/metric-engine/src/engine/alter.rs @@ -15,7 +15,7 @@ mod extract_new_columns; mod validate; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use extract_new_columns::extract_new_columns; use snafu::{OptionExt, ResultExt, ensure}; @@ -143,16 +143,20 @@ impl MetricEngineInner { }; let data_region_id = to_data_region_id(physical_region_id); - let mut write_guards = HashMap::with_capacity(requests.len()); - for (region_id, _) in requests.iter() { - if write_guards.contains_key(region_id) { - continue; - } - let _write_guard = self - .metadata_region - .write_lock_logical_region(*region_id) - .await?; - write_guards.insert(*region_id, _write_guard); + // Acquire logical region locks in a deterministic order to avoid deadlocks when multiple + // alter operations target overlapping regions concurrently. + let region_ids = requests + .iter() + .map(|(region_id, _)| *region_id) + .collect::>(); + + let mut write_guards = Vec::with_capacity(region_ids.len()); + for region_id in region_ids { + write_guards.push( + self.metadata_region + .write_lock_logical_region(region_id) + .await?, + ); } self.data_region @@ -224,6 +228,7 @@ mod test { use api::v1::SemanticType; use common_meta::ddl::test_util::assert_column_name_and_id; use common_meta::ddl::utils::{parse_column_metadatas, parse_manifest_infos_from_extensions}; + use common_query::prelude::{greptime_timestamp, greptime_value}; use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY; use store_api::region_engine::RegionEngine; use store_api::region_request::{ @@ -295,7 +300,7 @@ mod test { .unwrap(); assert_eq!(semantic_type, SemanticType::Tag); let timestamp_index = metadata_region - .column_semantic_type(physical_region_id, logical_region_id, "greptime_timestamp") + .column_semantic_type(physical_region_id, logical_region_id, greptime_timestamp()) .await .unwrap() .unwrap(); @@ -305,8 +310,8 @@ mod test { assert_column_name_and_id( &column_metadatas, &[ - ("greptime_timestamp", 0), - ("greptime_value", 1), + (greptime_timestamp(), 0), + (greptime_value(), 1), ("__table_id", ReservedColumnId::table_id()), ("__tsid", ReservedColumnId::tsid()), ("job", 2), @@ -323,9 +328,9 @@ mod test { let physical_region_id2 = RegionId::new(1024, 1); let logical_region_id1 = RegionId::new(1025, 0); let logical_region_id2 = RegionId::new(1025, 1); - env.create_physical_region(physical_region_id1, "/test_dir1") + env.create_physical_region(physical_region_id1, "/test_dir1", vec![]) .await; - env.create_physical_region(physical_region_id2, "/test_dir2") + env.create_physical_region(physical_region_id2, "/test_dir2", vec![]) .await; let region_create_request1 = crate::test_util::create_logical_region_request( @@ -364,8 +369,8 @@ mod test { assert_column_name_and_id( &column_metadatas, &[ - ("greptime_timestamp", 0), - ("greptime_value", 1), + (greptime_timestamp(), 0), + (greptime_value(), 1), ("__table_id", ReservedColumnId::table_id()), ("__tsid", ReservedColumnId::tsid()), ("job", 2), diff --git a/src/metric-engine/src/engine/catchup.rs b/src/metric-engine/src/engine/catchup.rs index 6ae4560228..61e7591893 100644 --- a/src/metric-engine/src/engine/catchup.rs +++ b/src/metric-engine/src/engine/catchup.rs @@ -12,51 +12,45 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_telemetry::debug; +use std::collections::HashMap; + +use common_error::ext::BoxedError; use snafu::{OptionExt, ResultExt}; -use store_api::region_engine::RegionEngine; -use store_api::region_request::{ - AffectedRows, RegionCatchupRequest, RegionRequest, ReplayCheckpoint, -}; +use store_api::region_engine::{BatchResponses, RegionEngine}; +use store_api::region_request::{RegionCatchupRequest, ReplayCheckpoint}; use store_api::storage::RegionId; use crate::engine::MetricEngineInner; -use crate::error::{ - MitoCatchupOperationSnafu, PhysicalRegionNotFoundSnafu, Result, UnsupportedRegionRequestSnafu, -}; +use crate::error::{BatchCatchupMitoRegionSnafu, PhysicalRegionNotFoundSnafu, Result}; use crate::utils; impl MetricEngineInner { - pub async fn catchup_region( + pub async fn handle_batch_catchup_requests( &self, - region_id: RegionId, - req: RegionCatchupRequest, - ) -> Result { - if !self.is_physical_region(region_id) { - return UnsupportedRegionRequestSnafu { - request: RegionRequest::Catchup(req), - } - .fail(); - } - let data_region_id = utils::to_data_region_id(region_id); - let physical_region_options = *self - .state - .read() - .unwrap() - .physical_region_states() - .get(&data_region_id) - .context(PhysicalRegionNotFoundSnafu { - region_id: data_region_id, - })? - .options(); + parallelism: usize, + requests: Vec<(RegionId, RegionCatchupRequest)>, + ) -> Result { + let mut all_requests = Vec::with_capacity(requests.len() * 2); + let mut physical_region_options_list = Vec::with_capacity(requests.len()); - let metadata_region_id = utils::to_metadata_region_id(region_id); - // TODO(weny): improve the catchup, we can read the wal entries only once. - debug!("Catchup metadata region {metadata_region_id}"); - self.mito - .handle_request( + for (region_id, req) in requests { + let metadata_region_id = utils::to_metadata_region_id(region_id); + let data_region_id = utils::to_data_region_id(region_id); + + let physical_region_options = *self + .state + .read() + .unwrap() + .physical_region_states() + .get(&data_region_id) + .context(PhysicalRegionNotFoundSnafu { + region_id: data_region_id, + })? + .options(); + physical_region_options_list.push((data_region_id, physical_region_options)); + all_requests.push(( metadata_region_id, - RegionRequest::Catchup(RegionCatchupRequest { + RegionCatchupRequest { set_writable: req.set_writable, entry_id: req.metadata_entry_id, metadata_entry_id: None, @@ -65,16 +59,11 @@ impl MetricEngineInner { entry_id: c.metadata_entry_id.unwrap_or_default(), metadata_entry_id: None, }), - }), - ) - .await - .context(MitoCatchupOperationSnafu)?; - - debug!("Catchup data region {data_region_id}"); - self.mito - .handle_request( + }, + )); + all_requests.push(( data_region_id, - RegionRequest::Catchup(RegionCatchupRequest { + RegionCatchupRequest { set_writable: req.set_writable, entry_id: req.entry_id, metadata_entry_id: None, @@ -83,14 +72,45 @@ impl MetricEngineInner { entry_id: c.entry_id, metadata_entry_id: None, }), - }), - ) - .await - .context(MitoCatchupOperationSnafu) - .map(|response| response.affected_rows)?; + }, + )); + } - self.recover_states(region_id, physical_region_options) - .await?; - Ok(0) + let mut results = self + .mito + .handle_batch_catchup_requests(parallelism, all_requests) + .await + .context(BatchCatchupMitoRegionSnafu {})? + .into_iter() + .collect::>(); + + let mut responses = Vec::with_capacity(physical_region_options_list.len()); + for (physical_region_id, physical_region_options) in physical_region_options_list { + let metadata_region_id = utils::to_metadata_region_id(physical_region_id); + let data_region_id = utils::to_data_region_id(physical_region_id); + let metadata_region_result = results.remove(&metadata_region_id); + let data_region_result = results.remove(&data_region_id); + + // Pass the optional `metadata_region_result` and `data_region_result` to + // `recover_physical_region_with_results`. This function handles errors for each + // catchup physical region request, allowing the process to continue with the + // remaining regions even if some requests fail. + let response = self + .recover_physical_region_with_results( + metadata_region_result, + data_region_result, + physical_region_id, + physical_region_options, + // Note: We intentionally don’t close the region if recovery fails. + // Closing it here might confuse the region server since it links RegionIds to Engines. + // If recovery didn’t succeed, the region should stay open. + false, + ) + .await + .map_err(BoxedError::new); + responses.push((physical_region_id, response)); + } + + Ok(responses) } } diff --git a/src/metric-engine/src/engine/create.rs b/src/metric-engine/src/engine/create.rs index c506c0e2b4..925241ba08 100644 --- a/src/metric-engine/src/engine/create.rs +++ b/src/metric-engine/src/engine/create.rs @@ -528,7 +528,7 @@ impl MetricEngineInner { // set data region options set_data_region_options( &mut data_region_request.options, - self.config.experimental_sparse_primary_key_encoding, + self.config.sparse_primary_key_encoding, ); data_region_request @@ -619,6 +619,7 @@ pub(crate) fn region_options_for_metadata_region( mod test { use common_meta::ddl::test_util::assert_column_name_and_id; use common_meta::ddl::utils::{parse_column_metadatas, parse_manifest_infos_from_extensions}; + use common_query::prelude::{greptime_timestamp, greptime_value}; use store_api::metric_engine_consts::{METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY}; use store_api::region_request::BatchRegionDdlRequest; @@ -827,9 +828,9 @@ mod test { let physical_region_id2 = RegionId::new(1024, 1); let logical_region_id1 = RegionId::new(1025, 0); let logical_region_id2 = RegionId::new(1025, 1); - env.create_physical_region(physical_region_id1, "/test_dir1") + env.create_physical_region(physical_region_id1, "/test_dir1", vec![]) .await; - env.create_physical_region(physical_region_id2, "/test_dir2") + env.create_physical_region(physical_region_id2, "/test_dir2", vec![]) .await; let region_create_request1 = @@ -856,8 +857,8 @@ mod test { assert_column_name_and_id( &column_metadatas, &[ - ("greptime_timestamp", 0), - ("greptime_value", 1), + (greptime_timestamp(), 0), + (greptime_value(), 1), ("__table_id", ReservedColumnId::table_id()), ("__tsid", ReservedColumnId::tsid()), ("job", 2), diff --git a/src/metric-engine/src/engine/flush.rs b/src/metric-engine/src/engine/flush.rs index 23899cbb05..cdc11db852 100644 --- a/src/metric-engine/src/engine/flush.rs +++ b/src/metric-engine/src/engine/flush.rs @@ -76,7 +76,7 @@ mod tests { ]; for (phy_region_id, logi_region_ids) in &phy_to_logi { - env.create_physical_region(*phy_region_id, &TestEnv::default_table_dir()) + env.create_physical_region(*phy_region_id, &TestEnv::default_table_dir(), vec![]) .await; for logi_region_id in logi_region_ids { env.create_logical_region(*phy_region_id, *logi_region_id) @@ -119,6 +119,7 @@ mod tests { .index_file_path .map(|path| path.replace(&e.file_id, "")); e.file_id = "".to_string(); + e.index_file_id = e.index_file_id.map(|_| "".to_string()); format!("\n{:?}", e) }) .sorted() @@ -127,12 +128,12 @@ mod tests { assert_eq!( debug_format, r#" -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test_metric_region/11_0000000001/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000001/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test_metric_region/11_0000000002/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/11_0000000002/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "", level: 0, file_path: "test_metric_region/11_0000000001/metadata/.parquet", file_size: 3505, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "", level: 0, file_path: "test_metric_region/11_0000000002/metadata/.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test_metric_region/22_0000000042/data/.parquet", file_size: 3173, index_file_path: Some("test_metric_region/22_0000000042/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "", level: 0, file_path: "test_metric_region/22_0000000042/metadata/.parquet", file_size: 3489, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"# +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", index_file_id: Some(""), level: 0, file_path: "test_metric_region/11_0000000001/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", index_file_id: Some(""), level: 0, file_path: "test_metric_region/11_0000000002/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000001/metadata/.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000002/metadata/.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", index_file_id: Some(""), level: 0, file_path: "test_metric_region/22_0000000042/data/.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "", index_file_id: None, level: 0, file_path: "test_metric_region/22_0000000042/metadata/.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"# ); // list from storage let storage_entries = mito diff --git a/src/metric-engine/src/engine/open.rs b/src/metric-engine/src/engine/open.rs index 895bb9ed14..44302a56bb 100644 --- a/src/metric-engine/src/engine/open.rs +++ b/src/metric-engine/src/engine/open.rs @@ -47,6 +47,7 @@ impl MetricEngineInner { for (region_id, request) in requests { if !request.is_physical_table() { + warn!("Skipping non-physical table open request: {region_id}"); continue; } let physical_region_options = PhysicalRegionOptions::try_from(&request.options)?; @@ -72,17 +73,19 @@ impl MetricEngineInner { let metadata_region_id = utils::to_metadata_region_id(physical_region_id); let data_region_id = utils::to_data_region_id(physical_region_id); let metadata_region_result = results.remove(&metadata_region_id); - let data_region_result = results.remove(&data_region_id); + let data_region_result: Option> = + results.remove(&data_region_id); // Pass the optional `metadata_region_result` and `data_region_result` to - // `open_physical_region_with_results`. This function handles errors for each + // `recover_physical_region_with_results`. This function handles errors for each // open physical region request, allowing the process to continue with the // remaining regions even if some requests fail. let response = self - .open_physical_region_with_results( + .recover_physical_region_with_results( metadata_region_result, data_region_result, physical_region_id, physical_region_options, + true, ) .await .map_err(BoxedError::new); @@ -107,12 +110,13 @@ impl MetricEngineInner { } } - async fn open_physical_region_with_results( + pub(crate) async fn recover_physical_region_with_results( &self, metadata_region_result: Option>, data_region_result: Option>, physical_region_id: RegionId, physical_region_options: PhysicalRegionOptions, + close_region_on_failure: bool, ) -> Result { let metadata_region_id = utils::to_metadata_region_id(physical_region_id); let data_region_id = utils::to_data_region_id(physical_region_id); @@ -136,8 +140,10 @@ impl MetricEngineInner { .recover_states(physical_region_id, physical_region_options) .await { - self.close_physical_region_on_recovery_failure(physical_region_id) - .await; + if close_region_on_failure { + self.close_physical_region_on_recovery_failure(physical_region_id) + .await; + } return Err(err); } Ok(data_region_response) @@ -221,7 +227,7 @@ impl MetricEngineInner { let mut data_region_options = request.options; set_data_region_options( &mut data_region_options, - self.config.experimental_sparse_primary_key_encoding, + self.config.sparse_primary_key_encoding, ); let open_data_region_request = RegionOpenRequest { table_dir: request.table_dir.clone(), diff --git a/src/metric-engine/src/engine/options.rs b/src/metric-engine/src/engine/options.rs index e8ff117c2e..838e5b4e45 100644 --- a/src/metric-engine/src/engine/options.rs +++ b/src/metric-engine/src/engine/options.rs @@ -17,12 +17,12 @@ use std::collections::HashMap; use store_api::metric_engine_consts::{ + MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING, METRIC_ENGINE_INDEX_SKIPPING_INDEX_FALSE_POSITIVE_RATE_OPTION, METRIC_ENGINE_INDEX_SKIPPING_INDEX_FALSE_POSITIVE_RATE_OPTION_DEFAULT, METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION, METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION_DEFAULT, METRIC_ENGINE_INDEX_TYPE_OPTION, }; -use store_api::mito_engine_options::MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING; use crate::error::{Error, ParseRegionOptionsSnafu, Result}; diff --git a/src/metric-engine/src/engine/put.rs b/src/metric-engine/src/engine/put.rs index 21aeec605d..0d4693ee42 100644 --- a/src/metric-engine/src/engine/put.rs +++ b/src/metric-engine/src/engine/put.rs @@ -16,13 +16,15 @@ use api::v1::{Rows, WriteHint}; use common_telemetry::{error, info}; use snafu::{OptionExt, ensure}; use store_api::codec::PrimaryKeyEncoding; -use store_api::region_request::{AffectedRows, RegionPutRequest}; +use store_api::region_request::{ + AffectedRows, RegionDeleteRequest, RegionPutRequest, RegionRequest, +}; use store_api::storage::{RegionId, TableId}; use crate::engine::MetricEngineInner; use crate::error::{ ColumnNotFoundSnafu, ForbiddenPhysicalAlterSnafu, LogicalRegionNotFoundSnafu, - PhysicalRegionNotFoundSnafu, Result, + PhysicalRegionNotFoundSnafu, Result, UnsupportedRegionRequestSnafu, }; use crate::metrics::{FORBIDDEN_OPERATION_COUNT, MITO_OPERATION_ELAPSED}; use crate::row_modifier::RowsIter; @@ -50,6 +52,27 @@ impl MetricEngineInner { } } + /// Dispatch region delete request + pub async fn delete_region( + &self, + region_id: RegionId, + request: RegionDeleteRequest, + ) -> Result { + if self.is_physical_region(region_id) { + info!( + "Metric region received delete request {request:?} on physical region {region_id:?}" + ); + FORBIDDEN_OPERATION_COUNT.inc(); + + UnsupportedRegionRequestSnafu { + request: RegionRequest::Delete(request), + } + .fail() + } else { + self.delete_logical_region(region_id, request).await + } + } + async fn put_logical_region( &self, logical_region_id: RegionId, @@ -59,30 +82,13 @@ impl MetricEngineInner { .with_label_values(&["put"]) .start_timer(); - let (physical_region_id, data_region_id, primary_key_encoding) = { - let state = self.state.read().unwrap(); - let physical_region_id = *state - .logical_regions() - .get(&logical_region_id) - .with_context(|| LogicalRegionNotFoundSnafu { - region_id: logical_region_id, - })?; - let data_region_id = to_data_region_id(physical_region_id); + let (physical_region_id, data_region_id, primary_key_encoding) = + self.find_data_region_meta(logical_region_id)?; - let primary_key_encoding = state.get_primary_key_encoding(data_region_id).context( - PhysicalRegionNotFoundSnafu { - region_id: data_region_id, - }, - )?; - - (physical_region_id, data_region_id, primary_key_encoding) - }; - - self.verify_put_request(logical_region_id, physical_region_id, &request) + self.verify_rows(logical_region_id, physical_region_id, &request.rows) .await?; // write to data region - // TODO: retrieve table name self.modify_rows( physical_region_id, @@ -95,19 +101,74 @@ impl MetricEngineInner { primary_key_encoding: api::v1::PrimaryKeyEncoding::Sparse.into(), }); } - self.data_region.write_data(data_region_id, request).await + self.data_region + .write_data(data_region_id, RegionRequest::Put(request)) + .await } - /// Verifies a put request for a logical region against its corresponding metadata region. + async fn delete_logical_region( + &self, + logical_region_id: RegionId, + mut request: RegionDeleteRequest, + ) -> Result { + let _timer = MITO_OPERATION_ELAPSED + .with_label_values(&["delete"]) + .start_timer(); + + let (physical_region_id, data_region_id, primary_key_encoding) = + self.find_data_region_meta(logical_region_id)?; + + self.verify_rows(logical_region_id, physical_region_id, &request.rows) + .await?; + + // write to data region + // TODO: retrieve table name + self.modify_rows( + physical_region_id, + logical_region_id.table_id(), + &mut request.rows, + primary_key_encoding, + )?; + if primary_key_encoding == PrimaryKeyEncoding::Sparse { + request.hint = Some(WriteHint { + primary_key_encoding: api::v1::PrimaryKeyEncoding::Sparse.into(), + }); + } + self.data_region + .write_data(data_region_id, RegionRequest::Delete(request)) + .await + } + + fn find_data_region_meta( + &self, + logical_region_id: RegionId, + ) -> Result<(RegionId, RegionId, PrimaryKeyEncoding)> { + let state = self.state.read().unwrap(); + let physical_region_id = *state + .logical_regions() + .get(&logical_region_id) + .with_context(|| LogicalRegionNotFoundSnafu { + region_id: logical_region_id, + })?; + let data_region_id = to_data_region_id(physical_region_id); + let primary_key_encoding = state.get_primary_key_encoding(data_region_id).context( + PhysicalRegionNotFoundSnafu { + region_id: data_region_id, + }, + )?; + Ok((physical_region_id, data_region_id, primary_key_encoding)) + } + + /// Verifies a request for a logical region against its corresponding metadata region. /// /// Includes: /// - Check if the logical region exists /// - Check if the columns exist - async fn verify_put_request( + async fn verify_rows( &self, logical_region_id: RegionId, physical_region_id: RegionId, - request: &RegionPutRequest, + rows: &Rows, ) -> Result<()> { // Check if the region exists let data_region_id = to_data_region_id(physical_region_id); @@ -128,7 +189,7 @@ impl MetricEngineInner { region_id: data_region_id, })? .physical_columns(); - for col in &request.rows.schema { + for col in &rows.schema { ensure!( physical_columns.contains_key(&col.column_name), ColumnNotFoundSnafu { diff --git a/src/metric-engine/src/engine/staging.rs b/src/metric-engine/src/engine/staging.rs new file mode 100644 index 0000000000..9db500957c --- /dev/null +++ b/src/metric-engine/src/engine/staging.rs @@ -0,0 +1,54 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_base::AffectedRows; +use snafu::ResultExt; +use store_api::region_engine::RegionEngine; +use store_api::region_request::{EnterStagingRequest, RegionRequest}; +use store_api::storage::RegionId; + +use crate::engine::MetricEngine; +use crate::error::{MitoEnterStagingOperationSnafu, Result}; +use crate::utils; + +impl MetricEngine { + /// Handles the enter staging request for the given region. + pub(crate) async fn handle_enter_staging_request( + &self, + region_id: RegionId, + request: RegionRequest, + ) -> Result { + let metadata_region_id = utils::to_metadata_region_id(region_id); + let data_region_id = utils::to_data_region_id(region_id); + + // For metadata region, it doesn't care about the partition expr, so we can just pass an empty string. + self.inner + .mito + .handle_request( + metadata_region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_expr: String::new(), + }), + ) + .await + .context(MitoEnterStagingOperationSnafu)?; + + self.inner + .mito + .handle_request(data_region_id, request) + .await + .context(MitoEnterStagingOperationSnafu) + .map(|response| response.affected_rows) + } +} diff --git a/src/metric-engine/src/engine/sync.rs b/src/metric-engine/src/engine/sync.rs index b62b138dab..4a2741c12b 100644 --- a/src/metric-engine/src/engine/sync.rs +++ b/src/metric-engine/src/engine/sync.rs @@ -45,7 +45,7 @@ impl MetricEngineInner { .metadata_flushed_entry_id() .unwrap_or_default(); let metadata_region_manifest = - RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id); + RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id, 0); let metadata_synced = self .mito .sync_region(metadata_region_id, metadata_region_manifest) @@ -57,7 +57,7 @@ impl MetricEngineInner { let data_manifest_version = manifest_info.data_manifest_version(); let data_flushed_entry_id = manifest_info.data_flushed_entry_id(); let data_region_manifest = - RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id); + RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id, 0); let data_synced = self .mito @@ -110,6 +110,7 @@ mod tests { use std::collections::HashMap; use api::v1::SemanticType; + use common_query::prelude::greptime_timestamp; use common_telemetry::info; use datatypes::data_type::ConcreteDataType; use datatypes::schema::ColumnSchema; @@ -243,7 +244,7 @@ mod tests { .unwrap(); assert_eq!(semantic_type, SemanticType::Tag); let timestamp_index = metadata_region - .column_semantic_type(physical_region_id, logical_region_id, "greptime_timestamp") + .column_semantic_type(physical_region_id, logical_region_id, greptime_timestamp()) .await .unwrap() .unwrap(); diff --git a/src/metric-engine/src/error.rs b/src/metric-engine/src/error.rs index 91881b5624..3d00b737c3 100644 --- a/src/metric-engine/src/error.rs +++ b/src/metric-engine/src/error.rs @@ -50,6 +50,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to batch catchup mito region"))] + BatchCatchupMitoRegion { + source: BoxedError, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("No open region result for region {}", region_id))] NoOpenRegionResult { region_id: RegionId, @@ -142,20 +149,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Mito delete operation fails"))] - MitoDeleteOperation { - source: BoxedError, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Mito catchup operation fails"))] - MitoCatchupOperation { - source: BoxedError, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Mito sync operation fails"))] MitoSyncOperation { source: BoxedError, @@ -163,6 +156,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Mito enter staging operation fails"))] + MitoEnterStagingOperation { + source: BoxedError, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Failed to collect record batch stream"))] CollectRecordBatchStream { source: common_recordbatch::error::Error, @@ -249,6 +249,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Unsupported remap manifests request for region {}", region_id))] + UnsupportedRemapManifestsRequest { + region_id: RegionId, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Unsupported alter kind: {}", kind))] UnsupportedAlterKind { kind: String, @@ -331,7 +338,8 @@ impl ErrorExt for Error { | AddingFieldColumn { .. } | ParseRegionOptions { .. } | UnexpectedRequest { .. } - | UnsupportedAlterKind { .. } => StatusCode::InvalidArguments, + | UnsupportedAlterKind { .. } + | UnsupportedRemapManifestsRequest { .. } => StatusCode::InvalidArguments, ForbiddenPhysicalAlter { .. } | UnsupportedRegionRequest { .. } => { StatusCode::Unsupported @@ -357,11 +365,11 @@ impl ErrorExt for Error { | CloseMitoRegion { source, .. } | MitoReadOperation { source, .. } | MitoWriteOperation { source, .. } - | MitoCatchupOperation { source, .. } | MitoFlushOperation { source, .. } - | MitoDeleteOperation { source, .. } | MitoSyncOperation { source, .. } - | BatchOpenMitoRegion { source, .. } => source.status_code(), + | MitoEnterStagingOperation { source, .. } + | BatchOpenMitoRegion { source, .. } + | BatchCatchupMitoRegion { source, .. } => source.status_code(), EncodePrimaryKey { source, .. } => source.status_code(), diff --git a/src/metric-engine/src/metadata_region.rs b/src/metric-engine/src/metadata_region.rs index cbd8e83bd3..c34b44e4a7 100644 --- a/src/metric-engine/src/metadata_region.rs +++ b/src/metric-engine/src/metadata_region.rs @@ -317,45 +317,20 @@ pub fn decode_batch_stream( /// Decode a record batch to a list of key and value. fn decode_record_batch_to_key_and_value(batch: RecordBatch) -> Vec<(String, String)> { - let key_col = batch.column(0); - let val_col = batch.column(1); - - (0..batch.num_rows()) - .flat_map(move |row_index| { - let key = key_col - .get_ref(row_index) - .try_into_string() - .unwrap() - .map(|s| s.to_string()); - - key.map(|k| { - ( - k, - val_col - .get_ref(row_index) - .try_into_string() - .unwrap() - .map(|s| s.to_string()) - .unwrap_or_default(), - ) - }) + let keys = batch.iter_column_as_string(0); + let values = batch.iter_column_as_string(1); + keys.zip(values) + .filter_map(|(k, v)| match (k, v) { + (Some(k), Some(v)) => Some((k, v)), + (Some(k), None) => Some((k, "".to_string())), + (None, _) => None, }) - .collect() + .collect::>() } /// Decode a record batch to a list of key. fn decode_record_batch_to_key(batch: RecordBatch) -> Vec { - let key_col = batch.column(0); - - (0..batch.num_rows()) - .flat_map(move |row_index| { - key_col - .get_ref(row_index) - .try_into_string() - .unwrap() - .map(|s| s.to_string()) - }) - .collect() + batch.iter_column_as_string(0).flatten().collect::>() } // simulate to `KvBackend` @@ -536,7 +511,7 @@ impl MetadataRegion { .collect(); let rows = Rows { schema: cols, rows }; - RegionDeleteRequest { rows } + RegionDeleteRequest { rows, hint: None } } /// Add logical regions to the metadata region. @@ -590,6 +565,8 @@ impl MetadataRegion { /// Retrieves the value associated with the given key in the specified region. /// Returns `Ok(None)` if the key is not found. pub async fn get(&self, region_id: RegionId, key: &str) -> Result> { + use datatypes::arrow::array::{Array, AsArray}; + let filter_expr = datafusion::prelude::col(METADATA_SCHEMA_KEY_COLUMN_NAME) .eq(datafusion::prelude::lit(key)); @@ -611,12 +588,9 @@ impl MetadataRegion { return Ok(None); }; - let val = first_batch - .column(0) - .get_ref(0) - .try_into_string() - .unwrap() - .map(|s| s.to_string()); + let column = first_batch.column(0); + let column = column.as_string::(); + let val = column.is_valid(0).then(|| column.value(0).to_string()); Ok(val) } diff --git a/src/metric-engine/src/test_util.rs b/src/metric-engine/src/test_util.rs index d594541d84..d81240d47f 100644 --- a/src/metric-engine/src/test_util.rs +++ b/src/metric-engine/src/test_util.rs @@ -17,6 +17,7 @@ use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnSchema as PbColumnSchema, Row, SemanticType, Value}; use common_meta::ddl::utils::parse_column_metadatas; +use common_query::prelude::{greptime_timestamp, greptime_value}; use common_telemetry::debug; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; @@ -75,6 +76,17 @@ impl TestEnv { } } + /// Returns a new env with specific `prefix` and `mito_env` for test. + pub async fn with_mito_env(mut mito_env: MitoTestEnv) -> Self { + let mito = mito_env.create_engine(MitoConfig::default()).await; + let metric = MetricEngine::try_new(mito.clone(), EngineConfig::default()).unwrap(); + Self { + mito_env, + mito, + metric, + } + } + pub fn data_home(&self) -> String { let env_root = self.mito_env.data_home().to_string_lossy().to_string(); join_dir(&env_root, "data") @@ -124,7 +136,12 @@ impl TestEnv { } /// Create regions in [MetricEngine] with specific `physical_region_id`. - pub async fn create_physical_region(&self, physical_region_id: RegionId, table_dir: &str) { + pub async fn create_physical_region( + &self, + physical_region_id: RegionId, + table_dir: &str, + options: Vec<(String, String)>, + ) { let region_create_request = RegionCreateRequest { engine: METRIC_ENGINE_NAME.to_string(), column_metadatas: vec![ @@ -132,7 +149,7 @@ impl TestEnv { column_id: 0, semantic_type: SemanticType::Timestamp, column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_millisecond_datatype(), false, ), @@ -141,7 +158,7 @@ impl TestEnv { column_id: 1, semantic_type: SemanticType::Field, column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), false, ), @@ -150,6 +167,7 @@ impl TestEnv { primary_key: vec![], options: [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())] .into_iter() + .chain(options.into_iter()) .collect(), table_dir: table_dir.to_string(), path_type: PathType::Bare, // Use Bare path type for engine regions @@ -204,8 +222,8 @@ impl TestEnv { assert_eq!( column_names, vec![ - "greptime_timestamp", - "greptime_value", + greptime_timestamp(), + greptime_value(), "__table_id", "__tsid", "job", @@ -230,7 +248,7 @@ impl TestEnv { /// under [`default_logical_region_id`]. pub async fn init_metric_region(&self) { let physical_region_id = self.default_physical_region_id(); - self.create_physical_region(physical_region_id, &Self::default_table_dir()) + self.create_physical_region(physical_region_id, &Self::default_table_dir(), vec![]) .await; let logical_region_id = self.default_logical_region_id(); self.create_logical_region(physical_region_id, logical_region_id) @@ -300,7 +318,7 @@ pub fn create_logical_region_request( column_id: 0, semantic_type: SemanticType::Timestamp, column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_millisecond_datatype(), false, ), @@ -309,7 +327,7 @@ pub fn create_logical_region_request( column_id: 1, semantic_type: SemanticType::Field, column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), false, ), @@ -372,14 +390,14 @@ pub fn alter_logical_region_request(tags: &[&str]) -> RegionAlterRequest { pub fn row_schema_with_tags(tags: &[&str]) -> Vec { let mut schema = vec![ PbColumnSchema { - column_name: "greptime_timestamp".to_string(), + column_name: greptime_timestamp().to_string(), datatype: ColumnDataType::TimestampMillisecond as i32, semantic_type: SemanticType::Timestamp as _, datatype_extension: None, options: None, }, PbColumnSchema { - column_name: "greptime_value".to_string(), + column_name: greptime_value().to_string(), datatype: ColumnDataType::Float64 as i32, semantic_type: SemanticType::Field as _, datatype_extension: None, @@ -423,6 +441,22 @@ pub fn build_rows(num_tags: usize, num_rows: usize) -> Vec { rows } +#[macro_export] +/// Skip the test if the environment variable `GT_KAFKA_ENDPOINTS` is not set. +/// +/// The format of the environment variable is: +/// ```text +/// GT_KAFKA_ENDPOINTS=localhost:9092,localhost:9093 +/// ``` +macro_rules! maybe_skip_kafka_log_store_integration_test { + () => { + if std::env::var("GT_KAFKA_ENDPOINTS").is_err() { + common_telemetry::warn!("The kafka endpoints is empty, skipping the test"); + return; + } + }; +} + #[cfg(test)] mod test { use object_store::ObjectStore; diff --git a/src/mito-codec/Cargo.toml b/src/mito-codec/Cargo.toml index 99a46e8ac9..81808f2714 100644 --- a/src/mito-codec/Cargo.toml +++ b/src/mito-codec/Cargo.toml @@ -15,6 +15,7 @@ common-base.workspace = true common-decimal.workspace = true common-error.workspace = true common-macro.workspace = true +common-query.workspace = true common-recordbatch.workspace = true common-telemetry.workspace = true common-time.workspace = true diff --git a/src/mito-codec/src/key_values.rs b/src/mito-codec/src/key_values.rs index 8f594d6ff8..5afacc3718 100644 --- a/src/mito-codec/src/key_values.rs +++ b/src/mito-codec/src/key_values.rs @@ -278,14 +278,41 @@ impl SparseReadRowHelper { primary_key_encoding: PrimaryKeyEncoding, ) -> SparseReadRowHelper { if primary_key_encoding == PrimaryKeyEncoding::Sparse { - // We can skip build the indices for sparse primary key encoding. - // The order of the columns is encoded primary key, timestamp, field columns. - let indices = rows + // Optimized case: when schema has exactly 3 columns (primary key, timestamp, and one field), + // we can directly use their indices in order without building an explicit mapping. + // The column order is: encoded primary key, timestamp, and field. + if rows.schema.len() == 3 { + let indices = rows + .schema + .iter() + .enumerate() + .map(|(index, _)| Some(index)) + .collect(); + return SparseReadRowHelper { + indices, + num_primary_key_column: 1, + }; + }; + + let mut indices = Vec::with_capacity(rows.schema.len()); + let name_to_index: HashMap<_, _> = rows .schema .iter() .enumerate() - .map(|(index, _)| Some(index)) + .map(|(index, col)| (&col.column_name, index)) .collect(); + indices.extend( + rows.schema[0..2] + .iter() + .enumerate() + .map(|(index, _)| Some(index)), + ); + // Iterate columns and find field columns. + for column in metadata.field_columns() { + // Get index in request for each field column. + let index = name_to_index.get(&column.column_schema.name); + indices.push(index.copied()); + } return SparseReadRowHelper { indices, num_primary_key_column: 1, diff --git a/src/mito-codec/src/primary_key_filter.rs b/src/mito-codec/src/primary_key_filter.rs index e4d1ce5056..c71fafc974 100644 --- a/src/mito-codec/src/primary_key_filter.rs +++ b/src/mito-codec/src/primary_key_filter.rs @@ -154,6 +154,7 @@ mod tests { use std::sync::Arc; use api::v1::SemanticType; + use common_query::prelude::{greptime_timestamp, greptime_value}; use datafusion_common::Column; use datafusion_expr::{BinaryExpr, Expr, Literal, Operator}; use datatypes::prelude::ConcreteDataType; @@ -193,7 +194,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), false, ), @@ -202,7 +203,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_nanosecond_datatype(), false, ), diff --git a/src/mito-codec/src/row_converter/sparse.rs b/src/mito-codec/src/row_converter/sparse.rs index edc26db8f0..731de5c0b0 100644 --- a/src/mito-codec/src/row_converter/sparse.rs +++ b/src/mito-codec/src/row_converter/sparse.rs @@ -83,6 +83,11 @@ impl SparseValues { pub fn insert(&mut self, column_id: ColumnId, value: Value) { self.values.insert(column_id, value); } + + /// Returns an iterator over all stored column id/value pairs. + pub fn iter(&self) -> impl Iterator { + self.values.iter() + } } /// The column id of the tsid. @@ -385,6 +390,7 @@ mod tests { use std::sync::Arc; use api::v1::SemanticType; + use common_query::prelude::{greptime_timestamp, greptime_value}; use common_time::Timestamp; use common_time::timestamp::TimeUnit; use datatypes::schema::ColumnSchema; @@ -461,7 +467,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), false, ), @@ -470,7 +476,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_nanosecond_datatype(), false, ), diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 4cc1efb8bc..a3686251bb 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -55,7 +55,7 @@ lazy_static = "1.4" log-store = { workspace = true } mito-codec.workspace = true moka = { workspace = true, features = ["sync", "future"] } -object-store.workspace = true +object-store = { workspace = true, features = ["testing"] } parquet = { workspace = true, features = ["async"] } paste.workspace = true pin-project.workspace = true @@ -65,7 +65,7 @@ partition.workspace = true puffin.workspace = true rand.workspace = true rayon = "1.10" -regex = "1.5" +regex.workspace = true rskafka = { workspace = true, optional = true } rstest = { workspace = true, optional = true } rstest_reuse = { workspace = true, optional = true } diff --git a/src/mito2/benches/memtable_bench.rs b/src/mito2/benches/memtable_bench.rs index c46a6aeedc..5e3f3ff2c0 100644 --- a/src/mito2/benches/memtable_bench.rs +++ b/src/mito2/benches/memtable_bench.rs @@ -477,6 +477,8 @@ fn flat_merge_iterator_bench(c: &mut Criterion) { bulk_part.batch.clone(), context.clone(), None, // No sequence filter + 1024, // 1024 hosts per part + None, // No mem_scan_metrics ); iters.push(Box::new(iter) as _); } @@ -534,8 +536,13 @@ fn bulk_part_record_batch_iter_filter(c: &mut Criterion) { ); // Create and iterate over BulkPartRecordBatchIter with filter - let iter = - BulkPartRecordBatchIter::new(record_batch_with_filter.clone(), context, None); + let iter = BulkPartRecordBatchIter::new( + record_batch_with_filter.clone(), + context, + None, // No sequence filter + 4096, // 4096 hosts + None, // No mem_scan_metrics + ); // Consume all batches for batch_result in iter { @@ -559,7 +566,13 @@ fn bulk_part_record_batch_iter_filter(c: &mut Criterion) { ); // Create and iterate over BulkPartRecordBatchIter - let iter = BulkPartRecordBatchIter::new(record_batch_no_filter.clone(), context, None); + let iter = BulkPartRecordBatchIter::new( + record_batch_no_filter.clone(), + context, + None, // No sequence filter + 4096, // 4096 hosts + None, // No mem_scan_metrics + ); // Consume all batches for batch_result in iter { diff --git a/src/mito2/benches/simple_bulk_memtable.rs b/src/mito2/benches/simple_bulk_memtable.rs index 9aa55893a1..02b6538aa9 100644 --- a/src/mito2/benches/simple_bulk_memtable.rs +++ b/src/mito2/benches/simple_bulk_memtable.rs @@ -20,12 +20,11 @@ use criterion::{Criterion, black_box, criterion_group, criterion_main}; use datatypes::data_type::ConcreteDataType; use datatypes::schema::ColumnSchema; use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable; -use mito2::memtable::{KeyValues, Memtable, MemtableRanges}; +use mito2::memtable::{KeyValues, Memtable, MemtableRanges, RangesOptions}; use mito2::read; use mito2::read::Source; use mito2::read::dedup::DedupReader; use mito2::read::merge::MergeReaderBuilder; -use mito2::read::scan_region::PredicateGroup; use mito2::region::options::MergeMode; use mito2::test_util::column_metadata_to_column_schema; use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; @@ -126,9 +125,7 @@ fn create_memtable_with_rows(num_batches: usize) -> SimpleBulkMemtable { } async fn flush(mem: &SimpleBulkMemtable) { - let MemtableRanges { ranges, .. } = mem - .ranges(None, PredicateGroup::default(), None, true) - .unwrap(); + let MemtableRanges { ranges, .. } = mem.ranges(None, RangesOptions::for_flush()).unwrap(); let mut source = if ranges.len() == 1 { let only_range = ranges.into_values().next().unwrap(); diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs index e5401209ca..b6891d7410 100644 --- a/src/mito2/src/access_layer.rs +++ b/src/mito2/src/access_layer.rs @@ -72,7 +72,7 @@ pub struct Metrics { } impl Metrics { - pub(crate) fn new(write_type: WriteType) -> Self { + pub fn new(write_type: WriteType) -> Self { Self { write_type, iter_source: Default::default(), @@ -213,7 +213,11 @@ impl AccessLayer { } /// Deletes a SST file (and its index file if it has one) with given file id. - pub(crate) async fn delete_sst(&self, region_file_id: &RegionFileId) -> Result<()> { + pub(crate) async fn delete_sst( + &self, + region_file_id: &RegionFileId, + index_file_id: &RegionFileId, + ) -> Result<()> { let path = location::sst_file_path(&self.table_dir, *region_file_id, self.path_type); self.object_store .delete(&path) @@ -222,7 +226,7 @@ impl AccessLayer { file_id: region_file_id.file_id(), })?; - let path = location::index_file_path(&self.table_dir, *region_file_id, self.path_type); + let path = location::index_file_path(&self.table_dir, *index_file_id, self.path_type); self.object_store .delete(&path) .await @@ -255,12 +259,12 @@ impl AccessLayer { &self, request: SstWriteRequest, write_opts: &WriteOptions, - write_type: WriteType, - ) -> Result<(SstInfoArray, Metrics)> { + metrics: &mut Metrics, + ) -> Result { let region_id = request.metadata.region_id; let cache_manager = request.cache_manager.clone(); - let (sst_info, metrics) = if let Some(write_cache) = cache_manager.write_cache() { + let sst_info = if let Some(write_cache) = cache_manager.write_cache() { // Write to the write cache. write_cache .write_and_upload_sst( @@ -273,7 +277,7 @@ impl AccessLayer { remote_store: self.object_store.clone(), }, write_opts, - write_type, + metrics, ) .await? } else { @@ -303,11 +307,11 @@ impl AccessLayer { request.index_config, indexer_builder, path_provider, - Metrics::new(write_type), + metrics, ) .await .with_file_cleaner(cleaner); - let ssts = match request.source { + match request.source { Either::Left(source) => { writer .write_all(source, request.max_sequence, write_opts) @@ -316,9 +320,7 @@ impl AccessLayer { Either::Right(flat_source) => { writer.write_all_flat(flat_source, write_opts).await? } - }; - let metrics = writer.into_metrics(); - (ssts, metrics) + } }; // Put parquet metadata to cache manager. @@ -333,7 +335,7 @@ impl AccessLayer { } } - Ok((sst_info, metrics)) + Ok(sst_info) } /// Puts encoded SST bytes to the write cache (if enabled) and uploads it to the object store. diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs index b371e39b78..b3a9bfb2df 100644 --- a/src/mito2/src/cache.rs +++ b/src/mito2/src/cache.rs @@ -53,6 +53,8 @@ const VECTOR_TYPE: &str = "vector"; const PAGE_TYPE: &str = "page"; /// Metrics type key for files on the local store. const FILE_TYPE: &str = "file"; +/// Metrics type key for index files (puffin) on the local store. +const INDEX_TYPE: &str = "index"; /// Metrics type key for selector result cache. const SELECTOR_RESULT_TYPE: &str = "selector_result"; diff --git a/src/mito2/src/cache/file_cache.rs b/src/mito2/src/cache/file_cache.rs index 3fed4b9916..e9c67aaa45 100644 --- a/src/mito2/src/cache/file_cache.rs +++ b/src/mito2/src/cache/file_cache.rs @@ -21,8 +21,8 @@ use std::time::{Duration, Instant}; use bytes::Bytes; use common_base::readable_size::ReadableSize; -use common_telemetry::{error, info, warn}; -use futures::{FutureExt, TryStreamExt}; +use common_telemetry::{debug, error, info, warn}; +use futures::{AsyncWriteExt, FutureExt, TryStreamExt}; use moka::future::Cache; use moka::notification::RemovalCause; use moka::policy::EvictionPolicy; @@ -31,10 +31,16 @@ use object_store::{ErrorKind, ObjectStore, Reader}; use parquet::file::metadata::ParquetMetaData; use snafu::ResultExt; use store_api::storage::{FileId, RegionId}; +use tokio::sync::mpsc::UnboundedReceiver; -use crate::cache::FILE_TYPE; -use crate::error::{OpenDalSnafu, Result}; -use crate::metrics::{CACHE_BYTES, CACHE_HIT, CACHE_MISS}; +use crate::access_layer::TempFileCleaner; +use crate::cache::{FILE_TYPE, INDEX_TYPE}; +use crate::error::{self, OpenDalSnafu, Result}; +use crate::metrics::{ + CACHE_BYTES, CACHE_HIT, CACHE_MISS, WRITE_CACHE_DOWNLOAD_BYTES_TOTAL, + WRITE_CACHE_DOWNLOAD_ELAPSED, +}; +use crate::region::opener::RegionLoadCacheTask; use crate::sst::parquet::helper::fetch_byte_ranges; use crate::sst::parquet::metadata::MetadataLoader; @@ -43,16 +49,24 @@ use crate::sst::parquet::metadata::MetadataLoader; /// This must contain three layers, corresponding to [`build_prometheus_metrics_layer`](object_store::layers::build_prometheus_metrics_layer). const FILE_DIR: &str = "cache/object/write/"; +/// Default percentage for index (puffin) cache (20% of total capacity). +pub(crate) const DEFAULT_INDEX_CACHE_PERCENT: u8 = 20; + +/// Minimum capacity for each cache (512MB). +const MIN_CACHE_CAPACITY: u64 = 512 * 1024 * 1024; + /// A file cache manages files on local store and evict files based /// on size. #[derive(Debug)] pub(crate) struct FileCache { /// Local store to cache files. local_store: ObjectStore, - /// Index to track cached files. - /// - /// File id is enough to identity a file uniquely. - memory_index: Cache, + /// Index to track cached Parquet files. + parquet_index: Cache, + /// Index to track cached Puffin files. + puffin_index: Cache, + /// Capacity of the puffin (index) cache in bytes. + puffin_capacity: u64, } pub(crate) type FileCacheRef = Arc; @@ -63,15 +77,57 @@ impl FileCache { local_store: ObjectStore, capacity: ReadableSize, ttl: Option, + index_cache_percent: Option, ) -> FileCache { - let cache_store = local_store.clone(); + // Validate and use the provided percent or default + let index_percent = index_cache_percent + .filter(|&percent| percent > 0 && percent < 100) + .unwrap_or(DEFAULT_INDEX_CACHE_PERCENT); + let total_capacity = capacity.as_bytes(); + + // Convert percent to ratio and calculate capacity for each cache + let index_ratio = index_percent as f64 / 100.0; + let puffin_capacity = (total_capacity as f64 * index_ratio) as u64; + let parquet_capacity = total_capacity - puffin_capacity; + + // Ensure both capacities are at least 512MB + let puffin_capacity = puffin_capacity.max(MIN_CACHE_CAPACITY); + let parquet_capacity = parquet_capacity.max(MIN_CACHE_CAPACITY); + + info!( + "Initializing file cache with index_percent: {}%, total_capacity: {}, parquet_capacity: {}, puffin_capacity: {}", + index_percent, + ReadableSize(total_capacity), + ReadableSize(parquet_capacity), + ReadableSize(puffin_capacity) + ); + + let parquet_index = Self::build_cache(local_store.clone(), parquet_capacity, ttl, "file"); + let puffin_index = Self::build_cache(local_store.clone(), puffin_capacity, ttl, "index"); + + FileCache { + local_store, + parquet_index, + puffin_index, + puffin_capacity, + } + } + + /// Builds a cache for a specific file type. + fn build_cache( + local_store: ObjectStore, + capacity: u64, + ttl: Option, + label: &'static str, + ) -> Cache { + let cache_store = local_store; let mut builder = Cache::builder() .eviction_policy(EvictionPolicy::lru()) .weigher(|_key, value: &IndexValue| -> u32 { // We only measure space on local store. value.file_size }) - .max_capacity(capacity.as_bytes()) + .max_capacity(capacity) .async_eviction_listener(move |key, value, cause| { let store = cache_store.clone(); // Stores files under FILE_DIR. @@ -80,14 +136,14 @@ impl FileCache { if let RemovalCause::Replaced = cause { // The cache is replaced by another file. This is unexpected, we don't remove the same // file but updates the metrics as the file is already replaced by users. - CACHE_BYTES.with_label_values(&[FILE_TYPE]).sub(value.file_size.into()); + CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into()); warn!("Replace existing cache {} for region {} unexpectedly", file_path, key.region_id); return; } match store.delete(&file_path).await { Ok(()) => { - CACHE_BYTES.with_label_values(&[FILE_TYPE]).sub(value.file_size.into()); + CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into()); } Err(e) => { warn!(e; "Failed to delete cached file {} for region {}", file_path, key.region_id); @@ -99,10 +155,14 @@ impl FileCache { if let Some(ttl) = ttl { builder = builder.time_to_idle(ttl); } - let memory_index = builder.build(); - FileCache { - local_store, - memory_index, + builder.build() + } + + /// Returns the appropriate memory index for the given file type. + fn memory_index(&self, file_type: FileType) -> &Cache { + match file_type { + FileType::Parquet => &self.parquet_index, + FileType::Puffin => &self.puffin_index, } } @@ -111,16 +171,17 @@ impl FileCache { /// The `WriteCache` should ensure the file is in the correct path. pub(crate) async fn put(&self, key: IndexKey, value: IndexValue) { CACHE_BYTES - .with_label_values(&[FILE_TYPE]) + .with_label_values(&[key.file_type.metric_label()]) .add(value.file_size.into()); - self.memory_index.insert(key, value).await; + let index = self.memory_index(key.file_type); + index.insert(key, value).await; // Since files are large items, we run the pending tasks immediately. - self.memory_index.run_pending_tasks().await; + index.run_pending_tasks().await; } pub(crate) async fn get(&self, key: IndexKey) -> Option { - self.memory_index.get(&key).await + self.memory_index(key.file_type).get(&key).await } /// Reads a file from the cache. @@ -128,15 +189,20 @@ impl FileCache { pub(crate) async fn reader(&self, key: IndexKey) -> Option { // We must use `get()` to update the estimator of the cache. // See https://docs.rs/moka/latest/moka/future/struct.Cache.html#method.contains_key - if self.memory_index.get(&key).await.is_none() { - CACHE_MISS.with_label_values(&[FILE_TYPE]).inc(); + let index = self.memory_index(key.file_type); + if index.get(&key).await.is_none() { + CACHE_MISS + .with_label_values(&[key.file_type.metric_label()]) + .inc(); return None; } let file_path = self.cache_file_path(key); match self.get_reader(&file_path).await { Ok(Some(reader)) => { - CACHE_HIT.with_label_values(&[FILE_TYPE]).inc(); + CACHE_HIT + .with_label_values(&[key.file_type.metric_label()]) + .inc(); return Some(reader); } Err(e) => { @@ -148,8 +214,10 @@ impl FileCache { } // We removes the file from the index. - self.memory_index.remove(&key).await; - CACHE_MISS.with_label_values(&[FILE_TYPE]).inc(); + index.remove(&key).await; + CACHE_MISS + .with_label_values(&[key.file_type.metric_label()]) + .inc(); None } @@ -159,8 +227,11 @@ impl FileCache { key: IndexKey, ranges: &[Range], ) -> Option> { - if self.memory_index.get(&key).await.is_none() { - CACHE_MISS.with_label_values(&[FILE_TYPE]).inc(); + let index = self.memory_index(key.file_type); + if index.get(&key).await.is_none() { + CACHE_MISS + .with_label_values(&[key.file_type.metric_label()]) + .inc(); return None; } @@ -170,7 +241,9 @@ impl FileCache { let bytes_result = fetch_byte_ranges(&file_path, self.local_store.clone(), ranges).await; match bytes_result { Ok(bytes) => { - CACHE_HIT.with_label_values(&[FILE_TYPE]).inc(); + CACHE_HIT + .with_label_values(&[key.file_type.metric_label()]) + .inc(); Some(bytes) } Err(e) => { @@ -179,8 +252,10 @@ impl FileCache { } // We removes the file from the index. - self.memory_index.remove(&key).await; - CACHE_MISS.with_label_values(&[FILE_TYPE]).inc(); + index.remove(&key).await; + CACHE_MISS + .with_label_values(&[key.file_type.metric_label()]) + .inc(); None } } @@ -191,7 +266,7 @@ impl FileCache { /// in the memory index if upload is failed. pub(crate) async fn remove(&self, key: IndexKey) { let file_path = self.cache_file_path(key); - self.memory_index.remove(&key).await; + self.memory_index(key.file_type).remove(&key).await; // Always delete the file from the local store. if let Err(e) = self.local_store.delete(&file_path).await { warn!(e; "Failed to delete a cached file {}", file_path); @@ -208,6 +283,7 @@ impl FileCache { // Use i64 for total_size to reduce the risk of overflow. // It is possible that the total size of the cache is larger than i32::MAX. let (mut total_size, mut total_keys) = (0i64, 0); + let (mut parquet_size, mut puffin_size) = (0i64, 0i64); while let Some(entry) = lister.try_next().await.context(OpenDalSnafu)? { let meta = entry.metadata(); if !meta.is_file() { @@ -223,36 +299,76 @@ impl FileCache { .await .context(OpenDalSnafu)?; let file_size = meta.content_length() as u32; - self.memory_index - .insert(key, IndexValue { file_size }) - .await; - total_size += i64::from(file_size); + let index = self.memory_index(key.file_type); + index.insert(key, IndexValue { file_size }).await; + let size = i64::from(file_size); + total_size += size; total_keys += 1; + + // Track sizes separately for each file type + match key.file_type { + FileType::Parquet => parquet_size += size, + FileType::Puffin => puffin_size += size, + } } // The metrics is a signed int gauge so we can updates it finally. - CACHE_BYTES.with_label_values(&[FILE_TYPE]).add(total_size); + CACHE_BYTES + .with_label_values(&[FILE_TYPE]) + .add(parquet_size); + CACHE_BYTES + .with_label_values(&[INDEX_TYPE]) + .add(puffin_size); // Run all pending tasks of the moka cache so that the cache size is updated // and the eviction policy is applied. - self.memory_index.run_pending_tasks().await; + self.parquet_index.run_pending_tasks().await; + self.puffin_index.run_pending_tasks().await; + let parquet_weight = self.parquet_index.weighted_size(); + let parquet_count = self.parquet_index.entry_count(); + let puffin_weight = self.puffin_index.weighted_size(); + let puffin_count = self.puffin_index.entry_count(); info!( - "Recovered file cache, num_keys: {}, num_bytes: {}, total weight: {}, cost: {:?}", + "Recovered file cache, num_keys: {}, num_bytes: {}, parquet(count: {}, weight: {}), puffin(count: {}, weight: {}), cost: {:?}", total_keys, total_size, - self.memory_index.weighted_size(), + parquet_count, + parquet_weight, + puffin_count, + puffin_weight, now.elapsed() ); Ok(()) } /// Recovers the index from local store. - pub(crate) async fn recover(self: &Arc, sync: bool) { + /// + /// If `task_receiver` is provided, spawns a background task after recovery + /// to process `RegionLoadCacheTask` messages for loading files into the cache. + pub(crate) async fn recover( + self: &Arc, + sync: bool, + task_receiver: Option>, + ) { let moved_self = self.clone(); let handle = tokio::spawn(async move { if let Err(err) = moved_self.recover_inner().await { error!(err; "Failed to recover file cache.") } + + // Spawns background task to process region load cache tasks after recovery. + // So it won't block the recovery when `sync` is true. + if let Some(mut receiver) = task_receiver { + let cache_ref = moved_self.clone(); + info!("Spawning background task for processing region load cache tasks"); + tokio::spawn(async move { + while let Some(task) = receiver.recv().await { + let file_cache = cache_ref.clone(); + task.fill_cache(file_cache).await; + } + info!("Background task for processing region load cache tasks stopped"); + }); + } }); if sync { @@ -274,7 +390,7 @@ impl FileCache { /// If the file is not in the cache or fail to load metadata, return None. pub(crate) async fn get_parquet_meta_data(&self, key: IndexKey) -> Option { // Check if file cache contains the key - if let Some(index_value) = self.memory_index.get(&key).await { + if let Some(index_value) = self.parquet_index.get(&key).await { // Load metadata from file cache let local_store = self.local_store(); let file_path = self.cache_file_path(key); @@ -283,7 +399,9 @@ impl FileCache { match metadata_loader.load().await { Ok(metadata) => { - CACHE_HIT.with_label_values(&[FILE_TYPE]).inc(); + CACHE_HIT + .with_label_values(&[key.file_type.metric_label()]) + .inc(); Some(metadata) } Err(e) => { @@ -294,13 +412,17 @@ impl FileCache { ); } // We removes the file from the index. - self.memory_index.remove(&key).await; - CACHE_MISS.with_label_values(&[FILE_TYPE]).inc(); + self.parquet_index.remove(&key).await; + CACHE_MISS + .with_label_values(&[key.file_type.metric_label()]) + .inc(); None } } } else { - CACHE_MISS.with_label_values(&[FILE_TYPE]).inc(); + CACHE_MISS + .with_label_values(&[key.file_type.metric_label()]) + .inc(); None } } @@ -314,9 +436,106 @@ impl FileCache { } /// Checks if the key is in the file cache. - #[cfg(test)] pub(crate) fn contains_key(&self, key: &IndexKey) -> bool { - self.memory_index.contains_key(key) + self.memory_index(key.file_type).contains_key(key) + } + + /// Returns the capacity of the puffin (index) cache in bytes. + pub(crate) fn puffin_cache_capacity(&self) -> u64 { + self.puffin_capacity + } + + /// Returns the current weighted size (used bytes) of the puffin (index) cache. + pub(crate) fn puffin_cache_size(&self) -> u64 { + self.puffin_index.weighted_size() + } + + /// Downloads a file in `remote_path` from the remote object store to the local cache + /// (specified by `index_key`). + pub(crate) async fn download( + &self, + index_key: IndexKey, + remote_path: &str, + remote_store: &ObjectStore, + file_size: u64, + ) -> Result<()> { + if let Err(e) = self + .download_without_cleaning(index_key, remote_path, remote_store, file_size) + .await + { + let filename = index_key.to_string(); + TempFileCleaner::clean_atomic_dir_files(&self.local_store, &[&filename]).await; + + return Err(e); + } + Ok(()) + } + + async fn download_without_cleaning( + &self, + index_key: IndexKey, + remote_path: &str, + remote_store: &ObjectStore, + file_size: u64, + ) -> Result<()> { + const DOWNLOAD_READER_CONCURRENCY: usize = 8; + const DOWNLOAD_READER_CHUNK_SIZE: ReadableSize = ReadableSize::mb(8); + + let file_type = index_key.file_type; + let timer = WRITE_CACHE_DOWNLOAD_ELAPSED + .with_label_values(&[match file_type { + FileType::Parquet => "download_parquet", + FileType::Puffin => "download_puffin", + }]) + .start_timer(); + + let reader = remote_store + .reader_with(remote_path) + .concurrent(DOWNLOAD_READER_CONCURRENCY) + .chunk(DOWNLOAD_READER_CHUNK_SIZE.as_bytes() as usize) + .await + .context(error::OpenDalSnafu)? + .into_futures_async_read(0..file_size) + .await + .context(error::OpenDalSnafu)?; + + let cache_path = self.cache_file_path(index_key); + let mut writer = self + .local_store + .writer(&cache_path) + .await + .context(error::OpenDalSnafu)? + .into_futures_async_write(); + + let region_id = index_key.region_id; + let file_id = index_key.file_id; + let bytes_written = + futures::io::copy(reader, &mut writer) + .await + .context(error::DownloadSnafu { + region_id, + file_id, + file_type, + })?; + writer.close().await.context(error::DownloadSnafu { + region_id, + file_id, + file_type, + })?; + + WRITE_CACHE_DOWNLOAD_BYTES_TOTAL.inc_by(bytes_written); + + let elapsed = timer.stop_and_record(); + debug!( + "Successfully download file '{}' to local '{}', file size: {}, region: {}, cost: {:?}s", + remote_path, cache_path, bytes_written, region_id, elapsed, + ); + + let index_value = IndexValue { + file_size: bytes_written as _, + }; + self.put(index_key, index_value).await; + Ok(()) } } @@ -377,6 +596,14 @@ impl FileType { FileType::Puffin => "puffin", } } + + /// Returns the metric label for this file type. + fn metric_label(&self) -> &'static str { + match self { + FileType::Parquet => FILE_TYPE, + FileType::Puffin => INDEX_TYPE, + } + } } /// An entity that describes the file in the file cache. @@ -429,6 +656,7 @@ mod tests { local_store.clone(), ReadableSize::mb(10), Some(Duration::from_millis(10)), + None, ); let region_id = RegionId::new(2000, 0); let file_id = FileId::random(); @@ -455,7 +683,7 @@ mod tests { let exist = cache.reader(key).await; assert!(exist.is_some()); tokio::time::sleep(Duration::from_millis(15)).await; - cache.memory_index.run_pending_tasks().await; + cache.parquet_index.run_pending_tasks().await; let non = cache.reader(key).await; assert!(non.is_none()); } @@ -465,7 +693,7 @@ mod tests { let dir = create_temp_dir(""); let local_store = new_fs_store(dir.path().to_str().unwrap()); - let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None); + let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None); let region_id = RegionId::new(2000, 0); let file_id = FileId::random(); let key = IndexKey::new(region_id, file_id, FileType::Parquet); @@ -493,19 +721,19 @@ mod tests { assert_eq!("hello", String::from_utf8(buf).unwrap()); // Get weighted size. - cache.memory_index.run_pending_tasks().await; - assert_eq!(5, cache.memory_index.weighted_size()); + cache.parquet_index.run_pending_tasks().await; + assert_eq!(5, cache.parquet_index.weighted_size()); // Remove the file. cache.remove(key).await; assert!(cache.reader(key).await.is_none()); // Ensure all pending tasks of the moka cache is done before assertion. - cache.memory_index.run_pending_tasks().await; + cache.parquet_index.run_pending_tasks().await; // The file also not exists. assert!(!local_store.exists(&file_path).await.unwrap()); - assert_eq!(0, cache.memory_index.weighted_size()); + assert_eq!(0, cache.parquet_index.weighted_size()); } #[tokio::test] @@ -513,7 +741,7 @@ mod tests { let dir = create_temp_dir(""); let local_store = new_fs_store(dir.path().to_str().unwrap()); - let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None); + let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None); let region_id = RegionId::new(2000, 0); let file_id = FileId::random(); let key = IndexKey::new(region_id, file_id, FileType::Parquet); @@ -538,14 +766,14 @@ mod tests { // Reader is none. assert!(cache.reader(key).await.is_none()); // Key is removed. - assert!(!cache.memory_index.contains_key(&key)); + assert!(!cache.parquet_index.contains_key(&key)); } #[tokio::test] async fn test_file_cache_recover() { let dir = create_temp_dir(""); let local_store = new_fs_store(dir.path().to_str().unwrap()); - let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None); + let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None); let region_id = RegionId::new(2000, 0); let file_type = FileType::Parquet; @@ -575,6 +803,7 @@ mod tests { local_store.clone(), ReadableSize::mb(10), None, + None, )); // No entry before recovery. assert!( @@ -583,11 +812,11 @@ mod tests { .await .is_none() ); - cache.recover(true).await; + cache.recover(true, None).await; // Check size. - cache.memory_index.run_pending_tasks().await; - assert_eq!(total_size, cache.memory_index.weighted_size() as usize); + cache.parquet_index.run_pending_tasks().await; + assert_eq!(total_size, cache.parquet_index.weighted_size() as usize); for (i, file_id) in file_ids.iter().enumerate() { let key = IndexKey::new(region_id, *file_id, file_type); @@ -601,7 +830,7 @@ mod tests { async fn test_file_cache_read_ranges() { let dir = create_temp_dir(""); let local_store = new_fs_store(dir.path().to_str().unwrap()); - let file_cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None); + let file_cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None); let region_id = RegionId::new(2000, 0); let file_id = FileId::random(); let key = IndexKey::new(region_id, file_id, FileType::Parquet); diff --git a/src/mito2/src/cache/index.rs b/src/mito2/src/cache/index.rs index cf24772994..350ef34b2a 100644 --- a/src/mito2/src/cache/index.rs +++ b/src/mito2/src/cache/index.rs @@ -216,6 +216,8 @@ where } fn put_page(&self, key: K, page_key: PageKey, value: Bytes) { + // Clones the value to ensure it doesn't reference a larger buffer. + let value = Bytes::from(value.to_vec()); CACHE_BYTES .with_label_values(&[INDEX_CONTENT_TYPE]) .add((self.weight_of_content)(&(key, page_key), &value).into()); diff --git a/src/mito2/src/cache/index/bloom_filter_index.rs b/src/mito2/src/cache/index/bloom_filter_index.rs index 9e8d864d7d..b4e7804b93 100644 --- a/src/mito2/src/cache/index/bloom_filter_index.rs +++ b/src/mito2/src/cache/index/bloom_filter_index.rs @@ -15,7 +15,7 @@ use std::ops::Range; use std::sync::Arc; -use api::v1::index::BloomFilterMeta; +use api::v1::index::{BloomFilterLoc, BloomFilterMeta}; use async_trait::async_trait; use bytes::Bytes; use index::bloom_filter::error::Result; @@ -60,11 +60,17 @@ impl BloomFilterIndexCache { /// Calculates weight for bloom filter index metadata. fn bloom_filter_index_metadata_weight( k: &(FileId, ColumnId, Tag), - _: &Arc, + meta: &Arc, ) -> u32 { - (k.0.as_bytes().len() + let base = k.0.as_bytes().len() + std::mem::size_of::() - + std::mem::size_of::()) as u32 + + std::mem::size_of::() + + std::mem::size_of::(); + + let vec_estimated = meta.segment_loc_indices.len() * std::mem::size_of::() + + meta.bloom_filter_locs.len() * std::mem::size_of::(); + + (base + vec_estimated) as u32 } /// Calculates weight for bloom filter index content. @@ -171,6 +177,45 @@ mod test { const FUZZ_REPEAT_TIMES: usize = 100; + #[test] + fn bloom_filter_metadata_weight_counts_vec_contents() { + let file_id = FileId::parse_str("00000000-0000-0000-0000-000000000001").unwrap(); + let column_id: ColumnId = 42; + let tag = Tag::Skipping; + + let meta = BloomFilterMeta { + rows_per_segment: 128, + segment_count: 2, + row_count: 256, + bloom_filter_size: 1024, + segment_loc_indices: vec![0, 64, 128, 192], + bloom_filter_locs: vec![ + BloomFilterLoc { + offset: 0, + size: 512, + element_count: 1000, + }, + BloomFilterLoc { + offset: 512, + size: 512, + element_count: 1000, + }, + ], + }; + + let weight = + bloom_filter_index_metadata_weight(&(file_id, column_id, tag), &Arc::new(meta.clone())); + + let base = file_id.as_bytes().len() + + std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::(); + let expected_dynamic = meta.segment_loc_indices.len() * std::mem::size_of::() + + meta.bloom_filter_locs.len() * std::mem::size_of::(); + + assert_eq!(weight as usize, base + expected_dynamic); + } + #[test] fn fuzz_index_calculation() { let mut rng = rand::rng(); diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs index d2b7e34997..b54e3e6f73 100644 --- a/src/mito2/src/cache/write_cache.rs +++ b/src/mito2/src/cache/write_cache.rs @@ -23,6 +23,7 @@ use futures::AsyncWriteExt; use object_store::ObjectStore; use snafu::ResultExt; use store_api::storage::RegionId; +use tokio::sync::mpsc::{UnboundedSender, unbounded_channel}; use crate::access_layer::{ FilePathProvider, Metrics, RegionFilePathFactory, SstInfoArray, SstWriteRequest, @@ -30,9 +31,8 @@ use crate::access_layer::{ }; use crate::cache::file_cache::{FileCache, FileCacheRef, FileType, IndexKey, IndexValue}; use crate::error::{self, Result}; -use crate::metrics::{ - UPLOAD_BYTES_TOTAL, WRITE_CACHE_DOWNLOAD_BYTES_TOTAL, WRITE_CACHE_DOWNLOAD_ELAPSED, -}; +use crate::metrics::UPLOAD_BYTES_TOTAL; +use crate::region::opener::RegionLoadCacheTask; use crate::sst::file::RegionFileId; use crate::sst::index::IndexerBuilderImpl; use crate::sst::index::intermediate::IntermediateManager; @@ -51,6 +51,8 @@ pub struct WriteCache { puffin_manager_factory: PuffinManagerFactory, /// Intermediate manager for index. intermediate_manager: IntermediateManager, + /// Sender for region load cache tasks. + task_sender: UnboundedSender, } pub type WriteCacheRef = Arc; @@ -62,16 +64,25 @@ impl WriteCache { local_store: ObjectStore, cache_capacity: ReadableSize, ttl: Option, + index_cache_percent: Option, puffin_manager_factory: PuffinManagerFactory, intermediate_manager: IntermediateManager, ) -> Result { - let file_cache = Arc::new(FileCache::new(local_store, cache_capacity, ttl)); - file_cache.recover(false).await; + let (task_sender, task_receiver) = unbounded_channel(); + + let file_cache = Arc::new(FileCache::new( + local_store, + cache_capacity, + ttl, + index_cache_percent, + )); + file_cache.recover(false, Some(task_receiver)).await; Ok(Self { file_cache, puffin_manager_factory, intermediate_manager, + task_sender, }) } @@ -80,6 +91,7 @@ impl WriteCache { cache_dir: &str, cache_capacity: ReadableSize, ttl: Option, + index_cache_percent: Option, puffin_manager_factory: PuffinManagerFactory, intermediate_manager: IntermediateManager, ) -> Result { @@ -90,6 +102,7 @@ impl WriteCache { local_store, cache_capacity, ttl, + index_cache_percent, puffin_manager_factory, intermediate_manager, ) @@ -169,8 +182,8 @@ impl WriteCache { write_request: SstWriteRequest, upload_request: SstUploadRequest, write_opts: &WriteOptions, - write_type: WriteType, - ) -> Result<(SstInfoArray, Metrics)> { + metrics: &mut Metrics, + ) -> Result { let region_id = write_request.metadata.region_id; let store = self.file_cache.local_store(); @@ -197,7 +210,7 @@ impl WriteCache { write_request.index_config, indexer, path_provider.clone(), - Metrics::new(write_type), + metrics, ) .await .with_file_cleaner(cleaner); @@ -210,11 +223,10 @@ impl WriteCache { } either::Right(flat_source) => writer.write_all_flat(flat_source, write_opts).await?, }; - let mut metrics = writer.into_metrics(); // Upload sst file to remote object store. if sst_info.is_empty() { - return Ok((sst_info, metrics)); + return Ok(sst_info); } let mut upload_tracker = UploadTracker::new(region_id); @@ -256,7 +268,7 @@ impl WriteCache { return Err(err); } - Ok((sst_info, metrics)) + Ok(sst_info) } /// Removes a file from the cache by `index_key`. @@ -273,85 +285,9 @@ impl WriteCache { remote_store: &ObjectStore, file_size: u64, ) -> Result<()> { - if let Err(e) = self - .download_without_cleaning(index_key, remote_path, remote_store, file_size) + self.file_cache + .download(index_key, remote_path, remote_store, file_size) .await - { - let filename = index_key.to_string(); - TempFileCleaner::clean_atomic_dir_files(&self.file_cache.local_store(), &[&filename]) - .await; - - return Err(e); - } - Ok(()) - } - - async fn download_without_cleaning( - &self, - index_key: IndexKey, - remote_path: &str, - remote_store: &ObjectStore, - file_size: u64, - ) -> Result<()> { - const DOWNLOAD_READER_CONCURRENCY: usize = 8; - const DOWNLOAD_READER_CHUNK_SIZE: ReadableSize = ReadableSize::mb(8); - - let file_type = index_key.file_type; - let timer = WRITE_CACHE_DOWNLOAD_ELAPSED - .with_label_values(&[match file_type { - FileType::Parquet => "download_parquet", - FileType::Puffin => "download_puffin", - }]) - .start_timer(); - - let reader = remote_store - .reader_with(remote_path) - .concurrent(DOWNLOAD_READER_CONCURRENCY) - .chunk(DOWNLOAD_READER_CHUNK_SIZE.as_bytes() as usize) - .await - .context(error::OpenDalSnafu)? - .into_futures_async_read(0..file_size) - .await - .context(error::OpenDalSnafu)?; - - let cache_path = self.file_cache.cache_file_path(index_key); - let mut writer = self - .file_cache - .local_store() - .writer(&cache_path) - .await - .context(error::OpenDalSnafu)? - .into_futures_async_write(); - - let region_id = index_key.region_id; - let file_id = index_key.file_id; - let bytes_written = - futures::io::copy(reader, &mut writer) - .await - .context(error::DownloadSnafu { - region_id, - file_id, - file_type, - })?; - writer.close().await.context(error::DownloadSnafu { - region_id, - file_id, - file_type, - })?; - - WRITE_CACHE_DOWNLOAD_BYTES_TOTAL.inc_by(bytes_written); - - let elapsed = timer.stop_and_record(); - debug!( - "Successfully download file '{}' to local '{}', file size: {}, region: {}, cost: {:?}s", - remote_path, cache_path, bytes_written, region_id, elapsed, - ); - - let index_value = IndexValue { - file_size: bytes_written as _, - }; - self.file_cache.put(index_key, index_value).await; - Ok(()) } /// Uploads a Parquet file or a Puffin file to the remote object store. @@ -425,6 +361,13 @@ impl WriteCache { Ok(()) } + + /// Sends a region load cache task to the background processing queue. + /// + /// If the receiver has been dropped, the error is ignored. + pub(crate) fn load_region_cache(&self, task: RegionLoadCacheTask) { + let _ = self.task_sender.send(task); + } } /// Request to write and upload a SST. @@ -559,8 +502,9 @@ mod tests { }; // Write to cache and upload sst to mock remote store - let (mut sst_infos, _) = write_cache - .write_and_upload_sst(write_request, upload_request, &write_opts, WriteType::Flush) + let mut metrics = Metrics::new(WriteType::Flush); + let mut sst_infos = write_cache + .write_and_upload_sst(write_request, upload_request, &write_opts, &mut metrics) .await .unwrap(); let sst_info = sst_infos.remove(0); @@ -655,8 +599,9 @@ mod tests { remote_store: mock_store.clone(), }; - let (mut sst_infos, _) = write_cache - .write_and_upload_sst(write_request, upload_request, &write_opts, WriteType::Flush) + let mut metrics = Metrics::new(WriteType::Flush); + let mut sst_infos = write_cache + .write_and_upload_sst(write_request, upload_request, &write_opts, &mut metrics) .await .unwrap(); let sst_info = sst_infos.remove(0); @@ -735,8 +680,9 @@ mod tests { remote_store: mock_store.clone(), }; + let mut metrics = Metrics::new(WriteType::Flush); write_cache - .write_and_upload_sst(write_request, upload_request, &write_opts, WriteType::Flush) + .write_and_upload_sst(write_request, upload_request, &write_opts, &mut metrics) .await .unwrap_err(); let atomic_write_dir = write_cache_dir.path().join(ATOMIC_WRITE_DIR); diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index d83ed7ab7d..e6492722f3 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -305,6 +305,7 @@ impl CompactionScheduler { &options, &request.current_version.options.compaction, request.current_version.options.append_mode, + Some(self.engine_config.max_background_compactions), ); let region_id = request.region_id(); let CompactionRequest { @@ -1110,9 +1111,8 @@ mod tests { checkpoint_distance: 10, remove_file_options: Default::default(), }, - Default::default(), - Default::default(), FormatType::PrimaryKey, + &Default::default(), ) .await .unwrap(); diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs index ba267f4a48..e63ae7f3f6 100644 --- a/src/mito2/src/compaction/compactor.rs +++ b/src/mito2/src/compaction/compactor.rs @@ -30,20 +30,20 @@ use store_api::metadata::RegionMetadataRef; use store_api::region_request::PathType; use store_api::storage::RegionId; -use crate::access_layer::{AccessLayer, AccessLayerRef, OperationType, SstWriteRequest, WriteType}; +use crate::access_layer::{ + AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType, +}; use crate::cache::{CacheManager, CacheManagerRef}; use crate::compaction::picker::{PickerOutput, new_picker}; -use crate::compaction::{CompactionSstReaderBuilder, find_ttl}; +use crate::compaction::{CompactionOutput, CompactionSstReaderBuilder, find_ttl}; use crate::config::MitoConfig; use crate::error::{ EmptyRegionDirSnafu, InvalidPartitionExprSnafu, JoinSnafu, ObjectStoreNotFoundSnafu, Result, }; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; -use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions, RemoveFileOptions}; -use crate::manifest::storage::manifest_compress_type; +use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions}; use crate::metrics; use crate::read::{FlatSource, Source}; -use crate::region::opener::new_manifest_dir; use crate::region::options::RegionOptions; use crate::region::version::VersionRef; use crate::region::{ManifestContext, RegionLeaderState, RegionRoleState}; @@ -160,31 +160,16 @@ pub async fn open_compaction_region( }; let manifest_manager = { - let region_manifest_options = RegionManifestOptions { - manifest_dir: new_manifest_dir(®ion_dir_from_table_dir( - &req.table_dir, - req.region_id, - req.path_type, - )), - object_store: object_store.clone(), - compress_type: manifest_compress_type(mito_config.compress_manifest), - checkpoint_distance: mito_config.manifest_checkpoint_distance, - remove_file_options: RemoveFileOptions { - keep_count: mito_config.experimental_manifest_keep_removed_file_count, - keep_ttl: mito_config.experimental_manifest_keep_removed_file_ttl, - }, - }; + let region_dir = region_dir_from_table_dir(&req.table_dir, req.region_id, req.path_type); + let region_manifest_options = + RegionManifestOptions::new(mito_config, ®ion_dir, object_store); - RegionManifestManager::open( - region_manifest_options, - Default::default(), - Default::default(), - ) - .await? - .context(EmptyRegionDirSnafu { - region_id: req.region_id, - region_dir: ®ion_dir_from_table_dir(&req.table_dir, req.region_id, req.path_type), - })? + RegionManifestManager::open(region_manifest_options, &Default::default()) + .await? + .with_context(|| EmptyRegionDirSnafu { + region_id: req.region_id, + region_dir: region_dir_from_table_dir(&req.table_dir, req.region_id, req.path_type), + })? }; let manifest = manifest_manager.manifest(); @@ -311,6 +296,127 @@ pub trait Compactor: Send + Sync + 'static { /// DefaultCompactor is the default implementation of Compactor. pub struct DefaultCompactor; +impl DefaultCompactor { + /// Merge a single compaction output into SST files. + async fn merge_single_output( + compaction_region: CompactionRegion, + output: CompactionOutput, + write_opts: WriteOptions, + ) -> Result> { + let region_id = compaction_region.region_id; + let storage = compaction_region.region_options.storage.clone(); + let index_options = compaction_region + .current_version + .options + .index_options + .clone(); + let append_mode = compaction_region.current_version.options.append_mode; + let merge_mode = compaction_region.current_version.options.merge_mode(); + let flat_format = compaction_region + .region_options + .sst_format + .map(|format| format == FormatType::Flat) + .unwrap_or( + compaction_region + .engine_config + .default_experimental_flat_format, + ); + + let index_config = compaction_region.engine_config.index.clone(); + let inverted_index_config = compaction_region.engine_config.inverted_index.clone(); + let fulltext_index_config = compaction_region.engine_config.fulltext_index.clone(); + let bloom_filter_index_config = compaction_region.engine_config.bloom_filter_index.clone(); + + let input_file_names = output + .inputs + .iter() + .map(|f| f.file_id().to_string()) + .join(","); + let max_sequence = output + .inputs + .iter() + .map(|f| f.meta_ref().sequence) + .max() + .flatten(); + let builder = CompactionSstReaderBuilder { + metadata: compaction_region.region_metadata.clone(), + sst_layer: compaction_region.access_layer.clone(), + cache: compaction_region.cache_manager.clone(), + inputs: &output.inputs, + append_mode, + filter_deleted: output.filter_deleted, + time_range: output.output_time_range, + merge_mode, + }; + let source = if flat_format { + let reader = builder.build_flat_sst_reader().await?; + Either::Right(FlatSource::Stream(reader)) + } else { + let reader = builder.build_sst_reader().await?; + Either::Left(Source::Reader(reader)) + }; + let mut metrics = Metrics::new(WriteType::Compaction); + let region_metadata = compaction_region.region_metadata.clone(); + let sst_infos = compaction_region + .access_layer + .write_sst( + SstWriteRequest { + op_type: OperationType::Compact, + metadata: region_metadata.clone(), + source, + cache_manager: compaction_region.cache_manager.clone(), + storage, + max_sequence: max_sequence.map(NonZero::get), + index_options, + index_config, + inverted_index_config, + fulltext_index_config, + bloom_filter_index_config, + }, + &write_opts, + &mut metrics, + ) + .await?; + // Convert partition expression once outside the map + let partition_expr = match ®ion_metadata.partition_expr { + None => None, + Some(json_str) if json_str.is_empty() => None, + Some(json_str) => PartitionExpr::from_json_str(json_str).with_context(|_| { + InvalidPartitionExprSnafu { + expr: json_str.clone(), + } + })?, + }; + + let output_files = sst_infos + .into_iter() + .map(|sst_info| FileMeta { + region_id, + file_id: sst_info.file_id, + time_range: sst_info.time_range, + level: output.output_level, + file_size: sst_info.file_size, + available_indexes: sst_info.index_metadata.build_available_indexes(), + indexes: sst_info.index_metadata.build_indexes(), + index_file_size: sst_info.index_metadata.file_size, + index_file_id: None, + num_rows: sst_info.num_rows as u64, + num_row_groups: sst_info.num_row_groups, + sequence: max_sequence, + partition_expr: partition_expr.clone(), + num_series: sst_info.num_series, + }) + .collect::>(); + let output_file_names = output_files.iter().map(|f| f.file_id.to_string()).join(","); + info!( + "Region {} compaction inputs: [{}], outputs: [{}], flat_format: {}, metrics: {:?}", + region_id, input_file_names, output_file_names, flat_format, metrics + ); + metrics.observe(); + Ok(output_files) + } +} + #[async_trait::async_trait] impl Compactor for DefaultCompactor { async fn merge_ssts( @@ -322,128 +428,22 @@ impl Compactor for DefaultCompactor { let mut compacted_inputs = Vec::with_capacity(picker_output.outputs.iter().map(|o| o.inputs.len()).sum()); let internal_parallelism = compaction_region.max_parallelism.max(1); + let compaction_time_window = picker_output.time_window_size; for output in picker_output.outputs.drain(..) { - compacted_inputs.extend(output.inputs.iter().map(|f| f.meta_ref().clone())); + let inputs_to_remove: Vec<_> = + output.inputs.iter().map(|f| f.meta_ref().clone()).collect(); + compacted_inputs.extend(inputs_to_remove.iter().cloned()); let write_opts = WriteOptions { write_buffer_size: compaction_region.engine_config.sst_write_buffer_size, max_file_size: picker_output.max_file_size, ..Default::default() }; - - let region_metadata = compaction_region.region_metadata.clone(); - let sst_layer = compaction_region.access_layer.clone(); - let region_id = compaction_region.region_id; - let cache_manager = compaction_region.cache_manager.clone(); - let storage = compaction_region.region_options.storage.clone(); - let index_options = compaction_region - .current_version - .options - .index_options - .clone(); - let append_mode = compaction_region.current_version.options.append_mode; - let merge_mode = compaction_region.current_version.options.merge_mode(); - let flat_format = compaction_region - .region_options - .sst_format - .map(|format| format == FormatType::Flat) - .unwrap_or( - compaction_region - .engine_config - .default_experimental_flat_format, - ); - let index_config = compaction_region.engine_config.index.clone(); - let inverted_index_config = compaction_region.engine_config.inverted_index.clone(); - let fulltext_index_config = compaction_region.engine_config.fulltext_index.clone(); - let bloom_filter_index_config = - compaction_region.engine_config.bloom_filter_index.clone(); - let max_sequence = output - .inputs - .iter() - .map(|f| f.meta_ref().sequence) - .max() - .flatten(); - let region_metadata_for_filemeta = region_metadata.clone(); - futs.push(async move { - let input_file_names = output - .inputs - .iter() - .map(|f| f.file_id().to_string()) - .join(","); - let builder = CompactionSstReaderBuilder { - metadata: region_metadata.clone(), - sst_layer: sst_layer.clone(), - cache: cache_manager.clone(), - inputs: &output.inputs, - append_mode, - filter_deleted: output.filter_deleted, - time_range: output.output_time_range, - merge_mode, - }; - let source = if flat_format { - let reader = builder.build_flat_sst_reader().await?; - either::Right(FlatSource::Stream(reader)) - } else { - let reader = builder.build_sst_reader().await?; - either::Left(Source::Reader(reader)) - }; - let (sst_infos, metrics) = sst_layer - .write_sst( - SstWriteRequest { - op_type: OperationType::Compact, - metadata: region_metadata, - source, - cache_manager, - storage, - max_sequence: max_sequence.map(NonZero::get), - index_options, - index_config, - inverted_index_config, - fulltext_index_config, - bloom_filter_index_config, - }, - &write_opts, - WriteType::Compaction, - ) - .await?; - // Convert partition expression once outside the map - let partition_expr = match ®ion_metadata_for_filemeta.partition_expr { - None => None, - Some(json_str) if json_str.is_empty() => None, - Some(json_str) => { - PartitionExpr::from_json_str(json_str).with_context(|_| { - InvalidPartitionExprSnafu { - expr: json_str.clone(), - } - })? - } - }; - - let output_files = sst_infos - .into_iter() - .map(|sst_info| FileMeta { - region_id, - file_id: sst_info.file_id, - time_range: sst_info.time_range, - level: output.output_level, - file_size: sst_info.file_size, - available_indexes: sst_info.index_metadata.build_available_indexes(), - index_file_size: sst_info.index_metadata.file_size, - num_rows: sst_info.num_rows as u64, - num_row_groups: sst_info.num_row_groups, - sequence: max_sequence, - partition_expr: partition_expr.clone(), - }) - .collect::>(); - let output_file_names = - output_files.iter().map(|f| f.file_id.to_string()).join(","); - info!( - "Region {} compaction inputs: [{}], outputs: [{}], flat_format: {}, metrics: {:?}", - region_id, input_file_names, output_file_names, flat_format, metrics - ); - metrics.observe(); - Ok(output_files) - }); + futs.push(Self::merge_single_output( + compaction_region.clone(), + output, + write_opts, + )); } let mut output_files = Vec::with_capacity(futs.len()); while !futs.is_empty() { @@ -461,6 +461,8 @@ impl Compactor for DefaultCompactor { output_files.extend(metas.into_iter().flatten()); } + // In case of remote compaction, we still allow the region edit after merge to + // clean expired ssts. let mut inputs: Vec<_> = compacted_inputs.into_iter().collect(); inputs.extend( picker_output @@ -472,7 +474,7 @@ impl Compactor for DefaultCompactor { Ok(MergeOutput { files_to_add: output_files, files_to_remove: inputs, - compaction_time_window: Some(picker_output.time_window_size), + compaction_time_window: Some(compaction_time_window), }) } @@ -499,7 +501,7 @@ impl Compactor for DefaultCompactor { // TODO: We might leak files if we fail to update manifest. We can add a cleanup task to remove them later. compaction_region .manifest_ctx - .update_manifest(RegionLeaderState::Writable, action_list) + .update_manifest(RegionLeaderState::Writable, action_list, false) .await?; Ok(edit) @@ -517,6 +519,7 @@ impl Compactor for DefaultCompactor { &compact_request_options, &compaction_region.region_options.compaction, compaction_region.region_options.append_mode, + None, ) .pick(compaction_region); diff --git a/src/mito2/src/compaction/picker.rs b/src/mito2/src/compaction/picker.rs index 16540e1e02..7c5cccfb8c 100644 --- a/src/mito2/src/compaction/picker.rs +++ b/src/mito2/src/compaction/picker.rs @@ -125,6 +125,7 @@ pub fn new_picker( compact_request_options: &compact_request::Options, compaction_options: &CompactionOptions, append_mode: bool, + max_background_tasks: Option, ) -> Arc { if let compact_request::Options::StrictWindow(window) = compact_request_options { let window = if window.window_seconds == 0 { @@ -140,6 +141,7 @@ pub fn new_picker( time_window_seconds: twcs_opts.time_window_seconds(), max_output_file_size: twcs_opts.max_output_file_size.map(|r| r.as_bytes()), append_mode, + max_background_tasks, }) as Arc<_>, } } diff --git a/src/mito2/src/compaction/run.rs b/src/mito2/src/compaction/run.rs index e691709948..a7e5ca490c 100644 --- a/src/mito2/src/compaction/run.rs +++ b/src/mito2/src/compaction/run.rs @@ -163,6 +163,10 @@ impl FileGroup { self.files.push(file); } + pub(crate) fn num_files(&self) -> usize { + self.files.len() + } + #[cfg(test)] pub(crate) fn files(&self) -> &[FileHandle] { &self.files[..] @@ -175,10 +179,6 @@ impl FileGroup { pub(crate) fn into_files(self) -> impl Iterator { self.files.into_iter() } - - pub(crate) fn is_all_level_0(&self) -> bool { - self.files.iter().all(|f| f.level() == 0) - } } impl Ranged for FileGroup { diff --git a/src/mito2/src/compaction/task.rs b/src/mito2/src/compaction/task.rs index e193665e7a..c952f4ba97 100644 --- a/src/mito2/src/compaction/task.rs +++ b/src/mito2/src/compaction/task.rs @@ -16,19 +16,22 @@ use std::fmt::{Debug, Formatter}; use std::sync::Arc; use std::time::Instant; -use common_telemetry::{error, info}; +use common_telemetry::{error, info, warn}; +use itertools::Itertools; use snafu::ResultExt; use tokio::sync::mpsc; use crate::compaction::compactor::{CompactionRegion, Compactor}; use crate::compaction::picker::{CompactionTask, PickerOutput}; use crate::error::CompactRegionSnafu; -use crate::manifest::action::RegionEdit; +use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED}; +use crate::region::RegionRoleState; use crate::request::{ - BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, WorkerRequest, - WorkerRequestWithTime, + BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, RegionEditResult, + WorkerRequest, WorkerRequestWithTime, }; +use crate::sst::file::FileMeta; use crate::worker::WorkerListener; use crate::{error, metrics}; @@ -78,9 +81,103 @@ impl CompactionTaskImpl { .for_each(|o| o.inputs.iter().for_each(|f| f.set_compacting(compacting))); } - async fn handle_compaction(&mut self) -> error::Result { + /// Remove expired ssts files, update manifest immediately + /// and apply the edit to region version. + /// + /// This function logs errors but does not stop the compaction process if removal fails. + async fn remove_expired( + &self, + compaction_region: &CompactionRegion, + expired_files: Vec, + ) { + let region_id = compaction_region.region_id; + let expired_files_str = expired_files.iter().map(|f| f.file_id).join(","); + let (expire_delete_sender, expire_delete_listener) = tokio::sync::oneshot::channel(); + // Update manifest to remove expired SSTs + let edit = RegionEdit { + files_to_add: Vec::new(), + files_to_remove: expired_files, + timestamp_ms: Some(chrono::Utc::now().timestamp_millis()), + compaction_time_window: None, + flushed_entry_id: None, + flushed_sequence: None, + committed_sequence: None, + }; + + // 1. Update manifest + let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone())); + let RegionRoleState::Leader(current_region_state) = + compaction_region.manifest_ctx.current_state() + else { + warn!( + "Region {} not in leader state, skip removing expired files", + region_id + ); + return; + }; + if let Err(e) = compaction_region + .manifest_ctx + .update_manifest(current_region_state, action_list, false) + .await + { + warn!( + e; + "Failed to update manifest for expired files removal, region: {region_id}, files: [{expired_files_str}]. Compaction will continue." + ); + return; + } + + // 2. Notify region worker loop to remove expired files from region version. + self.send_to_worker(WorkerRequest::Background { + region_id, + notify: BackgroundNotify::RegionEdit(RegionEditResult { + region_id, + sender: expire_delete_sender, + edit, + result: Ok(()), + update_region_state: false, + }), + }) + .await; + + if let Err(e) = expire_delete_listener + .await + .context(error::RecvSnafu) + .flatten() + { + warn!( + e; + "Failed to remove expired files from region version, region: {region_id}, files: [{expired_files_str}]. Compaction will continue." + ); + return; + } + + info!( + "Successfully removed expired files, region: {region_id}, files: [{expired_files_str}]" + ); + } + + async fn handle_expiration_and_compaction(&mut self) -> error::Result { self.mark_files_compacting(true); + // 1. In case of local compaction, we can delete expired ssts in advance. + if !self.picker_output.expired_ssts.is_empty() { + let remove_timer = COMPACTION_STAGE_ELAPSED + .with_label_values(&["remove_expired"]) + .start_timer(); + let expired_ssts = self + .picker_output + .expired_ssts + .drain(..) + .map(|f| f.meta_ref().clone()) + .collect(); + // remove_expired logs errors but doesn't stop compaction + self.remove_expired(&self.compaction_region, expired_ssts) + .await; + remove_timer.observe_duration(); + } + + // 2. Merge inputs let merge_timer = COMPACTION_STAGE_ELAPSED .with_label_values(&["merge"]) .start_timer(); @@ -152,7 +249,7 @@ impl CompactionTaskImpl { #[async_trait::async_trait] impl CompactionTask for CompactionTaskImpl { async fn run(&mut self) { - let notify = match self.handle_compaction().await { + let notify = match self.handle_expiration_and_compaction().await { Ok(edit) => BackgroundNotify::CompactionFinished(CompactionFinished { region_id: self.compaction_region.region_id, senders: std::mem::take(&mut self.waiters), @@ -178,3 +275,66 @@ impl CompactionTask for CompactionTaskImpl { .await; } } + +#[cfg(test)] +mod tests { + use store_api::storage::FileId; + + use crate::compaction::picker::PickerOutput; + use crate::compaction::test_util::new_file_handle; + + #[test] + fn test_picker_output_with_expired_ssts() { + // Test that PickerOutput correctly includes expired_ssts + // This verifies that expired SSTs are properly identified and included + // in the picker output, which is then handled by handle_expiration_and_compaction + + let file_ids = (0..3).map(|_| FileId::random()).collect::>(); + let expired_ssts = vec![ + new_file_handle(file_ids[0], 0, 999, 0), + new_file_handle(file_ids[1], 1000, 1999, 0), + ]; + + let picker_output = PickerOutput { + outputs: vec![], + expired_ssts: expired_ssts.clone(), + time_window_size: 3600, + max_file_size: None, + }; + + // Verify expired_ssts are included + assert_eq!(picker_output.expired_ssts.len(), 2); + assert_eq!( + picker_output.expired_ssts[0].file_id(), + expired_ssts[0].file_id() + ); + assert_eq!( + picker_output.expired_ssts[1].file_id(), + expired_ssts[1].file_id() + ); + } + + #[test] + fn test_picker_output_without_expired_ssts() { + // Test that PickerOutput works correctly when there are no expired SSTs + let picker_output = PickerOutput { + outputs: vec![], + expired_ssts: vec![], + time_window_size: 3600, + max_file_size: None, + }; + + // Verify empty expired_ssts + assert!(picker_output.expired_ssts.is_empty()); + } + + // Note: Testing remove_expired() directly requires extensive mocking of: + // - manifest_ctx (ManifestContext) + // - request_sender (mpsc::Sender) + // - WorkerRequest handling + // + // The behavior is tested indirectly through integration tests: + // - remove_expired() logs errors but doesn't stop compaction + // - handle_expiration_and_compaction() continues even if remove_expired() encounters errors + // - The function is designed to be non-blocking for compaction +} diff --git a/src/mito2/src/compaction/test_util.rs b/src/mito2/src/compaction/test_util.rs index b785d36bcb..90960b9841 100644 --- a/src/mito2/src/compaction/test_util.rs +++ b/src/mito2/src/compaction/test_util.rs @@ -75,9 +75,12 @@ pub fn new_file_handle_with_size_and_sequence( level, file_size, available_indexes: Default::default(), + indexes: Default::default(), index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, + num_series: 0, sequence: NonZeroU64::new(sequence), partition_expr: None, }, diff --git a/src/mito2/src/compaction/twcs.rs b/src/mito2/src/compaction/twcs.rs index 5196eff6b1..9012457f75 100644 --- a/src/mito2/src/compaction/twcs.rs +++ b/src/mito2/src/compaction/twcs.rs @@ -18,7 +18,7 @@ use std::fmt::Debug; use std::num::NonZeroU64; use common_base::readable_size::ReadableSize; -use common_telemetry::info; +use common_telemetry::{debug, info}; use common_time::Timestamp; use common_time::timestamp::TimeUnit; use common_time::timestamp_millis::BucketAligned; @@ -36,6 +36,9 @@ use crate::sst::version::LevelMeta; const LEVEL_COMPACTED: Level = 1; +/// Default value for max compaction input file num. +const DEFAULT_MAX_INPUT_FILE_NUM: usize = 32; + /// `TwcsPicker` picks files of which the max timestamp are in the same time window as compaction /// candidates. #[derive(Debug)] @@ -48,6 +51,8 @@ pub struct TwcsPicker { pub max_output_file_size: Option, /// Whether the target region is in append mode. pub append_mode: bool, + /// Max background compaction tasks. + pub max_background_tasks: Option, } impl TwcsPicker { @@ -71,7 +76,7 @@ impl TwcsPicker { { let (kept_files, ignored_files) = files_to_merge .into_iter() - .partition(|fg| fg.size() <= max_size as usize && fg.is_all_level_0()); + .partition(|fg| fg.size() <= max_size as usize); files_to_merge = kept_files; info!( "Skipped {} large files in append mode for region {}, window {}, max_size: {}", @@ -88,10 +93,10 @@ impl TwcsPicker { // because after compaction there will be no overlapping files. let filter_deleted = !files.overlapping && found_runs <= 2 && !self.append_mode; if found_runs == 0 { - return output; + continue; } - let inputs = if found_runs > 1 { + let mut inputs = if found_runs > 1 { reduce_runs(sorted_runs) } else { let run = sorted_runs.last().unwrap(); @@ -102,7 +107,32 @@ impl TwcsPicker { merge_seq_files(run.items(), self.max_output_file_size) }; - if !inputs.is_empty() { + // Limits the number of input files. + let total_input_files: usize = inputs.iter().map(|fg| fg.num_files()).sum(); + if total_input_files > DEFAULT_MAX_INPUT_FILE_NUM { + // Sorts file groups by size first. + inputs.sort_unstable_by_key(|fg| fg.size()); + let mut num_picked_files = 0; + inputs = inputs + .into_iter() + .take_while(|fg| { + let current_group_file_num = fg.num_files(); + if current_group_file_num + num_picked_files <= DEFAULT_MAX_INPUT_FILE_NUM { + num_picked_files += current_group_file_num; + true + } else { + false + } + }) + .collect::>(); + info!( + "Compaction for region {} enforces max input file num limit: {}, current total: {}, input: {:?}", + region_id, DEFAULT_MAX_INPUT_FILE_NUM, total_input_files, inputs + ); + } + + if inputs.len() > 1 { + // If we have more than one group to compact. log_pick_result( region_id, *window, @@ -119,6 +149,16 @@ impl TwcsPicker { filter_deleted, output_time_range: None, // we do not enforce output time range in twcs compactions. }); + + if let Some(max_background_tasks) = self.max_background_tasks + && output.len() >= max_background_tasks + { + debug!( + "Region ({:?}) compaction task size larger than max background tasks({}), remaining tasks discarded", + region_id, max_background_tasks + ); + break; + } } } output @@ -680,6 +720,7 @@ mod tests { time_window_seconds: None, max_output_file_size: None, append_mode: false, + max_background_tasks: None, } .build_output(RegionId::from_u64(0), &mut windows, active_window); @@ -831,5 +872,265 @@ mod tests { } } + #[test] + fn test_build_output_multiple_windows_with_zero_runs() { + let file_ids = (0..6).map(|_| FileId::random()).collect::>(); + + let files = [ + // Window 0: Contains 3 files but not forming any runs (not enough files in sequence to reach trigger_file_num) + new_file_handle_with_sequence(file_ids[0], 0, 999, 0, 1), + new_file_handle_with_sequence(file_ids[1], 0, 999, 0, 2), + new_file_handle_with_sequence(file_ids[2], 0, 999, 0, 3), + // Window 3: Contains files that will form 2 runs + new_file_handle_with_sequence(file_ids[3], 3000, 3999, 0, 4), + new_file_handle_with_sequence(file_ids[4], 3000, 3999, 0, 5), + new_file_handle_with_sequence(file_ids[5], 3000, 3999, 0, 6), + ]; + + let mut windows = assign_to_windows(files.iter(), 3); + + // Create picker with trigger_file_num of 4 so single files won't form runs in first window + let picker = TwcsPicker { + trigger_file_num: 4, // High enough to prevent runs in first window + time_window_seconds: Some(3), + max_output_file_size: None, + append_mode: false, + max_background_tasks: None, + }; + + let active_window = find_latest_window_in_seconds(files.iter(), 3); + let output = picker.build_output(RegionId::from_u64(123), &mut windows, active_window); + + assert!( + !output.is_empty(), + "Should have output from windows with runs, even when one window has 0 runs" + ); + + let all_output_files: Vec<_> = output + .iter() + .flat_map(|o| o.inputs.iter()) + .map(|f| f.file_id().file_id()) + .collect(); + + assert!( + all_output_files.contains(&file_ids[3]) + || all_output_files.contains(&file_ids[4]) + || all_output_files.contains(&file_ids[5]), + "Output should contain files from the window with runs" + ); + } + + #[test] + fn test_build_output_single_window_zero_runs() { + let file_ids = (0..2).map(|_| FileId::random()).collect::>(); + + let large_file_1 = new_file_handle_with_size_and_sequence(file_ids[0], 0, 999, 0, 1, 2000); // 2000 bytes + let large_file_2 = new_file_handle_with_size_and_sequence(file_ids[1], 0, 999, 0, 2, 2500); // 2500 bytes + + let files = [large_file_1, large_file_2]; + + let mut windows = assign_to_windows(files.iter(), 3); + + let picker = TwcsPicker { + trigger_file_num: 2, + time_window_seconds: Some(3), + max_output_file_size: Some(1000), + append_mode: true, + max_background_tasks: None, + }; + + let active_window = find_latest_window_in_seconds(files.iter(), 3); + let output = picker.build_output(RegionId::from_u64(456), &mut windows, active_window); + + // Should return empty output (no compaction needed) + assert!( + output.is_empty(), + "Should return empty output when no runs are found after filtering" + ); + } + + #[test] + fn test_max_background_tasks_truncation() { + let file_ids = (0..10).map(|_| FileId::random()).collect::>(); + let max_background_tasks = 3; + + // Create files across multiple windows that will generate multiple compaction outputs + let files = [ + // Window 0: 4 files that will form a run + new_file_handle_with_sequence(file_ids[0], 0, 999, 0, 1), + new_file_handle_with_sequence(file_ids[1], 0, 999, 0, 2), + new_file_handle_with_sequence(file_ids[2], 0, 999, 0, 3), + new_file_handle_with_sequence(file_ids[3], 0, 999, 0, 4), + // Window 3: 4 files that will form another run + new_file_handle_with_sequence(file_ids[4], 3000, 3999, 0, 5), + new_file_handle_with_sequence(file_ids[5], 3000, 3999, 0, 6), + new_file_handle_with_sequence(file_ids[6], 3000, 3999, 0, 7), + new_file_handle_with_sequence(file_ids[7], 3000, 3999, 0, 8), + // Window 6: 4 files that will form another run + new_file_handle_with_sequence(file_ids[8], 6000, 6999, 0, 9), + new_file_handle_with_sequence(file_ids[9], 6000, 6999, 0, 10), + ]; + + let mut windows = assign_to_windows(files.iter(), 3); + + let picker = TwcsPicker { + trigger_file_num: 4, + time_window_seconds: Some(3), + max_output_file_size: None, + append_mode: false, + max_background_tasks: Some(max_background_tasks), + }; + + let active_window = find_latest_window_in_seconds(files.iter(), 3); + let output = picker.build_output(RegionId::from_u64(123), &mut windows, active_window); + + // Should have at most max_background_tasks outputs + assert!( + output.len() <= max_background_tasks, + "Output should be truncated to max_background_tasks: expected <= {}, got {}", + max_background_tasks, + output.len() + ); + + // Without max_background_tasks, should have more outputs + let picker_no_limit = TwcsPicker { + trigger_file_num: 4, + time_window_seconds: Some(3), + max_output_file_size: None, + append_mode: false, + max_background_tasks: None, + }; + + let mut windows_no_limit = assign_to_windows(files.iter(), 3); + let output_no_limit = picker_no_limit.build_output( + RegionId::from_u64(123), + &mut windows_no_limit, + active_window, + ); + + // Without limit, should have more outputs (if there are enough windows) + if output_no_limit.len() > max_background_tasks { + assert!( + output_no_limit.len() > output.len(), + "Without limit should have more outputs than with limit" + ); + } + } + + #[test] + fn test_max_background_tasks_no_truncation_when_under_limit() { + let file_ids = (0..4).map(|_| FileId::random()).collect::>(); + let max_background_tasks = 10; // Larger than expected outputs + + // Create files in one window that will generate one compaction output + let files = [ + new_file_handle_with_sequence(file_ids[0], 0, 999, 0, 1), + new_file_handle_with_sequence(file_ids[1], 0, 999, 0, 2), + new_file_handle_with_sequence(file_ids[2], 0, 999, 0, 3), + new_file_handle_with_sequence(file_ids[3], 0, 999, 0, 4), + ]; + + let mut windows = assign_to_windows(files.iter(), 3); + + let picker = TwcsPicker { + trigger_file_num: 4, + time_window_seconds: Some(3), + max_output_file_size: None, + append_mode: false, + max_background_tasks: Some(max_background_tasks), + }; + + let active_window = find_latest_window_in_seconds(files.iter(), 3); + let output = picker.build_output(RegionId::from_u64(123), &mut windows, active_window); + + // Should have all outputs since we're under the limit + assert!( + output.len() <= max_background_tasks, + "Output should be within limit" + ); + // Should have at least one output + assert!(!output.is_empty(), "Should have at least one output"); + } + + #[test] + fn test_pick_multiple_runs() { + common_telemetry::init_default_ut_logging(); + + let num_files = 8; + let file_ids = (0..num_files).map(|_| FileId::random()).collect::>(); + + // Create files with different sequences so they form multiple runs + let files: Vec<_> = file_ids + .iter() + .enumerate() + .map(|(idx, file_id)| { + new_file_handle_with_size_and_sequence( + *file_id, + 0, + 999, + 0, + (idx + 1) as u64, + 1024 * 1024, + ) + }) + .collect(); + + let mut windows = assign_to_windows(files.iter(), 3); + + let picker = TwcsPicker { + trigger_file_num: 4, + time_window_seconds: Some(3), + max_output_file_size: None, + append_mode: false, + max_background_tasks: None, + }; + + let active_window = find_latest_window_in_seconds(files.iter(), 3); + let output = picker.build_output(RegionId::from_u64(123), &mut windows, active_window); + + assert_eq!(1, output.len()); + assert_eq!(output[0].inputs.len(), 2); + } + + #[test] + fn test_limit_max_input_files() { + common_telemetry::init_default_ut_logging(); + + let num_files = 50; + let file_ids = (0..num_files).map(|_| FileId::random()).collect::>(); + + // Create files with different sequences so they form 2 runs + let files: Vec<_> = file_ids + .iter() + .enumerate() + .map(|(idx, file_id)| { + new_file_handle_with_size_and_sequence( + *file_id, + (idx / 2 * 10) as i64, + (idx / 2 * 10 + 5) as i64, + 0, + (idx + 1) as u64, + 1024 * 1024, + ) + }) + .collect(); + + let mut windows = assign_to_windows(files.iter(), 3); + + let picker = TwcsPicker { + trigger_file_num: 4, + time_window_seconds: Some(3), + max_output_file_size: None, + append_mode: false, + max_background_tasks: None, + }; + + let active_window = find_latest_window_in_seconds(files.iter(), 3); + let output = picker.build_output(RegionId::from_u64(123), &mut windows, active_window); + + assert_eq!(1, output.len()); + assert_eq!(output[0].inputs.len(), 32); + } + // TODO(hl): TTL tester that checks if get_expired_ssts function works as expected. } diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index edf0709960..53cc745fe5 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -18,13 +18,16 @@ use std::cmp; use std::path::Path; use std::time::Duration; +use common_base::memory_limit::MemoryLimit; use common_base::readable_size::ReadableSize; use common_stat::{get_total_cpu_cores, get_total_memory_readable}; use common_telemetry::warn; use serde::{Deserialize, Serialize}; use serde_with::serde_as; +use crate::cache::file_cache::DEFAULT_INDEX_CACHE_PERCENT; use crate::error::Result; +use crate::gc::GcConfig; use crate::memtable::MemtableConfig; use crate::sst::DEFAULT_WRITE_BUFFER_SIZE; @@ -117,6 +120,12 @@ pub struct MitoConfig { /// TTL for write cache. #[serde(with = "humantime_serde")] pub write_cache_ttl: Option, + /// Preload index (puffin) files into cache on region open (default: true). + pub preload_index_cache: bool, + /// Percentage of write cache capacity allocated for index (puffin) files (default: 20). + /// The remaining capacity is used for data (parquet) files. + /// Must be between 0 and 100 (exclusive). + pub index_cache_percent: u8, // Other configs: /// Buffer size for SST writing. @@ -127,6 +136,9 @@ pub struct MitoConfig { pub max_concurrent_scan_files: usize, /// Whether to allow stale entries read during replay. pub allow_stale_entries: bool, + /// Memory limit for table scans across all queries. Setting it to 0 disables the limit. + /// Supports absolute size (e.g., "2GB") or percentage (e.g., "50%"). + pub scan_memory_limit: MemoryLimit, /// Index configs. pub index: IndexConfig, @@ -148,6 +160,8 @@ pub struct MitoConfig { /// Whether to enable experimental flat format as the default format. /// When enabled, forces using BulkMemtable and BulkMemtableBuilder. pub default_experimental_flat_format: bool, + + pub gc: GcConfig, } impl Default for MitoConfig { @@ -175,10 +189,13 @@ impl Default for MitoConfig { write_cache_path: String::new(), write_cache_size: ReadableSize::gb(5), write_cache_ttl: None, + preload_index_cache: true, + index_cache_percent: DEFAULT_INDEX_CACHE_PERCENT, sst_write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, allow_stale_entries: false, + scan_memory_limit: MemoryLimit::default(), index: IndexConfig::default(), inverted_index: InvertedIndexConfig::default(), fulltext_index: FulltextIndexConfig::default(), @@ -186,6 +203,7 @@ impl Default for MitoConfig { memtable: MemtableConfig::default(), min_compaction_interval: Duration::from_secs(0), default_experimental_flat_format: false, + gc: GcConfig::default(), }; // Adjust buffer and cache size according to system memory if we can. @@ -262,6 +280,15 @@ impl MitoConfig { self.write_cache_path = data_home.to_string(); } + // Validate index_cache_percent is within valid range (0, 100) + if self.index_cache_percent == 0 || self.index_cache_percent >= 100 { + warn!( + "Invalid index_cache_percent {}, resetting to default {}", + self.index_cache_percent, DEFAULT_INDEX_CACHE_PERCENT + ); + self.index_cache_percent = DEFAULT_INDEX_CACHE_PERCENT; + } + self.index.sanitize(data_home, &self.inverted_index)?; Ok(()) diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index 3fb3a8abd8..76c9a37616 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -21,6 +21,8 @@ mod append_mode_test; #[cfg(test)] mod basic_test; #[cfg(test)] +mod batch_catchup_test; +#[cfg(test)] mod batch_open_test; #[cfg(test)] mod bump_committed_sequence_test; @@ -29,7 +31,7 @@ mod catchup_test; #[cfg(test)] mod close_test; #[cfg(test)] -mod compaction_test; +pub(crate) mod compaction_test; #[cfg(test)] mod create_test; #[cfg(test)] @@ -69,6 +71,9 @@ mod sync_test; #[cfg(test)] mod truncate_test; +#[cfg(test)] +mod remap_manifests_test; + mod puffin_index; use std::any::Any; @@ -81,7 +86,8 @@ use async_trait::async_trait; use common_base::Plugins; use common_error::ext::BoxedError; use common_meta::key::SchemaMetadataManagerRef; -use common_recordbatch::SendableRecordBatchStream; +use common_recordbatch::{MemoryPermit, QueryMemoryTracker, SendableRecordBatchStream}; +use common_stat::get_total_memory_bytes; use common_telemetry::{info, tracing, warn}; use common_wal::options::{WAL_OPTIONS_KEY, WalOptions}; use futures::future::{join_all, try_join_all}; @@ -91,18 +97,21 @@ use snafu::{OptionExt, ResultExt, ensure}; use store_api::ManifestVersion; use store_api::codec::PrimaryKeyEncoding; use store_api::logstore::LogStore; -use store_api::logstore::provider::Provider; +use store_api::logstore::provider::{KafkaProvider, Provider}; use store_api::metadata::{ColumnMetadata, RegionMetadataRef}; use store_api::metric_engine_consts::{ MANIFEST_INFO_EXTENSION_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY, }; use store_api::region_engine::{ BatchResponses, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, - RegionStatistic, SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse, + RegionStatistic, RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse, + SettableRegionRoleState, SyncManifestResponse, +}; +use store_api::region_request::{ + AffectedRows, RegionCatchupRequest, RegionOpenRequest, RegionRequest, }; -use store_api::region_request::{AffectedRows, RegionOpenRequest, RegionRequest}; use store_api::sst_entry::{ManifestSstEntry, PuffinIndexMetaEntry, StorageSstEntry}; -use store_api::storage::{FileId, RegionId, ScanRequest, SequenceNumber}; +use store_api::storage::{FileId, FileRefsManifest, RegionId, ScanRequest, SequenceNumber}; use tokio::sync::{Semaphore, oneshot}; use crate::access_layer::RegionFilePathFactory; @@ -111,13 +120,16 @@ use crate::config::MitoConfig; use crate::engine::puffin_index::{IndexEntryContext, collect_index_entries_from_puffin}; use crate::error::{ InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu, Result, - SerdeJsonSnafu, SerializeColumnMetadataSnafu, + SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu, }; #[cfg(feature = "enterprise")] use crate::extension::BoxedExtensionRangeProviderFactory; +use crate::gc::GcLimiterRef; use crate::manifest::action::RegionEdit; use crate::memtable::MemtableStats; -use crate::metrics::HANDLE_REQUEST_ELAPSED; +use crate::metrics::{ + HANDLE_REQUEST_ELAPSED, SCAN_MEMORY_USAGE_BYTES, SCAN_REQUESTS_REJECTED_TOTAL, +}; use crate::read::scan_region::{ScanRegion, Scanner}; use crate::read::stream::ScanBatchStream; use crate::region::MitoRegionRef; @@ -142,6 +154,7 @@ pub struct MitoEngineBuilder<'a, S: LogStore> { file_ref_manager: FileReferenceManagerRef, partition_expr_fetcher: PartitionExprFetcherRef, plugins: Plugins, + max_concurrent_queries: usize, #[cfg(feature = "enterprise")] extension_range_provider_factory: Option, } @@ -157,6 +170,7 @@ impl<'a, S: LogStore> MitoEngineBuilder<'a, S> { file_ref_manager: FileReferenceManagerRef, partition_expr_fetcher: PartitionExprFetcherRef, plugins: Plugins, + max_concurrent_queries: usize, ) -> Self { Self { data_home, @@ -167,6 +181,7 @@ impl<'a, S: LogStore> MitoEngineBuilder<'a, S> { file_ref_manager, plugins, partition_expr_fetcher, + max_concurrent_queries, #[cfg(feature = "enterprise")] extension_range_provider_factory: None, } @@ -199,10 +214,22 @@ impl<'a, S: LogStore> MitoEngineBuilder<'a, S> { ) .await?; let wal_raw_entry_reader = Arc::new(LogStoreRawEntryReader::new(self.log_store)); + let total_memory = get_total_memory_bytes().max(0) as u64; + let scan_memory_limit = config.scan_memory_limit.resolve(total_memory) as usize; + let scan_memory_tracker = + QueryMemoryTracker::new(scan_memory_limit, self.max_concurrent_queries) + .with_on_update(|usage| { + SCAN_MEMORY_USAGE_BYTES.set(usage as i64); + }) + .with_on_reject(|| { + SCAN_REQUESTS_REJECTED_TOTAL.inc(); + }); + let inner = EngineInner { workers, config, wal_raw_entry_reader, + scan_memory_tracker, #[cfg(feature = "enterprise")] extension_range_provider_factory: None, }; @@ -245,6 +272,7 @@ impl MitoEngine { file_ref_manager, partition_expr_fetcher, plugins, + 0, // Default: no limit on concurrent queries ); builder.try_build().await } @@ -261,6 +289,38 @@ impl MitoEngine { self.inner.workers.file_ref_manager() } + pub fn gc_limiter(&self) -> GcLimiterRef { + self.inner.workers.gc_limiter() + } + + /// Get all tmp ref files for given region ids, excluding files that's already in manifest. + pub async fn get_snapshot_of_file_refs( + &self, + file_handle_regions: impl IntoIterator, + manifest_regions: HashMap>, + ) -> Result { + let file_ref_mgr = self.file_ref_manager(); + + let file_handle_regions = file_handle_regions.into_iter().collect::>(); + // Convert region IDs to MitoRegionRef objects, ignore regions that do not exist on current datanode + // as regions on other datanodes are not managed by this engine. + let query_regions: Vec = file_handle_regions + .into_iter() + .filter_map(|region_id| self.find_region(region_id)) + .collect(); + + let related_regions: Vec<(MitoRegionRef, Vec)> = manifest_regions + .into_iter() + .filter_map(|(related_region, queries)| { + self.find_region(related_region).map(|r| (r, queries)) + }) + .collect(); + + file_ref_mgr + .get_snapshot_of_file_refs(query_regions, related_regions) + .await + } + /// Returns true if the specific region exists. pub fn is_region_exists(&self, region_id: RegionId) -> bool { self.inner.workers.is_region_exists(region_id) @@ -271,6 +331,11 @@ impl MitoEngine { self.inner.workers.is_region_opening(region_id) } + /// Returns true if the specific region is catching up. + pub fn is_region_catching_up(&self, region_id: RegionId) -> bool { + self.inner.workers.is_region_catching_up(region_id) + } + /// Returns the region disk/memory statistic. pub fn get_region_statistic(&self, region_id: RegionId) -> Option { self.find_region(region_id) @@ -313,7 +378,11 @@ impl MitoEngine { } /// Returns a scanner to scan for `request`. - async fn scanner(&self, region_id: RegionId, request: ScanRequest) -> Result { + pub(crate) async fn scanner( + &self, + region_id: RegionId, + request: ScanRequest, + ) -> Result { self.scan_region(region_id, request)?.scanner().await } @@ -357,7 +426,7 @@ impl MitoEngine { self.find_region(id) } - pub(crate) fn find_region(&self, region_id: RegionId) -> Option { + pub fn find_region(&self, region_id: RegionId) -> Option { self.inner.workers.get_region(region_id) } @@ -472,19 +541,21 @@ impl MitoEngine { return Vec::new(); }; - let file_id = match FileId::parse_str(&entry.file_id) { + let Some(index_file_id) = entry.index_file_id.as_ref() else { + return Vec::new(); + }; + let file_id = match FileId::parse_str(index_file_id) { Ok(file_id) => file_id, Err(err) => { warn!( err; "Failed to parse puffin index file id, table_dir: {}, file_id: {}", entry.table_dir, - entry.file_id + index_file_id ); return Vec::new(); } }; - let region_file_id = RegionFileId::new(entry.region_id, file_id); let context = IndexEntryContext { table_dir: &entry.table_dir, @@ -494,7 +565,7 @@ impl MitoEngine { region_number: entry.region_number, region_group: entry.region_group, region_sequence: entry.region_sequence, - file_id: &entry.file_id, + file_id: index_file_id, index_file_size: entry.index_file_size, node_id, }; @@ -575,6 +646,8 @@ struct EngineInner { config: Arc, /// The Wal raw entry reader. wal_raw_entry_reader: Arc, + /// Memory tracker for table scans. + scan_memory_tracker: QueryMemoryTracker, #[cfg(feature = "enterprise")] extension_range_provider_factory: Option, } @@ -742,6 +815,122 @@ impl EngineInner { Ok(responses) } + async fn catchup_topic_regions( + &self, + provider: Provider, + region_requests: Vec<(RegionId, RegionCatchupRequest)>, + ) -> Result)>> { + let now = Instant::now(); + let region_ids = region_requests + .iter() + .map(|(region_id, _)| *region_id) + .collect::>(); + let (distributor, entry_receivers) = build_wal_entry_distributor_and_receivers( + provider.clone(), + self.wal_raw_entry_reader.clone(), + ®ion_ids, + DEFAULT_ENTRY_RECEIVER_BUFFER_SIZE, + ); + + let mut responses = Vec::with_capacity(region_requests.len()); + for ((region_id, request), entry_receiver) in + region_requests.into_iter().zip(entry_receivers) + { + let (request, receiver) = + WorkerRequest::new_catchup_region_request(region_id, request, Some(entry_receiver)); + self.workers.submit_to_worker(region_id, request).await?; + responses.push(async move { receiver.await.context(RecvSnafu)? }); + } + + // Wait for entries distribution. + let distribution = + common_runtime::spawn_global(async move { distributor.distribute().await }); + // Wait for worker returns. + let responses = join_all(responses).await; + distribution.await.context(JoinSnafu)??; + + let num_failure = responses.iter().filter(|r| r.is_err()).count(); + info!( + "Caught up {} regions for topic '{}', failures: {}, elapsed: {:?}", + region_ids.len() - num_failure, + // Safety: provider is kafka provider. + provider.as_kafka_provider().unwrap(), + num_failure, + now.elapsed(), + ); + + Ok(region_ids.into_iter().zip(responses).collect()) + } + + async fn handle_batch_catchup_requests( + &self, + parallelism: usize, + requests: Vec<(RegionId, RegionCatchupRequest)>, + ) -> Result)>> { + let mut responses = Vec::with_capacity(requests.len()); + let mut topic_regions: HashMap, Vec<_>> = HashMap::new(); + let mut remaining_region_requests = vec![]; + + for (region_id, request) in requests { + match self.workers.get_region(region_id) { + Some(region) => match region.provider.as_kafka_provider() { + Some(provider) => { + topic_regions + .entry(provider.clone()) + .or_default() + .push((region_id, request)); + } + None => { + remaining_region_requests.push((region_id, request)); + } + }, + None => responses.push((region_id, RegionNotFoundSnafu { region_id }.fail())), + } + } + + let semaphore = Arc::new(Semaphore::new(parallelism)); + + if !topic_regions.is_empty() { + let mut tasks = Vec::with_capacity(topic_regions.len()); + for (provider, region_requests) in topic_regions { + let semaphore_moved = semaphore.clone(); + tasks.push(async move { + // Safety: semaphore must exist + let _permit = semaphore_moved.acquire().await.unwrap(); + self.catchup_topic_regions(Provider::Kafka(provider), region_requests) + .await + }) + } + + let r = try_join_all(tasks).await?; + responses.extend(r.into_iter().flatten()); + } + + if !remaining_region_requests.is_empty() { + let mut tasks = Vec::with_capacity(remaining_region_requests.len()); + let mut region_ids = Vec::with_capacity(remaining_region_requests.len()); + for (region_id, request) in remaining_region_requests { + let semaphore_moved = semaphore.clone(); + region_ids.push(region_id); + tasks.push(async move { + // Safety: semaphore must exist + let _permit = semaphore_moved.acquire().await.unwrap(); + let (request, receiver) = + WorkerRequest::new_catchup_region_request(region_id, request, None); + + self.workers.submit_to_worker(region_id, request).await?; + + receiver.await.context(RecvSnafu)? + }) + } + + let results = join_all(tasks).await; + responses.extend(region_ids.into_iter().zip(results)); + } + + Ok(responses) + } + /// Handles [RegionRequest] and return its executed result. async fn handle_request( &self, @@ -783,8 +972,7 @@ impl EngineInner { .with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled()) .with_ignore_fulltext_index(self.config.fulltext_index.apply_on_query.disabled()) .with_ignore_bloom_filter(self.config.bloom_filter_index.apply_on_query.disabled()) - .with_start_time(query_start) - .with_flat_format(self.config.default_experimental_flat_format); + .with_start_time(query_start); #[cfg(feature = "enterprise")] let scan_region = self.maybe_fill_extension_range_provider(scan_region, region); @@ -843,6 +1031,28 @@ impl EngineInner { receiver.await.context(RecvSnafu)? } + async fn remap_manifests( + &self, + request: RemapManifestsRequest, + ) -> Result { + let region_id = request.region_id; + let (request, receiver) = WorkerRequest::try_from_remap_manifests_request(request)?; + self.workers.submit_to_worker(region_id, request).await?; + let manifests = receiver.await.context(RecvSnafu)??; + + let new_manifests = manifests + .into_iter() + .map(|(region_id, manifest)| { + Ok(( + region_id, + serde_json::to_string(&manifest) + .context(SerializeManifestSnafu { region_id })?, + )) + }) + .collect::>>()?; + Ok(RemapManifestsResponse { new_manifests }) + } + fn role(&self, region_id: RegionId) -> Option { self.workers.get_region(region_id).map(|region| { if region.is_follower() { @@ -884,6 +1094,29 @@ impl RegionEngine for MitoEngine { .map_err(BoxedError::new) } + #[tracing::instrument(skip_all)] + async fn handle_batch_catchup_requests( + &self, + parallelism: usize, + requests: Vec<(RegionId, RegionCatchupRequest)>, + ) -> Result { + self.inner + .handle_batch_catchup_requests(parallelism, requests) + .await + .map(|responses| { + responses + .into_iter() + .map(|(region_id, response)| { + ( + region_id, + response.map(RegionResponse::new).map_err(BoxedError::new), + ) + }) + .collect::>() + }) + .map_err(BoxedError::new) + } + #[tracing::instrument(skip_all)] async fn handle_request( &self, @@ -927,6 +1160,10 @@ impl RegionEngine for MitoEngine { .map_err(BoxedError::new) } + fn register_query_memory_permit(&self) -> Option> { + Some(Arc::new(self.inner.scan_memory_tracker.register_permit())) + } + async fn get_committed_sequence( &self, region_id: RegionId, @@ -992,6 +1229,16 @@ impl RegionEngine for MitoEngine { Ok(SyncManifestResponse::Mito { synced }) } + async fn remap_manifests( + &self, + request: RemapManifestsRequest, + ) -> Result { + self.inner + .remap_manifests(request) + .await + .map_err(BoxedError::new) + } + fn role(&self, region_id: RegionId) -> Option { self.inner.role(region_id) } @@ -1071,6 +1318,15 @@ impl MitoEngine { let config = Arc::new(config); let wal_raw_entry_reader = Arc::new(LogStoreRawEntryReader::new(log_store.clone())); + let total_memory = get_total_memory_bytes().max(0) as u64; + let scan_memory_limit = config.scan_memory_limit.resolve(total_memory) as usize; + let scan_memory_tracker = QueryMemoryTracker::new(scan_memory_limit, 0) + .with_on_update(|usage| { + SCAN_MEMORY_USAGE_BYTES.set(usage as i64); + }) + .with_on_reject(|| { + SCAN_REQUESTS_REJECTED_TOTAL.inc(); + }); Ok(MitoEngine { inner: Arc::new(EngineInner { workers: WorkerGroup::start_for_test( @@ -1087,6 +1343,7 @@ impl MitoEngine { .await?, config, wal_raw_entry_reader, + scan_memory_tracker, #[cfg(feature = "enterprise")] extension_range_provider_factory: None, }), diff --git a/src/mito2/src/engine/alter_test.rs b/src/mito2/src/engine/alter_test.rs index 63e7c029ae..2aa26ba204 100644 --- a/src/mito2/src/engine/alter_test.rs +++ b/src/mito2/src/engine/alter_test.rs @@ -38,6 +38,7 @@ use crate::config::MitoConfig; use crate::engine::MitoEngine; use crate::engine::listener::{AlterFlushListener, NotifyRegionChangeResultListener}; use crate::error; +use crate::sst::FormatType; use crate::test_util::{ CreateRequestBuilder, TestEnv, build_rows, build_rows_for_key, flush_region, put_rows, rows_schema, @@ -198,7 +199,7 @@ async fn test_alter_region_with_format(flat_format: bool) { assert_eq!(manifests.len(), 1); let (return_region_id, manifest) = manifests.remove(0); assert_eq!(return_region_id, region_id); - assert_eq!(manifest, RegionManifestInfo::mito(2, 1)); + assert_eq!(manifest, RegionManifestInfo::mito(2, 1, 0)); let column_metadatas = parse_column_metadatas(&response.extensions, TABLE_COLUMN_METADATA_EXTENSION_KEY).unwrap(); assert_column_metadatas( @@ -900,15 +901,15 @@ async fn test_alter_region_ttl_options_with_format(flat_format: bool) { check_ttl(&engine, &Duration::from_secs(500)); } -#[tokio::test] +#[tokio::test(flavor = "multi_thread")] async fn test_write_stall_on_altering() { + common_telemetry::init_default_ut_logging(); + test_write_stall_on_altering_with_format(false).await; test_write_stall_on_altering_with_format(true).await; } async fn test_write_stall_on_altering_with_format(flat_format: bool) { - common_telemetry::init_default_ut_logging(); - let mut env = TestEnv::new().await; let listener = Arc::new(NotifyRegionChangeResultListener::default()); let engine = env @@ -951,6 +952,8 @@ async fn test_write_stall_on_altering_with_format(flat_format: bool) { .await .unwrap(); }); + // Make sure the loop is handling the alter request. + tokio::time::sleep(Duration::from_millis(100)).await; let column_schemas_cloned = column_schemas.clone(); let engine_cloned = engine.clone(); @@ -961,6 +964,8 @@ async fn test_write_stall_on_altering_with_format(flat_format: bool) { }; put_rows(&engine_cloned, region_id, rows).await; }); + // Make sure the loop is handling the put request. + tokio::time::sleep(Duration::from_millis(100)).await; listener.wake_notify(); alter_job.await.unwrap(); @@ -980,3 +985,247 @@ async fn test_write_stall_on_altering_with_format(flat_format: bool) { let batches = RecordBatches::try_collect(stream).await.unwrap(); assert_eq!(expected, batches.pretty_print().unwrap()); } + +#[tokio::test] +async fn test_alter_region_sst_format_with_flush() { + common_telemetry::init_default_ut_logging(); + + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: false, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let column_schemas = rows_schema(&request); + let table_dir = request.table_dir.clone(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + // Inserts some data before alter + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + // Flushes to create SST files with primary_key format + flush_region(&engine, region_id, None).await; + + let expected_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_data, batches.pretty_print().unwrap()); + + // Alters sst_format from primary_key to flat + let alter_format_request = RegionAlterRequest { + kind: AlterKind::SetRegionOptions { + options: vec![SetRegionOption::Format("flat".to_string())], + }, + }; + engine + .handle_request(region_id, RegionRequest::Alter(alter_format_request)) + .await + .unwrap(); + + // Inserts more data after alter + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(3, 6), + }; + put_rows(&engine, region_id, rows).await; + + // Flushes to create SST files with flat format + flush_region(&engine, region_id, None).await; + + let expected_all_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | +| 3 | 3.0 | 1970-01-01T00:00:03 | +| 4 | 4.0 | 1970-01-01T00:00:04 | +| 5 | 5.0 | 1970-01-01T00:00:05 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); + + // Reopens region to verify format persists + let engine = env + .reopen_engine( + engine, + MitoConfig { + default_experimental_flat_format: false, + ..Default::default() + }, + ) + .await; + engine + .handle_request( + region_id, + RegionRequest::Open(RegionOpenRequest { + engine: String::new(), + table_dir, + path_type: PathType::Bare, + options: HashMap::default(), + skip_wal_replay: false, + checkpoint: None, + }), + ) + .await + .unwrap(); + + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); +} + +#[tokio::test] +async fn test_alter_region_sst_format_without_flush() { + common_telemetry::init_default_ut_logging(); + + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: false, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let column_schemas = rows_schema(&request); + let table_dir = request.table_dir.clone(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let check_format = |engine: &MitoEngine, expected: Option| { + let current_format = engine + .get_region(region_id) + .unwrap() + .version() + .options + .sst_format; + assert_eq!(current_format, expected); + }; + check_format(&engine, Some(FormatType::PrimaryKey)); + + // Inserts some data before alter + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + // Alters sst_format from primary_key to flat + let alter_format_request = RegionAlterRequest { + kind: AlterKind::SetRegionOptions { + options: vec![SetRegionOption::Format("flat".to_string())], + }, + }; + engine + .handle_request(region_id, RegionRequest::Alter(alter_format_request)) + .await + .unwrap(); + + check_format(&engine, Some(FormatType::Flat)); + + // Inserts more data after alter + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(3, 6), + }; + put_rows(&engine, region_id, rows).await; + + let expected_all_data = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | +| 3 | 3.0 | 1970-01-01T00:00:03 | +| 4 | 4.0 | 1970-01-01T00:00:04 | +| 5 | 5.0 | 1970-01-01T00:00:05 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); + + // Reopens region to verify format persists + let engine = env + .reopen_engine( + engine, + MitoConfig { + default_experimental_flat_format: false, + ..Default::default() + }, + ) + .await; + engine + .handle_request( + region_id, + RegionRequest::Open(RegionOpenRequest { + engine: String::new(), + table_dir, + path_type: PathType::Bare, + options: HashMap::default(), + skip_wal_replay: false, + checkpoint: None, + }), + ) + .await + .unwrap(); + + check_format(&engine, Some(FormatType::Flat)); + + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected_all_data, batches.pretty_print().unwrap()); +} diff --git a/src/mito2/src/engine/append_mode_test.rs b/src/mito2/src/engine/append_mode_test.rs index ccdcbb3372..85d4f24fe3 100644 --- a/src/mito2/src/engine/append_mode_test.rs +++ b/src/mito2/src/engine/append_mode_test.rs @@ -220,3 +220,126 @@ async fn test_append_mode_compaction_with_format(flat_format: bool) { let batches = RecordBatches::try_collect(stream).await.unwrap(); assert_eq!(expected, sort_batches_and_print(&batches, &["tag_0", "ts"])); } + +#[tokio::test] +async fn test_put_single_range() { + test_put_single_range_with_format(false).await; + test_put_single_range_with_format(true).await; +} + +async fn test_put_single_range_with_format(flat_format: bool) { + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + let region_id = RegionId::new(1, 1); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let request = CreateRequestBuilder::new() + .insert_option("compaction.type", "twcs") + .insert_option("append_mode", "true") + .build(); + let table_dir = request.table_dir.clone(); + let region_opts = request.options.clone(); + + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + // a, field 1, 2 + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows_for_key("a", 1, 3, 1), + }; + put_rows(&engine, region_id, rows).await; + // a, field 0, 1 + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows_for_key("a", 0, 2, 0), + }; + put_rows(&engine, region_id, rows).await; + // b, field 0, 1 + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows_for_key("b", 0, 2, 0), + }; + put_rows(&engine, region_id, rows).await; + // a, field 2, 3 + let rows = Rows { + schema: column_schemas, + rows: build_rows_for_key("a", 2, 4, 2), + }; + put_rows(&engine, region_id, rows).await; + + let expected = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| a | 0.0 | 1970-01-01T00:00:00 | +| a | 1.0 | 1970-01-01T00:00:01 | +| a | 1.0 | 1970-01-01T00:00:01 | +| a | 2.0 | 1970-01-01T00:00:02 | +| a | 2.0 | 1970-01-01T00:00:02 | +| a | 3.0 | 1970-01-01T00:00:03 | +| b | 0.0 | 1970-01-01T00:00:00 | +| b | 1.0 | 1970-01-01T00:00:01 | ++-------+---------+---------------------+"; + // Scans in parallel. + let mut scanner = engine + .scanner(region_id, ScanRequest::default()) + .await + .unwrap(); + assert_eq!(0, scanner.num_files()); + assert_eq!(1, scanner.num_memtables()); + scanner.set_target_partitions(2); + let stream = scanner.scan().await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected, sort_batches_and_print(&batches, &["tag_0", "ts"])); + + // Flushes and scans. + flush_region(&engine, region_id, None).await; + let mut scanner = engine + .scanner(region_id, ScanRequest::default()) + .await + .unwrap(); + assert_eq!(1, scanner.num_files()); + assert_eq!(0, scanner.num_memtables()); + scanner.set_target_partitions(2); + let stream = scanner.scan().await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected, sort_batches_and_print(&batches, &["tag_0", "ts"])); + + // Reopens engine. + let engine = env + .reopen_engine( + engine, + MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }, + ) + .await; + // Reopens the region. + reopen_region(&engine, region_id, table_dir, false, region_opts).await; + let stream = engine + .scan_to_stream(region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected, sort_batches_and_print(&batches, &["tag_0", "ts"])); +} diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs index 39f2366659..f17726abef 100644 --- a/src/mito2/src/engine/basic_test.rs +++ b/src/mito2/src/engine/basic_test.rs @@ -675,6 +675,7 @@ async fn test_region_usage_with_format(flat_format: bool) { let region_stat = region.region_statistic(); assert!(region_stat.wal_size > 0); + assert_eq!(region_stat.num_rows, 10); // delete some rows let rows = Rows { @@ -685,6 +686,7 @@ async fn test_region_usage_with_format(flat_format: bool) { let region_stat = region.region_statistic(); assert!(region_stat.wal_size > 0); + assert_eq!(region_stat.num_rows, 13); // flush region flush_region(&engine, region_id, None).await; @@ -859,9 +861,9 @@ async fn test_cache_null_primary_key_with_format(flat_format: bool) { #[tokio::test] async fn test_list_ssts() { test_list_ssts_with_format(false, r#" -ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test/11_0000000001/.parquet", file_size: 2531, index_file_path: Some("test/11_0000000001/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test/11_0000000002/.parquet", file_size: 2531, index_file_path: Some("test/11_0000000002/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test/22_0000000042/.parquet", file_size: 2531, index_file_path: Some("test/22_0000000042/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#" +ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", index_file_id: Some(""), level: 0, file_path: "test/11_0000000001/.parquet", file_size: 2513, index_file_path: Some("test/11_0000000001/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", index_file_id: Some(""), level: 0, file_path: "test/11_0000000002/.parquet", file_size: 2513, index_file_path: Some("test/11_0000000002/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", index_file_id: Some(""), level: 0, file_path: "test/22_0000000042/.parquet", file_size: 2513, index_file_path: Some("test/22_0000000042/index/.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#" StorageSstEntry { file_path: "test/11_0000000001/.parquet", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/11_0000000001/index/.puffin", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/11_0000000002/.parquet", file_size: None, last_modified_ms: None, node_id: None } @@ -869,9 +871,9 @@ StorageSstEntry { file_path: "test/11_0000000002/index/.puffin", file_s StorageSstEntry { file_path: "test/22_0000000042/.parquet", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/22_0000000042/index/.puffin", file_size: None, last_modified_ms: None, node_id: None }"#).await; test_list_ssts_with_format(true, r#" -ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", level: 0, file_path: "test/11_0000000001/.parquet", file_size: 2855, index_file_path: Some("test/11_0000000001/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", level: 0, file_path: "test/11_0000000002/.parquet", file_size: 2855, index_file_path: Some("test/11_0000000002/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } -ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", level: 0, file_path: "test/22_0000000042/.parquet", file_size: 2855, index_file_path: Some("test/22_0000000042/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#" +ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "", index_file_id: Some(""), level: 0, file_path: "test/11_0000000001/.parquet", file_size: 2837, index_file_path: Some("test/11_0000000001/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "", index_file_id: Some(""), level: 0, file_path: "test/11_0000000002/.parquet", file_size: 2837, index_file_path: Some("test/11_0000000002/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true } +ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "", index_file_id: Some(""), level: 0, file_path: "test/22_0000000042/.parquet", file_size: 2837, index_file_path: Some("test/22_0000000042/index/.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#" StorageSstEntry { file_path: "test/11_0000000001/.parquet", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/11_0000000001/index/.puffin", file_size: None, last_modified_ms: None, node_id: None } StorageSstEntry { file_path: "test/11_0000000002/.parquet", file_size: None, last_modified_ms: None, node_id: None } @@ -943,6 +945,7 @@ async fn test_list_ssts_with_format( .index_file_path .map(|p| p.replace(&e.file_id, "")); e.file_id = "".to_string(); + e.index_file_id = e.index_file_id.map(|_| "".to_string()); format!("\n{:?}", e) }) .sorted() diff --git a/src/mito2/src/engine/batch_catchup_test.rs b/src/mito2/src/engine/batch_catchup_test.rs new file mode 100644 index 0000000000..d8c744a733 --- /dev/null +++ b/src/mito2/src/engine/batch_catchup_test.rs @@ -0,0 +1,250 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use api::v1::Rows; +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_recordbatch::RecordBatches; +use common_wal::options::{KafkaWalOptions, WAL_OPTIONS_KEY, WalOptions}; +use rstest::rstest; +use rstest_reuse::apply; +use store_api::region_engine::RegionEngine; +use store_api::region_request::{PathType, RegionCatchupRequest, RegionOpenRequest, RegionRequest}; +use store_api::storage::{RegionId, ScanRequest}; + +use crate::config::MitoConfig; +use crate::engine::MitoEngine; +use crate::test_util::{ + CreateRequestBuilder, LogStoreFactory, TestEnv, build_rows, flush_region, + kafka_log_store_factory, prepare_test_for_kafka_log_store, put_rows, rows_schema, + single_kafka_log_store_factory, +}; + +#[apply(single_kafka_log_store_factory)] +async fn test_batch_catchup(factory: Option) { + test_batch_catchup_with_format(factory.clone(), false).await; + test_batch_catchup_with_format(factory, true).await; +} + +async fn test_batch_catchup_with_format(factory: Option, flat_format: bool) { + common_telemetry::init_default_ut_logging(); + let Some(factory) = factory else { + return; + }; + let mut env = TestEnv::with_prefix("catchup-batch-regions") + .await + .with_log_store_factory(factory.clone()); + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + + // Prepares 3 topics for 8 regions + let num_topic = 3; + let mut topics = vec![]; + for _ in 0..num_topic { + let topic = prepare_test_for_kafka_log_store(&factory).await.unwrap(); + topics.push(topic); + } + + let num_regions = 8u32; + let table_dir_fn = |region_id| format!("test/{region_id}"); + let mut region_schema = HashMap::new(); + + let get_topic_idx = |id| (id - 1) % num_topic; + + // Creates 8 regions and puts data into them + for id in 1..=num_regions { + let engine = engine.clone(); + let topic_idx = get_topic_idx(id); + let topic = topics[topic_idx as usize].clone(); + let region_id = RegionId::new(1, id); + let request = CreateRequestBuilder::new() + .table_dir(&table_dir_fn(region_id)) + .kafka_topic(Some(topic)) + .build(); + let column_schemas = rows_schema(&request); + region_schema.insert(region_id, column_schemas); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + } + + // Puts data into regions + let rows = 30; + for i in 0..rows { + for region_number in 1..=num_regions { + let region_id = RegionId::new(1, region_number); + let rows = Rows { + schema: region_schema[®ion_id].clone(), + rows: build_rows( + (region_number as usize) * 120 + i as usize, + (region_number as usize) * 120 + i as usize + 1, + ), + }; + put_rows(&engine, region_id, rows).await; + if i % region_number == 0 { + flush_region(&engine, region_id, None).await; + } + } + } + + let assert_result = |engine: MitoEngine| async move { + for i in 1..=num_regions { + let region_id = RegionId::new(1, i); + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + let mut expected = String::new(); + expected.push_str( + "+-------+---------+---------------------+\n| tag_0 | field_0 | ts |\n+-------+---------+---------------------+\n", + ); + for row in 0..rows { + expected.push_str(&format!( + "| {} | {}.0 | 1970-01-01T00:{:02}:{:02} |\n", + i * 120 + row, + i * 120 + row, + 2 * i, + row + )); + } + expected.push_str("+-------+---------+---------------------+"); + assert_eq!(expected, batches.pretty_print().unwrap()); + } + }; + assert_result(engine.clone()).await; + + // Reopen engine. + let engine = env + .reopen_engine( + engine, + MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }, + ) + .await; + + let requests = (1..=num_regions) + .map(|id| { + let region_id = RegionId::new(1, id); + let topic_idx = get_topic_idx(id); + let topic = topics[topic_idx as usize].clone(); + let mut options = HashMap::new(); + options.insert( + WAL_OPTIONS_KEY.to_string(), + serde_json::to_string(&WalOptions::Kafka(KafkaWalOptions { topic })).unwrap(), + ); + ( + region_id, + RegionOpenRequest { + engine: String::new(), + table_dir: table_dir_fn(region_id), + options, + skip_wal_replay: true, + path_type: PathType::Bare, + checkpoint: None, + }, + ) + }) + .collect::>(); + + let parallelism = 2; + let results = engine + .handle_batch_open_requests(parallelism, requests) + .await + .unwrap(); + for (_, result) in results { + assert!(result.is_ok()); + } + + let requests = (1..=num_regions) + .map(|id| { + let region_id = RegionId::new(1, id); + ( + region_id, + RegionCatchupRequest { + set_writable: true, + entry_id: None, + metadata_entry_id: None, + location_id: None, + checkpoint: None, + }, + ) + }) + .collect::>(); + + let results = engine + .handle_batch_catchup_requests(parallelism, requests) + .await + .unwrap(); + for (_, result) in results { + assert!(result.is_ok()); + } + assert_result(engine.clone()).await; +} + +#[apply(single_kafka_log_store_factory)] +async fn test_batch_catchup_err(factory: Option) { + test_batch_catchup_err_with_format(factory.clone(), false).await; + test_batch_catchup_err_with_format(factory, true).await; +} + +async fn test_batch_catchup_err_with_format(factory: Option, flat_format: bool) { + common_telemetry::init_default_ut_logging(); + let Some(factory) = factory else { + return; + }; + let mut env = TestEnv::with_prefix("catchup-regions-err") + .await + .with_log_store_factory(factory.clone()); + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + let num_regions = 3u32; + let requests = (1..num_regions) + .map(|id| { + let region_id = RegionId::new(1, id); + ( + region_id, + RegionCatchupRequest { + set_writable: true, + entry_id: None, + metadata_entry_id: None, + location_id: None, + checkpoint: None, + }, + ) + }) + .collect::>(); + + let results = engine + .handle_batch_catchup_requests(4, requests) + .await + .unwrap(); + for (_, result) in results { + assert_eq!( + result.unwrap_err().status_code(), + StatusCode::RegionNotFound + ); + } +} diff --git a/src/mito2/src/engine/catchup_test.rs b/src/mito2/src/engine/catchup_test.rs index f0ee6e13f5..0c7d058e4d 100644 --- a/src/mito2/src/engine/catchup_test.rs +++ b/src/mito2/src/engine/catchup_test.rs @@ -135,6 +135,7 @@ async fn test_catchup_with_last_entry_id(factory: Option) { let region = follower_engine.get_region(region_id).unwrap(); assert!(!region.is_writable()); assert!(resp.is_ok()); + assert!(!follower_engine.is_region_catching_up(region_id)); // Scans let request = ScanRequest::default(); @@ -256,7 +257,7 @@ async fn test_catchup_with_incorrect_last_entry_id(factory: Option().unwrap(); - + assert!(!follower_engine.is_region_catching_up(region_id)); assert_matches!(err, Error::Unexpected { .. }); // It should ignore requests to writable regions. @@ -719,3 +720,33 @@ async fn test_catchup_not_exist_with_format(flat_format: bool) { .unwrap_err(); assert_matches!(err.status_code(), StatusCode::RegionNotFound); } + +#[tokio::test] +async fn test_catchup_region_busy() { + common_telemetry::init_default_ut_logging(); + let mut env = TestEnv::new().await; + let engine = env.create_engine(MitoConfig::default()).await; + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + engine + .set_region_role(region_id, RegionRole::Follower) + .unwrap(); + let worker = engine.inner.workers.worker(region_id); + let catchup_regions = worker.catchup_regions(); + catchup_regions.insert_region(region_id); + let err = engine + .handle_request( + region_id, + RegionRequest::Catchup(RegionCatchupRequest { + set_writable: true, + ..Default::default() + }), + ) + .await + .unwrap_err(); + assert_matches!(err.status_code(), StatusCode::RegionBusy); +} diff --git a/src/mito2/src/engine/compaction_test.rs b/src/mito2/src/engine/compaction_test.rs index a385d64fb5..09fc4e2935 100644 --- a/src/mito2/src/engine/compaction_test.rs +++ b/src/mito2/src/engine/compaction_test.rs @@ -19,8 +19,8 @@ use std::time::Duration; use api::v1::{ColumnSchema, Rows}; use common_recordbatch::{RecordBatches, SendableRecordBatchStream}; -use datatypes::prelude::ScalarVector; -use datatypes::vectors::TimestampMillisecondVector; +use datatypes::arrow::array::AsArray; +use datatypes::arrow::datatypes::TimestampMillisecondType; use store_api::region_engine::{RegionEngine, RegionRole}; use store_api::region_request::AlterKind::SetRegionOptions; use store_api::region_request::{ @@ -100,7 +100,7 @@ pub(crate) async fn delete_and_flush( let result = engine .handle_request( region_id, - RegionRequest::Delete(RegionDeleteRequest { rows }), + RegionRequest::Delete(RegionDeleteRequest { rows, hint: None }), ) .await .unwrap(); @@ -125,10 +125,8 @@ async fn collect_stream_ts(stream: SendableRecordBatchStream) -> Vec { let ts_col = batch .column_by_name("ts") .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - res.extend(ts_col.iter_data().map(|t| t.unwrap().0.value())); + .as_primitive::(); + res.extend((0..ts_col.len()).map(|i| ts_col.value(i))); } res } diff --git a/src/mito2/src/engine/create_test.rs b/src/mito2/src/engine/create_test.rs index 7ba7aab225..e5980d9442 100644 --- a/src/mito2/src/engine/create_test.rs +++ b/src/mito2/src/engine/create_test.rs @@ -23,7 +23,7 @@ use store_api::storage::{RegionId, ScanRequest}; use crate::config::MitoConfig; use crate::region::options::MemtableOptions; use crate::test_util::{ - CreateRequestBuilder, TestEnv, build_rows, put_rows, reopen_region, rows_schema, + CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, reopen_region, rows_schema, }; #[tokio::test] @@ -380,3 +380,77 @@ async fn create_with_partition_expr_persists_manifest_with_format(flat_format: b let manifest = region.manifest_ctx.manifest().await; assert_eq!(manifest.metadata.partition_expr.as_deref(), Some(expr_json)); } + +#[tokio::test] +async fn test_engine_create_with_format() { + common_telemetry::init_default_ut_logging(); + + test_engine_create_with_format_one_case("primary_key", false).await; + test_engine_create_with_format_one_case("primary_key", true).await; + test_engine_create_with_format_one_case("flat", false).await; + test_engine_create_with_format_one_case("flat", true).await; +} + +async fn test_engine_create_with_format_one_case(create_format: &str, default_flat_format: bool) { + common_telemetry::info!( + "Test engine create with format, create_format: {}, default_flat_format: {}", + create_format, + default_flat_format + ); + + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: default_flat_format, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new() + .insert_option("sst_format", create_format) + .build(); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + let expected = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected, batches.pretty_print().unwrap()); + + flush_region(&engine, region_id, None).await; + + let request = ScanRequest::default(); + let stream = engine.scan_to_stream(region_id, request).await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected, batches.pretty_print().unwrap()); +} diff --git a/src/mito2/src/engine/index_build_test.rs b/src/mito2/src/engine/index_build_test.rs index 6fe27929e5..7c5ec19c7f 100644 --- a/src/mito2/src/engine/index_build_test.rs +++ b/src/mito2/src/engine/index_build_test.rs @@ -19,7 +19,9 @@ use std::sync::Arc; use api::v1::Rows; use store_api::region_engine::RegionEngine; -use store_api::region_request::{AlterKind, RegionAlterRequest, RegionRequest, SetIndexOption}; +use store_api::region_request::{ + AlterKind, RegionAlterRequest, RegionBuildIndexRequest, RegionRequest, SetIndexOption, +}; use store_api::storage::{RegionId, ScanRequest}; use crate::config::{IndexBuildMode, MitoConfig, Mode}; @@ -32,11 +34,6 @@ use crate::test_util::{ CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, reopen_region, rows_schema, }; -// wait listener receives enough success count. -async fn wait_finish(listener: &IndexBuildListener, times: usize) { - listener.wait_finish(times).await; -} - fn async_build_mode_config(is_create_on_flush: bool) -> MitoConfig { let mut config = MitoConfig::default(); config.index.build_mode = IndexBuildMode::Async; @@ -76,15 +73,13 @@ async fn num_of_index_files(engine: &MitoEngine, scanner: &Scanner, region_id: R index_files_count } -#[allow(dead_code)] fn assert_listener_counts( listener: &IndexBuildListener, expected_begin_count: usize, - expected_success_count: usize, ) { assert_eq!(listener.begin_count(), expected_begin_count); - assert_eq!(listener.success_count(), expected_success_count); + assert_eq!(listener.finish_count(), expected_success_count); } #[tokio::test] @@ -131,9 +126,9 @@ async fn test_index_build_type_flush() { .scanner(region_id, ScanRequest::default()) .await .unwrap(); - assert!(scanner.num_memtables() == 1); - assert!(scanner.num_files() == 0); - assert!(num_of_index_files(&engine, &scanner, region_id).await == 0); + assert_eq!(scanner.num_memtables(), 1); + assert_eq!(scanner.num_files(), 0); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 0); flush_region(&engine, region_id, None).await; @@ -142,9 +137,9 @@ async fn test_index_build_type_flush() { .scanner(region_id, ScanRequest::default()) .await .unwrap(); - assert!(scanner.num_memtables() == 0); - assert!(scanner.num_files() == 1); - assert!(num_of_index_files(&engine, &scanner, region_id).await == 0); + assert_eq!(scanner.num_memtables(), 0); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 0); let rows = Rows { schema: column_schemas.clone(), @@ -155,12 +150,12 @@ async fn test_index_build_type_flush() { flush_region(&engine, region_id, None).await; // After 2 index build task are finished, 2 index files should exist. - wait_finish(&listener, 2).await; + listener.wait_finish(2).await; let scanner = engine .scanner(region_id, ScanRequest::default()) .await .unwrap(); - assert!(num_of_index_files(&engine, &scanner, region_id).await == 2); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 2); } #[tokio::test] @@ -204,35 +199,39 @@ async fn test_index_build_type_compact() { put_and_flush(&engine, region_id, &column_schemas, 15..25).await; put_and_flush(&engine, region_id, &column_schemas, 40..50).await; + // all index build tasks begin means flush tasks are all finished. + listener.wait_begin(4).await; // Before compaction is triggered, files should be 4, and not all index files are built. let scanner = engine .scanner(region_id, ScanRequest::default()) .await .unwrap(); - assert!(scanner.num_files() == 4); + assert_eq!(scanner.num_files(), 4); assert!(num_of_index_files(&engine, &scanner, region_id).await < 4); // Note: Compaction have been implicitly triggered by the flush operations above. // This explicit compaction call serves to make the process deterministic for the test. compact(&engine, region_id).await; + listener.wait_begin(5).await; // 4 flush + 1 compaction begin // Before compaction is triggered, files should be 2, and not all index files are built. - listener.clear_success_count(); let scanner = engine .scanner(region_id, ScanRequest::default()) .await .unwrap(); - assert!(scanner.num_files() == 2); - assert!(num_of_index_files(&engine, &scanner, region_id).await < 2); + assert_eq!(scanner.num_files(), 2); + // Compaction is an async task, so it may be finished at this moment. + assert!(num_of_index_files(&engine, &scanner, region_id).await <= 2); // Wait a while to make sure index build tasks are finished. - wait_finish(&listener, 2).await; + listener.wait_stop(5).await; // 4 flush + 1 compaction = some abort + some finish let scanner = engine .scanner(region_id, ScanRequest::default()) .await .unwrap(); - assert!(scanner.num_files() == 2); - assert!(num_of_index_files(&engine, &scanner, region_id).await == 2); + assert_eq!(scanner.num_files(), 2); + // Index files should be built. + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 2); } #[tokio::test] @@ -277,8 +276,8 @@ async fn test_index_build_type_schema_change() { .scanner(region_id, ScanRequest::default()) .await .unwrap(); - assert!(scanner.num_files() == 1); - assert!(num_of_index_files(&engine, &scanner, region_id).await == 0); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 0); // Set Index and make sure index file is built without flush or compaction. let set_index_request = RegionAlterRequest { @@ -292,11 +291,179 @@ async fn test_index_build_type_schema_change() { .handle_request(region_id, RegionRequest::Alter(set_index_request)) .await .unwrap(); - wait_finish(&listener, 1).await; + listener.wait_finish(1).await; let scanner = engine .scanner(region_id, ScanRequest::default()) .await .unwrap(); - assert!(scanner.num_files() == 1); - assert!(num_of_index_files(&engine, &scanner, region_id).await == 1); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1); +} + +#[tokio::test] +async fn test_index_build_type_manual_basic() { + let mut env = TestEnv::with_prefix("test_index_build_type_manual_").await; + let listener = Arc::new(IndexBuildListener::default()); + let engine = env + .create_engine_with( + async_build_mode_config(false), // Disable index file creation on flush. + None, + Some(listener.clone()), + None, + ) + .await; + + let region_id = RegionId::new(1, 1); + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + // Create a region with index. + let request = CreateRequestBuilder::new().build_with_index(); + let table_dir = request.table_dir.clone(); + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + // Flush and make sure there is no index file (because create_on_flush is disabled). + put_and_flush(&engine, region_id, &column_schemas, 10..20).await; + reopen_region(&engine, region_id, table_dir.clone(), true, HashMap::new()).await; + let scanner = engine + .scanner(region_id, ScanRequest::default()) + .await + .unwrap(); + // Index build task is triggered on flush, but not finished. + assert_listener_counts(&listener, 1, 0); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 0); + + // Trigger manual index build task and make sure index file is built without flush or compaction. + let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {}); + engine.handle_request(region_id, request).await.unwrap(); + listener.wait_finish(1).await; + let scanner = engine + .scanner(region_id, ScanRequest::default()) + .await + .unwrap(); + assert_listener_counts(&listener, 2, 1); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1); + + // Test idempotency: Second manual index build request on the same file. + let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {}); + engine.handle_request(region_id, request).await.unwrap(); + reopen_region(&engine, region_id, table_dir.clone(), true, HashMap::new()).await; + let scanner = engine + .scanner(region_id, ScanRequest::default()) + .await + .unwrap(); + // Should still be 2 begin and 1 finish - no new task should be created for already indexed file. + assert_listener_counts(&listener, 2, 1); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1); + + // Test idempotency again: Third manual index build request to further verify. + let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {}); + engine.handle_request(region_id, request).await.unwrap(); + reopen_region(&engine, region_id, table_dir.clone(), true, HashMap::new()).await; + let scanner = engine + .scanner(region_id, ScanRequest::default()) + .await + .unwrap(); + assert_listener_counts(&listener, 2, 1); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1); +} + +#[tokio::test] +async fn test_index_build_type_manual_consistency() { + let mut env = TestEnv::with_prefix("test_index_build_type_manual_consistency_").await; + let listener = Arc::new(IndexBuildListener::default()); + let engine = env + .create_engine_with( + async_build_mode_config(true), + None, + Some(listener.clone()), + None, + ) + .await; + + let region_id = RegionId::new(1, 1); + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + // Create a region with index. + let create_request = CreateRequestBuilder::new().build_with_index(); + let table_dir = create_request.table_dir.clone(); + let column_schemas = rows_schema(&create_request); + engine + .handle_request(region_id, RegionRequest::Create(create_request.clone())) + .await + .unwrap(); + assert_listener_counts(&listener, 0, 0); + + // Flush and make sure index file exists. + put_and_flush(&engine, region_id, &column_schemas, 10..20).await; + listener.wait_finish(1).await; + let scanner = engine + .scanner(region_id, ScanRequest::default()) + .await + .unwrap(); + assert_listener_counts(&listener, 1, 1); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1); + + // Check index build task for consistent file will be skipped. + let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {}); + engine.handle_request(region_id, request).await.unwrap(); + // Reopen the region to ensure the task wasn't skipped due to insufficient time. + reopen_region(&engine, region_id, table_dir.clone(), true, HashMap::new()).await; + let scanner = engine + .scanner(region_id, ScanRequest::default()) + .await + .unwrap(); + // Because the file is consistent, no new index build task is triggered. + assert_listener_counts(&listener, 1, 1); + assert_eq!(scanner.num_files(), 1); + assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1); + + let mut altered_metadata = create_request.column_metadatas.clone(); + // Set index for field_0. + altered_metadata[1].column_schema.set_inverted_index(true); + let sync_columns_request = RegionAlterRequest { + kind: AlterKind::SyncColumns { + column_metadatas: altered_metadata, + }, + }; + // Use SyncColumns to avoid triggering SchemaChange index build. + engine + .handle_request(region_id, RegionRequest::Alter(sync_columns_request)) + .await + .unwrap(); + reopen_region(&engine, region_id, table_dir, true, HashMap::new()).await; + // SyncColumns won't trigger index build. + assert_listener_counts(&listener, 1, 1); + + let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {}); + engine.handle_request(region_id, request).await.unwrap(); + listener.wait_finish(2).await; // previous 1 + new 1 + // Because the file is inconsistent, new index build task is triggered. + assert_listener_counts(&listener, 2, 2); } diff --git a/src/mito2/src/engine/listener.rs b/src/mito2/src/engine/listener.rs index 317c3cdfd0..277c9a4050 100644 --- a/src/mito2/src/engine/listener.rs +++ b/src/mito2/src/engine/listener.rs @@ -74,11 +74,17 @@ pub trait EventListener: Send + Sync { /// Notifies the listener that region starts to send a region change result to worker. async fn on_notify_region_change_result_begin(&self, _region_id: RegionId) {} + /// Notifies the listener that region starts to send a enter staging result to worker. + async fn on_enter_staging_result_begin(&self, _region_id: RegionId) {} + /// Notifies the listener that the index build task is executed successfully. - async fn on_index_build_success(&self, _region_file_id: RegionFileId) {} + async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {} /// Notifies the listener that the index build task is started. async fn on_index_build_begin(&self, _region_file_id: RegionFileId) {} + + /// Notifies the listener that the index build task is aborted. + async fn on_index_build_abort(&self, _region_file_id: RegionFileId) {} } pub type EventListenerRef = Arc; @@ -304,50 +310,111 @@ impl EventListener for NotifyRegionChangeResultListener { region_id ); self.notify.notified().await; + info!( + "Continue to sending region change result for region {}", + region_id + ); + } +} + +#[derive(Default)] +pub struct NotifyEnterStagingResultListener { + notify: Notify, +} + +impl NotifyEnterStagingResultListener { + /// Continue to sending enter staging result. + pub fn wake_notify(&self) { + self.notify.notify_one(); + } +} + +#[async_trait] +impl EventListener for NotifyEnterStagingResultListener { + async fn on_enter_staging_result_begin(&self, region_id: RegionId) { + info!( + "Wait on notify to start notify enter staging result for region {}", + region_id + ); + self.notify.notified().await; + info!( + "Continue to sending enter staging result for region {}", + region_id + ); } } #[derive(Default)] pub struct IndexBuildListener { - notify: Notify, - success_count: AtomicUsize, - start_count: AtomicUsize, + begin_count: AtomicUsize, + begin_notify: Notify, + finish_count: AtomicUsize, + finish_notify: Notify, + abort_count: AtomicUsize, + abort_notify: Notify, + // stop means finished or aborted + stop_notify: Notify, } impl IndexBuildListener { /// Wait until index build is done for `times` times. pub async fn wait_finish(&self, times: usize) { - while self.success_count.load(Ordering::Relaxed) < times { - self.notify.notified().await; + while self.finish_count.load(Ordering::Relaxed) < times { + self.finish_notify.notified().await; + } + } + + /// Wait until index build is stopped for `times` times. + pub async fn wait_stop(&self, times: usize) { + while self.finish_count.load(Ordering::Relaxed) + self.abort_count.load(Ordering::Relaxed) + < times + { + self.stop_notify.notified().await; + } + } + + /// Wait until index build is begun for `times` times. + pub async fn wait_begin(&self, times: usize) { + while self.begin_count.load(Ordering::Relaxed) < times { + self.begin_notify.notified().await; } } /// Clears the success count. - pub fn clear_success_count(&self) { - self.success_count.store(0, Ordering::Relaxed); + pub fn clear_finish_count(&self) { + self.finish_count.store(0, Ordering::Relaxed); } /// Returns the success count. - pub fn success_count(&self) -> usize { - self.success_count.load(Ordering::Relaxed) + pub fn finish_count(&self) -> usize { + self.finish_count.load(Ordering::Relaxed) } /// Returns the start count. pub fn begin_count(&self) -> usize { - self.start_count.load(Ordering::Relaxed) + self.begin_count.load(Ordering::Relaxed) } } #[async_trait] impl EventListener for IndexBuildListener { - async fn on_index_build_success(&self, region_file_id: RegionFileId) { + async fn on_index_build_finish(&self, region_file_id: RegionFileId) { info!("Region {} index build successfully", region_file_id); - self.success_count.fetch_add(1, Ordering::Relaxed); - self.notify.notify_one(); + self.finish_count.fetch_add(1, Ordering::Relaxed); + self.finish_notify.notify_one(); + self.stop_notify.notify_one(); } async fn on_index_build_begin(&self, region_file_id: RegionFileId) { info!("Region {} index build begin", region_file_id); - self.start_count.fetch_add(1, Ordering::Relaxed); + self.begin_count.fetch_add(1, Ordering::Relaxed); + self.begin_notify.notify_one(); + } + + async fn on_index_build_abort(&self, region_file_id: RegionFileId) { + info!("Region {} index build aborted", region_file_id); + self.abort_count.fetch_add(1, Ordering::Relaxed); + self.abort_notify.notify_one(); + self.stop_notify.notify_one(); } } diff --git a/src/mito2/src/engine/prune_test.rs b/src/mito2/src/engine/prune_test.rs index b260024043..beb5e2644a 100644 --- a/src/mito2/src/engine/prune_test.rs +++ b/src/mito2/src/engine/prune_test.rs @@ -22,8 +22,10 @@ use store_api::region_request::RegionRequest; use store_api::storage::{RegionId, ScanRequest}; use crate::config::MitoConfig; +use crate::test_util::batch_util::sort_batches_and_print; use crate::test_util::{ - CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, rows_schema, + CreateRequestBuilder, TestEnv, build_delete_rows, build_rows, delete_rows, delete_rows_schema, + flush_region, put_rows, rows_schema, }; async fn check_prune_row_groups(exprs: Vec, expected: &str, flat_format: bool) { @@ -377,3 +379,99 @@ async fn test_mem_range_prune_with_format(flat_format: bool) { +-------+---------+---------------------+"; assert_eq!(expected, batches.pretty_print().unwrap()); } + +#[tokio::test] +async fn test_scan_filter_field_after_delete() { + test_scan_filter_field_after_delete_with_format(false).await; + test_scan_filter_field_after_delete_with_format(true).await; +} + +async fn test_scan_filter_field_after_delete_with_format(flat_format: bool) { + common_telemetry::init_default_ut_logging(); + + let mut env = TestEnv::new().await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + let request = CreateRequestBuilder::new() + .insert_option("compaction.type", "twcs") + .build(); + + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request.clone())) + .await + .unwrap(); + + // put 1, 2, 3, 4 and flush + put_rows( + &engine, + region_id, + Rows { + schema: column_schemas, + rows: build_rows(1, 5), + }, + ) + .await; + flush_region(&engine, region_id, None).await; + + // delete 2, 3 + let delete_schemas = delete_rows_schema(&request); + delete_rows( + &engine, + region_id, + Rows { + schema: delete_schemas, + rows: build_delete_rows(2, 4), + }, + ) + .await; + + // Scans and filter fields, the field should be deleted. + let request = ScanRequest { + filters: vec![col("field_0").eq(lit(3.0f64))], + ..Default::default() + }; + let stream = engine + .scan_to_stream(region_id, request.clone()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + let expected = "\ ++-------+---------+----+ +| tag_0 | field_0 | ts | ++-------+---------+----+ ++-------+---------+----+"; + assert_eq!( + expected, + sort_batches_and_print(&batches, &["tag_0", "field_0", "ts"]) + ); + + // flush delete op + flush_region(&engine, region_id, None).await; + let stream = engine + .scan_to_stream(region_id, request.clone()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!( + expected, + sort_batches_and_print(&batches, &["tag_0", "field_0", "ts"]) + ); +} diff --git a/src/mito2/src/engine/remap_manifests_test.rs b/src/mito2/src/engine/remap_manifests_test.rs new file mode 100644 index 0000000000..bd38e87e2a --- /dev/null +++ b/src/mito2/src/engine/remap_manifests_test.rs @@ -0,0 +1,239 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::assert_matches::assert_matches; + +use api::v1::Rows; +use datatypes::value::Value; +use partition::expr::{PartitionExpr, col}; +use store_api::region_engine::{RegionEngine, RemapManifestsRequest, SettableRegionRoleState}; +use store_api::region_request::{RegionFlushRequest, RegionRequest}; +use store_api::storage::RegionId; + +use crate::config::MitoConfig; +use crate::error::Error; +use crate::manifest::action::RegionManifest; +use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema}; + +#[tokio::test] +async fn test_remap_manifests_invalid_partition_expr() { + common_telemetry::init_default_ut_logging(); + test_remap_manifests_invalid_partition_expr_with_format(false).await; + test_remap_manifests_invalid_partition_expr_with_format(true).await; +} + +async fn test_remap_manifests_invalid_partition_expr_with_format(flat_format: bool) { + let mut env = TestEnv::with_prefix("invalid-partition-expr").await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let err = engine + .remap_manifests(RemapManifestsRequest { + region_id, + input_regions: vec![region_id], + region_mapping: [(region_id, vec![region_id])].into_iter().collect(), + new_partition_exprs: [(region_id, "invalid expr".to_string())] + .into_iter() + .collect(), + }) + .await + .unwrap_err(); + assert_matches!( + err.into_inner().as_any().downcast_ref::().unwrap(), + Error::InvalidPartitionExpr { .. } + ) +} + +#[tokio::test] +async fn test_remap_manifests_invalid_region_state() { + common_telemetry::init_default_ut_logging(); + test_remap_manifests_invalid_region_state_with_format(false).await; + test_remap_manifests_invalid_region_state_with_format(true).await; +} + +fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr { + col(col_name) + .gt_eq(Value::Int64(start)) + .and(col(col_name).lt(Value::Int64(end))) +} + +async fn test_remap_manifests_invalid_region_state_with_format(flat_format: bool) { + let mut env = TestEnv::with_prefix("invalid-region-state").await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let err = engine + .remap_manifests(RemapManifestsRequest { + region_id, + input_regions: vec![region_id], + region_mapping: [(region_id, vec![region_id])].into_iter().collect(), + new_partition_exprs: [(region_id, range_expr("x", 0, 100).as_json_str().unwrap())] + .into_iter() + .collect(), + }) + .await + .unwrap_err(); + assert_matches!( + err.into_inner().as_any().downcast_ref::().unwrap(), + Error::RegionState { .. } + ) +} + +#[tokio::test] +async fn test_remap_manifests_invalid_input_regions() { + common_telemetry::init_default_ut_logging(); + test_remap_manifests_invalid_input_regions_with_format(false).await; + test_remap_manifests_invalid_input_regions_with_format(true).await; +} + +async fn test_remap_manifests_invalid_input_regions_with_format(flat_format: bool) { + let mut env = TestEnv::with_prefix("invalid-input-regions").await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + engine + .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader) + .await + .unwrap(); + let err = engine + .remap_manifests(RemapManifestsRequest { + region_id, + input_regions: vec![region_id, RegionId::new(2, 1)], + region_mapping: [(region_id, vec![region_id])].into_iter().collect(), + new_partition_exprs: [(region_id, range_expr("x", 0, 100).as_json_str().unwrap())] + .into_iter() + .collect(), + }) + .await + .unwrap_err(); + assert_matches!( + err.into_inner().as_any().downcast_ref::().unwrap(), + Error::InvalidRequest { .. } + ) +} + +#[tokio::test] +async fn test_remap_manifests_success() { + common_telemetry::init_default_ut_logging(); + test_remap_manifests_success_with_format(false).await; + test_remap_manifests_success_with_format(true).await; +} + +async fn test_remap_manifests_success_with_format(flat_format: bool) { + let mut env = TestEnv::with_prefix("engine-stop").await; + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new() + .partition_expr_json(Some(range_expr("tag_0", 0, 100).as_json_str().unwrap())) + .build(); + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let new_region_id_1 = RegionId::new(1, 2); + let new_region_id_2 = RegionId::new(1, 3); + + // Generate some data + for i in 0..3 { + let rows_data = Rows { + schema: column_schemas.clone(), + rows: build_rows(i * 10, (i + 1) * 10), + }; + put_rows(&engine, region_id, rows_data).await; + engine + .handle_request( + region_id, + RegionRequest::Flush(RegionFlushRequest { + row_group_size: None, + }), + ) + .await + .unwrap(); + } + + engine + .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader) + .await + .unwrap(); + + let result = engine + .remap_manifests(RemapManifestsRequest { + region_id, + input_regions: vec![region_id], + region_mapping: [(region_id, vec![new_region_id_1, new_region_id_2])] + .into_iter() + .collect(), + new_partition_exprs: [ + ( + new_region_id_1, + range_expr("tag_0", 0, 50).as_json_str().unwrap(), + ), + ( + new_region_id_2, + range_expr("tag_0", 50, 100).as_json_str().unwrap(), + ), + ] + .into_iter() + .collect(), + }) + .await + .unwrap(); + assert_eq!(result.new_manifests.len(), 2); + let new_manifest_1 = + serde_json::from_str::(&result.new_manifests[&new_region_id_1]).unwrap(); + let new_manifest_2 = + serde_json::from_str::(&result.new_manifests[&new_region_id_2]).unwrap(); + assert_eq!(new_manifest_1.files.len(), 3); + assert_eq!(new_manifest_2.files.len(), 3); +} diff --git a/src/mito2/src/engine/staging_test.rs b/src/mito2/src/engine/staging_test.rs index 6d802a5d9d..91816a4f9f 100644 --- a/src/mito2/src/engine/staging_test.rs +++ b/src/mito2/src/engine/staging_test.rs @@ -14,17 +14,30 @@ //! Integration tests for staging state functionality. +use std::assert_matches::assert_matches; use std::fs; +use std::sync::Arc; +use std::time::Duration; use api::v1::Rows; +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; use common_recordbatch::RecordBatches; +use object_store::Buffer; +use object_store::layers::mock::{ + Entry, Error as MockError, ErrorKind, List, Lister, Metadata, MockLayerBuilder, + Result as MockResult, Write, Writer, +}; use store_api::region_engine::{RegionEngine, SettableRegionRoleState}; use store_api::region_request::{ - RegionAlterRequest, RegionFlushRequest, RegionRequest, RegionTruncateRequest, + EnterStagingRequest, RegionAlterRequest, RegionFlushRequest, RegionRequest, + RegionTruncateRequest, }; use store_api::storage::{RegionId, ScanRequest}; use crate::config::MitoConfig; +use crate::engine::listener::NotifyEnterStagingResultListener; +use crate::error::Error; use crate::region::{RegionLeaderState, RegionRoleState}; use crate::request::WorkerRequest; use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema}; @@ -214,6 +227,8 @@ async fn test_staging_state_validation_patterns() { ); } +const PARTITION_EXPR: &str = "partition_expr"; + #[tokio::test] async fn test_staging_manifest_directory() { test_staging_manifest_directory_with_format(false).await; @@ -221,6 +236,7 @@ async fn test_staging_manifest_directory() { } async fn test_staging_manifest_directory_with_format(flat_format: bool) { + common_telemetry::init_default_ut_logging(); let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { @@ -255,9 +271,57 @@ async fn test_staging_manifest_directory_with_format(flat_format: bool) { // Now test staging mode manifest creation // Set region to staging mode using the engine API engine - .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader) + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_expr: PARTITION_EXPR.to_string(), + }), + ) .await .unwrap(); + let region = engine.get_region(region_id).unwrap(); + let staging_partition_expr = region.staging_partition_expr.lock().unwrap().clone(); + assert_eq!(staging_partition_expr.unwrap(), PARTITION_EXPR); + { + let manager = region.manifest_ctx.manifest_manager.read().await; + assert_eq!( + manager + .staging_manifest() + .unwrap() + .metadata + .partition_expr + .as_deref() + .unwrap(), + PARTITION_EXPR + ); + assert!(manager.manifest().metadata.partition_expr.is_none()); + } + + // Should be ok to enter staging mode again with the same partition expr + engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_expr: PARTITION_EXPR.to_string(), + }), + ) + .await + .unwrap(); + + // Should throw error if try to enter staging mode again with a different partition expr + let err = engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_expr: "".to_string(), + }), + ) + .await + .unwrap_err(); + assert_matches!( + err.into_inner().as_any().downcast_ref::().unwrap(), + Error::StagingPartitionExprMismatch { .. } + ); // Put some data and flush in staging mode let rows_data = Rows { @@ -312,6 +376,7 @@ async fn test_staging_exit_success_with_manifests() { } async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) { + common_telemetry::init_default_ut_logging(); let mut env = TestEnv::new().await; let engine = env .create_engine(MitoConfig { @@ -330,16 +395,28 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) .await .unwrap(); + // Add some data and flush in staging mode to generate staging manifests + let rows_data = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows_data).await; + // Enter staging mode engine - .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader) + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_expr: PARTITION_EXPR.to_string(), + }), + ) .await .unwrap(); // Add some data and flush in staging mode to generate staging manifests let rows_data = Rows { schema: column_schemas.clone(), - rows: build_rows(0, 5), + rows: build_rows(3, 8), }; put_rows(&engine, region_id, rows_data).await; @@ -357,7 +434,7 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) // Add more data and flush again to generate multiple staging manifests let rows_data2 = Rows { schema: column_schemas.clone(), - rows: build_rows(5, 10), + rows: build_rows(8, 10), }; put_rows(&engine, region_id, rows_data2).await; @@ -382,8 +459,11 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) .unwrap(); assert_eq!( staging_files_before.len(), - 2, - "Staging manifest directory should contain two files before exit" + // Two files for flush operation + // One file for entering staging mode + 3, + "Staging manifest directory should contain 3 files before exit, got: {:?}", + staging_files_before ); // Count normal manifest files before exit @@ -394,8 +474,11 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) .unwrap(); let normal_count_before = normal_files_before.len(); assert_eq!( - normal_count_before, 1, - "Normal manifest directory should initially contain one file" + // One file for table creation + // One file for flush operation + normal_count_before, + 2, + "Normal manifest directory should initially contain 2 files" ); // Try read data before exiting staging, SST files should be invisible @@ -403,8 +486,8 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) let scanner = engine.scanner(region_id, request).await.unwrap(); assert_eq!( scanner.num_files(), - 0, - "No SST files should be scanned before exit" + 1, + "1 SST files should be scanned before exit" ); assert_eq!( scanner.num_memtables(), @@ -415,14 +498,20 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) let batches = RecordBatches::try_collect(stream).await.unwrap(); let total_rows: usize = batches.iter().map(|rb| rb.num_rows()).sum(); assert_eq!( - total_rows, 0, - "No data should be readable before exit staging mode" + total_rows, 3, + "3 rows should be readable before exit staging mode" ); // Inspect SSTs from manifest let sst_entries = engine.all_ssts_from_manifest().await; - assert_eq!(sst_entries.len(), 2); - assert!(sst_entries.iter().all(|e| !e.visible)); + assert_eq!( + sst_entries.len(), + 3, + "sst entries should be 3, got: {:?}", + sst_entries + ); + assert_eq!(sst_entries.iter().filter(|e| e.visible).count(), 1); + assert_eq!(sst_entries.iter().filter(|e| !e.visible).count(), 2); // Exit staging mode successfully engine @@ -470,7 +559,7 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) let scanner = engine.scanner(region_id, request).await.unwrap(); assert_eq!( scanner.num_files(), - 2, + 3, "SST files should be scanned after exit" ); @@ -482,6 +571,209 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) // Inspect SSTs from manifest let sst_entries = engine.all_ssts_from_manifest().await; - assert_eq!(sst_entries.len(), 2); + assert_eq!(sst_entries.len(), 3); assert!(sst_entries.iter().all(|e| e.visible)); } + +#[tokio::test(flavor = "multi_thread")] +async fn test_write_stall_on_enter_staging() { + test_write_stall_on_enter_staging_with_format(false).await; + test_write_stall_on_enter_staging_with_format(true).await; +} + +async fn test_write_stall_on_enter_staging_with_format(flat_format: bool) { + let mut env = TestEnv::new().await; + let listener = Arc::new(NotifyEnterStagingResultListener::default()); + let engine = env + .create_engine_with( + MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }, + None, + Some(listener.clone()), + None, + ) + .await; + + let region_id = RegionId::new(1, 1); + let request = CreateRequestBuilder::new().build(); + + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let engine_cloned = engine.clone(); + let alter_job = tokio::spawn(async move { + engine_cloned + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_expr: PARTITION_EXPR.to_string(), + }), + ) + .await + .unwrap(); + }); + // Make sure the loop is handling the alter request. + tokio::time::sleep(Duration::from_millis(100)).await; + + let column_schemas_cloned = column_schemas.clone(); + let engine_cloned = engine.clone(); + let put_job = tokio::spawn(async move { + let rows = Rows { + schema: column_schemas_cloned, + rows: build_rows(0, 3), + }; + put_rows(&engine_cloned, region_id, rows).await; + }); + // Make sure the loop is handling the put request. + tokio::time::sleep(Duration::from_millis(100)).await; + + listener.wake_notify(); + alter_job.await.unwrap(); + put_job.await.unwrap(); + + let expected = "\ ++-------+---------+---------------------+ +| tag_0 | field_0 | ts | ++-------+---------+---------------------+ +| 0 | 0.0 | 1970-01-01T00:00:00 | +| 1 | 1.0 | 1970-01-01T00:00:01 | +| 2 | 2.0 | 1970-01-01T00:00:02 | ++-------+---------+---------------------+"; + let request = ScanRequest::default(); + let scanner = engine.scanner(region_id, request).await.unwrap(); + let stream = scanner.scan().await.unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(expected, batches.pretty_print().unwrap()); +} + +#[tokio::test] +async fn test_enter_staging_clean_staging_manifest_error() { + common_telemetry::init_default_ut_logging(); + test_enter_staging_clean_staging_manifest_error_with_format(false).await; + test_enter_staging_clean_staging_manifest_error_with_format(true).await; +} + +struct MockLister { + path: String, + inner: Lister, +} + +impl List for MockLister { + async fn next(&mut self) -> MockResult> { + if self.path.contains("staging") { + return Err(MockError::new(ErrorKind::Unexpected, "mock error")); + } + self.inner.next().await + } +} + +struct MockWriter { + path: String, + inner: Writer, +} + +impl Write for MockWriter { + async fn write(&mut self, bs: Buffer) -> MockResult<()> { + self.inner.write(bs).await + } + + async fn close(&mut self) -> MockResult { + if self.path.contains("staging") { + return Err(MockError::new(ErrorKind::Unexpected, "mock error")); + } + self.inner.close().await + } + + async fn abort(&mut self) -> MockResult<()> { + self.inner.abort().await + } +} + +async fn test_enter_staging_error(env: &mut TestEnv, flat_format: bool) { + let engine = env + .create_engine(MitoConfig { + default_experimental_flat_format: flat_format, + ..Default::default() + }) + .await; + let region_id = RegionId::new(1024, 0); + let request = CreateRequestBuilder::new().build(); + engine + .handle_request(region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let err = engine + .handle_request( + region_id, + RegionRequest::EnterStaging(EnterStagingRequest { + partition_expr: PARTITION_EXPR.to_string(), + }), + ) + .await + .unwrap_err(); + assert_eq!(err.status_code(), StatusCode::StorageUnavailable); + let region = engine.get_region(region_id).unwrap(); + assert!( + region + .manifest_ctx + .manifest_manager + .read() + .await + .staging_manifest() + .is_none() + ); + let state = region.state(); + assert_eq!(state, RegionRoleState::Leader(RegionLeaderState::Writable)); +} + +async fn test_enter_staging_clean_staging_manifest_error_with_format(flat_format: bool) { + let mock_layer = MockLayerBuilder::default() + .lister_factory(Arc::new(|path, _args, lister| { + Box::new(MockLister { + path: path.to_string(), + inner: lister, + }) + })) + .build() + .unwrap(); + let mut env = TestEnv::new().await.with_mock_layer(mock_layer); + test_enter_staging_error(&mut env, flat_format).await; +} + +#[tokio::test] +async fn test_enter_staging_save_staging_manifest_error() { + common_telemetry::init_default_ut_logging(); + test_enter_staging_save_staging_manifest_error_with_format(false).await; + test_enter_staging_save_staging_manifest_error_with_format(true).await; +} + +async fn test_enter_staging_save_staging_manifest_error_with_format(flat_format: bool) { + let mock_layer = MockLayerBuilder::default() + .writer_factory(Arc::new(|path, _args, lister| { + Box::new(MockWriter { + path: path.to_string(), + inner: lister, + }) + })) + .build() + .unwrap(); + let mut env = TestEnv::new().await.with_mock_layer(mock_layer); + test_enter_staging_error(&mut env, flat_format).await; +} diff --git a/src/mito2/src/engine/sync_test.rs b/src/mito2/src/engine/sync_test.rs index 5d6d5802f2..6b98d4ba0f 100644 --- a/src/mito2/src/engine/sync_test.rs +++ b/src/mito2/src/engine/sync_test.rs @@ -151,7 +151,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) { scan_check(&follower_engine, region_id, expected, 0, 0).await; // Returns error since the max manifest is 1 - let manifest_info = RegionManifestInfo::mito(2, 0); + let manifest_info = RegionManifestInfo::mito(2, 0, 0); let err = follower_engine .sync_region(region_id, manifest_info) .await @@ -159,7 +159,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) { let err = err.as_any().downcast_ref::().unwrap(); assert_matches!(err, Error::InstallManifestTo { .. }); - let manifest_info = RegionManifestInfo::mito(1, 0); + let manifest_info = RegionManifestInfo::mito(1, 0, 0); follower_engine .sync_region(region_id, manifest_info) .await @@ -264,7 +264,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) { scan_check(&follower_engine, region_id, expected, 0, 0).await; // Sync the region from the leader engine to the follower engine - let manifest_info = RegionManifestInfo::mito(2, 0); + let manifest_info = RegionManifestInfo::mito(2, 0, 0); follower_engine .sync_region(region_id, manifest_info) .await diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index ad6d7c7caa..d357c68774 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -104,6 +104,15 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to serialize manifest, region_id: {}", region_id))] + SerializeManifest { + region_id: RegionId, + #[snafu(source)] + error: serde_json::Error, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Invalid scan index, start: {}, end: {}", start, end))] InvalidScanIndex { start: ManifestVersion, @@ -232,6 +241,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Manifest missing for region {}", region_id))] + MissingManifest { + region_id: RegionId, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("File consistency check failed for file {}: {}", file_id, reason))] InconsistentFile { file_id: FileId, @@ -254,6 +270,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to fetch manifests"))] + FetchManifests { + #[snafu(implicit)] + location: Location, + source: BoxedError, + }, + #[snafu(display("Partition expression missing for region {}", region_id))] MissingPartitionExpr { region_id: RegionId, @@ -1121,6 +1144,24 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("GC job permit exhausted"))] + TooManyGcJobs { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Staging partition expr mismatch, manifest: {:?}, request: {}", + manifest_expr, + request_expr + ))] + StagingPartitionExprMismatch { + manifest_expr: Option, + request_expr: String, + #[snafu(implicit)] + location: Location, + }, } pub type Result = std::result::Result; @@ -1166,7 +1207,9 @@ impl ErrorExt for Error { | FilesLost { .. } | InstallManifestTo { .. } | Unexpected { .. } - | SerializeColumnMetadata { .. } => StatusCode::Unexpected, + | SerializeColumnMetadata { .. } + | SerializeManifest { .. } + | StagingPartitionExprMismatch { .. } => StatusCode::Unexpected, RegionNotFound { .. } => StatusCode::RegionNotFound, ObjectStoreNotFound { .. } @@ -1184,6 +1227,7 @@ impl ErrorExt for Error { | DurationOutOfRange { .. } | MissingOldManifest { .. } | MissingNewManifest { .. } + | MissingManifest { .. } | NoOldManifests { .. } | MissingPartitionExpr { .. } | SerializePartitionExpr { .. } => StatusCode::InvalidArguments, @@ -1205,6 +1249,8 @@ impl ErrorExt for Error { | Metadata { .. } | MitoManifestInfo { .. } => StatusCode::Internal, + FetchManifests { source, .. } => source.status_code(), + OpenRegion { source, .. } => source.status_code(), WriteParquet { .. } => StatusCode::StorageUnavailable, @@ -1291,7 +1337,7 @@ impl ErrorExt for Error { InconsistentTimestampLength { .. } => StatusCode::InvalidArguments, - TooManyFilesToRead { .. } => StatusCode::RateLimited, + TooManyFilesToRead { .. } | TooManyGcJobs { .. } => StatusCode::RateLimited, } } diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index eb5e605ce1..50bbf59941 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -40,7 +40,9 @@ use crate::error::{ RegionDroppedSnafu, RegionTruncatedSnafu, Result, }; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; -use crate::memtable::{BoxedRecordBatchIterator, EncodedRange, IterBuilder, MemtableRanges}; +use crate::memtable::{ + BoxedRecordBatchIterator, EncodedRange, IterBuilder, MemtableRanges, RangesOptions, +}; use crate::metrics::{ FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_FAILURE_TOTAL, FLUSH_REQUESTS_TOTAL, INFLIGHT_FLUSH_COUNT, @@ -49,7 +51,6 @@ use crate::read::dedup::{DedupReader, LastNonNull, LastRow}; use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeIterator; use crate::read::merge::MergeReaderBuilder; -use crate::read::scan_region::PredicateGroup; use crate::read::{FlatSource, Source}; use crate::region::options::{IndexOptions, MergeMode, RegionOptions}; use crate::region::version::{VersionControlData, VersionControlRef, VersionRef}; @@ -207,7 +208,7 @@ impl WriteBufferManager for WriteBufferManagerImpl { } /// Reason of a flush task. -#[derive(Debug, IntoStaticStr)] +#[derive(Debug, IntoStaticStr, Clone, Copy, PartialEq, Eq)] pub enum FlushReason { /// Other reasons. Others, @@ -221,6 +222,8 @@ pub enum FlushReason { Periodically, /// Flush memtable during downgrading state. Downgrading, + /// Enter staging mode. + EnterStaging, } impl FlushReason { @@ -252,6 +255,8 @@ pub(crate) struct RegionFlushTask { pub(crate) index_options: IndexOptions, /// Semaphore to control flush concurrency. pub(crate) flush_semaphore: Arc, + /// Whether the region is in staging mode. + pub(crate) is_staging: bool, } impl RegionFlushTask { @@ -315,6 +320,7 @@ impl RegionFlushTask { _timer: timer, edit, memtables_to_remove, + is_staging: self.is_staging, }; WorkerRequest::Background { region_id: self.region_id, @@ -397,7 +403,10 @@ impl RegionFlushTask { flushed_sequence: Some(version_data.committed_sequence), committed_sequence: None, }; - info!("Applying {edit:?} to region {}", self.region_id); + info!( + "Applying {edit:?} to region {}, is_staging: {}", + self.region_id, self.is_staging + ); let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone())); @@ -416,11 +425,12 @@ impl RegionFlushTask { // add a cleanup job to remove them later. let version = self .manifest_ctx - .update_manifest(expected_state, action_list) + .update_manifest(expected_state, action_list, self.is_staging) .await?; info!( - "Successfully update manifest version to {version}, region: {}, reason: {}", + "Successfully update manifest version to {version}, region: {}, is_staging: {}, reason: {}", self.region_id, + self.is_staging, self.reason.as_str() ); @@ -459,7 +469,7 @@ impl RegionFlushTask { flush_metrics.compact_memtable += compact_cost; // Sets `for_flush` flag to true. - let mem_ranges = mem.ranges(None, PredicateGroup::default(), None, true)?; + let mem_ranges = mem.ranges(None, RangesOptions::for_flush())?; let num_mem_ranges = mem_ranges.ranges.len(); let num_mem_rows = mem_ranges.stats.num_rows(); let memtable_id = mem.id(); @@ -525,21 +535,19 @@ impl RegionFlushTask { let source = Either::Left(source); let write_request = self.new_write_request(version, max_sequence, source); - let (ssts_written, metrics) = self + let mut metrics = Metrics::new(WriteType::Flush); + let ssts_written = self .access_layer - .write_sst(write_request, &write_opts, WriteType::Flush) + .write_sst(write_request, &write_opts, &mut metrics) .await?; if ssts_written.is_empty() { // No data written. continue; } - common_telemetry::debug!( + debug!( "Region {} flush one memtable, num_mem_ranges: {}, num_rows: {}, metrics: {:?}", - self.region_id, - num_mem_ranges, - num_mem_rows, - metrics + self.region_id, num_mem_ranges, num_mem_rows, metrics ); flush_metrics = flush_metrics.merge(metrics); @@ -591,9 +599,11 @@ impl RegionFlushTask { let semaphore = self.flush_semaphore.clone(); let task = common_runtime::spawn_global(async move { let _permit = semaphore.acquire().await.unwrap(); - access_layer - .write_sst(write_request, &write_opts, WriteType::Flush) - .await + let mut metrics = Metrics::new(WriteType::Flush); + let ssts = access_layer + .write_sst(write_request, &write_opts, &mut metrics) + .await?; + Ok((ssts, metrics)) }); tasks.push(task); } @@ -636,11 +646,14 @@ impl RegionFlushTask { level: 0, file_size: sst_info.file_size, available_indexes: sst_info.index_metadata.build_available_indexes(), + indexes: sst_info.index_metadata.build_indexes(), index_file_size: sst_info.index_metadata.file_size, + index_file_id: None, num_rows: sst_info.num_rows as u64, num_row_groups: sst_info.num_row_groups, sequence: NonZeroU64::new(max_sequence), partition_expr, + num_series: sst_info.num_series, } } @@ -740,7 +753,6 @@ struct FlatSources { encoded: SmallVec<[EncodedRange; 4]>, } -// TODO(yingwen): Flushes into multiple files in parallel. /// Returns the max sequence and [FlatSource] for the given memtable. fn memtable_flat_sources( schema: SchemaRef, @@ -762,6 +774,9 @@ fn memtable_flat_sources( flat_sources.encoded.push(encoded); } else { let iter = only_range.build_record_batch_iter(None)?; + // Dedup according to append mode and merge mode. + // Even single range may have duplicate rows. + let iter = maybe_dedup_one(options, field_column_start, iter); flat_sources.sources.push(FlatSource::Iter(iter)); }; } else { @@ -829,6 +844,28 @@ fn merge_and_dedup( Ok(maybe_dedup) } +fn maybe_dedup_one( + options: &RegionOptions, + field_column_start: usize, + input_iter: BoxedRecordBatchIterator, +) -> BoxedRecordBatchIterator { + if options.append_mode { + // No dedup in append mode + input_iter + } else { + // Dedup according to merge mode. + match options.merge_mode() { + MergeMode::LastRow => { + Box::new(FlatDedupIterator::new(input_iter, FlatLastRow::new(false))) + } + MergeMode::LastNonNull => Box::new(FlatDedupIterator::new( + input_iter, + FlatLastNonNull::new(field_column_start, false), + )), + } + } +} + /// Manages background flushes of a worker. pub(crate) struct FlushScheduler { /// Tracks regions need to flush. @@ -1162,11 +1199,16 @@ impl FlushStatus { #[cfg(test)] mod tests { + use mito_codec::row_converter::build_primary_key_codec; use tokio::sync::oneshot; use super::*; use crate::cache::CacheManager; + use crate::memtable::bulk::part::BulkPartConverter; use crate::memtable::time_series::TimeSeriesMemtableBuilder; + use crate::memtable::{Memtable, RangesOptions}; + use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; + use crate::test_util::memtable_util::{build_key_values_with_ts_seq_values, metadata_for_test}; use crate::test_util::scheduler_util::{SchedulerEnv, VecScheduler}; use crate::test_util::version_util::{VersionControlBuilder, write_rows_to_version}; @@ -1259,6 +1301,7 @@ mod tests { .await, index_options: IndexOptions::default(), flush_semaphore: Arc::new(Semaphore::new(2)), + is_staging: false, }; task.push_sender(OptionOutputTx::from(output_tx)); scheduler @@ -1301,6 +1344,7 @@ mod tests { manifest_ctx: manifest_ctx.clone(), index_options: IndexOptions::default(), flush_semaphore: Arc::new(Semaphore::new(2)), + is_staging: false, }) .collect(); // Schedule first task. @@ -1350,4 +1394,113 @@ mod tests { assert_eq!(output, 0); } } + + // Verifies single-range flat flush path respects append_mode (no dedup) vs dedup when disabled. + #[test] + fn test_memtable_flat_sources_single_range_append_mode_behavior() { + // Build test metadata and flat schema + let metadata = metadata_for_test(); + let schema = to_flat_sst_arrow_schema( + &metadata, + &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding), + ); + + // Prepare a bulk part containing duplicate rows for the same PK and timestamp + // Two rows with identical keys and timestamps (ts = 1000), different field values + let capacity = 16; + let pk_codec = build_primary_key_codec(&metadata); + let mut converter = + BulkPartConverter::new(&metadata, schema.clone(), capacity, pk_codec, true); + let kvs = build_key_values_with_ts_seq_values( + &metadata, + "dup_key".to_string(), + 1, + vec![1000i64, 1000i64].into_iter(), + vec![Some(1.0f64), Some(2.0f64)].into_iter(), + 1, + ); + converter.append_key_values(&kvs).unwrap(); + let part = converter.convert().unwrap(); + + // Helper to build MemtableRanges with a single range from one bulk part. + // We use BulkMemtable directly because it produces record batch iterators. + let build_ranges = |append_mode: bool| -> MemtableRanges { + let memtable = crate::memtable::bulk::BulkMemtable::new( + 1, + metadata.clone(), + None, + None, + append_mode, + MergeMode::LastRow, + ); + memtable.write_bulk(part.clone()).unwrap(); + memtable.ranges(None, RangesOptions::for_flush()).unwrap() + }; + + // Case 1: append_mode = false => dedup happens, total rows should be 1 + { + let mem_ranges = build_ranges(false); + assert_eq!(1, mem_ranges.ranges.len()); + + let options = RegionOptions { + append_mode: false, + merge_mode: Some(MergeMode::LastRow), + ..Default::default() + }; + + let flat_sources = memtable_flat_sources( + schema.clone(), + mem_ranges, + &options, + metadata.primary_key.len(), + ) + .unwrap(); + assert!(flat_sources.encoded.is_empty()); + assert_eq!(1, flat_sources.sources.len()); + + // Consume the iterator and count rows + let mut total_rows = 0usize; + for source in flat_sources.sources { + match source { + crate::read::FlatSource::Iter(iter) => { + for rb in iter { + total_rows += rb.unwrap().num_rows(); + } + } + crate::read::FlatSource::Stream(_) => unreachable!(), + } + } + assert_eq!(1, total_rows, "dedup should keep a single row"); + } + + // Case 2: append_mode = true => no dedup, total rows should be 2 + { + let mem_ranges = build_ranges(true); + assert_eq!(1, mem_ranges.ranges.len()); + + let options = RegionOptions { + append_mode: true, + ..Default::default() + }; + + let flat_sources = + memtable_flat_sources(schema, mem_ranges, &options, metadata.primary_key.len()) + .unwrap(); + assert!(flat_sources.encoded.is_empty()); + assert_eq!(1, flat_sources.sources.len()); + + let mut total_rows = 0usize; + for source in flat_sources.sources { + match source { + crate::read::FlatSource::Iter(iter) => { + for rb in iter { + total_rows += rb.unwrap().num_rows(); + } + } + crate::read::FlatSource::Stream(_) => unreachable!(), + } + } + assert_eq!(2, total_rows, "append_mode should preserve duplicates"); + } + } } diff --git a/src/mito2/src/gc.rs b/src/mito2/src/gc.rs index e4d384d0f9..c61b569cef 100644 --- a/src/mito2/src/gc.rs +++ b/src/mito2/src/gc.rs @@ -21,49 +21,93 @@ //! `unknown files`: files that are not recorded in the manifest, usually due to saved checkpoint which remove actions before the checkpoint. //! -use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::sync::Arc; use std::time::Duration; -use common_telemetry::{error, info, warn}; +use common_meta::datanode::GcStat; +use common_telemetry::{debug, error, info, warn}; use common_time::Timestamp; use object_store::{Entry, Lister}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt as _, ensure}; -use store_api::storage::{FileId, RegionId}; +use snafu::ResultExt as _; +use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId}; +use tokio::sync::{OwnedSemaphorePermit, TryAcquireError}; use tokio_stream::StreamExt; use crate::access_layer::AccessLayerRef; use crate::cache::CacheManagerRef; use crate::config::MitoConfig; use crate::error::{ - DurationOutOfRangeSnafu, EmptyRegionDirSnafu, JoinSnafu, OpenDalSnafu, RegionNotFoundSnafu, - Result, UnexpectedSnafu, + DurationOutOfRangeSnafu, JoinSnafu, OpenDalSnafu, Result, TooManyGcJobsSnafu, UnexpectedSnafu, }; -use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions, RemoveFileOptions}; -use crate::manifest::storage::manifest_compress_type; -use crate::metrics::GC_FILE_CNT; -use crate::region::opener::new_manifest_dir; +use crate::manifest::action::RegionManifest; +use crate::metrics::GC_DELETE_FILE_CNT; +use crate::region::{MitoRegionRef, RegionRoleState}; use crate::sst::file::delete_files; -use crate::sst::file_ref::TableFileRefsManifest; -use crate::sst::location::{self, region_dir_from_table_dir}; +use crate::sst::location::{self}; -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct GcReport { - /// deleted files per region - pub deleted_files: HashMap>, - /// Regions that need retry in next gc round, usually because their tmp ref files are outdated - pub need_retry_regions: HashSet, +#[cfg(test)] +mod worker_test; + +/// Limit the amount of concurrent GC jobs on the datanode +pub struct GcLimiter { + pub gc_job_limit: Arc, + gc_concurrency: usize, } -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct FileGcOption { +pub type GcLimiterRef = Arc; + +impl GcLimiter { + pub fn new(gc_concurrency: usize) -> Self { + Self { + gc_job_limit: Arc::new(tokio::sync::Semaphore::new(gc_concurrency)), + gc_concurrency, + } + } + + pub fn running_gc_tasks(&self) -> u32 { + (self.gc_concurrency - self.gc_job_limit.available_permits()) as u32 + } + + pub fn gc_concurrency(&self) -> u32 { + self.gc_concurrency as u32 + } + + pub fn gc_stat(&self) -> GcStat { + GcStat::new(self.running_gc_tasks(), self.gc_concurrency()) + } + + /// Try to acquire a permit for a GC job. + /// + /// If no permit is available, returns an `TooManyGcJobs` error. + pub fn permit(&self) -> Result { + self.gc_job_limit + .clone() + .try_acquire_owned() + .map_err(|e| match e { + TryAcquireError::Closed => UnexpectedSnafu { + reason: format!("Failed to acquire gc permit: {e}"), + } + .build(), + TryAcquireError::NoPermits => TooManyGcJobsSnafu {}.build(), + }) + } +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(default)] +pub struct GcConfig { + /// Whether GC is enabled. + pub enable: bool, /// Lingering time before deleting files. /// Should be long enough to allow long running queries to finish. + /// If set to None, then unused files will be deleted immediately. /// /// TODO(discord9): long running queries should actively write tmp manifest files /// to prevent deletion of files they are using. #[serde(with = "humantime_serde")] - pub lingering_time: Duration, + pub lingering_time: Option, /// Lingering time before deleting unknown files(files with undetermine expel time). /// expel time is the time when the file is considered as removed, as in removed from the manifest. /// This should only occur rarely, as manifest keep tracks in `removed_files` field @@ -73,16 +117,22 @@ pub struct FileGcOption { /// Maximum concurrent list operations per GC job. /// This is used to limit the number of concurrent listing operations and speed up listing. pub max_concurrent_lister_per_gc_job: usize, + /// Maximum concurrent GC jobs. + /// This is used to limit the number of concurrent GC jobs running on the datanode + /// to prevent too many concurrent GC jobs from overwhelming the datanode. + pub max_concurrent_gc_job: usize, } -impl Default for FileGcOption { +impl Default for GcConfig { fn default() -> Self { Self { - // expect long running queries to be finished within a reasonable time - lingering_time: Duration::from_secs(60 * 5), - // 6 hours, for unknown expel time, which is when this file get removed from manifest, it should rarely happen, can keep it longer - unknown_file_lingering_time: Duration::from_secs(60 * 60 * 6), + enable: false, + // expect long running queries to be finished(or at least be able to notify it's using a deleted file) within a reasonable time + lingering_time: Some(Duration::from_secs(60)), + // 1 hours, for unknown expel time, which is when this file get removed from manifest, it should rarely happen, can keep it longer + unknown_file_lingering_time: Duration::from_secs(60 * 60), max_concurrent_lister_per_gc_job: 32, + max_concurrent_gc_job: 4, } } } @@ -90,15 +140,24 @@ impl Default for FileGcOption { pub struct LocalGcWorker { pub(crate) access_layer: AccessLayerRef, pub(crate) cache_manager: Option, - pub(crate) manifest_mgrs: HashMap, + pub(crate) regions: BTreeMap, /// Lingering time before deleting files. - pub(crate) opt: FileGcOption, - pub(crate) manifest_open_config: ManifestOpenConfig, + pub(crate) opt: GcConfig, /// Tmp ref files manifest, used to determine which files are still in use by ongoing queries. /// /// Also contains manifest versions of regions when the tmp ref files are generated. /// Used to determine whether the tmp ref files are outdated. - pub(crate) file_ref_manifest: TableFileRefsManifest, + pub(crate) file_ref_manifest: FileRefsManifest, + _permit: OwnedSemaphorePermit, + /// Whether to perform full file listing during GC. + /// When set to false, GC will only delete files that are tracked in the manifest's removed_files, + /// which can significantly improve performance by avoiding expensive list operations. + /// When set to true, GC will perform a full listing to find and delete orphan files + /// (files not tracked in the manifest). + /// + /// Set to false for regular GC operations to optimize performance. + /// Set to true periodically or when you need to clean up orphan files. + pub full_file_listing: bool, } pub struct ManifestOpenConfig { @@ -125,83 +184,37 @@ impl LocalGcWorker { /// Create a new LocalGcWorker, with `regions_to_gc` regions to GC. /// The regions are specified by their `RegionId` and should all belong to the same table. /// + #[allow(clippy::too_many_arguments)] pub async fn try_new( access_layer: AccessLayerRef, cache_manager: Option, - regions_to_gc: BTreeSet, - opt: FileGcOption, - manifest_open_config: ManifestOpenConfig, - file_ref_manifest: TableFileRefsManifest, + regions_to_gc: BTreeMap, + opt: GcConfig, + file_ref_manifest: FileRefsManifest, + limiter: &GcLimiterRef, + full_file_listing: bool, ) -> Result { - let table_id = regions_to_gc - .first() - .context(UnexpectedSnafu { - reason: "Expect at least one region, found none", - })? - .table_id(); - let mut zelf = Self { + let permit = limiter.permit()?; + + Ok(Self { access_layer, cache_manager, - manifest_mgrs: HashMap::new(), + regions: regions_to_gc, opt, - manifest_open_config, file_ref_manifest, - }; - - // dedup just in case - for region_id in regions_to_gc { - ensure!( - region_id.table_id() == table_id, - UnexpectedSnafu { - reason: format!( - "All regions should belong to the same table, found region {} and table {}", - region_id, table_id - ), - } - ); - let mgr = zelf.open_mgr_for(region_id).await?; - zelf.manifest_mgrs.insert(region_id, mgr); - } - - Ok(zelf) + _permit: permit, + full_file_listing, + }) } /// Get tmp ref files for all current regions - /// - /// Outdated regions are added to `outdated_regions` set - pub async fn read_tmp_ref_files( - &self, - outdated_regions: &mut HashSet, - ) -> Result>> { - for (region_id, region_mgr) in &self.manifest_mgrs { - let current_version = region_mgr.manifest().manifest_version; - if ¤t_version - > self - .file_ref_manifest - .manifest_version - .get(region_id) - .with_context(|| UnexpectedSnafu { - reason: format!( - "Region {} not found in tmp ref manifest version map", - region_id - ), - })? - { - outdated_regions.insert(*region_id); - } - } - // TODO(discord9): verify manifest version before reading tmp ref files - + pub async fn read_tmp_ref_files(&self) -> Result>> { let mut tmp_ref_files = HashMap::new(); - for file_ref in &self.file_ref_manifest.file_refs { - if outdated_regions.contains(&file_ref.region_id) { - // skip outdated regions - continue; - } + for (region_id, file_refs) in &self.file_ref_manifest.file_refs { tmp_ref_files - .entry(file_ref.region_id) + .entry(*region_id) .or_insert_with(HashSet::new) - .insert(file_ref.file_id); + .extend(file_refs.clone()); } Ok(tmp_ref_files) @@ -216,26 +229,38 @@ impl LocalGcWorker { info!("LocalGcWorker started"); let now = std::time::Instant::now(); - let mut outdated_regions = HashSet::new(); let mut deleted_files = HashMap::new(); - let tmp_ref_files = self.read_tmp_ref_files(&mut outdated_regions).await?; - for region_id in self.manifest_mgrs.keys() { - info!("Doing gc for region {}", region_id); + let tmp_ref_files = self.read_tmp_ref_files().await?; + for (region_id, region) in &self.regions { + let per_region_time = std::time::Instant::now(); + if region.manifest_ctx.current_state() == RegionRoleState::Follower { + return UnexpectedSnafu { + reason: format!( + "Region {} is in Follower state, should not run GC on follower regions", + region_id + ), + } + .fail(); + } let tmp_ref_files = tmp_ref_files .get(region_id) .cloned() .unwrap_or_else(HashSet::new); - let files = self.do_region_gc(*region_id, &tmp_ref_files).await?; + let files = self.do_region_gc(region.clone(), &tmp_ref_files).await?; deleted_files.insert(*region_id, files); - info!("Gc for region {} finished", region_id); + debug!( + "GC for region {} took {} secs.", + region_id, + per_region_time.elapsed().as_secs_f32() + ); } info!( "LocalGcWorker finished after {} secs.", - now.elapsed().as_secs() + now.elapsed().as_secs_f32() ); let report = GcReport { deleted_files, - need_retry_regions: outdated_regions.into_iter().collect(), + need_retry_regions: HashSet::new(), }; Ok(report) } @@ -244,7 +269,7 @@ impl LocalGcWorker { impl LocalGcWorker { /// concurrency of listing files per region. /// This is used to limit the number of concurrent listing operations and speed up listing - pub const CONCURRENCY_LIST_PER_FILES: usize = 512; + pub const CONCURRENCY_LIST_PER_FILES: usize = 1024; /// Perform GC for the region. /// 1. Get all the removed files in delta manifest files and their expel times @@ -256,70 +281,80 @@ impl LocalGcWorker { /// to avoid deleting files that are still needed. pub async fn do_region_gc( &self, - region_id: RegionId, + region: MitoRegionRef, tmp_ref_files: &HashSet, ) -> Result> { - info!("Doing gc for region {}", region_id); - let manifest = self - .manifest_mgrs - .get(®ion_id) - .context(RegionNotFoundSnafu { region_id })? - .manifest(); + let region_id = region.region_id(); + + debug!("Doing gc for region {}", region_id); + let manifest = region.manifest_ctx.manifest().await; let region_id = manifest.metadata.region_id; let current_files = &manifest.files; - let recently_removed_files = self.get_removed_files_expel_times(region_id).await?; + let recently_removed_files = self.get_removed_files_expel_times(&manifest).await?; if recently_removed_files.is_empty() { // no files to remove, skip - info!("No recently removed files to gc for region {}", region_id); + debug!("No recently removed files to gc for region {}", region_id); } - info!( - "Found {} recently removed files sets for region {}", - recently_removed_files.len(), - region_id - ); + let removed_file_cnt = recently_removed_files + .values() + .map(|s| s.len()) + .sum::(); let concurrency = (current_files.len() / Self::CONCURRENCY_LIST_PER_FILES) .max(1) .min(self.opt.max_concurrent_lister_per_gc_job); - let in_used = current_files + let in_used: HashSet = current_files .keys() .cloned() .chain(tmp_ref_files.clone().into_iter()) .collect(); - let true_tmp_ref_files = tmp_ref_files - .iter() - .filter(|f| !current_files.contains_key(f)) - .collect::>(); - - info!("True tmp ref files: {:?}", true_tmp_ref_files); - let unused_files = self - .list_to_be_deleted_files(region_id, in_used, recently_removed_files, concurrency) + .list_to_be_deleted_files(region_id, &in_used, recently_removed_files, concurrency) .await?; - let unused_len = unused_files.len(); + let unused_file_cnt = unused_files.len(); - info!( - "Found {} unused files to delete for region {}", - unused_len, region_id + debug!( + "gc: for region {region_id}: In manifest files: {}, Tmp ref file cnt: {}, In-used files: {}, recently removed files: {}, Unused files to delete: {} ", + current_files.len(), + tmp_ref_files.len(), + in_used.len(), + removed_file_cnt, + unused_files.len() ); - self.delete_files(region_id, &unused_files).await?; + // TODO(discord9): for now, ignore async index file as it's design is not stable, need to be improved once + // index file design is stable + let file_pairs: Vec<(FileId, FileId)> = unused_files + .iter() + .map(|file_id| (*file_id, *file_id)) + .collect(); - info!( + debug!( + "Found {} unused index files to delete for region {}", + file_pairs.len(), + region_id + ); + + self.delete_files(region_id, &file_pairs).await?; + + debug!( "Successfully deleted {} unused files for region {}", - unused_len, region_id + unused_file_cnt, region_id ); + // TODO(discord9): update region manifest about deleted files + self.update_manifest_removed_files(®ion, unused_files.clone()) + .await?; Ok(unused_files) } - async fn delete_files(&self, region_id: RegionId, file_ids: &[FileId]) -> Result<()> { + async fn delete_files(&self, region_id: RegionId, file_ids: &[(FileId, FileId)]) -> Result<()> { delete_files( region_id, file_ids, @@ -329,40 +364,33 @@ impl LocalGcWorker { ) .await?; - GC_FILE_CNT.add(file_ids.len() as i64); + // FIXME(discord9): if files are already deleted before calling delete_files, the metric will be inaccurate, no clean way to fix it now + GC_DELETE_FILE_CNT.add(file_ids.len() as i64); Ok(()) } - /// Get the manifest manager for the region. - async fn open_mgr_for(&self, region_id: RegionId) -> Result { - let table_dir = self.access_layer.table_dir(); - let path_type = self.access_layer.path_type(); - let mito_config = &self.manifest_open_config; + /// Update region manifest for clear the actually deleted files + async fn update_manifest_removed_files( + &self, + region: &MitoRegionRef, + deleted_files: Vec, + ) -> Result<()> { + let deleted_file_cnt = deleted_files.len(); + debug!( + "Trying to update manifest for {deleted_file_cnt} removed files for region {}", + region.region_id() + ); - let region_manifest_options = RegionManifestOptions { - manifest_dir: new_manifest_dir(®ion_dir_from_table_dir( - table_dir, region_id, path_type, - )), - object_store: self.access_layer.object_store().clone(), - compress_type: manifest_compress_type(mito_config.compress_manifest), - checkpoint_distance: mito_config.manifest_checkpoint_distance, - remove_file_options: RemoveFileOptions { - keep_count: mito_config.experimental_manifest_keep_removed_file_count, - keep_ttl: mito_config.experimental_manifest_keep_removed_file_ttl, - }, - }; + let mut manager = region.manifest_ctx.manifest_manager.write().await; + let cnt = deleted_files.len(); + manager.clear_deleted_files(deleted_files); + debug!( + "Updated region_id={} region manifest to clear {cnt} deleted files", + region.region_id(), + ); - RegionManifestManager::open( - region_manifest_options, - Default::default(), - Default::default(), - ) - .await? - .context(EmptyRegionDirSnafu { - region_id, - region_dir: ®ion_dir_from_table_dir(table_dir, region_id, path_type), - }) + Ok(()) } /// Get all the removed files in delta manifest files and their expel times. @@ -371,14 +399,8 @@ impl LocalGcWorker { /// pub async fn get_removed_files_expel_times( &self, - region_id: RegionId, + region_manifest: &Arc, ) -> Result>> { - let region_manifest = self - .manifest_mgrs - .get(®ion_id) - .context(RegionNotFoundSnafu { region_id })? - .manifest(); - let mut ret = BTreeMap::new(); for files in ®ion_manifest.removed_files.removed_files { let expel_time = Timestamp::new_millisecond(files.removed_at); @@ -491,7 +513,7 @@ impl LocalGcWorker { entries: Vec, in_use_filenames: &HashSet<&FileId>, may_linger_filenames: &HashSet<&FileId>, - all_files_appear_in_delta_manifests: &HashSet<&FileId>, + eligible_for_removal: &HashSet<&FileId>, unknown_file_may_linger_until: chrono::DateTime, ) -> (Vec, HashSet) { let mut all_unused_files_ready_for_delete = vec![]; @@ -515,7 +537,7 @@ impl LocalGcWorker { let should_delete = !in_use_filenames.contains(&file_id) && !may_linger_filenames.contains(&file_id) && { - if !all_files_appear_in_delta_manifests.contains(&file_id) { + if !eligible_for_removal.contains(&file_id) { // if the file's expel time is unknown(because not appear in delta manifest), we keep it for a while // using it's last modified time // notice unknown files use a different lingering time @@ -541,20 +563,31 @@ impl LocalGcWorker { /// Concurrently list unused files in the region dir /// because there may be a lot of files in the region dir /// and listing them may take a long time. + /// + /// When `full_file_listing` is false, this method will only delete files tracked in + /// `recently_removed_files` without performing expensive list operations, which significantly + /// improves performance. When `full_file_listing` is true, it performs a full listing to + /// find and delete orphan files. pub async fn list_to_be_deleted_files( &self, region_id: RegionId, - in_used: HashSet, + in_used: &HashSet, recently_removed_files: BTreeMap>, concurrency: usize, ) -> Result> { + let start = tokio::time::Instant::now(); let now = chrono::Utc::now(); - let may_linger_until = now - - chrono::Duration::from_std(self.opt.lingering_time).with_context(|_| { - DurationOutOfRangeSnafu { - input: self.opt.lingering_time, - } - })?; + let may_linger_until = self + .opt + .lingering_time + .map(|lingering_time| { + chrono::Duration::from_std(lingering_time) + .with_context(|_| DurationOutOfRangeSnafu { + input: lingering_time, + }) + .map(|t| now - t) + }) + .transpose()?; let unknown_file_may_linger_until = now - chrono::Duration::from_std(self.opt.unknown_file_lingering_time).with_context( @@ -564,12 +597,18 @@ impl LocalGcWorker { )?; // files that may linger, which means they are not in use but may still be kept for a while - let threshold = Timestamp::new_millisecond(may_linger_until.timestamp_millis()); + let threshold = + may_linger_until.map(|until| Timestamp::new_millisecond(until.timestamp_millis())); let mut recently_removed_files = recently_removed_files; - let may_linger_files = recently_removed_files.split_off(&threshold); + let may_linger_files = match threshold { + Some(threshold) => recently_removed_files.split_off(&threshold), + None => BTreeMap::new(), + }; + debug!("may_linger_files: {:?}", may_linger_files); + let may_linger_filenames = may_linger_files.values().flatten().collect::>(); - let all_files_appear_in_delta_manifests = recently_removed_files + let eligible_for_removal = recently_removed_files .values() .flatten() .collect::>(); @@ -577,23 +616,56 @@ impl LocalGcWorker { // in use filenames, include sst and index files let in_use_filenames = in_used.iter().collect::>(); + // When full_file_listing is false, skip expensive list operations and only delete + // files that are tracked in recently_removed_files + if !self.full_file_listing { + // Only delete files that: + // 1. Are in recently_removed_files (tracked in manifest) + // 2. Are not in use + // 3. Have passed the lingering time + let files_to_delete: Vec = eligible_for_removal + .iter() + .filter(|file_id| !in_use_filenames.contains(*file_id)) + .map(|&f| *f) + .collect(); + + info!( + "gc: fast mode (no full listing) cost {} secs for region {}, found {} files to delete from manifest", + start.elapsed().as_secs_f64(), + region_id, + files_to_delete.len() + ); + + return Ok(files_to_delete); + } + + // Full file listing mode: perform expensive list operations to find orphan files // Step 1: Create partitioned listers for concurrent processing let listers = self.partition_region_files(region_id, concurrency).await?; + let lister_cnt = listers.len(); // Step 2: Concurrently list all files in the region directory let all_entries = self.list_region_files_concurrent(listers).await?; + let cnt = all_entries.len(); + // Step 3: Filter files to determine which ones can be deleted let (all_unused_files_ready_for_delete, all_in_exist_linger_files) = self .filter_deletable_files( all_entries, &in_use_filenames, &may_linger_filenames, - &all_files_appear_in_delta_manifests, + &eligible_for_removal, unknown_file_may_linger_until, ); - info!("All in exist linger files: {:?}", all_in_exist_linger_files); + info!( + "gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}, found {} unused files to delete", + start.elapsed().as_secs_f64(), + region_id, + all_unused_files_ready_for_delete.len() + ); + debug!("All in exist linger files: {:?}", all_in_exist_linger_files); Ok(all_unused_files_ready_for_delete) } diff --git a/src/mito2/src/gc/worker_test.rs b/src/mito2/src/gc/worker_test.rs new file mode 100644 index 0000000000..6e3f5288c0 --- /dev/null +++ b/src/mito2/src/gc/worker_test.rs @@ -0,0 +1,401 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::sync::Arc; + +use api::v1::Rows; +use common_telemetry::init_default_ut_logging; +use store_api::region_engine::RegionEngine as _; +use store_api::region_request::{RegionCompactRequest, RegionRequest}; +use store_api::storage::{FileRefsManifest, RegionId}; + +use crate::config::MitoConfig; +use crate::engine::MitoEngine; +use crate::engine::compaction_test::{delete_and_flush, put_and_flush}; +use crate::gc::{GcConfig, LocalGcWorker}; +use crate::region::MitoRegionRef; +use crate::test_util::{ + CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, rows_schema, +}; + +async fn create_gc_worker( + mito_engine: &MitoEngine, + regions: BTreeMap, + file_ref_manifest: &FileRefsManifest, + full_file_listing: bool, +) -> LocalGcWorker { + let access_layer = regions.first_key_value().unwrap().1.access_layer.clone(); + let cache_manager = mito_engine.cache_manager(); + + LocalGcWorker::try_new( + access_layer, + Some(cache_manager), + regions, + mito_engine.mito_config().gc.clone(), + file_ref_manifest.clone(), + &mito_engine.gc_limiter(), + full_file_listing, + ) + .await + .unwrap() +} + +/// Test insert/flush then truncate can allow gc worker to delete files +#[tokio::test] +async fn test_gc_worker_basic_truncate() { + init_default_ut_logging(); + + let mut env = TestEnv::new().await; + env.log_store = Some(env.create_log_store().await); + // use in memory object store for gc test, so it will use `ObjectStoreFilePurger` + env.object_store_manager = Some(Arc::new(env.create_in_memory_object_store_manager())); + + let engine = env + .new_mito_engine(MitoConfig { + gc: GcConfig { + enable: true, + // for faster delete file + lingering_time: None, + ..Default::default() + }, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let request = CreateRequestBuilder::new().build(); + + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request.clone())) + .await + .unwrap(); + + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + flush_region(&engine, region_id, None).await; + + let region = engine.get_region(region_id).unwrap(); + let manifest = region.manifest_ctx.manifest().await; + + let to_be_deleted_file_id = *manifest.files.iter().next().unwrap().0; + + assert_eq!(manifest.files.len(), 1); + + engine + .handle_request( + region.region_id, + RegionRequest::Truncate(store_api::region_request::RegionTruncateRequest::All), + ) + .await + .unwrap(); + + let manifest = region.manifest_ctx.manifest().await; + assert!( + manifest.removed_files.removed_files[0] + .file_ids + .contains(&to_be_deleted_file_id) + && manifest.removed_files.removed_files[0].file_ids.len() == 1 + && manifest.files.is_empty(), + "Manifest after truncate: {:?}", + manifest + ); + let version = manifest.manifest_version; + + let regions = BTreeMap::from([(region_id, region.clone())]); + let file_ref_manifest = FileRefsManifest { + file_refs: Default::default(), + manifest_version: [(region_id, version)].into(), + }; + let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await; + let report = gc_worker.run().await.unwrap(); + assert_eq!( + report.deleted_files.get(®ion_id).unwrap(), + &vec![to_be_deleted_file_id], + ); + assert!(report.need_retry_regions.is_empty()); + + let manifest = region.manifest_ctx.manifest().await; + assert!(manifest.removed_files.removed_files.is_empty() && manifest.files.is_empty()); +} + +/// Truncate with file refs should not delete files +#[tokio::test] +async fn test_gc_worker_truncate_with_ref() { + init_default_ut_logging(); + + let mut env = TestEnv::new().await; + env.log_store = Some(env.create_log_store().await); + // use in memory object store for gc test, so it will use `ObjectStoreFilePurger` + env.object_store_manager = Some(Arc::new(env.create_in_memory_object_store_manager())); + + let engine = env + .new_mito_engine(MitoConfig { + gc: GcConfig { + enable: true, + // for faster delete file + lingering_time: None, + ..Default::default() + }, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let request = CreateRequestBuilder::new().build(); + + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request.clone())) + .await + .unwrap(); + + let rows = Rows { + schema: column_schemas.clone(), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + flush_region(&engine, region_id, None).await; + + let region = engine.get_region(region_id).unwrap(); + let manifest = region.manifest_ctx.manifest().await; + + assert_eq!(manifest.files.len(), 1); + + let to_be_deleted_file_id = *manifest.files.iter().next().unwrap().0; + + engine + .handle_request( + region.region_id, + RegionRequest::Truncate(store_api::region_request::RegionTruncateRequest::All), + ) + .await + .unwrap(); + + let manifest = region.manifest_ctx.manifest().await; + assert!( + manifest.removed_files.removed_files[0] + .file_ids + .contains(&to_be_deleted_file_id) + && manifest.removed_files.removed_files[0].file_ids.len() == 1 + && manifest.files.is_empty(), + "Manifest after truncate: {:?}", + manifest + ); + let version = manifest.manifest_version; + + let regions = BTreeMap::from([(region_id, region.clone())]); + let file_ref_manifest = FileRefsManifest { + file_refs: [(region_id, HashSet::from([to_be_deleted_file_id]))].into(), + manifest_version: [(region_id, version)].into(), + }; + let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await; + let report = gc_worker.run().await.unwrap(); + assert!(report.deleted_files.get(®ion_id).unwrap().is_empty()); + assert!(report.need_retry_regions.is_empty()); + + let manifest = region.manifest_ctx.manifest().await; + assert!( + manifest.removed_files.removed_files[0].file_ids.len() == 1 && manifest.files.is_empty(), + "Manifest: {:?}", + manifest + ); +} + +/// Test insert/flush then compact can allow gc worker to delete files +#[tokio::test] +async fn test_gc_worker_basic_compact() { + init_default_ut_logging(); + + let mut env = TestEnv::new().await; + env.log_store = Some(env.create_log_store().await); + // use in memory object store for gc test, so it will use `ObjectStoreFilePurger` + env.object_store_manager = Some(Arc::new(env.create_in_memory_object_store_manager())); + + let engine = env + .new_mito_engine(MitoConfig { + gc: GcConfig { + enable: true, + // for faster delete file + lingering_time: None, + ..Default::default() + }, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let request = CreateRequestBuilder::new().build(); + + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request.clone())) + .await + .unwrap(); + + put_and_flush(&engine, region_id, &column_schemas, 0..10).await; + put_and_flush(&engine, region_id, &column_schemas, 10..20).await; + put_and_flush(&engine, region_id, &column_schemas, 20..30).await; + delete_and_flush(&engine, region_id, &column_schemas, 15..30).await; + put_and_flush(&engine, region_id, &column_schemas, 15..25).await; + + let result = engine + .handle_request( + region_id, + RegionRequest::Compact(RegionCompactRequest::default()), + ) + .await + .unwrap(); + assert_eq!(result.affected_rows, 0); + + let region = engine.get_region(region_id).unwrap(); + let manifest = region.manifest_ctx.manifest().await; + assert_eq!(manifest.removed_files.removed_files[0].file_ids.len(), 3); + + let version = manifest.manifest_version; + + let regions = BTreeMap::from([(region_id, region.clone())]); + let file_ref_manifest = FileRefsManifest { + file_refs: Default::default(), + manifest_version: [(region_id, version)].into(), + }; + + let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await; + let report = gc_worker.run().await.unwrap(); + + assert_eq!(report.deleted_files.get(®ion_id).unwrap().len(), 3,); + assert!(report.need_retry_regions.is_empty()); +} + +/// Compact with file refs should not delete files +#[tokio::test] +async fn test_gc_worker_compact_with_ref() { + init_default_ut_logging(); + + let mut env = TestEnv::new().await; + env.log_store = Some(env.create_log_store().await); + // use in memory object store for gc test, so it will use `ObjectStoreFilePurger` + env.object_store_manager = Some(Arc::new(env.create_in_memory_object_store_manager())); + + let engine = env + .new_mito_engine(MitoConfig { + gc: GcConfig { + enable: true, + // for faster delete file + lingering_time: None, + ..Default::default() + }, + ..Default::default() + }) + .await; + + let region_id = RegionId::new(1, 1); + env.get_schema_metadata_manager() + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + env.get_kv_backend(), + ) + .await; + + let request = CreateRequestBuilder::new().build(); + + let column_schemas = rows_schema(&request); + engine + .handle_request(region_id, RegionRequest::Create(request.clone())) + .await + .unwrap(); + + put_and_flush(&engine, region_id, &column_schemas, 0..10).await; + put_and_flush(&engine, region_id, &column_schemas, 10..20).await; + put_and_flush(&engine, region_id, &column_schemas, 20..30).await; + delete_and_flush(&engine, region_id, &column_schemas, 15..30).await; + put_and_flush(&engine, region_id, &column_schemas, 15..25).await; + + let result = engine + .handle_request( + region_id, + RegionRequest::Compact(RegionCompactRequest::default()), + ) + .await + .unwrap(); + assert_eq!(result.affected_rows, 0); + + let region = engine.get_region(region_id).unwrap(); + let manifest = region.manifest_ctx.manifest().await; + assert_eq!(manifest.removed_files.removed_files[0].file_ids.len(), 3); + + let version = manifest.manifest_version; + + let regions = BTreeMap::from([(region_id, region.clone())]); + let file_ref_manifest = FileRefsManifest { + file_refs: HashMap::from([( + region_id, + manifest.removed_files.removed_files[0] + .file_ids + .iter() + .cloned() + .collect(), + )]), + manifest_version: [(region_id, version)].into(), + }; + + let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await; + let report = gc_worker.run().await.unwrap(); + + assert_eq!(report.deleted_files.get(®ion_id).unwrap().len(), 0); + assert!(report.need_retry_regions.is_empty()); +} diff --git a/src/mito2/src/lib.rs b/src/mito2/src/lib.rs index 45ce635148..a15711b34a 100644 --- a/src/mito2/src/lib.rs +++ b/src/mito2/src/lib.rs @@ -47,7 +47,7 @@ pub mod schedule; pub mod sst; mod time_provider; pub mod wal; -mod worker; +pub mod worker; #[cfg_attr(doc, aquamarine::aquamarine)] /// # Mito developer document diff --git a/src/mito2/src/manifest/action.rs b/src/mito2/src/manifest/action.rs index af09e6c861..dedb228e25 100644 --- a/src/mito2/src/manifest/action.rs +++ b/src/mito2/src/manifest/action.rs @@ -25,10 +25,9 @@ use store_api::metadata::RegionMetadataRef; use store_api::storage::{FileId, RegionId, SequenceNumber}; use strum::Display; -use crate::error::{ - DurationOutOfRangeSnafu, RegionMetadataNotFoundSnafu, Result, SerdeJsonSnafu, Utf8Snafu, -}; +use crate::error::{RegionMetadataNotFoundSnafu, Result, SerdeJsonSnafu, Utf8Snafu}; use crate::manifest::manager::RemoveFileOptions; +use crate::region::ManifestStats; use crate::sst::FormatType; use crate::sst::file::FileMeta; use crate::wal::EntryId; @@ -236,13 +235,13 @@ impl RegionManifestBuilder { self.flushed_entry_id = truncated_entry_id; self.flushed_sequence = truncated_sequence; self.truncated_entry_id = Some(truncated_entry_id); - self.files.clear(); self.removed_files.add_removed_files( self.files.values().map(|meta| meta.file_id).collect(), truncate .timestamp_ms .unwrap_or_else(|| Utc::now().timestamp_millis()), ); + self.files.clear(); } TruncateKind::Partial { files_to_remove } => { self.removed_files.add_removed_files( @@ -294,6 +293,29 @@ pub struct RemovedFilesRecord { pub removed_files: Vec, } +impl RemovedFilesRecord { + /// Clear the actually deleted files from the list of removed files + pub fn clear_deleted_files(&mut self, deleted_files: Vec) { + let deleted_file_set: HashSet<_> = HashSet::from_iter(deleted_files); + for files in self.removed_files.iter_mut() { + files.file_ids.retain(|fid| !deleted_file_set.contains(fid)); + } + + self.removed_files.retain(|fs| !fs.file_ids.is_empty()); + } + + pub fn update_file_removed_cnt_to_stats(&self, stats: &ManifestStats) { + let cnt = self + .removed_files + .iter() + .map(|r| r.file_ids.len() as u64) + .sum(); + stats + .file_removed_cnt + .store(cnt, std::sync::atomic::Ordering::Relaxed); + } +} + #[derive(Serialize, Deserialize, Clone, Debug, Default, PartialEq, Eq)] pub struct RemovedFiles { /// The timestamp is the time when @@ -306,6 +328,9 @@ pub struct RemovedFiles { impl RemovedFilesRecord { /// Add a record of removed files with the current timestamp. pub fn add_removed_files(&mut self, file_ids: HashSet, at: i64) { + if file_ids.is_empty() { + return; + } self.removed_files.push(RemovedFiles { removed_at: at, file_ids, @@ -313,35 +338,13 @@ impl RemovedFilesRecord { } pub fn evict_old_removed_files(&mut self, opt: &RemoveFileOptions) -> Result<()> { - let total_removed_files: usize = self.removed_files.iter().map(|s| s.file_ids.len()).sum(); - if opt.keep_count > 0 && total_removed_files <= opt.keep_count { + if !opt.enable_gc { + // If GC is not enabled, always keep removed files empty. + self.removed_files.clear(); return Ok(()); } - let mut cur_file_cnt = total_removed_files; - - let can_evict_until = chrono::Utc::now() - - chrono::Duration::from_std(opt.keep_ttl).context(DurationOutOfRangeSnafu { - input: opt.keep_ttl, - })?; - - self.removed_files.sort_unstable_by_key(|f| f.removed_at); - let updated = std::mem::take(&mut self.removed_files) - .into_iter() - .filter_map(|f| { - if f.removed_at < can_evict_until.timestamp_millis() - && (opt.keep_count == 0 || cur_file_cnt >= opt.keep_count) - { - // can evict all files - // TODO(discord9): maybe only evict to below keep_count? Maybe not, or the update might be too frequent. - cur_file_cnt -= f.file_ids.len(); - None - } else { - Some(f) - } - }) - .collect(); - self.removed_files = updated; + // if GC is enabled, rely on gc worker to delete files, and evict removed files based on options. Ok(()) } diff --git a/src/mito2/src/manifest/checkpointer.rs b/src/mito2/src/manifest/checkpointer.rs index 3f3164ad93..1da03dda21 100644 --- a/src/mito2/src/manifest/checkpointer.rs +++ b/src/mito2/src/manifest/checkpointer.rs @@ -25,7 +25,6 @@ use crate::manifest::action::{RegionCheckpoint, RegionManifest}; use crate::manifest::manager::RegionManifestOptions; use crate::manifest::storage::ManifestObjectStore; use crate::metrics::MANIFEST_OP_ELAPSED; -use crate::region::{RegionLeaderState, RegionRoleState}; /// [`Checkpointer`] is responsible for doing checkpoint for a region, in an asynchronous way. #[derive(Debug)] @@ -129,26 +128,15 @@ impl Checkpointer { manifest.removed_files.evict_old_removed_files(opt)?; + // TODO(discord9): consider also check object store to clear removed files that are already deleted? How costly it is? + Ok(manifest) } /// Check if it's needed to do checkpoint for the region by the checkpoint distance. /// If needed, and there's no currently running checkpoint task, it will start a new checkpoint /// task running in the background. - pub(crate) fn maybe_do_checkpoint( - &self, - manifest: &RegionManifest, - region_state: RegionRoleState, - ) { - // Skip checkpoint if region is in staging state - if region_state == RegionRoleState::Leader(RegionLeaderState::Staging) { - info!( - "Skipping checkpoint for region {} in staging mode, manifest version: {}", - manifest.metadata.region_id, manifest.manifest_version - ); - return; - } - + pub(crate) fn maybe_do_checkpoint(&self, manifest: &RegionManifest) { if self.manifest_options.checkpoint_distance == 0 { return; } diff --git a/src/mito2/src/manifest/manager.rs b/src/mito2/src/manifest/manager.rs index b65d9c840d..81e69d1539 100644 --- a/src/mito2/src/manifest/manager.rs +++ b/src/mito2/src/manifest/manager.rs @@ -21,8 +21,10 @@ use futures::TryStreamExt; use object_store::ObjectStore; use snafu::{OptionExt, ResultExt, ensure}; use store_api::metadata::RegionMetadataRef; +use store_api::storage::FileId; use store_api::{MAX_VERSION, MIN_VERSION, ManifestVersion}; +use crate::config::MitoConfig; use crate::error::{ self, InstallManifestToSnafu, NoCheckpointSnafu, NoManifestsSnafu, RegionStoppedSnafu, Result, }; @@ -32,10 +34,11 @@ use crate::manifest::action::{ }; use crate::manifest::checkpointer::Checkpointer; use crate::manifest::storage::{ - ManifestObjectStore, file_version, is_checkpoint_file, is_delta_file, + ManifestObjectStore, file_version, is_checkpoint_file, is_delta_file, manifest_compress_type, + manifest_dir, }; use crate::metrics::MANIFEST_OP_ELAPSED; -use crate::region::{RegionLeaderState, RegionRoleState}; +use crate::region::{ManifestStats, RegionLeaderState, RegionRoleState}; use crate::sst::FormatType; /// Options for [RegionManifestManager]. @@ -51,27 +54,31 @@ pub struct RegionManifestOptions { pub remove_file_options: RemoveFileOptions, } -/// Options for updating `removed_files` field in [RegionManifest]. -#[derive(Debug, Clone)] -pub struct RemoveFileOptions { - /// Number of removed files to keep in manifest's `removed_files` field before also - /// remove them from `removed_files`. Only remove files when both `keep_count` and `keep_duration` is reached. - pub keep_count: usize, - /// Duration to keep removed files in manifest's `removed_files` field before also - /// remove them from `removed_files`. Only remove files when both `keep_count` and `keep_duration` is reached. - pub keep_ttl: std::time::Duration, -} - -#[cfg(any(test, feature = "test"))] -impl Default for RemoveFileOptions { - fn default() -> Self { - Self { - keep_count: 256, - keep_ttl: std::time::Duration::from_secs(3600), +impl RegionManifestOptions { + /// Creates a new [RegionManifestOptions] with the given region directory, object store, and configuration. + pub fn new(config: &MitoConfig, region_dir: &str, object_store: &ObjectStore) -> Self { + RegionManifestOptions { + manifest_dir: manifest_dir(region_dir), + object_store: object_store.clone(), + // We don't allow users to set the compression algorithm as we use it as a file suffix. + // Currently, the manifest storage doesn't have good support for changing compression algorithms. + compress_type: manifest_compress_type(config.compress_manifest), + checkpoint_distance: config.manifest_checkpoint_distance, + remove_file_options: RemoveFileOptions { + enable_gc: config.gc.enable, + }, } } } +/// Options for updating `removed_files` field in [RegionManifest]. +#[derive(Debug, Clone)] +#[cfg_attr(any(test, feature = "test"), derive(Default))] +pub struct RemoveFileOptions { + /// Whether GC is enabled. If not, the removed files should always be empty when persisting manifest. + pub enable_gc: bool, +} + // rewrite note: // trait Checkpoint -> struct RegionCheckpoint // trait MetaAction -> struct RegionMetaActionList @@ -144,6 +151,11 @@ pub struct RegionManifestManager { last_version: Arc, checkpointer: Checkpointer, manifest: Arc, + // Staging manifest is used to store the manifest of the staging region before it becomes available. + // It is initially inherited from the previous manifest(i.e., `self.manifest`). + // When the staging manifest becomes available, it will be used to construct the new manifest. + staging_manifest: Option>, + stats: ManifestStats, stopped: bool, } @@ -153,17 +165,17 @@ impl RegionManifestManager { metadata: RegionMetadataRef, flushed_entry_id: u64, options: RegionManifestOptions, - total_manifest_size: Arc, - manifest_version: Arc, sst_format: FormatType, + stats: &ManifestStats, ) -> Result { // construct storage let mut store = ManifestObjectStore::new( &options.manifest_dir, options.object_store.clone(), options.compress_type, - total_manifest_size, + stats.total_manifest_size.clone(), ); + let manifest_version = stats.manifest_version.clone(); info!( "Creating region manifest in {} with metadata {:?}, flushed_entry_id: {}", @@ -213,11 +225,16 @@ impl RegionManifestManager { let checkpointer = Checkpointer::new(region_id, options, store.clone(), MIN_VERSION); manifest_version.store(version, Ordering::Relaxed); + manifest + .removed_files + .update_file_removed_cnt_to_stats(stats); Ok(Self { store, last_version: manifest_version, checkpointer, manifest: Arc::new(manifest), + staging_manifest: None, + stats: stats.clone(), stopped: false, }) } @@ -227,8 +244,7 @@ impl RegionManifestManager { /// Returns `Ok(None)` if no such manifest. pub async fn open( options: RegionManifestOptions, - total_manifest_size: Arc, - manifest_version: Arc, + stats: &ManifestStats, ) -> Result> { let _t = MANIFEST_OP_ELAPSED .with_label_values(&["open"]) @@ -239,8 +255,9 @@ impl RegionManifestManager { &options.manifest_dir, options.object_store.clone(), options.compress_type, - total_manifest_size, + stats.total_manifest_size.clone(), ); + let manifest_version = stats.manifest_version.clone(); // recover from storage // construct manifest builder @@ -314,11 +331,17 @@ impl RegionManifestManager { last_checkpoint_version, ); manifest_version.store(version, Ordering::Relaxed); + manifest + .removed_files + .update_file_removed_cnt_to_stats(stats); Ok(Some(Self { store, last_version: manifest_version, checkpointer, manifest: Arc::new(manifest), + // TODO(weny): open the staging manifest if exists. + staging_manifest: None, + stats: stats.clone(), stopped: false, })) } @@ -442,6 +465,9 @@ impl RegionManifestManager { ); let version = self.last_version(); + new_manifest + .removed_files + .update_file_removed_cnt_to_stats(&self.stats); self.manifest = Arc::new(new_manifest); let last_version = self.set_version(self.manifest.manifest_version); info!( @@ -469,6 +495,9 @@ impl RegionManifestManager { let builder = RegionManifestBuilder::with_checkpoint(checkpoint.checkpoint); let manifest = builder.try_build()?; let last_version = self.set_version(manifest.manifest_version); + manifest + .removed_files + .update_file_removed_cnt_to_stats(&self.stats); self.manifest = Arc::new(manifest); info!( "Installed region manifest from checkpoint: {}, region: {}", @@ -482,7 +511,7 @@ impl RegionManifestManager { pub async fn update( &mut self, action_list: RegionMetaActionList, - region_state: RegionRoleState, + is_staging: bool, ) -> Result { let _t = MANIFEST_OP_ELAPSED .with_label_values(&["update"]) @@ -496,13 +525,19 @@ impl RegionManifestManager { ); let version = self.increase_version(); - let is_staging = region_state == RegionRoleState::Leader(RegionLeaderState::Staging); self.store .save(version, &action_list.encode()?, is_staging) .await?; + // For a staging region, the manifest is initially inherited from the previous manifest(i.e., `self.manifest`). + // When the staging manifest becomes available, it will be used to construct the new manifest. let mut manifest_builder = - RegionManifestBuilder::with_checkpoint(Some(self.manifest.as_ref().clone())); + if is_staging && let Some(staging_manifest) = self.staging_manifest.as_ref() { + RegionManifestBuilder::with_checkpoint(Some(staging_manifest.as_ref().clone())) + } else { + RegionManifestBuilder::with_checkpoint(Some(self.manifest.as_ref().clone())) + }; + for action in action_list.actions { match action { RegionMetaAction::Change(action) => { @@ -522,23 +557,52 @@ impl RegionManifestManager { } } } - let new_manifest = manifest_builder.try_build()?; - let updated_manifest = self - .checkpointer - .update_manifest_removed_files(new_manifest)?; - self.manifest = Arc::new(updated_manifest); - self.checkpointer - .maybe_do_checkpoint(self.manifest.as_ref(), region_state); + if is_staging { + let new_manifest = manifest_builder.try_build()?; + self.staging_manifest = Some(Arc::new(new_manifest)); + + info!( + "Skipping checkpoint for region {} in staging mode, manifest version: {}", + self.manifest.metadata.region_id, self.manifest.manifest_version + ); + } else { + let new_manifest = manifest_builder.try_build()?; + new_manifest + .removed_files + .update_file_removed_cnt_to_stats(&self.stats); + let updated_manifest = self + .checkpointer + .update_manifest_removed_files(new_manifest)?; + self.manifest = Arc::new(updated_manifest); + self.checkpointer + .maybe_do_checkpoint(self.manifest.as_ref()); + } Ok(version) } + /// Clear deleted files from manifest's `removed_files` field without update version. Notice if datanode exit before checkpoint then new manifest by open region may still contain these deleted files, which is acceptable for gc process. + pub fn clear_deleted_files(&mut self, deleted_files: Vec) { + let mut manifest = (*self.manifest()).clone(); + manifest.removed_files.clear_deleted_files(deleted_files); + self.set_manifest(Arc::new(manifest)); + } + + pub(crate) fn set_manifest(&mut self, manifest: Arc) { + self.manifest = manifest; + } + /// Retrieves the current [RegionManifest]. pub fn manifest(&self) -> Arc { self.manifest.clone() } + /// Retrieves the current [RegionManifest]. + pub fn staging_manifest(&self) -> Option> { + self.staging_manifest.clone() + } + /// Returns total manifest size. pub fn manifest_usage(&self) -> u64 { self.store.total_manifest_size() @@ -675,6 +739,22 @@ impl RegionManifestManager { Ok(Some(RegionMetaActionList::new(merged_actions))) } + + /// Unsets the staging manifest. + pub(crate) fn unset_staging_manifest(&mut self) { + self.staging_manifest = None; + } + + /// Clear all staging manifests. + pub(crate) async fn clear_staging_manifest_and_dir(&mut self) -> Result<()> { + self.staging_manifest = None; + self.store.clear_staging_manifests().await?; + info!( + "Cleared all staging manifests for region {}", + self.manifest.metadata.region_id + ); + Ok(()) + } } #[cfg(test)] @@ -801,13 +881,7 @@ mod test { sst_format: FormatType::PrimaryKey, })); - let current_version = manager - .update( - action_list, - RegionRoleState::Leader(RegionLeaderState::Writable), - ) - .await - .unwrap(); + let current_version = manager.update(action_list, false).await.unwrap(); assert_eq!(current_version, 1); manager.validate_manifest(&new_metadata, 1); @@ -870,13 +944,7 @@ mod test { sst_format: FormatType::PrimaryKey, })); - let current_version = manager - .update( - action_list, - RegionRoleState::Leader(RegionLeaderState::Writable), - ) - .await - .unwrap(); + let current_version = manager.update(action_list, false).await.unwrap(); assert_eq!(current_version, 1); manager.validate_manifest(&new_metadata, 1); @@ -897,7 +965,7 @@ mod test { flushed_sequence: None, committed_sequence: None, })]), - RegionRoleState::Leader(RegionLeaderState::Writable), + false, ) .await .unwrap(); @@ -923,6 +991,6 @@ mod test { // get manifest size again let manifest_size = manager.manifest_usage(); - assert_eq!(manifest_size, 1764); + assert_eq!(manifest_size, 1378); } } diff --git a/src/mito2/src/manifest/storage.rs b/src/mito2/src/manifest/storage.rs index 48a30af16e..78ae188224 100644 --- a/src/mito2/src/manifest/storage.rs +++ b/src/mito2/src/manifest/storage.rs @@ -24,6 +24,7 @@ use crc32fast::Hasher; use futures::TryStreamExt; use futures::future::try_join_all; use lazy_static::lazy_static; +use object_store::util::join_dir; use object_store::{Entry, ErrorKind, Lister, ObjectStore, util}; use regex::Regex; use serde::{Deserialize, Serialize}; @@ -49,6 +50,11 @@ const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed; const FETCH_MANIFEST_PARALLELISM: usize = 16; +/// Returns the directory to the manifest files. +pub fn manifest_dir(region_dir: &str) -> String { + join_dir(region_dir, "manifest") +} + /// Returns the [CompressionType] according to whether to compress manifest files. pub const fn manifest_compress_type(compress: bool) -> CompressionType { if compress { diff --git a/src/mito2/src/manifest/tests/checkpoint.rs b/src/mito2/src/manifest/tests/checkpoint.rs index e10d3aad46..da063fe242 100644 --- a/src/mito2/src/manifest/tests/checkpoint.rs +++ b/src/mito2/src/manifest/tests/checkpoint.rs @@ -27,7 +27,6 @@ use crate::manifest::action::{ use crate::manifest::manager::RegionManifestManager; use crate::manifest::storage::CheckpointMetadata; use crate::manifest::tests::utils::basic_region_metadata; -use crate::region::{RegionLeaderState, RegionRoleState}; use crate::sst::file::FileMeta; use crate::test_util::TestEnv; @@ -87,13 +86,7 @@ async fn manager_without_checkpoint() { // apply 10 actions for _ in 0..10 { - manager - .update( - nop_action(), - RegionRoleState::Leader(RegionLeaderState::Writable), - ) - .await - .unwrap(); + manager.update(nop_action(), false).await.unwrap(); } // no checkpoint @@ -138,13 +131,7 @@ async fn manager_with_checkpoint_distance_1() { // apply 10 actions for _ in 0..10 { - manager - .update( - nop_action(), - RegionRoleState::Leader(RegionLeaderState::Writable), - ) - .await - .unwrap(); + manager.update(nop_action(), false).await.unwrap(); while manager.checkpointer().is_doing_checkpoint() { tokio::time::sleep(Duration::from_millis(10)).await; @@ -205,13 +192,7 @@ async fn test_corrupted_data_causing_checksum_error() { // Apply actions for _ in 0..10 { - manager - .update( - nop_action(), - RegionRoleState::Leader(RegionLeaderState::Writable), - ) - .await - .unwrap(); + manager.update(nop_action(), false).await.unwrap(); } // Wait for the checkpoint to finish. @@ -264,11 +245,14 @@ async fn checkpoint_with_different_compression_types() { level: 0, file_size: 1024000, available_indexes: Default::default(), + indexes: Default::default(), index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, sequence: None, partition_expr: None, + num_series: 0, }; let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit { files_to_add: vec![file_meta], @@ -299,10 +283,7 @@ async fn generate_checkpoint_with_compression_types( let (_env, mut manager) = build_manager(1, compress_type).await; for action in actions { - manager - .update(action, RegionRoleState::Leader(RegionLeaderState::Writable)) - .await - .unwrap(); + manager.update(action, false).await.unwrap(); while manager.checkpointer().is_doing_checkpoint() { tokio::time::sleep(Duration::from_millis(10)).await; @@ -329,11 +310,14 @@ fn generate_action_lists(num: usize) -> (Vec, Vec) level: 0, file_size: 1024000, available_indexes: Default::default(), + indexes: Default::default(), index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, sequence: None, partition_expr: None, + num_series: 0, }; let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit { files_to_add: vec![file_meta], @@ -355,10 +339,7 @@ async fn manifest_install_manifest_to() { let (env, mut manager) = build_manager(0, CompressionType::Uncompressed).await; let (files, actions) = generate_action_lists(10); for action in actions { - manager - .update(action, RegionRoleState::Leader(RegionLeaderState::Writable)) - .await - .unwrap(); + manager.update(action, false).await.unwrap(); } // Nothing to install @@ -396,10 +377,7 @@ async fn manifest_install_manifest_to_with_checkpoint() { let (env, mut manager) = build_manager(3, CompressionType::Uncompressed).await; let (files, actions) = generate_action_lists(10); for action in actions { - manager - .update(action, RegionRoleState::Leader(RegionLeaderState::Writable)) - .await - .unwrap(); + manager.update(action, false).await.unwrap(); while manager.checkpointer().is_doing_checkpoint() { tokio::time::sleep(Duration::from_millis(10)).await; @@ -471,13 +449,7 @@ async fn test_checkpoint_bypass_in_staging_mode() { // Apply actions in staging mode - checkpoint should be bypassed for _ in 0..15 { - manager - .update( - nop_action(), - RegionRoleState::Leader(RegionLeaderState::Staging), - ) - .await - .unwrap(); + manager.update(nop_action(), true).await.unwrap(); } assert!(!manager.checkpointer().is_doing_checkpoint()); @@ -492,13 +464,7 @@ async fn test_checkpoint_bypass_in_staging_mode() { ); // Now switch to normal mode and apply one more action - manager - .update( - nop_action(), - RegionRoleState::Leader(RegionLeaderState::Writable), - ) - .await - .unwrap(); + manager.update(nop_action(), false).await.unwrap(); // Wait for potential checkpoint while manager.checkpointer().is_doing_checkpoint() { diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs index b4461e8b06..ea3875ac7a 100644 --- a/src/mito2/src/memtable.rs +++ b/src/mito2/src/memtable.rs @@ -44,6 +44,7 @@ use crate::region::options::{MemtableOptions, MergeMode, RegionOptions}; use crate::sst::FormatType; use crate::sst::file::FileTimeRange; use crate::sst::parquet::SstInfo; +use crate::sst::parquet::file_range::PreFilterMode; mod builder; pub mod bulk; @@ -73,6 +74,63 @@ pub enum MemtableConfig { TimeSeries, } +/// Options for querying ranges from a memtable. +#[derive(Clone)] +pub struct RangesOptions { + /// Whether the ranges are being queried for flush. + pub for_flush: bool, + /// Mode to pre-filter columns in ranges. + pub pre_filter_mode: PreFilterMode, + /// Predicate to filter the data. + pub predicate: PredicateGroup, + /// Sequence range to filter the data. + pub sequence: Option, +} + +impl Default for RangesOptions { + fn default() -> Self { + Self { + for_flush: false, + pre_filter_mode: PreFilterMode::All, + predicate: PredicateGroup::default(), + sequence: None, + } + } +} + +impl RangesOptions { + /// Creates a new [RangesOptions] for flushing. + pub fn for_flush() -> Self { + Self { + for_flush: true, + pre_filter_mode: PreFilterMode::All, + predicate: PredicateGroup::default(), + sequence: None, + } + } + + /// Sets the pre-filter mode. + #[must_use] + pub fn with_pre_filter_mode(mut self, pre_filter_mode: PreFilterMode) -> Self { + self.pre_filter_mode = pre_filter_mode; + self + } + + /// Sets the predicate. + #[must_use] + pub fn with_predicate(mut self, predicate: PredicateGroup) -> Self { + self.predicate = predicate; + self + } + + /// Sets the sequence range. + #[must_use] + pub fn with_sequence(mut self, sequence: Option) -> Self { + self.sequence = sequence; + self + } +} + #[derive(Debug, Default, Clone)] pub struct MemtableStats { /// The estimated bytes allocated by this memtable from heap. @@ -191,14 +249,11 @@ pub trait Memtable: Send + Sync + fmt::Debug { /// Returns the ranges in the memtable. /// - /// The `for_flush` flag is true if the flush job calls this method for flush. /// The returned map contains the range id and the range after applying the predicate. fn ranges( &self, projection: Option<&[ColumnId]>, - predicate: PredicateGroup, - sequence: Option, - for_flush: bool, + options: RangesOptions, ) -> Result; /// Returns true if the memtable is empty. @@ -367,6 +422,7 @@ impl MemtableBuilderProvider { ); } + // The format is not flat. match &options.memtable { Some(MemtableOptions::TimeSeries) => Arc::new(TimeSeriesMemtableBuilder::new( self.write_buffer_manager.clone(), @@ -385,22 +441,15 @@ impl MemtableBuilderProvider { self.write_buffer_manager.clone(), )) } - None => self.default_memtable_builder(dedup, merge_mode), + None => self.default_primary_key_memtable_builder(dedup, merge_mode), } } - fn default_memtable_builder(&self, dedup: bool, merge_mode: MergeMode) -> MemtableBuilderRef { - if self.config.default_experimental_flat_format { - return Arc::new( - BulkMemtableBuilder::new( - self.write_buffer_manager.clone(), - !dedup, // append_mode: true if not dedup, false if dedup - merge_mode, - ) - .with_compact_dispatcher(self.compact_dispatcher.clone()), - ); - } - + fn default_primary_key_memtable_builder( + &self, + dedup: bool, + merge_mode: MergeMode, + ) -> MemtableBuilderRef { match &self.config.memtable { MemtableConfig::PartitionTree(config) => { let mut config = config.clone(); diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs index d67e9f1424..2c26410ca6 100644 --- a/src/mito2/src/memtable/bulk.rs +++ b/src/mito2/src/memtable/bulk.rs @@ -36,13 +36,15 @@ use tokio::sync::Semaphore; use crate::error::{Result, UnsupportedOperationSnafu}; use crate::flush::WriteBufferManagerRef; use crate::memtable::bulk::context::BulkIterContext; -use crate::memtable::bulk::part::{BulkPart, BulkPartEncodeMetrics, BulkPartEncoder}; +use crate::memtable::bulk::part::{ + BulkPart, BulkPartEncodeMetrics, BulkPartEncoder, UnorderedPart, +}; use crate::memtable::bulk::part_reader::BulkPartRecordBatchIter; use crate::memtable::stats::WriteMetrics; use crate::memtable::{ AllocTracker, BoxedBatchIterator, BoxedRecordBatchIterator, EncodedBulkPart, EncodedRange, IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, - MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, PredicateGroup, + MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions, }; use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeIterator; @@ -54,6 +56,8 @@ use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; /// All parts in a bulk memtable. #[derive(Default)] struct BulkParts { + /// Unordered small parts (< 1024 rows). + unordered_part: UnorderedPart, /// Raw parts. parts: Vec, /// Parts encoded as parquets. @@ -61,14 +65,15 @@ struct BulkParts { } impl BulkParts { - /// Total number of parts (raw + encoded). + /// Total number of parts (raw + encoded + unordered). fn num_parts(&self) -> usize { - self.parts.len() + self.encoded_parts.len() + let unordered_count = if self.unordered_part.is_empty() { 0 } else { 1 }; + self.parts.len() + self.encoded_parts.len() + unordered_count } /// Returns true if there is no part. fn is_empty(&self) -> bool { - self.parts.is_empty() && self.encoded_parts.is_empty() + self.unordered_part.is_empty() && self.parts.is_empty() && self.encoded_parts.is_empty() } /// Returns true if the bulk parts should be merged. @@ -89,6 +94,11 @@ impl BulkParts { unmerged_count >= 8 } + /// Returns true if the unordered_part should be compacted into a BulkPart. + fn should_compact_unordered_part(&self) -> bool { + self.unordered_part.should_compact() + } + /// Collects unmerged parts and marks them as being merged. /// Returns the collected parts to merge. fn collect_bulk_parts_to_merge(&mut self) -> Vec { @@ -243,7 +253,6 @@ pub struct BulkMemtable { max_sequence: AtomicU64, num_rows: AtomicUsize, /// Cached flat SST arrow schema for memtable compaction. - #[allow(dead_code)] flat_arrow_schema: SchemaRef, /// Compactor for merging bulk parts compactor: Arc>, @@ -298,11 +307,29 @@ impl Memtable for BulkMemtable { { let mut bulk_parts = self.parts.write().unwrap(); - bulk_parts.parts.push(BulkPartWrapper { - part: fragment, - file_id: FileId::random(), - merging: false, - }); + + // Routes small parts to unordered_part based on threshold + if bulk_parts.unordered_part.should_accept(fragment.num_rows()) { + bulk_parts.unordered_part.push(fragment); + + // Compacts unordered_part if threshold is reached + if bulk_parts.should_compact_unordered_part() + && let Some(bulk_part) = bulk_parts.unordered_part.to_bulk_part()? + { + bulk_parts.parts.push(BulkPartWrapper { + part: bulk_part, + file_id: FileId::random(), + merging: false, + }); + bulk_parts.unordered_part.clear(); + } + } else { + bulk_parts.parts.push(BulkPartWrapper { + part: fragment, + file_id: FileId::random(), + merging: false, + }); + } // Since this operation should be fast, we do it in parts lock scope. // This ensure the statistics in `ranges()` are correct. What's more, @@ -331,25 +358,47 @@ impl Memtable for BulkMemtable { fn ranges( &self, projection: Option<&[ColumnId]>, - predicate: PredicateGroup, - sequence: Option, - for_flush: bool, + options: RangesOptions, ) -> Result { + let predicate = options.predicate; + let sequence = options.sequence; let mut ranges = BTreeMap::new(); let mut range_id = 0; // TODO(yingwen): Filter ranges by sequence. - let context = Arc::new(BulkIterContext::new( + let context = Arc::new(BulkIterContext::new_with_pre_filter_mode( self.metadata.clone(), projection, predicate.predicate().cloned(), - for_flush, + options.for_flush, + options.pre_filter_mode, )?); // Adds ranges for regular parts and encoded parts { let bulk_parts = self.parts.read().unwrap(); + // Adds range for unordered part if not empty + if !bulk_parts.unordered_part.is_empty() + && let Some(unordered_bulk_part) = bulk_parts.unordered_part.to_bulk_part()? + { + let num_rows = unordered_bulk_part.num_rows(); + let range = MemtableRange::new( + Arc::new(MemtableRangeContext::new( + self.id, + Box::new(BulkRangeIterBuilder { + part: unordered_bulk_part, + context: context.clone(), + sequence, + }), + predicate.clone(), + )), + num_rows, + ); + ranges.insert(range_id, range); + range_id += 1; + } + // Adds ranges for regular parts for part_wrapper in bulk_parts.parts.iter() { // Skips empty parts @@ -544,6 +593,26 @@ impl BulkMemtable { } } + /// Sets the unordered part threshold (for testing). + #[cfg(test)] + pub fn set_unordered_part_threshold(&self, threshold: usize) { + self.parts + .write() + .unwrap() + .unordered_part + .set_threshold(threshold); + } + + /// Sets the unordered part compact threshold (for testing). + #[cfg(test)] + pub fn set_unordered_part_compact_threshold(&self, compact_threshold: usize) { + self.parts + .write() + .unwrap() + .unordered_part + .set_compact_threshold(compact_threshold); + } + /// Updates memtable stats. /// /// Please update this inside the write lock scope. @@ -619,12 +688,15 @@ impl IterBuilder for BulkRangeIterBuilder { fn build_record_batch( &self, - _metrics: Option, + metrics: Option, ) -> Result { + let series_count = self.part.estimated_series_count(); let iter = BulkPartRecordBatchIter::new( self.part.batch.clone(), self.context.clone(), self.sequence, + series_count, + metrics, ); Ok(Box::new(iter)) @@ -637,7 +709,6 @@ impl IterBuilder for BulkRangeIterBuilder { /// Iterator builder for encoded bulk range struct EncodedBulkRangeIterBuilder { - #[allow(dead_code)] file_id: FileId, part: EncodedBulkPart, context: Arc, @@ -658,9 +729,12 @@ impl IterBuilder for EncodedBulkRangeIterBuilder { fn build_record_batch( &self, - _metrics: Option, + metrics: Option, ) -> Result { - if let Some(iter) = self.part.read(self.context.clone(), self.sequence)? { + if let Some(iter) = self + .part + .read(self.context.clone(), self.sequence, metrics)? + { Ok(iter) } else { // Return an empty iterator if no data to read @@ -679,7 +753,6 @@ impl IterBuilder for EncodedBulkRangeIterBuilder { struct BulkPartWrapper { part: BulkPart, /// The unique file id for this part in memtable. - #[allow(dead_code)] file_id: FileId, /// Whether this part is currently being merged. merging: bool, @@ -688,7 +761,6 @@ struct BulkPartWrapper { struct EncodedPartWrapper { part: EncodedBulkPart, /// The unique file id for this part in memtable. - #[allow(dead_code)] file_id: FileId, /// Whether this part is currently being merged. merging: bool, @@ -746,12 +818,17 @@ impl PartToMerge { ) -> Result> { match self { PartToMerge::Bulk { part, .. } => { + let series_count = part.estimated_series_count(); let iter = BulkPartRecordBatchIter::new( - part.batch, context, None, // No sequence filter for merging + part.batch, + context, + None, // No sequence filter for merging + series_count, + None, // No metrics for merging ); Ok(Some(Box::new(iter) as BoxedRecordBatchIterator)) } - PartToMerge::Encoded { part, .. } => part.read(context, None), + PartToMerge::Encoded { part, .. } => part.read(context, None, None), } } } @@ -1156,6 +1233,8 @@ mod tests { let metadata = metadata_for_test(); let memtable = BulkMemtable::new(999, metadata.clone(), None, None, false, MergeMode::LastRow); + // Disable unordered_part for this test + memtable.set_unordered_part_threshold(0); let test_data = [ ( @@ -1192,7 +1271,12 @@ mod tests { assert_eq!(3000, max_ts.value()); let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap(); - let ranges = memtable.ranges(None, predicate_group, None, false).unwrap(); + let ranges = memtable + .ranges( + None, + RangesOptions::default().with_predicate(predicate_group), + ) + .unwrap(); assert_eq!(3, ranges.ranges.len()); assert_eq!(5, ranges.stats.num_rows); @@ -1234,7 +1318,10 @@ mod tests { let projection = vec![4u32]; let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap(); let ranges = memtable - .ranges(Some(&projection), predicate_group, None, false) + .ranges( + Some(&projection), + RangesOptions::default().with_predicate(predicate_group), + ) .unwrap(); assert_eq!(1, ranges.ranges.len()); @@ -1325,6 +1412,8 @@ mod tests { let metadata = metadata_for_test(); let memtable = BulkMemtable::new(777, metadata.clone(), None, None, false, MergeMode::LastRow); + // Disable unordered_part for this test + memtable.set_unordered_part_threshold(0); let parts_data = vec![ ( @@ -1350,7 +1439,12 @@ mod tests { } let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap(); - let ranges = memtable.ranges(None, predicate_group, None, false).unwrap(); + let ranges = memtable + .ranges( + None, + RangesOptions::default().with_predicate(predicate_group), + ) + .unwrap(); assert_eq!(3, ranges.ranges.len()); assert_eq!(5, ranges.stats.num_rows); @@ -1383,7 +1477,12 @@ mod tests { let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap(); let sequence_filter = Some(SequenceRange::LtEq { max: 400 }); // Filters out rows with sequence > 400 let ranges = memtable - .ranges(None, predicate_group, sequence_filter, false) + .ranges( + None, + RangesOptions::default() + .with_predicate(predicate_group) + .with_sequence(sequence_filter), + ) .unwrap(); assert_eq!(1, ranges.ranges.len()); @@ -1398,6 +1497,8 @@ mod tests { let metadata = metadata_for_test(); let memtable = BulkMemtable::new(999, metadata.clone(), None, None, false, MergeMode::LastRow); + // Disable unordered_part for this test + memtable.set_unordered_part_threshold(0); // Adds enough bulk parts to trigger encoding for i in 0..10 { @@ -1415,7 +1516,12 @@ mod tests { memtable.compact(false).unwrap(); let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap(); - let ranges = memtable.ranges(None, predicate_group, None, false).unwrap(); + let ranges = memtable + .ranges( + None, + RangesOptions::default().with_predicate(predicate_group), + ) + .unwrap(); // Should have ranges for both bulk parts and encoded parts assert_eq!(3, ranges.ranges.len()); @@ -1435,4 +1541,229 @@ mod tests { assert_eq!(total_rows, range.num_rows()); } } + + #[test] + fn test_bulk_memtable_unordered_part() { + let metadata = metadata_for_test(); + let memtable = BulkMemtable::new( + 1001, + metadata.clone(), + None, + None, + false, + MergeMode::LastRow, + ); + + // Set smaller thresholds for testing with smaller inputs + // Accept parts with < 5 rows into unordered_part + memtable.set_unordered_part_threshold(5); + // Compact when total rows >= 10 + memtable.set_unordered_part_compact_threshold(10); + + // Write 3 small parts (each has 2 rows), should be collected in unordered_part + for i in 0..3 { + let part = create_bulk_part_with_converter( + &format!("key_{}", i), + i, + vec![1000 + i as i64 * 100, 1100 + i as i64 * 100], + vec![Some(i as f64 * 10.0), Some(i as f64 * 10.0 + 1.0)], + 100 + i as u64, + ) + .unwrap(); + assert_eq!(2, part.num_rows()); + memtable.write_bulk(part).unwrap(); + } + + // Total rows = 6, not yet reaching compact threshold + let stats = memtable.stats(); + assert_eq!(6, stats.num_rows); + + // Write 2 more small parts (each has 2 rows) + // This should trigger compaction when total >= 10 + for i in 3..5 { + let part = create_bulk_part_with_converter( + &format!("key_{}", i), + i, + vec![1000 + i as i64 * 100, 1100 + i as i64 * 100], + vec![Some(i as f64 * 10.0), Some(i as f64 * 10.0 + 1.0)], + 100 + i as u64, + ) + .unwrap(); + memtable.write_bulk(part).unwrap(); + } + + // Total rows = 10, should have compacted unordered_part into a regular part + let stats = memtable.stats(); + assert_eq!(10, stats.num_rows); + + // Verify we can read all data correctly + let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap(); + let ranges = memtable + .ranges( + None, + RangesOptions::default().with_predicate(predicate_group), + ) + .unwrap(); + + // Should have at least 1 range (the compacted part) + assert!(!ranges.ranges.is_empty()); + assert_eq!(10, ranges.stats.num_rows); + + // Read all data and verify + let mut total_rows_read = 0; + for (_range_id, range) in ranges.ranges.iter() { + assert!(range.is_record_batch()); + let record_batch_iter = range.build_record_batch_iter(None).unwrap(); + + for batch_result in record_batch_iter { + let batch = batch_result.unwrap(); + total_rows_read += batch.num_rows(); + } + } + assert_eq!(10, total_rows_read); + } + + #[test] + fn test_bulk_memtable_unordered_part_mixed_sizes() { + let metadata = metadata_for_test(); + let memtable = BulkMemtable::new( + 1002, + metadata.clone(), + None, + None, + false, + MergeMode::LastRow, + ); + + // Set threshold to 4 rows - parts with < 4 rows go to unordered_part + memtable.set_unordered_part_threshold(4); + memtable.set_unordered_part_compact_threshold(8); + + // Write small parts (3 rows each) - should go to unordered_part + for i in 0..2 { + let part = create_bulk_part_with_converter( + &format!("small_{}", i), + i, + vec![1000 + i as i64, 2000 + i as i64, 3000 + i as i64], + vec![Some(i as f64), Some(i as f64 + 1.0), Some(i as f64 + 2.0)], + 10 + i as u64, + ) + .unwrap(); + assert_eq!(3, part.num_rows()); + memtable.write_bulk(part).unwrap(); + } + + // Write a large part (5 rows) - should go directly to regular parts + let large_part = create_bulk_part_with_converter( + "large_key", + 100, + vec![5000, 6000, 7000, 8000, 9000], + vec![ + Some(100.0), + Some(101.0), + Some(102.0), + Some(103.0), + Some(104.0), + ], + 50, + ) + .unwrap(); + assert_eq!(5, large_part.num_rows()); + memtable.write_bulk(large_part).unwrap(); + + // Write another small part (2 rows) - should trigger compaction of unordered_part + let part = create_bulk_part_with_converter( + "small_2", + 2, + vec![4000, 4100], + vec![Some(20.0), Some(21.0)], + 30, + ) + .unwrap(); + memtable.write_bulk(part).unwrap(); + + let stats = memtable.stats(); + assert_eq!(13, stats.num_rows); // 3 + 3 + 5 + 2 = 13 + + // Verify all data can be read + let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap(); + let ranges = memtable + .ranges( + None, + RangesOptions::default().with_predicate(predicate_group), + ) + .unwrap(); + + assert_eq!(13, ranges.stats.num_rows); + + let mut total_rows_read = 0; + for (_range_id, range) in ranges.ranges.iter() { + let record_batch_iter = range.build_record_batch_iter(None).unwrap(); + for batch_result in record_batch_iter { + let batch = batch_result.unwrap(); + total_rows_read += batch.num_rows(); + } + } + assert_eq!(13, total_rows_read); + } + + #[test] + fn test_bulk_memtable_unordered_part_with_ranges() { + let metadata = metadata_for_test(); + let memtable = BulkMemtable::new( + 1003, + metadata.clone(), + None, + None, + false, + MergeMode::LastRow, + ); + + // Set small thresholds + memtable.set_unordered_part_threshold(3); + memtable.set_unordered_part_compact_threshold(100); // High threshold to prevent auto-compaction + + // Write several small parts that stay in unordered_part + for i in 0..3 { + let part = create_bulk_part_with_converter( + &format!("key_{}", i), + i, + vec![1000 + i as i64 * 100], + vec![Some(i as f64 * 10.0)], + 100 + i as u64, + ) + .unwrap(); + assert_eq!(1, part.num_rows()); + memtable.write_bulk(part).unwrap(); + } + + let stats = memtable.stats(); + assert_eq!(3, stats.num_rows); + + // Test that ranges() can correctly read from unordered_part + let predicate_group = PredicateGroup::new(&metadata, &[]).unwrap(); + let ranges = memtable + .ranges( + None, + RangesOptions::default().with_predicate(predicate_group), + ) + .unwrap(); + + // Should have 1 range for the unordered_part + assert_eq!(1, ranges.ranges.len()); + assert_eq!(3, ranges.stats.num_rows); + + // Verify data is sorted correctly in the range + let range = ranges.ranges.get(&0).unwrap(); + let record_batch_iter = range.build_record_batch_iter(None).unwrap(); + + let mut total_rows = 0; + for batch_result in record_batch_iter { + let batch = batch_result.unwrap(); + total_rows += batch.num_rows(); + // Verify data is properly sorted by primary key + assert!(batch.num_rows() > 0); + } + assert_eq!(3, total_rows); + } } diff --git a/src/mito2/src/memtable/bulk/context.rs b/src/mito2/src/memtable/bulk/context.rs index 55d064e3b9..7688a6e1d9 100644 --- a/src/mito2/src/memtable/bulk/context.rs +++ b/src/mito2/src/memtable/bulk/context.rs @@ -24,7 +24,7 @@ use store_api::storage::ColumnId; use table::predicate::Predicate; use crate::error::Result; -use crate::sst::parquet::file_range::RangeBase; +use crate::sst::parquet::file_range::{PreFilterMode, RangeBase}; use crate::sst::parquet::flat_format::FlatReadFormat; use crate::sst::parquet::format::ReadFormat; use crate::sst::parquet::reader::SimpleFilterContext; @@ -43,6 +43,22 @@ impl BulkIterContext { projection: Option<&[ColumnId]>, predicate: Option, skip_auto_convert: bool, + ) -> Result { + Self::new_with_pre_filter_mode( + region_metadata, + projection, + predicate, + skip_auto_convert, + PreFilterMode::All, + ) + } + + pub fn new_with_pre_filter_mode( + region_metadata: RegionMetadataRef, + projection: Option<&[ColumnId]>, + predicate: Option, + skip_auto_convert: bool, + pre_filter_mode: PreFilterMode, ) -> Result { let codec = build_primary_key_codec(®ion_metadata); @@ -73,17 +89,23 @@ impl BulkIterContext { codec, // we don't need to compat batch since all batch in memtable have the same schema. compat_batch: None, + pre_filter_mode, }, predicate, }) } /// Prunes row groups by stats. - pub(crate) fn row_groups_to_read(&self, file_meta: &Arc) -> VecDeque { + pub(crate) fn row_groups_to_read( + &self, + file_meta: &Arc, + skip_fields: bool, + ) -> VecDeque { let region_meta = self.base.read_format.metadata(); let row_groups = file_meta.row_groups(); // expected_metadata is set to None since we always expect region metadata of memtable is up-to-date. - let stats = RowGroupPruningStats::new(row_groups, &self.base.read_format, None); + let stats = + RowGroupPruningStats::new(row_groups, &self.base.read_format, None, skip_fields); if let Some(predicate) = self.predicate.as_ref() { predicate .prune_with_stats(&stats, region_meta.schema.arrow_schema()) @@ -104,4 +126,14 @@ impl BulkIterContext { pub(crate) fn read_format(&self) -> &ReadFormat { &self.base.read_format } + + /// Returns the pre-filter mode. + pub(crate) fn pre_filter_mode(&self) -> PreFilterMode { + self.base.pre_filter_mode + } + + /// Returns the region id. + pub(crate) fn region_id(&self) -> store_api::storage::RegionId { + self.base.read_format.metadata().region_id + } } diff --git a/src/mito2/src/memtable/bulk/part.rs b/src/mito2/src/memtable/bulk/part.rs index 4eb2655755..aa2278cb78 100644 --- a/src/mito2/src/memtable/bulk/part.rs +++ b/src/mito2/src/memtable/bulk/part.rs @@ -14,11 +14,11 @@ //! Bulk part encoder/decoder. -use std::collections::VecDeque; +use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::Arc; use std::time::{Duration, Instant}; -use api::helper::{ColumnDataTypeWrapper, value_to_grpc_value}; +use api::helper::{ColumnDataTypeWrapper, to_grpc_value}; use api::v1::bulk_wal_entry::Body; use api::v1::{ArrowIpc, BulkWalEntry, Mutation, OpType, bulk_wal_entry}; use bytes::Bytes; @@ -34,7 +34,9 @@ use datatypes::arrow::array::{ UInt64Array, UInt64Builder, }; use datatypes::arrow::compute::{SortColumn, SortOptions, TakeOptions}; -use datatypes::arrow::datatypes::{SchemaRef, UInt32Type}; +use datatypes::arrow::datatypes::{ + DataType as ArrowDataType, Field, Schema, SchemaRef, UInt32Type, +}; use datatypes::arrow_array::BinaryArray; use datatypes::data_type::DataType; use datatypes::prelude::{MutableVector, ScalarVectorBuilder, Vector}; @@ -51,25 +53,27 @@ use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::WriterProperties; use snafu::{OptionExt, ResultExt, Snafu}; use store_api::codec::PrimaryKeyEncoding; -use store_api::metadata::{RegionMetadata, RegionMetadataRef}; +use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef}; use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME; -use store_api::storage::{FileId, SequenceNumber, SequenceRange}; +use store_api::storage::{FileId, RegionId, SequenceNumber, SequenceRange}; use table::predicate::Predicate; use crate::error::{ - self, ColumnNotFoundSnafu, ComputeArrowSnafu, DataTypeMismatchSnafu, EncodeMemtableSnafu, - EncodeSnafu, InvalidMetadataSnafu, NewRecordBatchSnafu, Result, + self, ColumnNotFoundSnafu, ComputeArrowSnafu, ConvertColumnDataTypeSnafu, CreateDefaultSnafu, + DataTypeMismatchSnafu, EncodeMemtableSnafu, EncodeSnafu, InvalidMetadataSnafu, + InvalidRequestSnafu, NewRecordBatchSnafu, Result, UnexpectedSnafu, }; -use crate::memtable::BoxedRecordBatchIterator; use crate::memtable::bulk::context::BulkIterContextRef; use crate::memtable::bulk::part_reader::EncodedBulkPartIter; use crate::memtable::time_series::{ValueBuilder, Values}; +use crate::memtable::{BoxedRecordBatchIterator, MemScanMetrics}; use crate::sst::index::IndexOutput; +use crate::sst::parquet::file_range::{PreFilterMode, row_group_contains_delete}; use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::{PrimaryKeyArray, PrimaryKeyArrayBuilder, ReadFormat}; use crate::sst::parquet::helper::parse_parquet_metadata; use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo}; -use crate::sst::to_sst_arrow_schema; +use crate::sst::{SeriesEstimator, to_sst_arrow_schema}; const INIT_DICT_VALUE_CAPACITY: usize = 8; @@ -166,6 +170,86 @@ impl BulkPart { } } + /// Fills missing columns in the BulkPart batch with default values. + /// + /// This function checks if the batch schema matches the region metadata schema, + /// and if there are missing columns, it fills them with default values (or null + /// for nullable columns). + /// + /// # Arguments + /// + /// * `region_metadata` - The region metadata containing the expected schema + pub fn fill_missing_columns(&mut self, region_metadata: &RegionMetadata) -> Result<()> { + // Builds a map of existing columns in the batch + let batch_schema = self.batch.schema(); + let batch_columns: HashSet<_> = batch_schema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + + // Finds columns that need to be filled + let mut columns_to_fill = Vec::new(); + for column_meta in ®ion_metadata.column_metadatas { + // TODO(yingwen): Returns error if it is impure default after we support filling + // bulk insert request in the frontend + if !batch_columns.contains(column_meta.column_schema.name.as_str()) { + columns_to_fill.push(column_meta); + } + } + + if columns_to_fill.is_empty() { + return Ok(()); + } + + let num_rows = self.batch.num_rows(); + + let mut new_columns = Vec::new(); + let mut new_fields = Vec::new(); + + // First, adds all existing columns + new_fields.extend(batch_schema.fields().iter().cloned()); + new_columns.extend_from_slice(self.batch.columns()); + + let region_id = region_metadata.region_id; + // Then adds the missing columns with default values + for column_meta in columns_to_fill { + let default_vector = column_meta + .column_schema + .create_default_vector(num_rows) + .context(CreateDefaultSnafu { + region_id, + column: &column_meta.column_schema.name, + })? + .with_context(|| InvalidRequestSnafu { + region_id, + reason: format!( + "column {} does not have default value", + column_meta.column_schema.name + ), + })?; + let arrow_array = default_vector.to_arrow_array(); + column_meta.column_schema.data_type.as_arrow_type(); + + new_fields.push(Arc::new(Field::new( + column_meta.column_schema.name.clone(), + column_meta.column_schema.data_type.as_arrow_type(), + column_meta.column_schema.is_nullable(), + ))); + new_columns.push(arrow_array); + } + + // Create a new schema and batch with the filled columns + let new_schema = Arc::new(Schema::new(new_fields)); + let new_batch = + RecordBatch::try_new(new_schema, new_columns).context(NewRecordBatchSnafu)?; + + // Update the batch + self.batch = new_batch; + + Ok(()) + } + /// Converts [BulkPart] to [Mutation] for fallback `write_bulk` implementation. pub(crate) fn to_mutation(&self, region_metadata: &RegionMetadataRef) -> Result { let vectors = region_metadata @@ -184,7 +268,7 @@ impl BulkPart { let values = (0..self.batch.num_columns()) .map(|col_idx| { if let Some(v) = &vectors[col_idx] { - value_to_grpc_value(v.get(row_idx)) + to_grpc_value(v.get(row_idx)) } else { api::v1::Value { value_data: None } } @@ -231,6 +315,154 @@ impl BulkPart { } } +/// A collection of small unordered bulk parts. +/// Used to batch small parts together before merging them into a sorted part. +pub struct UnorderedPart { + /// Small bulk parts that haven't been sorted yet. + parts: Vec, + /// Total number of rows across all parts. + total_rows: usize, + /// Minimum timestamp across all parts. + min_timestamp: i64, + /// Maximum timestamp across all parts. + max_timestamp: i64, + /// Maximum sequence number across all parts. + max_sequence: u64, + /// Row count threshold for accepting parts (default: 1024). + threshold: usize, + /// Row count threshold for compacting (default: 4096). + compact_threshold: usize, +} + +impl Default for UnorderedPart { + fn default() -> Self { + Self::new() + } +} + +impl UnorderedPart { + /// Creates a new empty UnorderedPart. + pub fn new() -> Self { + Self { + parts: Vec::new(), + total_rows: 0, + min_timestamp: i64::MAX, + max_timestamp: i64::MIN, + max_sequence: 0, + threshold: 1024, + compact_threshold: 4096, + } + } + + /// Sets the threshold for accepting parts into unordered_part. + pub fn set_threshold(&mut self, threshold: usize) { + self.threshold = threshold; + } + + /// Sets the threshold for compacting unordered_part. + pub fn set_compact_threshold(&mut self, compact_threshold: usize) { + self.compact_threshold = compact_threshold; + } + + /// Returns the threshold for accepting parts. + pub fn threshold(&self) -> usize { + self.threshold + } + + /// Returns the compact threshold. + pub fn compact_threshold(&self) -> usize { + self.compact_threshold + } + + /// Returns true if this part should accept the given row count. + pub fn should_accept(&self, num_rows: usize) -> bool { + num_rows < self.threshold + } + + /// Returns true if this part should be compacted. + pub fn should_compact(&self) -> bool { + self.total_rows >= self.compact_threshold + } + + /// Adds a BulkPart to this unordered collection. + pub fn push(&mut self, part: BulkPart) { + self.total_rows += part.num_rows(); + self.min_timestamp = self.min_timestamp.min(part.min_timestamp); + self.max_timestamp = self.max_timestamp.max(part.max_timestamp); + self.max_sequence = self.max_sequence.max(part.sequence); + self.parts.push(part); + } + + /// Returns the total number of rows across all parts. + pub fn num_rows(&self) -> usize { + self.total_rows + } + + /// Returns true if there are no parts. + pub fn is_empty(&self) -> bool { + self.parts.is_empty() + } + + /// Returns the number of parts in this collection. + pub fn num_parts(&self) -> usize { + self.parts.len() + } + + /// Concatenates and sorts all parts into a single RecordBatch. + /// Returns None if the collection is empty. + pub fn concat_and_sort(&self) -> Result> { + if self.parts.is_empty() { + return Ok(None); + } + + if self.parts.len() == 1 { + // If there's only one part, return its batch directly + return Ok(Some(self.parts[0].batch.clone())); + } + + // Get the schema from the first part + let schema = self.parts[0].batch.schema(); + + // Concatenate all record batches + let batches: Vec = self.parts.iter().map(|p| p.batch.clone()).collect(); + let concatenated = + arrow::compute::concat_batches(&schema, &batches).context(ComputeArrowSnafu)?; + + // Sort the concatenated batch + let sorted_batch = sort_primary_key_record_batch(&concatenated)?; + + Ok(Some(sorted_batch)) + } + + /// Converts all parts into a single sorted BulkPart. + /// Returns None if the collection is empty. + pub fn to_bulk_part(&self) -> Result> { + let Some(sorted_batch) = self.concat_and_sort()? else { + return Ok(None); + }; + + let timestamp_index = self.parts[0].timestamp_index; + + Ok(Some(BulkPart { + batch: sorted_batch, + max_timestamp: self.max_timestamp, + min_timestamp: self.min_timestamp, + sequence: self.max_sequence, + timestamp_index, + raw_data: None, + })) + } + + /// Clears all parts from this collection. + pub fn clear(&mut self) { + self.parts.clear(); + self.total_rows = 0; + self.min_timestamp = i64::MAX; + self.max_timestamp = i64::MIN; + self.max_sequence = 0; + } +} + /// More accurate estimation of the size of a record batch. pub(crate) fn record_batch_estimated_size(batch: &RecordBatch) -> usize { batch @@ -518,6 +750,196 @@ fn sort_primary_key_record_batch(batch: &RecordBatch) -> Result { datatypes::arrow::compute::take_record_batch(batch, &indices).context(ComputeArrowSnafu) } +/// Converts a `BulkPart` that is unordered and without encoded primary keys into a `BulkPart` +/// with the same format as produced by [BulkPartConverter]. +/// +/// This function takes a `BulkPart` where: +/// - For dense encoding: Primary key columns may be stored as individual columns +/// - For sparse encoding: The `__primary_key` column should already be present with encoded keys +/// - The batch may not be sorted +/// +/// And produces a `BulkPart` where: +/// - Primary key columns are optionally stored (depending on `store_primary_key_columns` and encoding) +/// - An encoded `__primary_key` dictionary column is present +/// - The batch is sorted by (primary_key, timestamp, sequence desc) +/// +/// # Arguments +/// +/// * `part` - The input `BulkPart` to convert +/// * `region_metadata` - Region metadata containing schema information +/// * `primary_key_codec` - Codec for encoding primary keys +/// * `schema` - Target schema for the output batch +/// * `store_primary_key_columns` - If true and encoding is not sparse, stores individual primary key columns +/// +/// # Returns +/// +/// Returns `None` if the input part has no rows, otherwise returns a new `BulkPart` with +/// encoded primary keys and sorted data. +pub fn convert_bulk_part( + part: BulkPart, + region_metadata: &RegionMetadataRef, + primary_key_codec: Arc, + schema: SchemaRef, + store_primary_key_columns: bool, +) -> Result> { + if part.num_rows() == 0 { + return Ok(None); + } + + let num_rows = part.num_rows(); + let is_sparse = region_metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse; + + // Builds a column name-to-index map for efficient lookups + let input_schema = part.batch.schema(); + let column_indices: HashMap<&str, usize> = input_schema + .fields() + .iter() + .enumerate() + .map(|(idx, field)| (field.name().as_str(), idx)) + .collect(); + + // Determines the structure of the input batch by looking up columns by name + let mut output_columns = Vec::new(); + + // Extracts primary key columns if we need to encode them (dense encoding) + let pk_array = if is_sparse { + // For sparse encoding, the input should already have the __primary_key column + // We need to find it in the input batch + None + } else { + // For dense encoding, extract and encode primary key columns by name + let pk_vectors: Result> = region_metadata + .primary_key_columns() + .map(|col_meta| { + let col_idx = column_indices + .get(col_meta.column_schema.name.as_str()) + .context(ColumnNotFoundSnafu { + column: &col_meta.column_schema.name, + })?; + let col = part.batch.column(*col_idx); + Helper::try_into_vector(col).context(error::ComputeVectorSnafu) + }) + .collect(); + let pk_vectors = pk_vectors?; + + let mut key_array_builder = PrimaryKeyArrayBuilder::new(); + let mut encode_buf = Vec::new(); + + for row_idx in 0..num_rows { + encode_buf.clear(); + + // Collects primary key values with column IDs for this row + let pk_values_with_ids: Vec<_> = region_metadata + .primary_key + .iter() + .zip(pk_vectors.iter()) + .map(|(col_id, vector)| (*col_id, vector.get_ref(row_idx))) + .collect(); + + // Encodes the primary key + primary_key_codec + .encode_value_refs(&pk_values_with_ids, &mut encode_buf) + .context(EncodeSnafu)?; + + key_array_builder + .append(&encode_buf) + .context(ComputeArrowSnafu)?; + } + + Some(key_array_builder.finish()) + }; + + // Adds primary key columns if storing them (only for dense encoding) + if store_primary_key_columns && !is_sparse { + for col_meta in region_metadata.primary_key_columns() { + let col_idx = column_indices + .get(col_meta.column_schema.name.as_str()) + .context(ColumnNotFoundSnafu { + column: &col_meta.column_schema.name, + })?; + let col = part.batch.column(*col_idx); + + // Converts to dictionary if needed for string types + let col = if col_meta.column_schema.data_type.is_string() { + let target_type = ArrowDataType::Dictionary( + Box::new(ArrowDataType::UInt32), + Box::new(ArrowDataType::Utf8), + ); + arrow::compute::cast(col, &target_type).context(ComputeArrowSnafu)? + } else { + col.clone() + }; + output_columns.push(col); + } + } + + // Adds field columns + for col_meta in region_metadata.field_columns() { + let col_idx = column_indices + .get(col_meta.column_schema.name.as_str()) + .context(ColumnNotFoundSnafu { + column: &col_meta.column_schema.name, + })?; + output_columns.push(part.batch.column(*col_idx).clone()); + } + + // Adds timestamp column + let new_timestamp_index = output_columns.len(); + let ts_col_idx = column_indices + .get( + region_metadata + .time_index_column() + .column_schema + .name + .as_str(), + ) + .context(ColumnNotFoundSnafu { + column: ®ion_metadata.time_index_column().column_schema.name, + })?; + output_columns.push(part.batch.column(*ts_col_idx).clone()); + + // Adds encoded primary key dictionary column + let pk_dictionary = if let Some(pk_dict_array) = pk_array { + Arc::new(pk_dict_array) as ArrayRef + } else { + let pk_col_idx = + column_indices + .get(PRIMARY_KEY_COLUMN_NAME) + .context(ColumnNotFoundSnafu { + column: PRIMARY_KEY_COLUMN_NAME, + })?; + let col = part.batch.column(*pk_col_idx); + + // Casts to dictionary type if needed + let target_type = ArrowDataType::Dictionary( + Box::new(ArrowDataType::UInt32), + Box::new(ArrowDataType::Binary), + ); + arrow::compute::cast(col, &target_type).context(ComputeArrowSnafu)? + }; + output_columns.push(pk_dictionary); + + let sequence_array = UInt64Array::from(vec![part.sequence; num_rows]); + output_columns.push(Arc::new(sequence_array) as ArrayRef); + + let op_type_array = UInt8Array::from(vec![OpType::Put as u8; num_rows]); + output_columns.push(Arc::new(op_type_array) as ArrayRef); + + let batch = RecordBatch::try_new(schema, output_columns).context(NewRecordBatchSnafu)?; + + // Sorts the batch by (primary_key, timestamp, sequence desc) + let sorted_batch = sort_primary_key_record_batch(&batch)?; + + Ok(Some(BulkPart { + batch: sorted_batch, + max_timestamp: part.max_timestamp, + min_timestamp: part.min_timestamp, + sequence: part.sequence, + timestamp_index: new_timestamp_index, + raw_data: None, + })) +} + #[derive(Debug, Clone)] pub struct EncodedBulkPart { data: Bytes, @@ -563,6 +985,7 @@ impl EncodedBulkPart { num_row_groups: self.metadata.parquet_metadata.num_row_groups() as u64, file_metadata: Some(self.metadata.parquet_metadata.clone()), index_metadata: IndexOutput::default(), + num_series: self.metadata.num_series, } } @@ -570,9 +993,15 @@ impl EncodedBulkPart { &self, context: BulkIterContextRef, sequence: Option, + mem_scan_metrics: Option, ) -> Result> { + // Compute skip_fields for row group pruning using the same approach as compute_skip_fields in reader.rs. + let skip_fields_for_pruning = + Self::compute_skip_fields(context.pre_filter_mode(), &self.metadata.parquet_metadata); + // use predicate to find row groups to read. - let row_groups_to_read = context.row_groups_to_read(&self.metadata.parquet_metadata); + let row_groups_to_read = + context.row_groups_to_read(&self.metadata.parquet_metadata, skip_fields_for_pruning); if row_groups_to_read.is_empty() { // All row groups are filtered. @@ -580,14 +1009,28 @@ impl EncodedBulkPart { } let iter = EncodedBulkPartIter::try_new( + self, context, row_groups_to_read, - self.metadata.parquet_metadata.clone(), - self.data.clone(), sequence, + mem_scan_metrics, )?; Ok(Some(Box::new(iter) as BoxedRecordBatchIterator)) } + + /// Computes whether to skip field columns based on PreFilterMode. + fn compute_skip_fields(pre_filter_mode: PreFilterMode, parquet_meta: &ParquetMetaData) -> bool { + match pre_filter_mode { + PreFilterMode::All => false, + PreFilterMode::SkipFields => true, + PreFilterMode::SkipFieldsOnDelete => { + // Check if any row group contains delete op + (0..parquet_meta.num_row_groups()).any(|rg_idx| { + row_group_contains_delete(parquet_meta, rg_idx, "memtable").unwrap_or(true) + }) + } + } + } } #[derive(Debug, Clone)] @@ -602,6 +1045,8 @@ pub struct BulkPartMeta { pub parquet_metadata: Arc, /// Part region schema. pub region_metadata: RegionMetadataRef, + /// Number of series. + pub num_series: u64, } /// Metrics for encoding a part. @@ -669,6 +1114,7 @@ impl BulkPartEncoder { let mut writer = ArrowWriter::try_new(&mut buf, arrow_schema, self.writer_props.clone()) .context(EncodeMemtableSnafu)?; let mut total_rows = 0; + let mut series_estimator = SeriesEstimator::default(); // Process each batch from the iterator let mut iter_start = Instant::now(); @@ -679,6 +1125,7 @@ impl BulkPartEncoder { continue; } + series_estimator.update_flat(&batch); metrics.raw_size += record_batch_estimated_size(&batch); let write_start = Instant::now(); writer.write(&batch).context(EncodeMemtableSnafu)?; @@ -701,6 +1148,7 @@ impl BulkPartEncoder { let buf = Bytes::from(buf); let parquet_metadata = Arc::new(parse_parquet_metadata(file_metadata)?); + let num_series = series_estimator.finish(); Ok(Some(EncodedBulkPart { data: buf, @@ -710,6 +1158,7 @@ impl BulkPartEncoder { min_timestamp, parquet_metadata, region_metadata: self.metadata.clone(), + num_series, }, })) } @@ -742,6 +1191,7 @@ impl BulkPartEncoder { min_timestamp: part.min_timestamp, parquet_metadata, region_metadata: self.metadata.clone(), + num_series: part.estimated_series_count() as u64, }, })) } @@ -1012,11 +1462,14 @@ fn binary_array_to_dictionary(input: &BinaryArray) -> Result { mod tests { use std::collections::VecDeque; - use api::v1::{Row, WriteHint}; + use api::v1::{Row, SemanticType, WriteHint}; use datafusion_common::ScalarValue; use datatypes::arrow::array::Float64Array; use datatypes::prelude::{ConcreteDataType, ScalarVector, Value}; + use datatypes::schema::ColumnSchema; use datatypes::vectors::{Float64Vector, TimestampMillisecondVector}; + use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; + use store_api::storage::RegionId; use store_api::storage::consts::ReservedColumnId; use super::*; @@ -1387,6 +1840,7 @@ mod tests { .unwrap(), ), None, + None, ) .unwrap() .expect("expect at least one row group"); @@ -1446,7 +1900,7 @@ mod tests { .unwrap(), ); let mut reader = part - .read(context, None) + .read(context, None, None) .unwrap() .expect("expect at least one row group"); let mut total_rows_read = 0; @@ -1479,7 +1933,7 @@ mod tests { ) .unwrap(), ); - assert!(part.read(context, None).unwrap().is_none()); + assert!(part.read(context, None, None).unwrap().is_none()); check_prune_row_group(&part, None, 310); @@ -1988,4 +2442,379 @@ mod tests { ); } } + + #[test] + fn test_convert_bulk_part_empty() { + let metadata = metadata_for_test(); + let schema = to_flat_sst_arrow_schema( + &metadata, + &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding), + ); + let primary_key_codec = build_primary_key_codec(&metadata); + + // Create empty batch + let empty_batch = RecordBatch::new_empty(schema.clone()); + let empty_part = BulkPart { + batch: empty_batch, + max_timestamp: 0, + min_timestamp: 0, + sequence: 0, + timestamp_index: 0, + raw_data: None, + }; + + let result = + convert_bulk_part(empty_part, &metadata, primary_key_codec, schema, true).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_convert_bulk_part_dense_with_pk_columns() { + let metadata = metadata_for_test(); + let primary_key_codec = build_primary_key_codec(&metadata); + + let k0_array = Arc::new(arrow::array::StringArray::from(vec![ + "key1", "key2", "key1", + ])); + let k1_array = Arc::new(arrow::array::UInt32Array::from(vec![1, 2, 1])); + let v0_array = Arc::new(arrow::array::Int64Array::from(vec![100, 200, 300])); + let v1_array = Arc::new(arrow::array::Float64Array::from(vec![1.0, 2.0, 3.0])); + let ts_array = Arc::new(TimestampMillisecondArray::from(vec![1000, 2000, 1500])); + + let input_schema = Arc::new(Schema::new(vec![ + Field::new("k0", ArrowDataType::Utf8, false), + Field::new("k1", ArrowDataType::UInt32, false), + Field::new("v0", ArrowDataType::Int64, true), + Field::new("v1", ArrowDataType::Float64, true), + Field::new( + "ts", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + ])); + + let input_batch = RecordBatch::try_new( + input_schema, + vec![k0_array, k1_array, v0_array, v1_array, ts_array], + ) + .unwrap(); + + let part = BulkPart { + batch: input_batch, + max_timestamp: 2000, + min_timestamp: 1000, + sequence: 5, + timestamp_index: 4, + raw_data: None, + }; + + let output_schema = to_flat_sst_arrow_schema( + &metadata, + &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding), + ); + + let result = convert_bulk_part( + part, + &metadata, + primary_key_codec, + output_schema, + true, // store primary key columns + ) + .unwrap(); + + let converted = result.unwrap(); + + assert_eq!(converted.num_rows(), 3); + assert_eq!(converted.max_timestamp, 2000); + assert_eq!(converted.min_timestamp, 1000); + assert_eq!(converted.sequence, 5); + + let schema = converted.batch.schema(); + let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); + assert_eq!( + field_names, + vec![ + "k0", + "k1", + "v0", + "v1", + "ts", + "__primary_key", + "__sequence", + "__op_type" + ] + ); + + let k0_col = converted.batch.column_by_name("k0").unwrap(); + assert!(matches!( + k0_col.data_type(), + ArrowDataType::Dictionary(_, _) + )); + + let pk_col = converted.batch.column_by_name("__primary_key").unwrap(); + let dict_array = pk_col + .as_any() + .downcast_ref::>() + .unwrap(); + let keys = dict_array.keys(); + + assert_eq!(keys.len(), 3); + } + + #[test] + fn test_convert_bulk_part_dense_without_pk_columns() { + let metadata = metadata_for_test(); + let primary_key_codec = build_primary_key_codec(&metadata); + + // Create input batch with primary key columns (k0, k1) + let k0_array = Arc::new(arrow::array::StringArray::from(vec!["key1", "key2"])); + let k1_array = Arc::new(arrow::array::UInt32Array::from(vec![1, 2])); + let v0_array = Arc::new(arrow::array::Int64Array::from(vec![100, 200])); + let v1_array = Arc::new(arrow::array::Float64Array::from(vec![1.0, 2.0])); + let ts_array = Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])); + + let input_schema = Arc::new(Schema::new(vec![ + Field::new("k0", ArrowDataType::Utf8, false), + Field::new("k1", ArrowDataType::UInt32, false), + Field::new("v0", ArrowDataType::Int64, true), + Field::new("v1", ArrowDataType::Float64, true), + Field::new( + "ts", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + ])); + + let input_batch = RecordBatch::try_new( + input_schema, + vec![k0_array, k1_array, v0_array, v1_array, ts_array], + ) + .unwrap(); + + let part = BulkPart { + batch: input_batch, + max_timestamp: 2000, + min_timestamp: 1000, + sequence: 3, + timestamp_index: 4, + raw_data: None, + }; + + let output_schema = to_flat_sst_arrow_schema( + &metadata, + &FlatSchemaOptions { + raw_pk_columns: false, + string_pk_use_dict: true, + }, + ); + + let result = convert_bulk_part( + part, + &metadata, + primary_key_codec, + output_schema, + false, // don't store primary key columns + ) + .unwrap(); + + let converted = result.unwrap(); + + assert_eq!(converted.num_rows(), 2); + assert_eq!(converted.max_timestamp, 2000); + assert_eq!(converted.min_timestamp, 1000); + assert_eq!(converted.sequence, 3); + + // Verify schema does NOT include individual primary key columns + let schema = converted.batch.schema(); + let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); + assert_eq!( + field_names, + vec!["v0", "v1", "ts", "__primary_key", "__sequence", "__op_type"] + ); + + // Verify __primary_key column is present and is a dictionary + let pk_col = converted.batch.column_by_name("__primary_key").unwrap(); + assert!(matches!( + pk_col.data_type(), + ArrowDataType::Dictionary(_, _) + )); + } + + #[test] + fn test_convert_bulk_part_sparse_encoding() { + let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456)); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new("k0", ConcreteDataType::string_datatype(), false), + semantic_type: SemanticType::Tag, + column_id: 0, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new("k1", ConcreteDataType::string_datatype(), false), + semantic_type: SemanticType::Tag, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 2, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new("v0", ConcreteDataType::int64_datatype(), true), + semantic_type: SemanticType::Field, + column_id: 3, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new("v1", ConcreteDataType::float64_datatype(), true), + semantic_type: SemanticType::Field, + column_id: 4, + }) + .primary_key(vec![0, 1]) + .primary_key_encoding(PrimaryKeyEncoding::Sparse); + let metadata = Arc::new(builder.build().unwrap()); + + let primary_key_codec = build_primary_key_codec(&metadata); + + // Create input batch with __primary_key column (sparse encoding) + let pk_array = Arc::new(arrow::array::BinaryArray::from(vec![ + b"encoded_key_1".as_slice(), + b"encoded_key_2".as_slice(), + ])); + let v0_array = Arc::new(arrow::array::Int64Array::from(vec![100, 200])); + let v1_array = Arc::new(arrow::array::Float64Array::from(vec![1.0, 2.0])); + let ts_array = Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])); + + let input_schema = Arc::new(Schema::new(vec![ + Field::new("__primary_key", ArrowDataType::Binary, false), + Field::new("v0", ArrowDataType::Int64, true), + Field::new("v1", ArrowDataType::Float64, true), + Field::new( + "ts", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + ])); + + let input_batch = + RecordBatch::try_new(input_schema, vec![pk_array, v0_array, v1_array, ts_array]) + .unwrap(); + + let part = BulkPart { + batch: input_batch, + max_timestamp: 2000, + min_timestamp: 1000, + sequence: 7, + timestamp_index: 3, + raw_data: None, + }; + + let output_schema = to_flat_sst_arrow_schema( + &metadata, + &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding), + ); + + let result = convert_bulk_part( + part, + &metadata, + primary_key_codec, + output_schema, + true, // store_primary_key_columns (ignored for sparse) + ) + .unwrap(); + + let converted = result.unwrap(); + + assert_eq!(converted.num_rows(), 2); + assert_eq!(converted.max_timestamp, 2000); + assert_eq!(converted.min_timestamp, 1000); + assert_eq!(converted.sequence, 7); + + // Verify schema does NOT include individual primary key columns (sparse encoding) + let schema = converted.batch.schema(); + let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); + assert_eq!( + field_names, + vec!["v0", "v1", "ts", "__primary_key", "__sequence", "__op_type"] + ); + + // Verify __primary_key is dictionary encoded + let pk_col = converted.batch.column_by_name("__primary_key").unwrap(); + assert!(matches!( + pk_col.data_type(), + ArrowDataType::Dictionary(_, _) + )); + } + + #[test] + fn test_convert_bulk_part_sorting_with_multiple_series() { + let metadata = metadata_for_test(); + let primary_key_codec = build_primary_key_codec(&metadata); + + // Create unsorted batch with multiple series and timestamps + let k0_array = Arc::new(arrow::array::StringArray::from(vec![ + "series_b", "series_a", "series_b", "series_a", + ])); + let k1_array = Arc::new(arrow::array::UInt32Array::from(vec![2, 1, 2, 1])); + let v0_array = Arc::new(arrow::array::Int64Array::from(vec![200, 100, 400, 300])); + let v1_array = Arc::new(arrow::array::Float64Array::from(vec![2.0, 1.0, 4.0, 3.0])); + let ts_array = Arc::new(TimestampMillisecondArray::from(vec![ + 2000, 1000, 4000, 3000, + ])); + + let input_schema = Arc::new(Schema::new(vec![ + Field::new("k0", ArrowDataType::Utf8, false), + Field::new("k1", ArrowDataType::UInt32, false), + Field::new("v0", ArrowDataType::Int64, true), + Field::new("v1", ArrowDataType::Float64, true), + Field::new( + "ts", + ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None), + false, + ), + ])); + + let input_batch = RecordBatch::try_new( + input_schema, + vec![k0_array, k1_array, v0_array, v1_array, ts_array], + ) + .unwrap(); + + let part = BulkPart { + batch: input_batch, + max_timestamp: 4000, + min_timestamp: 1000, + sequence: 10, + timestamp_index: 4, + raw_data: None, + }; + + let output_schema = to_flat_sst_arrow_schema( + &metadata, + &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding), + ); + + let result = + convert_bulk_part(part, &metadata, primary_key_codec, output_schema, true).unwrap(); + + let converted = result.unwrap(); + + assert_eq!(converted.num_rows(), 4); + + // Verify data is sorted by (primary_key, timestamp, sequence desc) + let ts_col = converted.batch.column(converted.timestamp_index); + let ts_array = ts_col + .as_any() + .downcast_ref::() + .unwrap(); + + // After sorting by (pk, ts), we should have: + // series_a,1: ts=1000, 3000 + // series_b,2: ts=2000, 4000 + let timestamps: Vec = ts_array.values().to_vec(); + assert_eq!(timestamps, vec![1000, 3000, 2000, 4000]); + } } diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs index 5578018a8d..d779f1ff04 100644 --- a/src/mito2/src/memtable/bulk/part_reader.rs +++ b/src/mito2/src/memtable/bulk/part_reader.rs @@ -13,20 +13,22 @@ // limitations under the License. use std::collections::VecDeque; -use std::sync::Arc; +use std::time::Instant; -use bytes::Bytes; use datatypes::arrow::array::BooleanArray; use datatypes::arrow::record_batch::RecordBatch; use parquet::arrow::ProjectionMask; use parquet::arrow::arrow_reader::ParquetRecordBatchReader; -use parquet::file::metadata::ParquetMetaData; use snafu::ResultExt; use store_api::storage::SequenceRange; use crate::error::{self, ComputeArrowSnafu, DecodeArrowRowGroupSnafu}; use crate::memtable::bulk::context::{BulkIterContext, BulkIterContextRef}; +use crate::memtable::bulk::part::EncodedBulkPart; use crate::memtable::bulk::row_group_reader::MemtableRowGroupReaderBuilder; +use crate::memtable::{MemScanMetrics, MemScanMetricsData}; +use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED}; +use crate::sst::parquet::file_range::PreFilterMode; use crate::sst::parquet::flat_format::sequence_column_index; use crate::sst::parquet::reader::RowGroupReaderContext; @@ -38,19 +40,29 @@ pub struct EncodedBulkPartIter { builder: MemtableRowGroupReaderBuilder, /// Sequence number filter. sequence: Option, + /// Cached skip_fields for current row group. + current_skip_fields: bool, + /// Metrics for this iterator. + metrics: MemScanMetricsData, + /// Optional memory scan metrics to report to. + mem_scan_metrics: Option, } impl EncodedBulkPartIter { /// Creates a new [BulkPartIter]. pub(crate) fn try_new( + encoded_part: &EncodedBulkPart, context: BulkIterContextRef, mut row_groups_to_read: VecDeque, - parquet_meta: Arc, - data: Bytes, sequence: Option, + mem_scan_metrics: Option, ) -> error::Result { assert!(context.read_format().as_flat().is_some()); + let parquet_meta = encoded_part.metadata().parquet_metadata.clone(); + let data = encoded_part.data().clone(); + let series_count = encoded_part.metadata().num_series as usize; + let projection_mask = ProjectionMask::roots( parquet_meta.file_metadata().schema_descr(), context.read_format().projection_indices().iter().copied(), @@ -58,46 +70,90 @@ impl EncodedBulkPartIter { let builder = MemtableRowGroupReaderBuilder::try_new(&context, projection_mask, parquet_meta, data)?; - let init_reader = row_groups_to_read - .pop_front() - .map(|first_row_group| builder.build_row_group_reader(first_row_group, None)) - .transpose()?; + let (init_reader, current_skip_fields) = match row_groups_to_read.pop_front() { + Some(first_row_group) => { + let skip_fields = builder.compute_skip_fields(&context, first_row_group); + let reader = builder.build_row_group_reader(first_row_group, None)?; + (Some(reader), skip_fields) + } + None => (None, false), + }; + Ok(Self { context, row_groups_to_read, current_reader: init_reader, builder, sequence, + current_skip_fields, + metrics: MemScanMetricsData { + total_series: series_count, + ..Default::default() + }, + mem_scan_metrics, }) } + fn report_mem_scan_metrics(&mut self) { + if let Some(mem_scan_metrics) = self.mem_scan_metrics.take() { + mem_scan_metrics.merge_inner(&self.metrics); + } + } + /// Fetches next non-empty record batch. pub(crate) fn next_record_batch(&mut self) -> error::Result> { + let start = Instant::now(); + let Some(current) = &mut self.current_reader else { // All row group exhausted. + self.metrics.scan_cost += start.elapsed(); return Ok(None); }; for batch in current { let batch = batch.context(DecodeArrowRowGroupSnafu)?; - if let Some(batch) = apply_combined_filters(&self.context, &self.sequence, batch)? { + if let Some(batch) = apply_combined_filters( + &self.context, + &self.sequence, + batch, + self.current_skip_fields, + )? { + // Update metrics + self.metrics.num_batches += 1; + self.metrics.num_rows += batch.num_rows(); + self.metrics.scan_cost += start.elapsed(); return Ok(Some(batch)); } } // Previous row group exhausted, read next row group while let Some(next_row_group) = self.row_groups_to_read.pop_front() { + // Compute skip_fields for this row group + self.current_skip_fields = self + .builder + .compute_skip_fields(&self.context, next_row_group); + let next_reader = self.builder.build_row_group_reader(next_row_group, None)?; let current = self.current_reader.insert(next_reader); for batch in current { let batch = batch.context(DecodeArrowRowGroupSnafu)?; - if let Some(batch) = apply_combined_filters(&self.context, &self.sequence, batch)? { + if let Some(batch) = apply_combined_filters( + &self.context, + &self.sequence, + batch, + self.current_skip_fields, + )? { + // Update metrics + self.metrics.num_batches += 1; + self.metrics.num_rows += batch.num_rows(); + self.metrics.scan_cost += start.elapsed(); return Ok(Some(batch)); } } } + self.metrics.scan_cost += start.elapsed(); Ok(None) } } @@ -106,7 +162,37 @@ impl Iterator for EncodedBulkPartIter { type Item = error::Result; fn next(&mut self) -> Option { - self.next_record_batch().transpose() + let result = self.next_record_batch().transpose(); + + // Report metrics when iteration is complete + if result.is_none() { + self.report_mem_scan_metrics(); + } + + result + } +} + +impl Drop for EncodedBulkPartIter { + fn drop(&mut self) { + common_telemetry::debug!( + "EncodedBulkPartIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}", + self.context.region_id(), + self.metrics.total_series, + self.metrics.num_rows, + self.metrics.num_batches, + self.metrics.scan_cost + ); + + // Report MemScanMetrics if not already reported + self.report_mem_scan_metrics(); + + READ_ROWS_TOTAL + .with_label_values(&["bulk_memtable"]) + .inc_by(self.metrics.num_rows as u64); + READ_STAGE_ELAPSED + .with_label_values(&["scan_memtable"]) + .observe(self.metrics.scan_cost.as_secs_f64()); } } @@ -118,6 +204,10 @@ pub struct BulkPartRecordBatchIter { context: BulkIterContextRef, /// Sequence number filter. sequence: Option, + /// Metrics for this iterator. + metrics: MemScanMetricsData, + /// Optional memory scan metrics to report to. + mem_scan_metrics: Option, } impl BulkPartRecordBatchIter { @@ -126,6 +216,8 @@ impl BulkPartRecordBatchIter { record_batch: RecordBatch, context: BulkIterContextRef, sequence: Option, + series_count: usize, + mem_scan_metrics: Option, ) -> Self { assert!(context.read_format().as_flat().is_some()); @@ -133,6 +225,17 @@ impl BulkPartRecordBatchIter { record_batch: Some(record_batch), context, sequence, + metrics: MemScanMetricsData { + total_series: series_count, + ..Default::default() + }, + mem_scan_metrics, + } + } + + fn report_mem_scan_metrics(&mut self) { + if let Some(mem_scan_metrics) = self.mem_scan_metrics.take() { + mem_scan_metrics.merge_inner(&self.metrics); } } @@ -149,15 +252,29 @@ impl BulkPartRecordBatchIter { } fn process_batch(&mut self, record_batch: RecordBatch) -> error::Result> { + let start = Instant::now(); + // Apply projection first. let projected_batch = self.apply_projection(record_batch)?; // Apply combined filtering (both predicate and sequence filters) + // For BulkPartRecordBatchIter, we don't have row group information. + let skip_fields = match self.context.pre_filter_mode() { + PreFilterMode::All => false, + PreFilterMode::SkipFields => true, + PreFilterMode::SkipFieldsOnDelete => true, + }; let Some(filtered_batch) = - apply_combined_filters(&self.context, &self.sequence, projected_batch)? + apply_combined_filters(&self.context, &self.sequence, projected_batch, skip_fields)? else { + self.metrics.scan_cost += start.elapsed(); return Ok(None); }; + // Update metrics + self.metrics.num_batches += 1; + self.metrics.num_rows += filtered_batch.num_rows(); + self.metrics.scan_cost += start.elapsed(); + Ok(Some(filtered_batch)) } } @@ -166,9 +283,43 @@ impl Iterator for BulkPartRecordBatchIter { type Item = error::Result; fn next(&mut self) -> Option { - let record_batch = self.record_batch.take()?; + let Some(record_batch) = self.record_batch.take() else { + // `take()` should be cheap, we report the metrics directly. + self.report_mem_scan_metrics(); + return None; + }; - self.process_batch(record_batch).transpose() + let result = self.process_batch(record_batch).transpose(); + + // Reports metrics when iteration is complete + if result.is_none() { + self.report_mem_scan_metrics(); + } + + result + } +} + +impl Drop for BulkPartRecordBatchIter { + fn drop(&mut self) { + common_telemetry::debug!( + "BulkPartRecordBatchIter region: {}, metrics: total_series={}, num_rows={}, num_batches={}, scan_cost={:?}", + self.context.region_id(), + self.metrics.total_series, + self.metrics.num_rows, + self.metrics.num_batches, + self.metrics.scan_cost + ); + + // Report MemScanMetrics if not already reported + self.report_mem_scan_metrics(); + + READ_ROWS_TOTAL + .with_label_values(&["bulk_memtable"]) + .inc_by(self.metrics.num_rows as u64); + READ_STAGE_ELAPSED + .with_label_values(&["scan_memtable"]) + .observe(self.metrics.scan_cost.as_secs_f64()); } } @@ -181,6 +332,7 @@ fn apply_combined_filters( context: &BulkIterContext, sequence: &Option, record_batch: RecordBatch, + skip_fields: bool, ) -> error::Result> { // Converts the format to the flat format first. let format = context.read_format().as_flat().unwrap(); @@ -191,7 +343,9 @@ fn apply_combined_filters( // First, apply predicate filters using the shared method. if !context.base.filters.is_empty() { - let predicate_mask = context.base.compute_filter_mask_flat(&record_batch)?; + let predicate_mask = context + .base + .compute_filter_mask_flat(&record_batch, skip_fields)?; // If predicate filters out the entire batch, return None early let Some(mask) = predicate_mask else { return Ok(None); @@ -347,7 +501,8 @@ mod tests { .unwrap(), ); // Iterates all rows. - let iter = BulkPartRecordBatchIter::new(record_batch.clone(), context.clone(), None); + let iter = + BulkPartRecordBatchIter::new(record_batch.clone(), context.clone(), None, 0, None); let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect(); assert_eq!(1, result.len()); assert_eq!(3, result[0].num_rows()); @@ -358,6 +513,8 @@ mod tests { record_batch.clone(), context, Some(SequenceRange::LtEq { max: 2 }), + 0, + None, ); let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect(); assert_eq!(1, result.len()); @@ -378,7 +535,8 @@ mod tests { .unwrap(), ); // Creates iter with projection and predicate. - let iter = BulkPartRecordBatchIter::new(record_batch.clone(), context.clone(), None); + let iter = + BulkPartRecordBatchIter::new(record_batch.clone(), context.clone(), None, 0, None); let result: Vec<_> = iter.map(|rb| rb.unwrap()).collect(); assert_eq!(1, result.len()); assert_eq!(1, result[0].num_rows()); diff --git a/src/mito2/src/memtable/bulk/row_group_reader.rs b/src/mito2/src/memtable/bulk/row_group_reader.rs index 9918d81871..1e9e5dec4d 100644 --- a/src/mito2/src/memtable/bulk/row_group_reader.rs +++ b/src/mito2/src/memtable/bulk/row_group_reader.rs @@ -169,4 +169,23 @@ impl MemtableRowGroupReaderBuilder { ) .context(ReadDataPartSnafu) } + + /// Computes whether to skip field filters for a specific row group based on PreFilterMode. + pub(crate) fn compute_skip_fields( + &self, + context: &BulkIterContextRef, + row_group_idx: usize, + ) -> bool { + use crate::sst::parquet::file_range::{PreFilterMode, row_group_contains_delete}; + + match context.pre_filter_mode() { + PreFilterMode::All => false, + PreFilterMode::SkipFields => true, + PreFilterMode::SkipFieldsOnDelete => { + // Check if this specific row group contains delete op + row_group_contains_delete(&self.parquet_metadata, row_group_idx, "memtable") + .unwrap_or(true) + } + } + } } diff --git a/src/mito2/src/memtable/partition_tree.rs b/src/mito2/src/memtable/partition_tree.rs index e404a5851e..8ddd687053 100644 --- a/src/mito2/src/memtable/partition_tree.rs +++ b/src/mito2/src/memtable/partition_tree.rs @@ -44,7 +44,7 @@ use crate::memtable::stats::WriteMetrics; use crate::memtable::{ AllocTracker, BoxedBatchIterator, IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext, MemtableRanges, MemtableRef, - MemtableStats, PredicateGroup, + MemtableStats, RangesOptions, }; use crate::region::options::MergeMode; @@ -190,10 +190,10 @@ impl Memtable for PartitionTreeMemtable { fn ranges( &self, projection: Option<&[ColumnId]>, - predicate: PredicateGroup, - sequence: Option, - _for_flush: bool, + options: RangesOptions, ) -> Result { + let predicate = options.predicate; + let sequence = options.sequence; let projection = projection.map(|ids| ids.to_vec()); let builder = Box::new(PartitionTreeIterBuilder { tree: self.tree.clone(), @@ -384,6 +384,7 @@ mod tests { use api::v1::helper::{field_column_schema, row, tag_column_schema, time_index_column_schema}; use api::v1::value::ValueData; use api::v1::{Mutation, OpType, Rows, SemanticType}; + use common_query::prelude::{greptime_timestamp, greptime_value}; use common_time::Timestamp; use datafusion_common::Column; use datafusion_expr::{BinaryExpr, Expr, Literal, Operator}; @@ -694,7 +695,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_timestamp", + greptime_timestamp(), ConcreteDataType::timestamp_millisecond_datatype(), false, ), @@ -703,7 +704,7 @@ mod tests { }) .push_column_metadata(ColumnMetadata { column_schema: ColumnSchema::new( - "greptime_value", + greptime_value(), ConcreteDataType::float64_datatype(), true, ), diff --git a/src/mito2/src/memtable/partition_tree/dict.rs b/src/mito2/src/memtable/partition_tree/dict.rs index 62adda62bb..77cc835ea0 100644 --- a/src/mito2/src/memtable/partition_tree/dict.rs +++ b/src/mito2/src/memtable/partition_tree/dict.rs @@ -103,7 +103,7 @@ impl KeyDictBuilder { self.key_bytes_in_index += full_primary_key.len() + sparse_key_len; // Adds key size of index to the metrics. - MEMTABLE_DICT_BYTES.add(self.key_bytes_in_index as i64); + MEMTABLE_DICT_BYTES.add((full_primary_key.len() + sparse_key_len) as i64); pk_index } diff --git a/src/mito2/src/memtable/simple_bulk_memtable.rs b/src/mito2/src/memtable/simple_bulk_memtable.rs index cd7d9bdf5c..4e0b9ac525 100644 --- a/src/mito2/src/memtable/simple_bulk_memtable.rs +++ b/src/mito2/src/memtable/simple_bulk_memtable.rs @@ -27,7 +27,7 @@ use mito_codec::key_values::KeyValue; use rayon::prelude::*; use snafu::{OptionExt, ResultExt}; use store_api::metadata::RegionMetadataRef; -use store_api::storage::{ColumnId, SequenceRange}; +use store_api::storage::ColumnId; use crate::flush::WriteBufferManagerRef; use crate::memtable::bulk::part::BulkPart; @@ -35,12 +35,11 @@ use crate::memtable::stats::WriteMetrics; use crate::memtable::time_series::Series; use crate::memtable::{ AllocTracker, BoxedBatchIterator, IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableId, - MemtableRange, MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, + MemtableRange, MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions, }; use crate::metrics::MEMTABLE_ACTIVE_SERIES_COUNT; use crate::read::Batch; use crate::read::dedup::LastNonNullIter; -use crate::read::scan_region::PredicateGroup; use crate::region::options::MergeMode; use crate::{error, metrics}; @@ -223,7 +222,7 @@ impl Memtable for SimpleBulkMemtable { &self, projection: Option<&[ColumnId]>, _predicate: Option, - sequence: Option, + sequence: Option, ) -> error::Result { let iter = self.create_iter(projection, sequence)?.build(None)?; @@ -238,10 +237,10 @@ impl Memtable for SimpleBulkMemtable { fn ranges( &self, projection: Option<&[ColumnId]>, - predicate: PredicateGroup, - sequence: Option, - _for_flush: bool, + options: RangesOptions, ) -> error::Result { + let predicate = options.predicate; + let sequence = options.sequence; let start_time = Instant::now(); let projection = Arc::new(self.build_projection(projection)); let values = self.series.read().unwrap().read_to_values(); @@ -412,7 +411,7 @@ mod tests { use datatypes::value::Value; use datatypes::vectors::TimestampMillisecondVector; use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder}; - use store_api::storage::{RegionId, SequenceNumber}; + use store_api::storage::{RegionId, SequenceNumber, SequenceRange}; use super::*; use crate::read; @@ -617,9 +616,7 @@ mod tests { let kv = kvs.iter().next().unwrap(); memtable.write_one(kv).unwrap(); - let ranges = memtable - .ranges(None, PredicateGroup::default(), None, false) - .unwrap(); + let ranges = memtable.ranges(None, RangesOptions::default()).unwrap(); let mut source = vec![]; for r in ranges.ranges.values() { source.push(Source::Iter(r.build_iter().unwrap())); @@ -651,9 +648,7 @@ mod tests { memtable.write_one(kv).unwrap(); memtable.freeze().unwrap(); - let ranges = memtable - .ranges(None, PredicateGroup::default(), None, false) - .unwrap(); + let ranges = memtable.ranges(None, RangesOptions::default()).unwrap(); let mut source = vec![]; for r in ranges.ranges.values() { source.push(Source::Iter(r.build_iter().unwrap())); @@ -694,9 +689,7 @@ mod tests { memtable.write_one(kvs.iter().next().unwrap()).unwrap(); memtable.freeze().unwrap(); - let ranges = memtable - .ranges(None, PredicateGroup::default(), None, false) - .unwrap(); + let ranges = memtable.ranges(None, RangesOptions::default()).unwrap(); assert_eq!(ranges.ranges.len(), 1); let range = ranges.ranges.into_values().next().unwrap(); let mut reader = range.context.builder.build(None).unwrap(); @@ -910,9 +903,8 @@ mod tests { raw_data: None, }) .unwrap(); - let MemtableRanges { ranges, .. } = memtable - .ranges(None, PredicateGroup::default(), None, false) - .unwrap(); + let MemtableRanges { ranges, .. } = + memtable.ranges(None, RangesOptions::default()).unwrap(); let mut source = if ranges.len() == 1 { let only_range = ranges.into_values().next().unwrap(); Source::Iter(only_range.build_iter().unwrap()) diff --git a/src/mito2/src/memtable/time_partition.rs b/src/mito2/src/memtable/time_partition.rs index 6038d5cd20..6f11c813cb 100644 --- a/src/mito2/src/memtable/time_partition.rs +++ b/src/mito2/src/memtable/time_partition.rs @@ -261,7 +261,7 @@ impl TimePartitions { converter.append_key_values(kvs)?; let part = converter.convert()?; - return self.write_bulk(part); + return self.write_bulk_inner(part); } // Get all parts. @@ -291,7 +291,31 @@ impl TimePartitions { self.write_multi_parts(kvs, &parts) } + /// Writes a bulk part. pub fn write_bulk(&self, part: BulkPart) -> Result<()> { + // Convert the bulk part if bulk_schema is Some + let part = if let Some(bulk_schema) = &self.bulk_schema { + let converted = crate::memtable::bulk::part::convert_bulk_part( + part, + &self.metadata, + self.primary_key_codec.clone(), + bulk_schema.clone(), + // Always store primary keys for bulk mode. + true, + )?; + match converted { + Some(p) => p, + None => return Ok(()), + } + } else { + part + }; + + self.write_bulk_inner(part) + } + + /// Writes a bulk part without converting. + fn write_bulk_inner(&self, part: BulkPart) -> Result<()> { let time_type = self .metadata .time_index_column() @@ -353,12 +377,13 @@ impl TimePartitions { .builder .build(inner.alloc_memtable_id(), &self.metadata); debug!( - "Create time partition {:?} for region {}, duration: {:?}, memtable_id: {}, parts_total: {}", + "Create time partition {:?} for region {}, duration: {:?}, memtable_id: {}, parts_total: {}, metadata: {:?}", range, self.metadata.region_id, self.part_duration, memtable.id(), - inner.parts.len() + 1 + inner.parts.len() + 1, + self.metadata, ); let pos = inner.parts.len(); inner.parts.push(TimePartition { @@ -454,6 +479,11 @@ impl TimePartitions { self.part_duration } + /// Returns the memtable builder. + pub(crate) fn memtable_builder(&self) -> &MemtableBuilderRef { + &self.builder + } + /// Returns memory usage. pub(crate) fn memory_usage(&self) -> usize { let inner = self.inner.lock().unwrap(); @@ -488,12 +518,16 @@ impl TimePartitions { /// Creates a new empty partition list from this list and a `part_duration`. /// It falls back to the old partition duration if `part_duration` is `None`. - pub(crate) fn new_with_part_duration(&self, part_duration: Option) -> Self { + pub(crate) fn new_with_part_duration( + &self, + part_duration: Option, + memtable_builder: Option, + ) -> Self { debug_assert!(self.is_empty()); Self::new( self.metadata.clone(), - self.builder.clone(), + memtable_builder.unwrap_or_else(|| self.builder.clone()), self.next_memtable_id(), Some(part_duration.unwrap_or(self.part_duration)), ) @@ -941,17 +975,17 @@ mod tests { let builder = Arc::new(PartitionTreeMemtableBuilder::default()); let partitions = TimePartitions::new(metadata.clone(), builder.clone(), 0, None); - let new_parts = partitions.new_with_part_duration(Some(Duration::from_secs(5))); + let new_parts = partitions.new_with_part_duration(Some(Duration::from_secs(5)), None); assert_eq!(Duration::from_secs(5), new_parts.part_duration()); assert_eq!(0, new_parts.next_memtable_id()); // Won't update the duration if it's None. - let new_parts = new_parts.new_with_part_duration(None); + let new_parts = new_parts.new_with_part_duration(None, None); assert_eq!(Duration::from_secs(5), new_parts.part_duration()); // Don't need to create new memtables. assert_eq!(0, new_parts.next_memtable_id()); - let new_parts = new_parts.new_with_part_duration(Some(Duration::from_secs(10))); + let new_parts = new_parts.new_with_part_duration(Some(Duration::from_secs(10)), None); assert_eq!(Duration::from_secs(10), new_parts.part_duration()); // Don't need to create new memtables. assert_eq!(0, new_parts.next_memtable_id()); @@ -959,7 +993,7 @@ mod tests { let builder = Arc::new(PartitionTreeMemtableBuilder::default()); let partitions = TimePartitions::new(metadata.clone(), builder.clone(), 0, None); // Need to build a new memtable as duration is still None. - let new_parts = partitions.new_with_part_duration(None); + let new_parts = partitions.new_with_part_duration(None, None); assert_eq!(INITIAL_TIME_WINDOW, new_parts.part_duration()); assert_eq!(0, new_parts.next_memtable_id()); } diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs index 60fe2f0bcd..7401dd96b9 100644 --- a/src/mito2/src/memtable/time_series.rs +++ b/src/mito2/src/memtable/time_series.rs @@ -53,7 +53,7 @@ use crate::memtable::stats::WriteMetrics; use crate::memtable::{ AllocTracker, BoxedBatchIterator, IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext, MemtableRanges, MemtableRef, - MemtableStats, PredicateGroup, + MemtableStats, RangesOptions, }; use crate::metrics::{ MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT, MEMTABLE_ACTIVE_SERIES_COUNT, READ_ROWS_TOTAL, @@ -303,10 +303,10 @@ impl Memtable for TimeSeriesMemtable { fn ranges( &self, projection: Option<&[ColumnId]>, - predicate: PredicateGroup, - sequence: Option, - _for_flush: bool, + options: RangesOptions, ) -> Result { + let predicate = options.predicate; + let sequence = options.sequence; let projection = if let Some(projection) = projection { projection.iter().copied().collect() } else { @@ -922,7 +922,9 @@ impl ValueBuilder { ) }; mutable_vector.push_nulls(num_rows - 1); - let _ = mutable_vector.push(field_value); + mutable_vector + .push(field_value) + .unwrap_or_else(|e| panic!("unexpected field value: {e:?}")); self.fields[idx] = Some(mutable_vector); MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT.inc(); } diff --git a/src/mito2/src/memtable/version.rs b/src/mito2/src/memtable/version.rs index 537332d3b7..25c7535927 100644 --- a/src/mito2/src/memtable/version.rs +++ b/src/mito2/src/memtable/version.rs @@ -82,7 +82,7 @@ impl MemtableVersion { } // Update the time window. - let mutable = self.mutable.new_with_part_duration(time_window); + let mutable = self.mutable.new_with_part_duration(time_window, None); common_telemetry::debug!( "Freeze empty memtable, update partition duration from {:?} to {:?}", self.mutable.part_duration(), diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index 0f923f60a6..1e35ae1c06 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -36,6 +36,7 @@ pub const STAGING_TYPE: &str = "index_staging"; /// Recycle bin type label. pub const RECYCLE_TYPE: &str = "recycle_bin"; +// Write metrics. lazy_static! { /// Global write buffer size in bytes. pub static ref WRITE_BUFFER_BYTES: IntGauge = @@ -114,10 +115,10 @@ lazy_static! { &[TYPE_LABEL] ) .unwrap(); - // ------ End of write related metrics +} - - // Compaction metrics +// Compaction metrics. +lazy_static! { /// Timer of different stages in compaction. /// - pick /// - merge (in parallel) @@ -156,8 +157,10 @@ lazy_static! { "greptime_mito_inflight_compaction_count", "inflight compaction count", ).unwrap(); +} - // Query metrics. +// Query metrics. +lazy_static! { /// Timer of different stages in query. pub static ref READ_STAGE_ELAPSED: HistogramVec = register_histogram_vec!( "greptime_mito_read_stage_elapsed", @@ -207,9 +210,20 @@ lazy_static! { "Number of rows returned in a scan task", exponential_buckets(100.0, 10.0, 7).unwrap(), ).unwrap(); - // ------- End of query metrics. + /// Gauge for scan memory usage in bytes. + pub static ref SCAN_MEMORY_USAGE_BYTES: IntGauge = register_int_gauge!( + "greptime_mito_scan_memory_usage_bytes", + "current scan memory usage in bytes" + ).unwrap(); + /// Counter of rejected scan requests due to memory limit. + pub static ref SCAN_REQUESTS_REJECTED_TOTAL: IntCounter = register_int_counter!( + "greptime_mito_scan_requests_rejected_total", + "total number of scan requests rejected due to memory limit" + ).unwrap(); +} - // Cache related metrics. +// Cache metrics. +lazy_static! { /// Cache hit counter. pub static ref CACHE_HIT: IntCounterVec = register_int_counter_vec!( "greptime_mito_cache_hit", @@ -261,8 +275,10 @@ lazy_static! { "mito cache eviction", &[TYPE_LABEL, CACHE_EVICTION_CAUSE] ).unwrap(); - // ------- End of cache metrics. +} +// Index metrics. +lazy_static! { // Index metrics. /// Timer of index application. pub static ref INDEX_APPLY_ELAPSED: HistogramVec = register_histogram_vec!( @@ -359,8 +375,9 @@ lazy_static! { /// Counter of flush operations on intermediate files. pub static ref INDEX_INTERMEDIATE_FLUSH_OP_TOTAL: IntCounter = INDEX_IO_OP_TOTAL .with_label_values(&["flush", "intermediate"]); - // ------- End of index metrics. +} +lazy_static! { /// Partition tree memtable data buffer freeze metrics pub static ref PARTITION_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED: HistogramVec = register_histogram_vec!( "greptime_partition_tree_buffer_freeze_stage_elapsed", @@ -405,7 +422,6 @@ lazy_static! { } -// Use another block to avoid reaching the recursion limit. lazy_static! { /// Counter for compaction input file size. pub static ref COMPACTION_INPUT_BYTES: Counter = register_counter!( @@ -437,7 +453,7 @@ lazy_static! { "mito stalled write request in each worker", &[WORKER_LABEL] ).unwrap(); - /// Number of ref files per table + /// Number of ref files pub static ref GC_REF_FILE_CNT: IntGauge = register_int_gauge!( "greptime_gc_ref_file_count", "gc ref file count", @@ -458,11 +474,23 @@ lazy_static! { .unwrap(); /// Counter for the number of files deleted by the GC worker. - pub static ref GC_FILE_CNT: IntGauge = + pub static ref GC_DELETE_FILE_CNT: IntGauge = register_int_gauge!( - "greptime_mito_gc_file_count", + "greptime_mito_gc_delete_file_count", "mito gc deleted file count", ).unwrap(); + + /// Total number of files downloaded during cache fill on region open. + pub static ref CACHE_FILL_DOWNLOADED_FILES: IntCounter = register_int_counter!( + "mito_cache_fill_downloaded_files", + "mito cache fill downloaded files count", + ).unwrap(); + + /// Number of files pending download during cache fill on region open. + pub static ref CACHE_FILL_PENDING_FILES: IntGauge = register_int_gauge!( + "mito_cache_fill_pending_files", + "mito cache fill pending files count", + ).unwrap(); } /// Stager notifier to collect metrics. diff --git a/src/mito2/src/read/compat.rs b/src/mito2/src/read/compat.rs index 8bc24a4953..8a69b1856f 100644 --- a/src/mito2/src/read/compat.rs +++ b/src/mito2/src/read/compat.rs @@ -18,6 +18,7 @@ use std::collections::HashMap; use std::sync::Arc; use api::v1::SemanticType; +use common_recordbatch::recordbatch::align_json_array; use datatypes::arrow::array::{ Array, ArrayRef, BinaryArray, BinaryBuilder, DictionaryArray, UInt32Array, }; @@ -27,7 +28,7 @@ use datatypes::arrow::record_batch::RecordBatch; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::DataType; use datatypes::value::Value; -use datatypes::vectors::VectorRef; +use datatypes::vectors::{Helper, VectorRef}; use mito_codec::row_converter::{ CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec, build_primary_key_codec_with_fields, @@ -38,8 +39,9 @@ use store_api::metadata::{RegionMetadata, RegionMetadataRef}; use store_api::storage::ColumnId; use crate::error::{ - CompatReaderSnafu, ComputeArrowSnafu, CreateDefaultSnafu, DecodeSnafu, EncodeSnafu, - NewRecordBatchSnafu, Result, UnexpectedSnafu, UnsupportedOperationSnafu, + CastVectorSnafu, CompatReaderSnafu, ComputeArrowSnafu, ConvertVectorSnafu, CreateDefaultSnafu, + DecodeSnafu, EncodeSnafu, NewRecordBatchSnafu, RecordBatchSnafu, Result, UnexpectedSnafu, + UnsupportedOperationSnafu, }; use crate::read::flat_projection::{FlatProjectionMapper, flat_projected_columns}; use crate::read::projection::{PrimaryKeyProjectionMapper, ProjectionMapper}; @@ -150,7 +152,7 @@ impl PrimaryKeyCompatBatch { batch = compat_pk.compat(batch)?; } if let Some(compat_fields) = &self.compat_fields { - batch = compat_fields.compat(batch); + batch = compat_fields.compat(batch)?; } Ok(batch) @@ -351,11 +353,13 @@ impl FlatCompatBatch { let old_column = batch.column(*pos); if let Some(ty) = cast_type { - // Safety: We ensure type can be converted and the new batch should be valid. - // Tips: `safe` must be true in `CastOptions`, which will replace the specific value with null when it cannot be converted. - let casted = + let casted = if let Some(json_type) = ty.as_json() { + align_json_array(old_column, &json_type.as_arrow_type()) + .context(RecordBatchSnafu)? + } else { datatypes::arrow::compute::cast(old_column, &ty.as_arrow_type()) - .context(ComputeArrowSnafu)?; + .context(ComputeArrowSnafu)? + }; Ok(casted) } else { Ok(old_column.clone()) @@ -452,8 +456,7 @@ struct CompatFields { impl CompatFields { /// Make fields of the `batch` compatible. - #[must_use] - fn compat(&self, batch: Batch) -> Batch { + fn compat(&self, batch: Batch) -> Result { debug_assert_eq!(self.actual_fields.len(), batch.fields().len()); debug_assert!( self.actual_fields @@ -463,24 +466,32 @@ impl CompatFields { ); let len = batch.num_rows(); - let fields = self - .index_or_defaults + self.index_or_defaults .iter() .map(|index_or_default| match index_or_default { IndexOrDefault::Index { pos, cast_type } => { let old_column = &batch.fields()[*pos]; let data = if let Some(ty) = cast_type { - // Safety: We ensure type can be converted and the new batch should be valid. - // Tips: `safe` must be true in `CastOptions`, which will replace the specific value with null when it cannot be converted. - old_column.data.cast(ty).unwrap() + if let Some(json_type) = ty.as_json() { + let json_array = old_column.data.to_arrow_array(); + let json_array = + align_json_array(&json_array, &json_type.as_arrow_type()) + .context(RecordBatchSnafu)?; + Helper::try_into_vector(&json_array).context(ConvertVectorSnafu)? + } else { + old_column.data.cast(ty).with_context(|_| CastVectorSnafu { + from: old_column.data.data_type(), + to: ty.clone(), + })? + } } else { old_column.data.clone() }; - BatchColumn { + Ok(BatchColumn { column_id: old_column.column_id, data, - } + }) } IndexOrDefault::DefaultValue { column_id, @@ -488,16 +499,14 @@ impl CompatFields { semantic_type: _, } => { let data = default_vector.replicate(&[len]); - BatchColumn { + Ok(BatchColumn { column_id: *column_id, data, - } + }) } }) - .collect(); - - // Safety: We ensure all columns have the same length and the new batch should be valid. - batch.with_fields(fields).unwrap() + .collect::>>() + .and_then(|fields| batch.with_fields(fields)) } } diff --git a/src/mito2/src/read/flat_projection.rs b/src/mito2/src/read/flat_projection.rs index 23257ef649..11e7ae26bc 100644 --- a/src/mito2/src/read/flat_projection.rs +++ b/src/mito2/src/read/flat_projection.rs @@ -39,7 +39,6 @@ use crate::sst::{ /// /// This mapper support duplicate and unsorted projection indices. /// The output schema is determined by the projection indices. -#[allow(dead_code)] pub struct FlatProjectionMapper { /// Metadata of the region. metadata: RegionMetadataRef, diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs index 413f787980..22cc9fb3ba 100644 --- a/src/mito2/src/read/prune.rs +++ b/src/mito2/src/read/prune.rs @@ -49,33 +49,40 @@ pub struct PruneReader { context: FileRangeContextRef, source: Source, metrics: ReaderMetrics, + /// Whether to skip field filters for this row group. + skip_fields: bool, } impl PruneReader { pub(crate) fn new_with_row_group_reader( ctx: FileRangeContextRef, reader: RowGroupReader, + skip_fields: bool, ) -> Self { Self { context: ctx, source: Source::RowGroup(reader), metrics: Default::default(), + skip_fields, } } pub(crate) fn new_with_last_row_reader( ctx: FileRangeContextRef, reader: RowGroupLastRowCachedReader, + skip_fields: bool, ) -> Self { Self { context: ctx, source: Source::LastRow(reader), metrics: Default::default(), + skip_fields, } } - pub(crate) fn reset_source(&mut self, source: Source) { + pub(crate) fn reset_source(&mut self, source: Source, skip_fields: bool) { self.source = source; + self.skip_fields = skip_fields; } /// Merge metrics with the inner reader and return the merged metrics. @@ -117,7 +124,7 @@ impl PruneReader { } let num_rows_before_filter = batch.num_rows(); - let Some(batch_filtered) = self.context.precise_filter(batch)? else { + let Some(batch_filtered) = self.context.precise_filter(batch, self.skip_fields)? else { // the entire batch is filtered out self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter; return Ok(None); @@ -257,17 +264,21 @@ pub struct FlatPruneReader { context: FileRangeContextRef, source: FlatSource, metrics: ReaderMetrics, + /// Whether to skip field filters for this row group. + skip_fields: bool, } impl FlatPruneReader { pub(crate) fn new_with_row_group_reader( ctx: FileRangeContextRef, reader: FlatRowGroupReader, + skip_fields: bool, ) -> Self { Self { context: ctx, source: FlatSource::RowGroup(reader), metrics: Default::default(), + skip_fields, } } @@ -309,7 +320,10 @@ impl FlatPruneReader { } let num_rows_before_filter = record_batch.num_rows(); - let Some(filtered_batch) = self.context.precise_filter_flat(record_batch)? else { + let Some(filtered_batch) = self + .context + .precise_filter_flat(record_batch, self.skip_fields)? + else { // the entire batch is filtered out self.metrics.filter_metrics.rows_precise_filtered += num_rows_before_filter; return Ok(None); diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 536c48e248..babdd43b0b 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -48,7 +48,7 @@ use crate::config::{DEFAULT_MAX_CONCURRENT_SCAN_FILES, DEFAULT_SCAN_CHANNEL_SIZE use crate::error::{InvalidPartitionExprSnafu, Result}; #[cfg(feature = "enterprise")] use crate::extension::{BoxedExtensionRange, BoxedExtensionRangeProvider}; -use crate::memtable::MemtableRange; +use crate::memtable::{MemtableRange, RangesOptions}; use crate::metrics::READ_SST_COUNT; use crate::read::compat::{self, CompatBatch, FlatCompatBatch, PrimaryKeyCompatBatch}; use crate::read::projection::ProjectionMapper; @@ -60,6 +60,7 @@ use crate::read::unordered_scan::UnorderedScan; use crate::read::{Batch, BoxedRecordBatchStream, RecordBatch, Source}; use crate::region::options::MergeMode; use crate::region::version::VersionRef; +use crate::sst::FormatType; use crate::sst::file::FileHandle; use crate::sst::index::bloom_filter::applier::{ BloomFilterIndexApplierBuilder, BloomFilterIndexApplierRef, @@ -68,6 +69,7 @@ use crate::sst::index::fulltext_index::applier::FulltextIndexApplierRef; use crate::sst::index::fulltext_index::applier::builder::FulltextIndexApplierBuilder; use crate::sst::index::inverted_index::applier::InvertedIndexApplierRef; use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder; +use crate::sst::parquet::file_range::PreFilterMode; use crate::sst::parquet::reader::ReaderMetrics; /// Parallel scan channel size for flat format. @@ -220,8 +222,6 @@ pub(crate) struct ScanRegion { /// Whether to filter out the deleted rows. /// Usually true for normal read, and false for scan for compaction. filter_deleted: bool, - /// Whether to use flat format. - flat_format: bool, #[cfg(feature = "enterprise")] extension_range_provider: Option, } @@ -246,7 +246,6 @@ impl ScanRegion { ignore_bloom_filter: false, start_time: None, filter_deleted: true, - flat_format: false, #[cfg(feature = "enterprise")] extension_range_provider: None, } @@ -303,13 +302,6 @@ impl ScanRegion { self.filter_deleted = filter_deleted; } - /// Sets whether to use flat format. - #[must_use] - pub(crate) fn with_flat_format(mut self, flat_format: bool) -> Self { - self.flat_format = flat_format; - self - } - #[cfg(feature = "enterprise")] pub(crate) fn set_extension_range_provider( &mut self, @@ -384,18 +376,24 @@ impl ScanRegion { self.request.distribution == Some(TimeSeriesDistribution::PerSeries) } + /// Returns true if the region use flat format. + fn use_flat_format(&self) -> bool { + self.version.options.sst_format.unwrap_or_default() == FormatType::Flat + } + /// Creates a scan input. async fn scan_input(mut self) -> Result { let sst_min_sequence = self.request.sst_min_sequence.and_then(NonZeroU64::new); let time_range = self.build_time_range_predicate(); let predicate = PredicateGroup::new(&self.version.metadata, &self.request.filters)?; + let flat_format = self.use_flat_format(); // The mapper always computes projected column ids as the schema of SSTs may change. let mapper = match &self.request.projection { Some(p) => { - ProjectionMapper::new(&self.version.metadata, p.iter().copied(), self.flat_format)? + ProjectionMapper::new(&self.version.metadata, p.iter().copied(), flat_format)? } - None => ProjectionMapper::all(&self.version.metadata, self.flat_format)?, + None => ProjectionMapper::all(&self.version.metadata, flat_format)?, }; let ssts = &self.version.ssts; @@ -426,6 +424,10 @@ impl ScanRegion { let memtables = self.version.memtables.list_memtables(); // Skip empty memtables and memtables out of time range. let mut mem_range_builders = Vec::new(); + let filter_mode = pre_filter_mode( + self.version.options.append_mode, + self.version.options.merge_mode(), + ); for m in memtables { // check if memtable is empty by reading stats. @@ -439,12 +441,13 @@ impl ScanRegion { } let ranges_in_memtable = m.ranges( Some(mapper.column_ids()), - predicate.clone(), - SequenceRange::new( - self.request.memtable_min_sequence, - self.request.memtable_max_sequence, - ), - false, + RangesOptions::default() + .with_predicate(predicate.clone()) + .with_sequence(SequenceRange::new( + self.request.memtable_min_sequence, + self.request.memtable_max_sequence, + )) + .with_pre_filter_mode(filter_mode), )?; mem_range_builders.extend(ranges_in_memtable.ranges.into_values().map(|v| { // todo: we should add stats to MemtableRange @@ -457,24 +460,32 @@ impl ScanRegion { let region_id = self.region_id(); debug!( - "Scan region {}, request: {:?}, time range: {:?}, memtables: {}, ssts_to_read: {}, append_mode: {}", + "Scan region {}, request: {:?}, time range: {:?}, memtables: {}, ssts_to_read: {}, append_mode: {}, flat_format: {}", region_id, self.request, time_range, mem_range_builders.len(), files.len(), self.version.options.append_mode, + flat_format, ); - // Remove field filters for LastNonNull mode after logging the request. - self.maybe_remove_field_filters(); - - let inverted_index_applier = self.build_invereted_index_applier(); - let bloom_filter_applier = self.build_bloom_filter_applier(); - let fulltext_index_applier = self.build_fulltext_index_applier(); + let (non_field_filters, field_filters) = self.partition_by_field_filters(); + let inverted_index_appliers = [ + self.build_invereted_index_applier(&non_field_filters), + self.build_invereted_index_applier(&field_filters), + ]; + let bloom_filter_appliers = [ + self.build_bloom_filter_applier(&non_field_filters), + self.build_bloom_filter_applier(&field_filters), + ]; + let fulltext_index_appliers = [ + self.build_fulltext_index_applier(&non_field_filters), + self.build_fulltext_index_applier(&field_filters), + ]; let predicate = PredicateGroup::new(&self.version.metadata, &self.request.filters)?; - if self.flat_format { + if flat_format { // The batch is already large enough so we use a small channel size here. self.parallel_scan_channel_size = FLAT_SCAN_CHANNEL_SIZE; } @@ -485,9 +496,9 @@ impl ScanRegion { .with_memtables(mem_range_builders) .with_files(files) .with_cache(self.cache_strategy) - .with_inverted_index_applier(inverted_index_applier) - .with_bloom_filter_index_applier(bloom_filter_applier) - .with_fulltext_index_applier(fulltext_index_applier) + .with_inverted_index_appliers(inverted_index_appliers) + .with_bloom_filter_index_appliers(bloom_filter_appliers) + .with_fulltext_index_appliers(fulltext_index_appliers) .with_parallel_scan_channel_size(self.parallel_scan_channel_size) .with_max_concurrent_scan_files(self.max_concurrent_scan_files) .with_start_time(self.start_time) @@ -496,7 +507,7 @@ impl ScanRegion { .with_merge_mode(self.version.options.merge_mode()) .with_series_row_selector(self.request.series_row_selector) .with_distribution(self.request.distribution) - .with_flat_format(self.flat_format); + .with_flat_format(flat_format); #[cfg(feature = "enterprise")] let input = if let Some(provider) = self.extension_range_provider { @@ -527,40 +538,34 @@ impl ScanRegion { build_time_range_predicate(&time_index.column_schema.name, unit, &self.request.filters) } - /// Remove field filters if the merge mode is [MergeMode::LastNonNull]. - fn maybe_remove_field_filters(&mut self) { - if self.version.options.merge_mode() != MergeMode::LastNonNull { - return; - } - - // TODO(yingwen): We can ignore field filters only when there are multiple sources in the same time window. + /// Partitions filters into two groups: non-field filters and field filters. + /// Returns `(non_field_filters, field_filters)`. + fn partition_by_field_filters(&self) -> (Vec, Vec) { let field_columns = self .version .metadata .field_columns() .map(|col| &col.column_schema.name) .collect::>(); - // Columns in the expr. + let mut columns = HashSet::new(); - self.request.filters.retain(|expr| { + self.request.filters.iter().cloned().partition(|expr| { columns.clear(); // `expr_to_columns` won't return error. if expr_to_columns(expr, &mut columns).is_err() { - return false; + // If we can't extract columns, treat it as non-field filter + return true; } - for column in &columns { - if field_columns.contains(&column.name) { - // This expr uses the field column. - return false; - } - } - true - }); + // Return true for non-field filters (partition puts true cases in first vec) + !columns + .iter() + .any(|column| field_columns.contains(&column.name)) + }) } /// Use the latest schema to build the inverted index applier. - fn build_invereted_index_applier(&self) -> Option { + fn build_invereted_index_applier(&self, filters: &[Expr]) -> Option { if self.ignore_inverted_index { return None; } @@ -588,7 +593,7 @@ impl ScanRegion { .with_file_cache(file_cache) .with_inverted_index_cache(inverted_index_cache) .with_puffin_metadata_cache(puffin_metadata_cache) - .build(&self.request.filters) + .build(filters) .inspect_err(|err| warn!(err; "Failed to build invereted index applier")) .ok() .flatten() @@ -596,7 +601,7 @@ impl ScanRegion { } /// Use the latest schema to build the bloom filter index applier. - fn build_bloom_filter_applier(&self) -> Option { + fn build_bloom_filter_applier(&self, filters: &[Expr]) -> Option { if self.ignore_bloom_filter { return None; } @@ -615,7 +620,7 @@ impl ScanRegion { .with_file_cache(file_cache) .with_bloom_filter_index_cache(bloom_filter_index_cache) .with_puffin_metadata_cache(puffin_metadata_cache) - .build(&self.request.filters) + .build(filters) .inspect_err(|err| warn!(err; "Failed to build bloom filter index applier")) .ok() .flatten() @@ -623,7 +628,7 @@ impl ScanRegion { } /// Use the latest schema to build the fulltext index applier. - fn build_fulltext_index_applier(&self) -> Option { + fn build_fulltext_index_applier(&self, filters: &[Expr]) -> Option { if self.ignore_fulltext_index { return None; } @@ -641,7 +646,7 @@ impl ScanRegion { .with_file_cache(file_cache) .with_puffin_metadata_cache(puffin_metadata_cache) .with_bloom_filter_cache(bloom_filter_index_cache) - .build(&self.request.filters) + .build(filters) .inspect_err(|err| warn!(err; "Failed to build fulltext index applier")) .ok() .flatten() @@ -685,9 +690,9 @@ pub struct ScanInput { /// Maximum number of SST files to scan concurrently. pub(crate) max_concurrent_scan_files: usize, /// Index appliers. - inverted_index_applier: Option, - bloom_filter_index_applier: Option, - fulltext_index_applier: Option, + inverted_index_appliers: [Option; 2], + bloom_filter_index_appliers: [Option; 2], + fulltext_index_appliers: [Option; 2], /// Start time of the query. pub(crate) query_start: Option, /// The region is using append mode. @@ -724,9 +729,9 @@ impl ScanInput { ignore_file_not_found: false, parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES, - inverted_index_applier: None, - bloom_filter_index_applier: None, - fulltext_index_applier: None, + inverted_index_appliers: [None, None], + bloom_filter_index_appliers: [None, None], + fulltext_index_appliers: [None, None], query_start: None, append_mode: false, filter_deleted: true, @@ -803,33 +808,33 @@ impl ScanInput { self } - /// Sets invereted index applier. + /// Sets inverted index appliers. #[must_use] - pub(crate) fn with_inverted_index_applier( + pub(crate) fn with_inverted_index_appliers( mut self, - applier: Option, + appliers: [Option; 2], ) -> Self { - self.inverted_index_applier = applier; + self.inverted_index_appliers = appliers; self } - /// Sets bloom filter applier. + /// Sets bloom filter appliers. #[must_use] - pub(crate) fn with_bloom_filter_index_applier( + pub(crate) fn with_bloom_filter_index_appliers( mut self, - applier: Option, + appliers: [Option; 2], ) -> Self { - self.bloom_filter_index_applier = applier; + self.bloom_filter_index_appliers = appliers; self } - /// Sets fulltext index applier. + /// Sets fulltext index appliers. #[must_use] - pub(crate) fn with_fulltext_index_applier( + pub(crate) fn with_fulltext_index_appliers( mut self, - applier: Option, + appliers: [Option; 2], ) -> Self { - self.fulltext_index_applier = applier; + self.fulltext_index_appliers = appliers; self } @@ -952,18 +957,20 @@ impl ScanInput { reader_metrics: &mut ReaderMetrics, ) -> Result { let predicate = self.predicate_for_file(file); + let filter_mode = pre_filter_mode(self.append_mode, self.merge_mode); let res = self .access_layer .read_sst(file.clone()) .predicate(predicate) .projection(Some(self.mapper.column_ids().to_vec())) .cache(self.cache_strategy.clone()) - .inverted_index_applier(self.inverted_index_applier.clone()) - .bloom_filter_index_applier(self.bloom_filter_index_applier.clone()) - .fulltext_index_applier(self.fulltext_index_applier.clone()) + .inverted_index_appliers(self.inverted_index_appliers.clone()) + .bloom_filter_index_appliers(self.bloom_filter_index_appliers.clone()) + .fulltext_index_appliers(self.fulltext_index_appliers.clone()) .expected_metadata(Some(self.mapper.metadata().clone())) .flat_format(self.flat_format) .compaction(self.compaction) + .pre_filter_mode(filter_mode) .build_reader_input(reader_metrics) .await; let (mut file_range_ctx, selection) = match res { @@ -1106,9 +1113,8 @@ impl ScanInput { rows } - /// Returns table predicate of all exprs. - pub(crate) fn predicate(&self) -> Option<&Predicate> { - self.predicate.predicate() + pub(crate) fn predicate_group(&self) -> &PredicateGroup { + &self.predicate } /// Returns number of memtables to scan. @@ -1156,6 +1162,17 @@ impl ScanInput { } } +fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode { + if append_mode { + return PreFilterMode::All; + } + + match merge_mode { + MergeMode::LastRow => PreFilterMode::SkipFieldsOnDelete, + MergeMode::LastNonNull => PreFilterMode::SkipFields, + } +} + /// Context shared by different streams from a scanner. /// It contains the input and ranges to scan. pub struct StreamContext { diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index fd7d4329ca..de8875c4f6 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -14,13 +14,17 @@ //! Utilities for scanners. +use std::collections::VecDeque; use std::fmt; +use std::pin::Pin; use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll}; use std::time::{Duration, Instant}; use async_stream::try_stream; use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time}; use datatypes::arrow::record_batch::RecordBatch; +use datatypes::timestamp::timestamp_array_to_primitive; use futures::Stream; use prometheus::IntGauge; use smallvec::SmallVec; @@ -33,11 +37,13 @@ use crate::metrics::{ IN_PROGRESS_SCAN, PRECISE_FILTER_ROWS_TOTAL, READ_BATCHES_RETURN, READ_ROW_GROUPS_TOTAL, READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_RETURN, READ_STAGE_ELAPSED, }; -use crate::read::range::{RangeBuilderList, RowGroupIndex}; +use crate::read::range::{RangeBuilderList, RangeMeta, RowGroupIndex}; use crate::read::scan_region::StreamContext; use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source}; use crate::sst::file::FileTimeRange; +use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE; use crate::sst::parquet::file_range::FileRange; +use crate::sst::parquet::flat_format::time_index_column_index; use crate::sst::parquet::reader::{ReaderFilterMetrics, ReaderMetrics}; /// Verbose scan metrics for a partition. @@ -426,6 +432,8 @@ struct PartitionMetricsInner { yield_cost: Time, /// Duration to convert [`Batch`]es. convert_cost: Time, + /// Aggregated compute time reported to DataFusion. + elapsed_compute: Time, } impl PartitionMetricsInner { @@ -526,6 +534,7 @@ impl PartitionMetrics { scan_cost: MetricBuilder::new(metrics_set).subset_time("scan_cost", partition), yield_cost: MetricBuilder::new(metrics_set).subset_time("yield_cost", partition), convert_cost: MetricBuilder::new(metrics_set).subset_time("convert_cost", partition), + elapsed_compute: MetricBuilder::new(metrics_set).elapsed_compute(partition), }; Self(Arc::new(inner)) } @@ -545,6 +554,13 @@ impl PartitionMetrics { metrics.num_file_ranges += num; } + fn record_elapsed_compute(&self, duration: Duration) { + if duration.is_zero() { + return; + } + self.0.elapsed_compute.add_duration(duration); + } + /// Merges `build_reader_cost`. pub(crate) fn inc_build_reader_cost(&self, cost: Duration) { self.0.build_reader_cost.add_duration(cost); @@ -555,6 +571,7 @@ impl PartitionMetrics { pub(crate) fn inc_convert_batch_cost(&self, cost: Duration) { self.0.convert_cost.add_duration(cost); + self.record_elapsed_compute(cost); } /// Reports memtable scan metrics. @@ -572,7 +589,9 @@ impl PartitionMetrics { .build_reader_cost .add_duration(metrics.build_reader_cost); self.0.scan_cost.add_duration(metrics.scan_cost); + self.record_elapsed_compute(metrics.scan_cost); self.0.yield_cost.add_duration(metrics.yield_cost); + self.record_elapsed_compute(metrics.yield_cost); let mut metrics_set = self.0.metrics.lock().unwrap(); metrics_set.merge_scanner_metrics(metrics); @@ -657,7 +676,6 @@ pub(crate) fn scan_mem_ranges( } /// Scans memtable ranges at `index` using flat format that returns RecordBatch. -#[allow(dead_code)] pub(crate) fn scan_flat_mem_ranges( stream_ctx: Arc, part_metrics: PartitionMetrics, @@ -685,6 +703,71 @@ pub(crate) fn scan_flat_mem_ranges( } } +/// Files with row count greater than this threshold can contribute to the estimation. +const SPLIT_ROW_THRESHOLD: u64 = DEFAULT_ROW_GROUP_SIZE as u64; +/// Number of series threshold for splitting batches. +const NUM_SERIES_THRESHOLD: u64 = 10240; +/// Minimum batch size after splitting. The batch size is less than 60 because a series may only have +/// 60 samples per hour. +const BATCH_SIZE_THRESHOLD: u64 = 50; + +/// Returns true if splitting flat record batches may improve merge performance. +pub(crate) fn should_split_flat_batches_for_merge( + stream_ctx: &Arc, + range_meta: &RangeMeta, +) -> bool { + // Number of files to split and scan. + let mut num_files_to_split = 0; + let mut num_mem_rows = 0; + let mut num_mem_series = 0; + // Checks each file range, returns early if any range is not splittable. + // For mem ranges, we collect the total number of rows and series because the number of rows in a + // mem range may be too small. + for index in &range_meta.row_group_indices { + if stream_ctx.is_mem_range_index(*index) { + let memtable = &stream_ctx.input.memtables[index.index]; + // Is mem range + let stats = memtable.stats(); + num_mem_rows += stats.num_rows(); + num_mem_series += stats.series_count(); + } else if stream_ctx.is_file_range_index(*index) { + // This is a file range. + let file_index = index.index - stream_ctx.input.num_memtables(); + let file = &stream_ctx.input.files[file_index]; + if file.meta_ref().num_rows < SPLIT_ROW_THRESHOLD || file.meta_ref().num_series == 0 { + // If the file doesn't have enough rows, or the number of series is unavailable, skips it. + continue; + } + debug_assert!(file.meta_ref().num_rows > 0); + if !can_split_series(file.meta_ref().num_rows, file.meta_ref().num_series) { + // We can't split batches in a file. + return false; + } else { + num_files_to_split += 1; + } + } + // Skips non-file and non-mem ranges. + } + + if num_files_to_split > 0 { + // We mainly consider file ranges because they have enough data for sampling. + true + } else if num_mem_series > 0 && num_mem_rows > 0 { + // If we don't have files to scan, we check whether to split by the memtable. + can_split_series(num_mem_rows as u64, num_mem_series as u64) + } else { + false + } +} + +fn can_split_series(num_rows: u64, num_series: u64) -> bool { + assert!(num_series > 0); + assert!(num_rows > 0); + + // It doesn't have too many series or it will have enough rows for each batch. + num_series < NUM_SERIES_THRESHOLD || num_rows / num_series >= BATCH_SIZE_THRESHOLD +} + /// Scans file ranges at `index`. pub(crate) async fn scan_file_ranges( stream_ctx: Arc, @@ -709,7 +792,6 @@ pub(crate) async fn scan_file_ranges( } /// Scans file ranges at `index` using flat reader that returns RecordBatch. -#[allow(dead_code)] pub(crate) async fn scan_flat_file_ranges( stream_ctx: Arc, part_metrics: PartitionMetrics, @@ -851,7 +933,6 @@ pub(crate) async fn maybe_scan_other_ranges( } } -#[allow(dead_code)] pub(crate) async fn maybe_scan_flat_other_ranges( context: &Arc, index: RowGroupIndex, @@ -866,3 +947,83 @@ pub(crate) async fn maybe_scan_flat_other_ranges( } .fail() } + +/// A stream wrapper that splits record batches from an inner stream. +pub(crate) struct SplitRecordBatchStream { + /// The inner stream that yields record batches. + inner: S, + /// Buffer for split batches. + batches: VecDeque, +} + +impl SplitRecordBatchStream { + /// Creates a new splitting stream wrapper. + pub(crate) fn new(inner: S) -> Self { + Self { + inner, + batches: VecDeque::new(), + } + } +} + +impl Stream for SplitRecordBatchStream +where + S: Stream> + Unpin, +{ + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + // First, check if we have buffered split batches + if let Some(batch) = self.batches.pop_front() { + return Poll::Ready(Some(Ok(batch))); + } + + // Poll the inner stream for the next batch + let record_batch = match futures::ready!(Pin::new(&mut self.inner).poll_next(cx)) { + Some(Ok(batch)) => batch, + Some(Err(e)) => return Poll::Ready(Some(Err(e))), + None => return Poll::Ready(None), + }; + + // Split the batch and buffer the results + split_record_batch(record_batch, &mut self.batches); + // Continue the loop to return the first split batch + } + } +} + +/// Splits the batch by timestamps. +/// +/// # Panics +/// Panics if the timestamp array is invalid. +pub(crate) fn split_record_batch(record_batch: RecordBatch, batches: &mut VecDeque) { + let batch_rows = record_batch.num_rows(); + if batch_rows == 0 { + return; + } + if batch_rows < 2 { + batches.push_back(record_batch); + return; + } + + let time_index_pos = time_index_column_index(record_batch.num_columns()); + let timestamps = record_batch.column(time_index_pos); + let (ts_values, _unit) = timestamp_array_to_primitive(timestamps).unwrap(); + let mut offsets = Vec::with_capacity(16); + offsets.push(0); + let values = ts_values.values(); + for (i, &value) in values.iter().take(batch_rows - 1).enumerate() { + if value > values[i + 1] { + offsets.push(i + 1); + } + } + offsets.push(values.len()); + + // Splits the batch by offsets. + for (i, &start) in offsets[..offsets.len() - 1].iter().enumerate() { + let end = offsets[i + 1]; + let rows_in_batch = end - start; + batches.push_back(record_batch.slice(start, rows_in_batch)); + } +} diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index 631c40b42a..8df8d6fb48 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -44,8 +44,9 @@ use crate::read::merge::MergeReaderBuilder; use crate::read::range::{RangeBuilderList, RangeMeta}; use crate::read::scan_region::{ScanInput, StreamContext}; use crate::read::scan_util::{ - PartitionMetrics, PartitionMetricsList, scan_file_ranges, scan_flat_file_ranges, - scan_flat_mem_ranges, scan_mem_ranges, + PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, scan_file_ranges, + scan_flat_file_ranges, scan_flat_mem_ranges, scan_mem_ranges, + should_split_flat_batches_for_merge, }; use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream}; use crate::read::{ @@ -177,6 +178,7 @@ impl SeqScan { part_metrics, range_builder_list.clone(), &mut sources, + None, ) .await?; } @@ -210,6 +212,7 @@ impl SeqScan { part_metrics, range_builder_list.clone(), &mut sources, + None, ) .await?; } @@ -378,6 +381,7 @@ impl SeqScan { let partition_ranges = self.properties.partitions[partition].clone(); let compaction = self.stream_ctx.input.compaction; let distinguish_range = self.properties.distinguish_partition_range; + let file_scan_semaphore = if compaction { None } else { semaphore.clone() }; let stream = try_stream! { part_metrics.on_first_poll(); @@ -399,6 +403,7 @@ impl SeqScan { &part_metrics, range_builder_list.clone(), &mut sources, + file_scan_semaphore.clone(), ).await?; let mut metrics = ScannerMetrics::default(); @@ -475,6 +480,7 @@ impl SeqScan { let semaphore = self.new_semaphore(); let partition_ranges = self.properties.partitions[partition].clone(); let compaction = self.stream_ctx.input.compaction; + let file_scan_semaphore = if compaction { None } else { semaphore.clone() }; let stream = try_stream! { part_metrics.on_first_poll(); @@ -493,6 +499,7 @@ impl SeqScan { &part_metrics, range_builder_list.clone(), &mut sources, + file_scan_semaphore.clone(), ).await?; let mut metrics = ScannerMetrics::default(); @@ -602,6 +609,10 @@ impl SeqScan { } impl RegionScanner for SeqScan { + fn name(&self) -> &str { + "SeqScan" + } + fn properties(&self) -> &ScannerProperties { &self.properties } @@ -632,8 +643,12 @@ impl RegionScanner for SeqScan { Ok(()) } - fn has_predicate(&self) -> bool { - let predicate = self.stream_ctx.input.predicate(); + fn has_predicate_without_region(&self) -> bool { + let predicate = self + .stream_ctx + .input + .predicate_group() + .predicate_without_region(); predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false) } @@ -678,6 +693,7 @@ pub(crate) async fn build_sources( part_metrics: &PartitionMetrics, range_builder_list: Arc, sources: &mut Vec, + semaphore: Option>, ) -> Result<()> { // Gets range meta. let range_meta = &stream_ctx.ranges[part_range.identifier]; @@ -695,35 +711,78 @@ pub(crate) async fn build_sources( } } - sources.reserve(range_meta.row_group_indices.len()); - for index in &range_meta.row_group_indices { - let stream = if stream_ctx.is_mem_range_index(*index) { + let read_type = if compaction { + "compaction" + } else { + "seq_scan_files" + }; + let num_indices = range_meta.row_group_indices.len(); + if num_indices == 0 { + return Ok(()); + } + + sources.reserve(num_indices); + let mut ordered_sources = Vec::with_capacity(num_indices); + ordered_sources.resize_with(num_indices, || None); + let mut file_scan_tasks = Vec::new(); + + for (position, index) in range_meta.row_group_indices.iter().enumerate() { + if stream_ctx.is_mem_range_index(*index) { let stream = scan_mem_ranges( stream_ctx.clone(), part_metrics.clone(), *index, range_meta.time_range, ); - Box::pin(stream) as _ + ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _)); } else if stream_ctx.is_file_range_index(*index) { - let read_type = if compaction { - "compaction" + if let Some(semaphore_ref) = semaphore.as_ref() { + // run in parallel, controlled by semaphore + let stream_ctx = stream_ctx.clone(); + let part_metrics = part_metrics.clone(); + let range_builder_list = range_builder_list.clone(); + let semaphore = Arc::clone(semaphore_ref); + let row_group_index = *index; + file_scan_tasks.push(async move { + let _permit = semaphore.acquire().await.unwrap(); + let stream = scan_file_ranges( + stream_ctx, + part_metrics, + row_group_index, + read_type, + range_builder_list, + ) + .await?; + Ok((position, Source::Stream(Box::pin(stream) as _))) + }); } else { - "seq_scan_files" - }; - let stream = scan_file_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - read_type, - range_builder_list.clone(), - ) - .await?; - Box::pin(stream) as _ + // no semaphore, run sequentially + let stream = scan_file_ranges( + stream_ctx.clone(), + part_metrics.clone(), + *index, + read_type, + range_builder_list.clone(), + ) + .await?; + ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _)); + } } else { - scan_util::maybe_scan_other_ranges(stream_ctx, *index, part_metrics).await? - }; - sources.push(Source::Stream(stream)); + let stream = + scan_util::maybe_scan_other_ranges(stream_ctx, *index, part_metrics).await?; + ordered_sources[position] = Some(Source::Stream(stream)); + } + } + + if !file_scan_tasks.is_empty() { + let results = futures::future::try_join_all(file_scan_tasks).await?; + for (position, source) in results { + ordered_sources[position] = Some(source); + } + } + + for source in ordered_sources.into_iter().flatten() { + sources.push(source); } Ok(()) } @@ -736,6 +795,7 @@ pub(crate) async fn build_flat_sources( part_metrics: &PartitionMetrics, range_builder_list: Arc, sources: &mut Vec, + semaphore: Option>, ) -> Result<()> { // Gets range meta. let range_meta = &stream_ctx.ranges[part_range.identifier]; @@ -753,31 +813,89 @@ pub(crate) async fn build_flat_sources( } } - sources.reserve(range_meta.row_group_indices.len()); - for index in &range_meta.row_group_indices { - let stream = if stream_ctx.is_mem_range_index(*index) { - let stream = scan_flat_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index); - Box::pin(stream) as _ - } else if stream_ctx.is_file_range_index(*index) { - let read_type = if compaction { - "compaction" - } else { - "seq_scan_files" - }; - let stream = scan_flat_file_ranges( - stream_ctx.clone(), - part_metrics.clone(), - *index, - read_type, - range_builder_list.clone(), - ) - .await?; - Box::pin(stream) as _ - } else { - scan_util::maybe_scan_flat_other_ranges(stream_ctx, *index, part_metrics).await? - }; - sources.push(stream); + let read_type = if compaction { + "compaction" + } else { + "seq_scan_files" + }; + let num_indices = range_meta.row_group_indices.len(); + if num_indices == 0 { + return Ok(()); } + + let should_split = should_split_flat_batches_for_merge(stream_ctx, range_meta); + sources.reserve(num_indices); + let mut ordered_sources = Vec::with_capacity(num_indices); + ordered_sources.resize_with(num_indices, || None); + let mut file_scan_tasks = Vec::new(); + + for (position, index) in range_meta.row_group_indices.iter().enumerate() { + if stream_ctx.is_mem_range_index(*index) { + let stream = scan_flat_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index); + ordered_sources[position] = Some(Box::pin(stream) as _); + } else if stream_ctx.is_file_range_index(*index) { + if let Some(semaphore_ref) = semaphore.as_ref() { + // run in parallel, controlled by semaphore + let stream_ctx = stream_ctx.clone(); + let part_metrics = part_metrics.clone(); + let range_builder_list = range_builder_list.clone(); + let semaphore = Arc::clone(semaphore_ref); + let row_group_index = *index; + file_scan_tasks.push(async move { + let _permit = semaphore.acquire().await.unwrap(); + let stream = scan_flat_file_ranges( + stream_ctx, + part_metrics, + row_group_index, + read_type, + range_builder_list, + ) + .await?; + Ok((position, Box::pin(stream) as _)) + }); + } else { + // no semaphore, run sequentially + let stream = scan_flat_file_ranges( + stream_ctx.clone(), + part_metrics.clone(), + *index, + read_type, + range_builder_list.clone(), + ) + .await?; + ordered_sources[position] = Some(Box::pin(stream) as _); + } + } else { + let stream = + scan_util::maybe_scan_flat_other_ranges(stream_ctx, *index, part_metrics).await?; + ordered_sources[position] = Some(stream); + } + } + + if !file_scan_tasks.is_empty() { + let results = futures::future::try_join_all(file_scan_tasks).await?; + for (position, stream) in results { + ordered_sources[position] = Some(stream); + } + } + + for stream in ordered_sources.into_iter().flatten() { + if should_split { + sources.push(Box::pin(SplitRecordBatchStream::new(stream))); + } else { + sources.push(stream); + } + } + + if should_split { + common_telemetry::debug!( + "Splitting record batches, region: {}, sources: {}, part_range: {:?}", + stream_ctx.input.region_metadata().region_id, + sources.len(), + part_range, + ); + } + Ok(()) } diff --git a/src/mito2/src/read/series_scan.rs b/src/mito2/src/read/series_scan.rs index 3a006dcb67..ecb40d438b 100644 --- a/src/mito2/src/read/series_scan.rs +++ b/src/mito2/src/read/series_scan.rs @@ -284,6 +284,10 @@ fn new_channel_list(num_partitions: usize) -> (SenderList, ReceiverList) { } impl RegionScanner for SeriesScan { + fn name(&self) -> &str { + "SeriesScan" + } + fn properties(&self) -> &ScannerProperties { &self.properties } @@ -314,8 +318,12 @@ impl RegionScanner for SeriesScan { Ok(()) } - fn has_predicate(&self) -> bool { - let predicate = self.stream_ctx.input.predicate(); + fn has_predicate_without_region(&self) -> bool { + let predicate = self + .stream_ctx + .input + .predicate_group() + .predicate_without_region(); predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false) } @@ -419,6 +427,7 @@ impl SeriesDistributor { &part_metrics, range_builder_list.clone(), &mut sources, + self.semaphore.clone(), ) .await?; } @@ -503,6 +512,7 @@ impl SeriesDistributor { &part_metrics, range_builder_list.clone(), &mut sources, + self.semaphore.clone(), ) .await?; } diff --git a/src/mito2/src/read/stream.rs b/src/mito2/src/read/stream.rs index b51f9f5fcc..e621c77e36 100644 --- a/src/mito2/src/read/stream.rs +++ b/src/mito2/src/read/stream.rs @@ -109,7 +109,10 @@ impl ConvertBatchStream { compute::concat_batches(output_schema.arrow_schema(), &self.buffer) .context(ArrowComputeSnafu)?; - RecordBatch::try_from_df_record_batch(output_schema, record_batch) + Ok(RecordBatch::from_df_record_batch( + output_schema, + record_batch, + )) } ScanBatch::RecordBatch(df_record_batch) => { // Safety: Only flat format returns this batch. diff --git a/src/mito2/src/read/unordered_scan.rs b/src/mito2/src/read/unordered_scan.rs index 4dc5d59b98..c0a48f60da 100644 --- a/src/mito2/src/read/unordered_scan.rs +++ b/src/mito2/src/read/unordered_scan.rs @@ -399,6 +399,10 @@ impl UnorderedScan { } impl RegionScanner for UnorderedScan { + fn name(&self) -> &str { + "UnorderedScan" + } + fn properties(&self) -> &ScannerProperties { &self.properties } @@ -427,8 +431,14 @@ impl RegionScanner for UnorderedScan { .map_err(BoxedError::new) } - fn has_predicate(&self) -> bool { - let predicate = self.stream_ctx.input.predicate(); + /// If this scanner have predicate other than region partition exprs + fn has_predicate_without_region(&self) -> bool { + let predicate = self + .stream_ctx + .input + .predicate_group() + .predicate_without_region(); + predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false) } diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index aac7090174..2a85c40677 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -14,6 +14,7 @@ //! Mito region. +pub mod catchup; pub mod opener; pub mod options; pub(crate) mod version; @@ -21,7 +22,7 @@ pub(crate) mod version; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::atomic::{AtomicI64, AtomicU64, Ordering}; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex, RwLock}; use common_telemetry::{error, info, warn}; use crossbeam_utils::atomic::AtomicCell; @@ -46,10 +47,8 @@ use crate::manifest::action::{ RegionChange, RegionManifest, RegionMetaAction, RegionMetaActionList, }; use crate::manifest::manager::RegionManifestManager; -use crate::memtable::MemtableBuilderRef; use crate::region::version::{VersionControlRef, VersionRef}; use crate::request::{OnFailure, OptionOutputTx}; -use crate::sst::FormatType; use crate::sst::file_purger::FilePurgerRef; use crate::sst::location::{index_file_path, sst_file_path}; use crate::time_provider::TimeProviderRef; @@ -78,6 +77,8 @@ pub enum RegionLeaderState { Writable, /// The region is in staging mode - writable but no checkpoint/compaction. Staging, + /// The region is entering staging mode. - write requests will be stalled. + EnteringStaging, /// The region is altering. Altering, /// The region is dropping. @@ -139,10 +140,14 @@ pub struct MitoRegion { pub(crate) topic_latest_entry_id: AtomicU64, /// The total bytes written to the region. pub(crate) written_bytes: Arc, - /// Memtable builder for the region. - pub(crate) memtable_builder: MemtableBuilderRef, - /// Format type of the SST file. - pub(crate) sst_format: FormatType, + /// The partition expression of the region in staging mode. + /// + /// During the staging mode, the region metadata in [`VersionControlRef`] is not updated, + /// so we need to store the partition expression separately. + /// TODO(weny): + /// 1. Reload the staging partition expr during region open. + /// 2. Rejects requests with mismatching partition expr. + pub(crate) staging_partition_expr: Mutex>, /// manifest stats stats: ManifestStats, } @@ -199,11 +204,6 @@ impl MitoRegion { self.last_compaction_millis.load(Ordering::Relaxed) } - /// Returns format type of the SST file. - pub(crate) fn sst_format(&self) -> FormatType { - self.sst_format - } - /// Update compaction time to current time. pub(crate) fn update_compaction_millis(&self) { let now = self.time_provider.current_time_millis(); @@ -336,11 +336,19 @@ impl MitoRegion { ) } + /// Sets the entering staging state. + pub(crate) fn set_entering_staging(&self) -> Result<()> { + self.compare_exchange_state( + RegionLeaderState::Writable, + RegionRoleState::Leader(RegionLeaderState::EnteringStaging), + ) + } + /// Exits the staging state back to writable. /// /// You should call this method in the worker loop. /// Transitions from Staging to Writable state. - fn exit_staging(&self) -> Result<()> { + pub fn exit_staging(&self) -> Result<()> { self.compare_exchange_state( RegionLeaderState::Staging, RegionRoleState::Leader(RegionLeaderState::Writable), @@ -459,17 +467,15 @@ impl MitoRegion { if self.state() == RegionRoleState::Leader(RegionLeaderState::Writable) { // Persist backfilled metadata if manifest is missing fields (e.g., partition_expr) let manifest_meta = &manager.manifest().metadata; - let current_meta = &self.version().metadata; + let current_version = self.version(); + let current_meta = ¤t_version.metadata; if manifest_meta.partition_expr.is_none() && current_meta.partition_expr.is_some() { let action = RegionMetaAction::Change(RegionChange { metadata: current_meta.clone(), - sst_format: self.sst_format(), + sst_format: current_version.options.sst_format.unwrap_or_default(), }); let result = manager - .update( - RegionMetaActionList::with_action(action), - RegionRoleState::Leader(RegionLeaderState::Writable), - ) + .update(RegionMetaActionList::with_action(action), false) .await; match result { @@ -501,6 +507,16 @@ impl MitoRegion { } } + /// Switches the region state to `RegionRoleState::Leader(RegionLeaderState::Staging)` if the current state is `expect`. + /// Otherwise, logs an error. + pub(crate) fn switch_state_to_staging(&self, expect: RegionLeaderState) { + if let Err(e) = + self.compare_exchange_state(expect, RegionRoleState::Leader(RegionLeaderState::Staging)) + { + error!(e; "failed to switch region state to staging, expect state is {:?}", expect); + } + } + /// Returns the region statistic. pub(crate) fn region_statistic(&self) -> RegionStatistic { let version = self.version(); @@ -516,6 +532,7 @@ impl MitoRegion { let num_rows = version.ssts.num_rows() + version.memtables.num_rows(); let num_files = version.ssts.num_files(); let manifest_version = self.stats.manifest_version(); + let file_removed_cnt = self.stats.file_removed_cnt(); let topic_latest_entry_id = self.topic_latest_entry_id.load(Ordering::Relaxed); let written_bytes = self.written_bytes.load(Ordering::Relaxed); @@ -531,6 +548,7 @@ impl MitoRegion { manifest: RegionManifestInfo::Mito { manifest_version, flushed_entry_id, + file_removed_cnt, }, data_topic_latest_entry_id: topic_latest_entry_id, metadata_topic_latest_entry_id: topic_latest_entry_id, @@ -565,6 +583,10 @@ impl MitoRegion { Ok(()) } + pub fn access_layer(&self) -> AccessLayerRef { + self.access_layer.clone() + } + /// Returns the SST entries of the region. pub async fn manifest_sst_entries(&self) -> Vec { let table_dir = self.table_dir(); @@ -578,19 +600,34 @@ impl MitoRegion { .flat_map(|level| level.files().map(|file| file.file_id().file_id())) .collect::>(); - self.manifest_ctx - .manifest() + let manifest_files = self.manifest_ctx.manifest().await.files.clone(); + let staging_files = self + .manifest_ctx + .staging_manifest() .await - .files + .map(|m| m.files.clone()) + .unwrap_or_default(); + let files = manifest_files + .into_iter() + .chain(staging_files.into_iter()) + .collect::>(); + + files .values() .map(|meta| { let region_id = self.region_id; let origin_region_id = meta.region_id; - let (index_file_path, index_file_size) = if meta.index_file_size > 0 { - let index_file_path = index_file_path(table_dir, meta.file_id(), path_type); - (Some(index_file_path), Some(meta.index_file_size)) + let (index_file_id, index_file_path, index_file_size) = if meta.index_file_size > 0 + { + let index_file_path = + index_file_path(table_dir, meta.index_file_id(), path_type); + ( + Some(meta.index_file_id().file_id().to_string()), + Some(index_file_path), + Some(meta.index_file_size), + ) } else { - (None, None) + (None, None, None) }; let visible = visible_ssts.contains(&meta.file_id); ManifestSstEntry { @@ -601,6 +638,7 @@ impl MitoRegion { region_group: region_id.region_group(), region_sequence: region_id.region_sequence(), file_id: meta.file_id.to_string(), + index_file_id, level: meta.level, file_path: sst_file_path(table_dir, meta.file_id(), path_type), file_size: meta.file_size, @@ -608,6 +646,7 @@ impl MitoRegion { index_file_size, num_rows: meta.num_rows, num_row_groups: meta.num_row_groups, + num_series: Some(meta.num_series), min_ts: meta.time_range.0, max_ts: meta.time_range.1, sequence: meta.sequence.map(|s| s.get()), @@ -649,9 +688,8 @@ impl MitoRegion { }; // Submit merged actions using the manifest manager's update method - // Pass the target state (Writable) so it saves to normal directory, not staging - let target_state = RegionRoleState::Leader(RegionLeaderState::Writable); - let new_version = manager.update(merged_actions.clone(), target_state).await?; + // Pass the `false` so it saves to normal directory, not staging + let new_version = manager.update(merged_actions.clone(), false).await?; info!( "Successfully submitted merged staged manifests for region {}, new version: {}", @@ -726,6 +764,7 @@ impl ManifestContext { &self, expect_state: RegionLeaderState, action_list: RegionMetaActionList, + is_staging: bool, ) -> Result { // Acquires the write lock of the manifest manager. let mut manager = self.manifest_manager.write().await; @@ -801,7 +840,7 @@ impl ManifestContext { } // Now we can update the manifest. - let version = manager.update(action_list, current_state).await.inspect_err( + let version = manager.update(action_list, is_staging).await.inspect_err( |e| error!(e; "Failed to update manifest, region_id: {}", manifest.metadata.region_id), )?; @@ -908,9 +947,17 @@ impl ManifestContext { } } + /// Returns the normal manifest of the region. pub(crate) async fn manifest(&self) -> Arc { self.manifest_manager.read().await.manifest() } + + /// Returns the staging manifest of the region. + pub(crate) async fn staging_manifest( + &self, + ) -> Option> { + self.manifest_manager.read().await.staging_manifest() + } } pub(crate) type ManifestContextRef = Arc; @@ -1030,6 +1077,24 @@ impl RegionMap { Ok(region) } + /// Gets staging region by region id. + /// + /// Returns error if the region does not exist or is not in staging state. + pub(crate) fn staging_region(&self, region_id: RegionId) -> Result { + let region = self + .get_region(region_id) + .context(RegionNotFoundSnafu { region_id })?; + ensure!( + region.is_staging(), + RegionStateSnafu { + region_id, + state: region.state(), + expect: RegionRoleState::Leader(RegionLeaderState::Staging), + } + ); + Ok(region) + } + /// Gets flushable region by region id. /// /// Returns error if the region does not exist or is not operable. @@ -1138,11 +1203,40 @@ impl OpeningRegions { pub(crate) type OpeningRegionsRef = Arc; +/// The regions that are catching up. +#[derive(Debug, Default)] +pub(crate) struct CatchupRegions { + regions: RwLock>, +} + +impl CatchupRegions { + /// Returns true if the region exists. + pub(crate) fn is_region_exists(&self, region_id: RegionId) -> bool { + let regions = self.regions.read().unwrap(); + regions.contains(®ion_id) + } + + /// Inserts a new region into the set. + pub(crate) fn insert_region(&self, region_id: RegionId) { + let mut regions = self.regions.write().unwrap(); + regions.insert(region_id); + } + + /// Remove region by id. + pub(crate) fn remove_region(&self, region_id: RegionId) { + let mut regions = self.regions.write().unwrap(); + regions.remove(®ion_id); + } +} + +pub(crate) type CatchupRegionsRef = Arc; + /// Manifest stats. #[derive(Default, Debug, Clone)] -pub(crate) struct ManifestStats { - total_manifest_size: Arc, - manifest_version: Arc, +pub struct ManifestStats { + pub(crate) total_manifest_size: Arc, + pub(crate) manifest_version: Arc, + pub(crate) file_removed_cnt: Arc, } impl ManifestStats { @@ -1153,12 +1247,16 @@ impl ManifestStats { fn manifest_version(&self) -> u64 { self.manifest_version.load(Ordering::Relaxed) } + + fn file_removed_cnt(&self) -> u64 { + self.file_removed_cnt.load(Ordering::Relaxed) + } } #[cfg(test)] mod tests { - use std::sync::Arc; use std::sync::atomic::AtomicU64; + use std::sync::{Arc, Mutex}; use common_datasource::compression::CompressionType; use common_test_util::temp_dir::create_temp_dir; @@ -1179,7 +1277,6 @@ mod tests { use crate::sst::FormatType; use crate::sst::index::intermediate::IntermediateManager; use crate::sst::index::puffin_manager::PuffinManagerFactory; - use crate::test_util::memtable_util::EmptyMemtableBuilder; use crate::test_util::scheduler_util::SchedulerEnv; use crate::test_util::version_util::VersionControlBuilder; use crate::time_provider::StdTimeProvider; @@ -1259,9 +1356,8 @@ mod tests { checkpoint_distance: 10, remove_file_options: Default::default(), }, - Default::default(), - Default::default(), FormatType::PrimaryKey, + &Default::default(), ) .await .unwrap(); @@ -1326,9 +1422,8 @@ mod tests { checkpoint_distance: 10, remove_file_options: Default::default(), }, - Default::default(), - Default::default(), FormatType::PrimaryKey, + &Default::default(), ) .await .unwrap(); @@ -1350,9 +1445,8 @@ mod tests { time_provider: Arc::new(StdTimeProvider), topic_latest_entry_id: Default::default(), written_bytes: Arc::new(AtomicU64::new(0)), - memtable_builder: Arc::new(EmptyMemtableBuilder::default()), - sst_format: FormatType::PrimaryKey, stats: ManifestStats::default(), + staging_partition_expr: Mutex::new(None), }; // Test initial state diff --git a/src/mito2/src/region/catchup.rs b/src/mito2/src/region/catchup.rs new file mode 100644 index 0000000000..0d8cfa8ed8 --- /dev/null +++ b/src/mito2/src/region/catchup.rs @@ -0,0 +1,167 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Instant; + +use common_telemetry::{info, warn}; +use snafu::ensure; +use store_api::logstore::LogStore; + +use crate::error::{self, Result}; +use crate::region::MitoRegion; +use crate::region::opener::replay_memtable; +use crate::wal::Wal; +use crate::wal::entry_distributor::WalEntryReceiver; + +pub struct RegionCatchupTask { + entry_receiver: Option, + region: Arc, + replay_checkpoint_entry_id: Option, + expected_last_entry_id: Option, + allow_stale_entries: bool, + location_id: Option, + wal: Wal, +} + +impl RegionCatchupTask { + pub fn new(region: Arc, wal: Wal, allow_stale_entries: bool) -> Self { + Self { + entry_receiver: None, + region, + replay_checkpoint_entry_id: None, + expected_last_entry_id: None, + allow_stale_entries, + location_id: None, + wal, + } + } + + /// Sets the location id. + pub(crate) fn with_location_id(mut self, location_id: Option) -> Self { + self.location_id = location_id; + self + } + + /// Sets the expected last entry id. + pub(crate) fn with_expected_last_entry_id( + mut self, + expected_last_entry_id: Option, + ) -> Self { + self.expected_last_entry_id = expected_last_entry_id; + self + } + + /// Sets the entry receiver. + pub(crate) fn with_entry_receiver(mut self, entry_receiver: Option) -> Self { + self.entry_receiver = entry_receiver; + self + } + + /// Sets the replay checkpoint entry id. + pub(crate) fn with_replay_checkpoint_entry_id( + mut self, + replay_checkpoint_entry_id: Option, + ) -> Self { + self.replay_checkpoint_entry_id = replay_checkpoint_entry_id; + self + } + + pub async fn run(&mut self) -> Result<()> { + if self.region.provider.is_remote_wal() { + self.remote_wal_catchup().await + } else { + self.local_wal_catchup().await + } + } + + async fn remote_wal_catchup(&mut self) -> Result<()> { + let flushed_entry_id = self.region.version_control.current().last_entry_id; + let replay_from_entry_id = self + .replay_checkpoint_entry_id + .unwrap_or(flushed_entry_id) + .max(flushed_entry_id); + let region_id = self.region.region_id; + info!( + "Trying to replay memtable for region: {region_id}, provider: {:?}, replay from entry id: {replay_from_entry_id}, flushed entry id: {flushed_entry_id}", + self.region.provider + ); + let timer = Instant::now(); + let wal_entry_reader = self + .entry_receiver + .take() + .map(|r| Box::new(r) as _) + .unwrap_or_else(|| { + self.wal + .wal_entry_reader(&self.region.provider, region_id, self.location_id) + }); + let on_region_opened = self.wal.on_region_opened(); + let last_entry_id = replay_memtable( + &self.region.provider, + wal_entry_reader, + region_id, + replay_from_entry_id, + &self.region.version_control, + self.allow_stale_entries, + on_region_opened, + ) + .await?; + info!( + "Elapsed: {:?}, region: {region_id}, provider: {:?} catchup finished. replay from entry id: {replay_from_entry_id}, flushed entry id: {flushed_entry_id}, last entry id: {last_entry_id}, expected: {:?}.", + timer.elapsed(), + self.region.provider, + self.expected_last_entry_id + ); + if let Some(expected_last_entry_id) = self.expected_last_entry_id { + ensure!( + // The replayed last entry id may be greater than the `expected_last_entry_id`. + last_entry_id >= expected_last_entry_id, + error::UnexpectedSnafu { + reason: format!( + "Failed to catchup region {}, it was expected to replay to {}, but actually replayed to {}", + region_id, expected_last_entry_id, last_entry_id, + ), + } + ) + } + Ok(()) + } + + async fn local_wal_catchup(&mut self) -> Result<()> { + let version = self.region.version_control.current(); + let mut flushed_entry_id = version.last_entry_id; + let region_id = self.region.region_id; + let latest_entry_id = self + .wal + .store() + .latest_entry_id(&self.region.provider) + .unwrap_or_default(); + info!( + "Skips to replay memtable for region: {}, flushed entry id: {}, latest entry id: {}", + region_id, flushed_entry_id, latest_entry_id + ); + + if latest_entry_id > flushed_entry_id { + warn!( + "Found latest entry id is greater than flushed entry id, using latest entry id as flushed entry id, region: {}, latest entry id: {}, flushed entry id: {}", + region_id, latest_entry_id, flushed_entry_id + ); + flushed_entry_id = latest_entry_id; + self.region.version_control.set_entry_id(flushed_entry_id); + } + let on_region_opened = self.wal.on_region_opened(); + on_region_opened(region_id, flushed_entry_id, &self.region.provider).await?; + Ok(()) + } +} diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index af2fa093b8..5538d1bf21 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -16,8 +16,8 @@ use std::any::TypeId; use std::collections::HashMap; -use std::sync::Arc; use std::sync::atomic::{AtomicI64, AtomicU64}; +use std::sync::{Arc, Mutex}; use std::time::Instant; use common_telemetry::{debug, error, info, warn}; @@ -28,7 +28,7 @@ use log_store::kafka::log_store::KafkaLogStore; use log_store::noop::log_store::NoopLogStore; use log_store::raft_engine::log_store::RaftEngineLogStore; use object_store::manager::ObjectStoreManagerRef; -use object_store::util::{join_dir, normalize_dir}; +use object_store::util::normalize_dir; use snafu::{OptionExt, ResultExt, ensure}; use store_api::logstore::LogStore; use store_api::logstore::provider::Provider; @@ -41,6 +41,7 @@ use store_api::storage::{ColumnId, RegionId}; use crate::access_layer::AccessLayer; use crate::cache::CacheManagerRef; +use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey}; use crate::config::MitoConfig; use crate::error; use crate::error::{ @@ -48,25 +49,26 @@ use crate::error::{ Result, StaleLogEntrySnafu, }; use crate::manifest::action::RegionManifest; -use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions, RemoveFileOptions}; -use crate::manifest::storage::manifest_compress_type; +use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions}; use crate::memtable::MemtableBuilderProvider; use crate::memtable::bulk::part::BulkPart; -use crate::memtable::time_partition::TimePartitions; +use crate::memtable::time_partition::{TimePartitions, TimePartitionsRef}; +use crate::metrics::{CACHE_FILL_DOWNLOADED_FILES, CACHE_FILL_PENDING_FILES}; use crate::region::options::RegionOptions; use crate::region::version::{VersionBuilder, VersionControl, VersionControlRef}; use crate::region::{ - ManifestContext, ManifestStats, MitoRegion, RegionLeaderState, RegionRoleState, + ManifestContext, ManifestStats, MitoRegion, MitoRegionRef, RegionLeaderState, RegionRoleState, }; use crate::region_write_ctx::RegionWriteCtx; use crate::request::OptionOutputTx; use crate::schedule::scheduler::SchedulerRef; use crate::sst::FormatType; -use crate::sst::file_purger::create_local_file_purger; +use crate::sst::file::RegionFileId; +use crate::sst::file_purger::{FilePurgerRef, create_file_purger}; use crate::sst::file_ref::FileReferenceManagerRef; use crate::sst::index::intermediate::IntermediateManager; use crate::sst::index::puffin_manager::PuffinManagerFactory; -use crate::sst::location::region_dir_from_table_dir; +use crate::sst::location::{self, region_dir_from_table_dir}; use crate::time_provider::TimeProviderRef; use crate::wal::entry_reader::WalEntryReader; use crate::wal::{EntryId, Wal}; @@ -217,7 +219,7 @@ impl RegionOpener { mut self, config: &MitoConfig, wal: &Wal, - ) -> Result { + ) -> Result { let region_id = self.region_id; let region_dir = self.region_dir(); let metadata = self.build_metadata()?; @@ -252,31 +254,32 @@ impl RegionOpener { } } // Safety: must be set before calling this method. - let options = self.options.take().unwrap(); + let mut options = self.options.take().unwrap(); let object_store = get_object_store(&options.storage, &self.object_store_manager)?; let provider = self.provider::(&options.wal_options)?; let metadata = Arc::new(metadata); - // Set the sst_format based on options or flat_format flag + // Sets the sst_format based on options or flat_format flag let sst_format = if let Some(format) = options.sst_format { format } else if config.default_experimental_flat_format { + options.sst_format = Some(FormatType::Flat); FormatType::Flat } else { // Default to PrimaryKeyParquet for newly created regions + options.sst_format = Some(FormatType::PrimaryKey); FormatType::PrimaryKey }; // Create a manifest manager for this region and writes regions to the manifest file. let region_manifest_options = - Self::manifest_options(config, &options, ®ion_dir, &self.object_store_manager)?; + RegionManifestOptions::new(config, ®ion_dir, &object_store); // For remote WAL, we need to set flushed_entry_id to current topic's latest entry id. let flushed_entry_id = provider.initial_flushed_entry_id::(wal.store()); let manifest_manager = RegionManifestManager::new( metadata.clone(), flushed_entry_id, region_manifest_options, - self.stats.total_manifest_size.clone(), - self.stats.manifest_version.clone(), sst_format, + &self.stats, ) .await?; @@ -290,7 +293,10 @@ impl RegionOpener { part_duration, )); - debug!("Create region {} with options: {:?}", region_id, options); + debug!( + "Create region {} with options: {:?}, default_flat_format: {}", + region_id, options, config.default_experimental_flat_format + ); let version = VersionBuilder::new(metadata, mutable) .options(options) @@ -305,7 +311,7 @@ impl RegionOpener { )); let now = self.time_provider.current_time_millis(); - Ok(MitoRegion { + Ok(Arc::new(MitoRegion { region_id, version_control, access_layer: access_layer.clone(), @@ -314,7 +320,8 @@ impl RegionOpener { manifest_manager, RegionRoleState::Leader(RegionLeaderState::Writable), )), - file_purger: create_local_file_purger( + file_purger: create_file_purger( + config.gc.enable, self.purge_scheduler, access_layer, self.cache_manager, @@ -325,11 +332,10 @@ impl RegionOpener { last_compaction_millis: AtomicI64::new(now), time_provider: self.time_provider.clone(), topic_latest_entry_id: AtomicU64::new(0), - memtable_builder, written_bytes: Arc::new(AtomicU64::new(0)), - sst_format, stats: self.stats, - }) + staging_partition_expr: Mutex::new(None), + })) } /// Opens an existing region in read only mode. @@ -339,13 +345,13 @@ impl RegionOpener { mut self, config: &MitoConfig, wal: &Wal, - ) -> Result { + ) -> Result { let region_id = self.region_id; let region_dir = self.region_dir(); let region = self .maybe_open(config, wal) .await? - .context(EmptyRegionDirSnafu { + .with_context(|| EmptyRegionDirSnafu { region_id, region_dir: ®ion_dir, })?; @@ -397,21 +403,14 @@ impl RegionOpener { &mut self, config: &MitoConfig, wal: &Wal, - ) -> Result> { - let region_options = self.options.as_ref().unwrap().clone(); - - let region_manifest_options = Self::manifest_options( - config, - ®ion_options, - &self.region_dir(), - &self.object_store_manager, - )?; - let Some(manifest_manager) = RegionManifestManager::open( - region_manifest_options, - self.stats.total_manifest_size.clone(), - self.stats.manifest_version.clone(), - ) - .await? + ) -> Result> { + let now = Instant::now(); + let mut region_options = self.options.as_ref().unwrap().clone(); + let object_storage = get_object_store(®ion_options.storage, &self.object_store_manager)?; + let region_manifest_options = + RegionManifestOptions::new(config, &self.region_dir(), &object_storage); + let Some(manifest_manager) = + RegionManifestManager::open(region_manifest_options, &self.stats).await? else { return Ok(None); }; @@ -428,6 +427,8 @@ impl RegionOpener { } else { manifest.metadata.clone() }; + // Updates the region options with the manifest. + sanitize_region_options(&manifest, &mut region_options); let region_id = self.region_id; let provider = self.provider::(®ion_options.wal_options)?; @@ -450,12 +451,14 @@ impl RegionOpener { self.puffin_manager_factory.clone(), self.intermediate_manager.clone(), )); - let file_purger = create_local_file_purger( + let file_purger = create_file_purger( + config.gc.enable, self.purge_scheduler.clone(), access_layer.clone(), self.cache_manager.clone(), self.file_ref_manager.clone(), ); + // We should sanitize the region options before creating a new memtable. let memtable_builder = self .memtable_builder_provider .builder_for_options(®ion_options); @@ -472,14 +475,16 @@ impl RegionOpener { 0, part_duration, )); - let version = VersionBuilder::new(metadata, mutable) - .add_files(file_purger.clone(), manifest.files.values().cloned()) - .flushed_entry_id(manifest.flushed_entry_id) - .flushed_sequence(manifest.flushed_sequence) - .truncated_entry_id(manifest.truncated_entry_id) - .compaction_time_window(manifest.compaction_time_window) - .options(region_options) - .build(); + + // Updates region options by manifest before creating version. + let version_builder = version_builder_from_manifest( + &manifest, + metadata, + file_purger.clone(), + mutable, + region_options, + ); + let version = version_builder.build(); let flushed_entry_id = version.flushed_entry_id; let version_control = Arc::new(VersionControl::new(version)); @@ -489,8 +494,12 @@ impl RegionOpener { .unwrap_or_default() .max(flushed_entry_id); info!( - "Start replaying memtable at replay_from_entry_id: {} for region {}, manifest version: {}, flushed entry id: {}", - replay_from_entry_id, region_id, manifest.manifest_version, flushed_entry_id + "Start replaying memtable at replay_from_entry_id: {} for region {}, manifest version: {}, flushed entry id: {}, elapsed: {:?}", + replay_from_entry_id, + region_id, + manifest.manifest_version, + flushed_entry_id, + now.elapsed() ); replay_memtable( &provider, @@ -512,8 +521,11 @@ impl RegionOpener { } } else { info!( - "Skip the WAL replay for region: {}, manifest version: {}, flushed_entry_id: {}", - region_id, manifest.manifest_version, flushed_entry_id + "Skip the WAL replay for region: {}, manifest version: {}, flushed_entry_id: {}, elapsed: {:?}", + region_id, + manifest.manifest_version, + flushed_entry_id, + now.elapsed() ); 0 @@ -534,13 +546,11 @@ impl RegionOpener { } let now = self.time_provider.current_time_millis(); - // Read sst_format from manifest - let sst_format = manifest.sst_format; let region = MitoRegion { region_id: self.region_id, - version_control, - access_layer, + version_control: version_control.clone(), + access_layer: access_layer.clone(), // Region is always opened in read only mode. manifest_ctx: Arc::new(ManifestContext::new( manifest_manager, @@ -553,33 +563,47 @@ impl RegionOpener { time_provider: self.time_provider.clone(), topic_latest_entry_id: AtomicU64::new(topic_latest_entry_id), written_bytes: Arc::new(AtomicU64::new(0)), - memtable_builder, - sst_format, stats: self.stats.clone(), + // TODO(weny): reload the staging partition expr from the manifest. + staging_partition_expr: Mutex::new(None), }; + + let region = Arc::new(region); + + maybe_load_cache(®ion, config, &self.cache_manager); + Ok(Some(region)) } +} - /// Returns a new manifest options. - fn manifest_options( - config: &MitoConfig, - options: &RegionOptions, - region_dir: &str, - object_store_manager: &ObjectStoreManagerRef, - ) -> Result { - let object_store = get_object_store(&options.storage, object_store_manager)?; - Ok(RegionManifestOptions { - manifest_dir: new_manifest_dir(region_dir), - object_store, - // We don't allow users to set the compression algorithm as we use it as a file suffix. - // Currently, the manifest storage doesn't have good support for changing compression algorithms. - compress_type: manifest_compress_type(config.compress_manifest), - checkpoint_distance: config.manifest_checkpoint_distance, - remove_file_options: RemoveFileOptions { - keep_count: config.experimental_manifest_keep_removed_file_count, - keep_ttl: config.experimental_manifest_keep_removed_file_ttl, - }, - }) +/// Creates a version builder from a region manifest. +pub(crate) fn version_builder_from_manifest( + manifest: &RegionManifest, + metadata: RegionMetadataRef, + file_purger: FilePurgerRef, + mutable: TimePartitionsRef, + region_options: RegionOptions, +) -> VersionBuilder { + VersionBuilder::new(metadata, mutable) + .add_files(file_purger, manifest.files.values().cloned()) + .flushed_entry_id(manifest.flushed_entry_id) + .flushed_sequence(manifest.flushed_sequence) + .truncated_entry_id(manifest.truncated_entry_id) + .compaction_time_window(manifest.compaction_time_window) + .options(region_options) +} + +/// Updates region options by persistent options. +pub(crate) fn sanitize_region_options(manifest: &RegionManifest, options: &mut RegionOptions) { + let option_format = options.sst_format.unwrap_or_default(); + if option_format != manifest.sst_format { + common_telemetry::warn!( + "Overriding SST format from {:?} to {:?} for region {}", + option_format, + manifest.sst_format, + manifest.metadata.region_id, + ); + options.sst_format = Some(manifest.sst_format); } } @@ -601,6 +625,7 @@ pub fn get_object_store( } /// A loader for loading metadata from a region dir. +#[derive(Debug, Clone)] pub struct RegionMetadataLoader { config: Arc, object_store_manager: ObjectStoreManagerRef, @@ -621,7 +646,9 @@ impl RegionMetadataLoader { region_dir: &str, region_options: &RegionOptions, ) -> Result> { - let manifest = self.load_manifest(region_dir, region_options).await?; + let manifest = self + .load_manifest(region_dir, ®ion_options.storage) + .await?; Ok(manifest.map(|m| m.metadata.clone())) } @@ -629,20 +656,13 @@ impl RegionMetadataLoader { pub async fn load_manifest( &self, region_dir: &str, - region_options: &RegionOptions, + storage: &Option, ) -> Result>> { - let region_manifest_options = RegionOpener::manifest_options( - &self.config, - region_options, - region_dir, - &self.object_store_manager, - )?; - let Some(manifest_manager) = RegionManifestManager::open( - region_manifest_options, - Arc::new(AtomicU64::new(0)), - Arc::new(AtomicU64::new(0)), - ) - .await? + let object_store = get_object_store(storage, &self.object_store_manager)?; + let region_manifest_options = + RegionManifestOptions::new(&self.config, region_dir, &object_store); + let Some(manifest_manager) = + RegionManifestManager::open(region_manifest_options, &Default::default()).await? else { return Ok(None); }; @@ -805,7 +825,163 @@ where Ok(last_entry_id) } -/// Returns the directory to the manifest files. -pub(crate) fn new_manifest_dir(region_dir: &str) -> String { - join_dir(region_dir, "manifest") +/// A task to load and fill the region file cache. +pub(crate) struct RegionLoadCacheTask { + region: MitoRegionRef, +} + +impl RegionLoadCacheTask { + pub(crate) fn new(region: MitoRegionRef) -> Self { + Self { region } + } + + /// Fills the file cache with index files from the region. + pub(crate) async fn fill_cache(&self, file_cache: FileCacheRef) { + let region_id = self.region.region_id; + let table_dir = self.region.access_layer.table_dir(); + let path_type = self.region.access_layer.path_type(); + let object_store = self.region.access_layer.object_store(); + let version_control = &self.region.version_control; + + // Collects IndexKeys, file sizes, and max timestamps for files that need to be downloaded + let mut files_to_download = Vec::new(); + let mut files_already_cached = 0; + + { + let version = version_control.current().version; + for level in version.ssts.levels() { + for file_handle in level.files.values() { + let file_meta = file_handle.meta_ref(); + if file_meta.exists_index() { + let puffin_key = IndexKey::new( + file_meta.region_id, + file_meta.index_file_id().file_id(), + FileType::Puffin, + ); + + if !file_cache.contains_key(&puffin_key) { + files_to_download.push(( + puffin_key, + file_meta.index_file_size, + file_meta.time_range.1, // max timestamp + )); + } else { + files_already_cached += 1; + } + } + } + } + // Releases the Version after the scope to avoid holding the memtables and file handles + // for a long time. + } + + // Sorts files by max timestamp in descending order to loads latest files first + files_to_download.sort_by(|a, b| b.2.cmp(&a.2)); + + let total_files = files_to_download.len() as i64; + + info!( + "Starting background index cache preload for region {}, total_files_to_download: {}, files_already_cached: {}", + region_id, total_files, files_already_cached + ); + + CACHE_FILL_PENDING_FILES.add(total_files); + + let mut files_downloaded = 0; + let mut files_skipped = 0; + + for (puffin_key, file_size, max_timestamp) in files_to_download { + let current_size = file_cache.puffin_cache_size(); + let capacity = file_cache.puffin_cache_capacity(); + let region_state = self.region.state(); + if !can_load_cache(region_state) { + info!( + "Stopping index cache by state: {:?}, region: {}, current_size: {}, capacity: {}", + region_state, region_id, current_size, capacity + ); + break; + } + + // Checks if adding this file would exceed capacity + if current_size + file_size > capacity { + info!( + "Stopping index cache preload due to capacity limit, region: {}, file_id: {}, current_size: {}, file_size: {}, capacity: {}, file_timestamp: {:?}", + region_id, puffin_key.file_id, current_size, file_size, capacity, max_timestamp + ); + files_skipped = (total_files - files_downloaded) as usize; + CACHE_FILL_PENDING_FILES.sub(total_files - files_downloaded); + break; + } + + let index_remote_path = location::index_file_path( + table_dir, + RegionFileId::new(puffin_key.region_id, puffin_key.file_id), + path_type, + ); + + match file_cache + .download(puffin_key, &index_remote_path, object_store, file_size) + .await + { + Ok(_) => { + debug!( + "Downloaded index file to write cache, region: {}, file_id: {}", + region_id, puffin_key.file_id + ); + files_downloaded += 1; + CACHE_FILL_DOWNLOADED_FILES.inc_by(1); + CACHE_FILL_PENDING_FILES.dec(); + } + Err(e) => { + warn!( + e; "Failed to download index file to write cache, region: {}, file_id: {}", + region_id, puffin_key.file_id + ); + CACHE_FILL_PENDING_FILES.dec(); + } + } + } + + info!( + "Completed background cache fill task for region {}, total_files: {}, files_downloaded: {}, files_already_cached: {}, files_skipped: {}", + region_id, total_files, files_downloaded, files_already_cached, files_skipped + ); + } +} + +/// Loads all index (Puffin) files from the version into the write cache. +fn maybe_load_cache( + region: &MitoRegionRef, + config: &MitoConfig, + cache_manager: &Option, +) { + let Some(cache_manager) = cache_manager else { + return; + }; + let Some(write_cache) = cache_manager.write_cache() else { + return; + }; + + let preload_enabled = config.preload_index_cache; + if !preload_enabled { + return; + } + + let task = RegionLoadCacheTask::new(region.clone()); + write_cache.load_region_cache(task); +} + +fn can_load_cache(state: RegionRoleState) -> bool { + match state { + RegionRoleState::Leader(RegionLeaderState::Writable) + | RegionRoleState::Leader(RegionLeaderState::Staging) + | RegionRoleState::Leader(RegionLeaderState::Altering) + | RegionRoleState::Leader(RegionLeaderState::EnteringStaging) + | RegionRoleState::Leader(RegionLeaderState::Editing) + | RegionRoleState::Follower => true, + // The region will be closed soon if it is downgrading. + RegionRoleState::Leader(RegionLeaderState::Downgrading) + | RegionRoleState::Leader(RegionLeaderState::Dropping) + | RegionRoleState::Leader(RegionLeaderState::Truncating) => false, + } } diff --git a/src/mito2/src/region/version.rs b/src/mito2/src/region/version.rs index c7438b196a..79391e324d 100644 --- a/src/mito2/src/region/version.rs +++ b/src/mito2/src/region/version.rs @@ -161,13 +161,14 @@ impl VersionControl { } /// Mark all opened files as deleted and set the delete marker in [VersionControlData] - pub(crate) fn mark_dropped(&self, memtable_builder: &MemtableBuilderRef) { + pub(crate) fn mark_dropped(&self) { let version = self.current().version; let part_duration = Some(version.memtables.mutable.part_duration()); let next_memtable_id = version.memtables.mutable.next_memtable_id(); + let memtable_builder = version.memtables.mutable.memtable_builder().clone(); let new_mutable = Arc::new(TimePartitions::new( version.metadata.clone(), - memtable_builder.clone(), + memtable_builder, next_memtable_id, part_duration, )); @@ -185,13 +186,14 @@ impl VersionControl { /// /// It replaces existing mutable memtable with a memtable that uses the /// new schema. Memtables of the version must be empty. - pub(crate) fn alter_schema(&self, metadata: RegionMetadataRef, builder: &MemtableBuilderRef) { + pub(crate) fn alter_schema(&self, metadata: RegionMetadataRef) { let version = self.current().version; let part_duration = Some(version.memtables.mutable.part_duration()); let next_memtable_id = version.memtables.mutable.next_memtable_id(); + let memtable_builder = version.memtables.mutable.memtable_builder().clone(); let new_mutable = Arc::new(TimePartitions::new( metadata.clone(), - builder.clone(), + memtable_builder, next_memtable_id, part_duration, )); @@ -208,19 +210,50 @@ impl VersionControl { version_data.version = new_version; } - /// Truncate current version. - pub(crate) fn truncate( + /// Alter schema and format of the region. + /// + /// It replaces existing mutable memtable with a memtable that uses the + /// new format. Memtables of the version must be empty. + pub(crate) fn alter_schema_and_format( &self, - truncate_kind: TruncateKind, - memtable_builder: &MemtableBuilderRef, + metadata: RegionMetadataRef, + options: RegionOptions, + memtable_builder: MemtableBuilderRef, ) { let version = self.current().version; + let part_duration = Some(version.memtables.mutable.part_duration()); + let next_memtable_id = version.memtables.mutable.next_memtable_id(); + // Use the new metadata to build `TimePartitions`. + let new_mutable = Arc::new(TimePartitions::new( + metadata.clone(), + memtable_builder, + next_memtable_id, + part_duration, + )); + debug_assert!(version.memtables.mutable.is_empty()); + debug_assert!(version.memtables.immutables().is_empty()); + let new_version = Arc::new( + VersionBuilder::from_version(version) + .metadata(metadata) + .options(options) + .memtables(MemtableVersion::new(new_mutable)) + .build(), + ); + + let mut version_data = self.data.write().unwrap(); + version_data.version = new_version; + } + + /// Truncate current version. + pub(crate) fn truncate(&self, truncate_kind: TruncateKind) { + let version = self.current().version; let part_duration = version.memtables.mutable.part_duration(); let next_memtable_id = version.memtables.mutable.next_memtable_id(); + let memtable_builder = version.memtables.mutable.memtable_builder().clone(); let new_mutable = Arc::new(TimePartitions::new( version.metadata.clone(), - memtable_builder.clone(), + memtable_builder, next_memtable_id, Some(part_duration), )); @@ -230,7 +263,9 @@ impl VersionControl { truncated_sequence, } => { let new_version = Arc::new( - VersionBuilder::new(version.metadata.clone(), new_mutable) + VersionBuilder::from_version(version) + .memtables(MemtableVersion::new(new_mutable)) + .clear_files() .flushed_entry_id(truncated_entry_id) .flushed_sequence(truncated_sequence) .truncated_entry_id(Some(truncated_entry_id)) @@ -456,6 +491,12 @@ impl VersionBuilder { self } + /// Clear all files in the builder. + pub(crate) fn clear_files(mut self) -> Self { + self.ssts = Arc::new(SstVersion::new()); + self + } + /// Builds a new [Version] from the builder. /// It overwrites the window size by compaction option. pub(crate) fn build(self) -> Version { diff --git a/src/mito2/src/remap_manifest.rs b/src/mito2/src/remap_manifest.rs index 6800a4bf4d..79f816bb13 100644 --- a/src/mito2/src/remap_manifest.rs +++ b/src/mito2/src/remap_manifest.rs @@ -426,11 +426,14 @@ mod tests { level: 0, file_size: 1024, available_indexes: SmallVec::new(), + indexes: Default::default(), index_file_size: 0, + index_file_id: None, num_rows: 100, num_row_groups: 1, sequence: NonZeroU64::new(1), partition_expr, + num_series: 1, } } diff --git a/src/mito2/src/request.rs b/src/mito2/src/request.rs index ce013b15d3..39a0d3a3f8 100644 --- a/src/mito2/src/request.rs +++ b/src/mito2/src/request.rs @@ -20,12 +20,12 @@ use std::time::Instant; use api::helper::{ ColumnDataTypeWrapper, is_column_type_value_eq, is_semantic_type_eq, proto_value_type, - to_proto_value, }; use api::v1::column_def::options_from_column_schema; use api::v1::{ColumnDataType, ColumnSchema, OpType, Rows, SemanticType, Value, WriteHint}; use common_telemetry::info; use datatypes::prelude::DataType; +use partition::expr::PartitionExpr; use prometheus::HistogramTimer; use prost::Message; use smallvec::SmallVec; @@ -35,21 +35,24 @@ use store_api::codec::{PrimaryKeyEncoding, infer_primary_key_encoding_from_hint} use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef}; use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState}; use store_api::region_request::{ - AffectedRows, RegionAlterRequest, RegionBuildIndexRequest, RegionBulkInsertsRequest, - RegionCatchupRequest, RegionCloseRequest, RegionCompactRequest, RegionCreateRequest, - RegionFlushRequest, RegionOpenRequest, RegionRequest, RegionTruncateRequest, + AffectedRows, EnterStagingRequest, RegionAlterRequest, RegionBuildIndexRequest, + RegionBulkInsertsRequest, RegionCatchupRequest, RegionCloseRequest, RegionCompactRequest, + RegionCreateRequest, RegionFlushRequest, RegionOpenRequest, RegionRequest, + RegionTruncateRequest, }; -use store_api::storage::RegionId; +use store_api::storage::{FileId, RegionId}; use tokio::sync::oneshot::{self, Receiver, Sender}; use crate::error::{ CompactRegionSnafu, ConvertColumnDataTypeSnafu, CreateDefaultSnafu, Error, FillDefaultSnafu, - FlushRegionSnafu, InvalidRequestSnafu, Result, UnexpectedSnafu, + FlushRegionSnafu, InvalidPartitionExprSnafu, InvalidRequestSnafu, MissingPartitionExprSnafu, + Result, UnexpectedSnafu, }; -use crate::manifest::action::{RegionEdit, TruncateKind}; +use crate::manifest::action::{RegionEdit, RegionManifest, TruncateKind}; use crate::memtable::MemtableId; use crate::memtable::bulk::part::BulkPart; use crate::metrics::COMPACTION_ELAPSED_TOTAL; +use crate::region::options::RegionOptions; use crate::sst::file::FileMeta; use crate::sst::index::IndexBuildType; use crate::wal::EntryId; @@ -411,7 +414,7 @@ impl WriteRequest { }; // Convert default value into proto's value. - Ok(to_proto_value(default_value)) + Ok(api::helper::to_grpc_value(default_value)) } } @@ -599,9 +602,13 @@ pub(crate) enum WorkerRequest { request: RegionBulkInsertsRequest, sender: OptionOutputTx, }, + + /// Remap manifests request. + RemapManifests(RemapManifestsRequest), } impl WorkerRequest { + /// Creates a new open region request. pub(crate) fn new_open_region_request( region_id: RegionId, request: RegionOpenRequest, @@ -618,6 +625,21 @@ impl WorkerRequest { (worker_request, receiver) } + /// Creates a new catchup region request. + pub(crate) fn new_catchup_region_request( + region_id: RegionId, + request: RegionCatchupRequest, + entry_receiver: Option, + ) -> (WorkerRequest, Receiver>) { + let (sender, receiver) = oneshot::channel(); + let worker_request = WorkerRequest::Ddl(SenderDdlRequest { + region_id, + sender: sender.into(), + request: DdlRequest::Catchup((request, entry_receiver)), + }); + (worker_request, receiver) + } + /// Converts request from a [RegionRequest]. pub(crate) fn try_from_region_request( region_id: RegionId, @@ -642,7 +664,8 @@ impl WorkerRequest { } RegionRequest::Delete(v) => { let mut write_request = - WriteRequest::new(region_id, OpType::Delete, v.rows, region_metadata.clone())?; + WriteRequest::new(region_id, OpType::Delete, v.rows, region_metadata.clone())? + .with_hint(v.hint); if write_request.primary_key_encoding() == PrimaryKeyEncoding::Dense && let Some(region_metadata) = ®ion_metadata { @@ -701,7 +724,12 @@ impl WorkerRequest { RegionRequest::Catchup(v) => WorkerRequest::Ddl(SenderDdlRequest { region_id, sender: sender.into(), - request: DdlRequest::Catchup(v), + request: DdlRequest::Catchup((v, None)), + }), + RegionRequest::EnterStaging(v) => WorkerRequest::Ddl(SenderDdlRequest { + region_id, + sender: sender.into(), + request: DdlRequest::EnterStaging(v), }), RegionRequest::BulkInserts(region_bulk_inserts_request) => WorkerRequest::BulkInserts { metadata: region_metadata, @@ -743,6 +771,48 @@ impl WorkerRequest { receiver, ) } + + /// Converts [RemapManifestsRequest] from a [RemapManifestsRequest](store_api::region_engine::RemapManifestsRequest). + /// + /// # Errors + /// + /// Returns an error if the partition expression is invalid or missing. + /// Returns an error if the new partition expressions are not found for some regions. + #[allow(clippy::type_complexity)] + pub(crate) fn try_from_remap_manifests_request( + store_api::region_engine::RemapManifestsRequest { + region_id, + input_regions, + region_mapping, + new_partition_exprs, + }: store_api::region_engine::RemapManifestsRequest, + ) -> Result<( + WorkerRequest, + Receiver>>, + )> { + let (sender, receiver) = oneshot::channel(); + let new_partition_exprs = new_partition_exprs + .into_iter() + .map(|(k, v)| { + Ok(( + k, + PartitionExpr::from_json_str(&v) + .context(InvalidPartitionExprSnafu { expr: v })? + .context(MissingPartitionExprSnafu { region_id: k })?, + )) + }) + .collect::>>()?; + + let request = RemapManifestsRequest { + region_id, + input_regions, + region_mapping, + new_partition_exprs, + sender, + }; + + Ok((WorkerRequest::RemapManifests(request), receiver)) + } } /// DDL request to a region. @@ -757,7 +827,8 @@ pub(crate) enum DdlRequest { Compact(RegionCompactRequest), BuildIndex(RegionBuildIndexRequest), Truncate(RegionTruncateRequest), - Catchup(RegionCatchupRequest), + Catchup((RegionCatchupRequest, Option)), + EnterStaging(EnterStagingRequest), } /// Sender and Ddl request. @@ -780,8 +851,9 @@ pub(crate) enum BackgroundNotify { FlushFailed(FlushFailed), /// Index build has finished. IndexBuildFinished(IndexBuildFinished), + /// Index build has been stopped (aborted or succeeded). + IndexBuildStopped(IndexBuildStopped), /// Index build has failed. - #[allow(dead_code)] IndexBuildFailed(IndexBuildFailed), /// Compaction has finished. CompactionFinished(CompactionFinished), @@ -793,6 +865,8 @@ pub(crate) enum BackgroundNotify { RegionChange(RegionChangeResult), /// Region edit result. RegionEdit(RegionEditResult), + /// Enter staging result. + EnterStaging(EnterStagingResult), } /// Notifies a flush job is finished. @@ -810,6 +884,8 @@ pub(crate) struct FlushFinished { pub(crate) edit: RegionEdit, /// Memtables to remove. pub(crate) memtables_to_remove: SmallVec<[MemtableId; 2]>, + /// Whether the region is in staging mode. + pub(crate) is_staging: bool, } impl FlushFinished { @@ -846,10 +922,17 @@ pub(crate) struct IndexBuildFinished { pub(crate) edit: RegionEdit, } +/// Notifies an index build job has been stopped. +#[derive(Debug)] +pub(crate) struct IndexBuildStopped { + #[allow(dead_code)] + pub(crate) region_id: RegionId, + pub(crate) file_id: FileId, +} + /// Notifies an index build job has failed. #[derive(Debug)] pub(crate) struct IndexBuildFailed { - #[allow(dead_code)] pub(crate) err: Arc, } @@ -923,6 +1006,21 @@ pub(crate) struct RegionChangeResult { pub(crate) result: Result<()>, /// Used for index build in schema change. pub(crate) need_index: bool, + /// New options for the region. + pub(crate) new_options: Option, +} + +/// Notifies the region the result of entering staging. +#[derive(Debug)] +pub(crate) struct EnterStagingResult { + /// Region id. + pub(crate) region_id: RegionId, + /// The new partition expression to apply. + pub(crate) partition_expr: String, + /// Result sender. + pub(crate) sender: OptionOutputTx, + /// Result from the manifest manager. + pub(crate) result: Result<()>, } /// Request to edit a region directly. @@ -945,6 +1043,8 @@ pub(crate) struct RegionEditResult { pub(crate) edit: RegionEdit, /// Result from the manifest manager. pub(crate) result: Result<()>, + /// Whether region state need to be set to Writable after handling this request. + pub(crate) update_region_state: bool, } #[derive(Debug)] @@ -963,6 +1063,20 @@ pub(crate) struct RegionSyncRequest { pub(crate) sender: Sender>, } +#[derive(Debug)] +pub(crate) struct RemapManifestsRequest { + /// The [`RegionId`] of a staging region used to obtain table directory and storage configuration for the remap operation. + pub(crate) region_id: RegionId, + /// Regions to remap manifests from. + pub(crate) input_regions: Vec, + /// For each old region, which new regions should receive its files + pub(crate) region_mapping: HashMap>, + /// New partition expressions for the new regions. + pub(crate) new_partition_exprs: HashMap, + /// Result sender. + pub(crate) sender: Sender>>, +} + #[cfg(test)] mod tests { use api::v1::value::ValueData; diff --git a/src/mito2/src/row_converter.rs b/src/mito2/src/row_converter.rs deleted file mode 100644 index 2bafc49ca3..0000000000 --- a/src/mito2/src/row_converter.rs +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod dense; -mod sparse; -use std::fmt::Debug; -use std::sync::Arc; - -use common_recordbatch::filter::SimpleFilterEvaluator; -use datatypes::value::{Value, ValueRef}; -pub use dense::{DensePrimaryKeyCodec, SortField}; -use mito_codec::key_values::KeyValue; -pub use sparse::{SparsePrimaryKeyCodec, SparseValues, COLUMN_ID_ENCODE_SIZE}; -use store_api::codec::PrimaryKeyEncoding; -use store_api::metadata::{RegionMetadata, RegionMetadataRef}; -use store_api::storage::ColumnId; - -use crate::error::Result; - -/// Row value encoder/decoder. -pub trait PrimaryKeyCodecExt { - /// Encodes rows to bytes. - /// # Note - /// Ensure the length of row iterator matches the length of fields. - fn encode<'a, I>(&self, row: I) -> Result> - where - I: Iterator>, - { - let mut buffer = Vec::new(); - self.encode_to_vec(row, &mut buffer)?; - Ok(buffer) - } - - /// Encodes rows to specific vec. - /// # Note - /// Ensure the length of row iterator matches the length of fields. - fn encode_to_vec<'a, I>(&self, row: I, buffer: &mut Vec) -> Result<()> - where - I: Iterator>; -} - -pub trait PrimaryKeyFilter: Send + Sync { - /// Returns true if the primary key matches the filter. - fn matches(&mut self, pk: &[u8]) -> bool; -} - -/// Composite values decoded from primary key bytes. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum CompositeValues { - Dense(Vec<(ColumnId, Value)>), - Sparse(SparseValues), -} - -impl CompositeValues { - /// Extends the composite values with the given values. - pub fn extend(&mut self, values: &[(ColumnId, Value)]) { - match self { - CompositeValues::Dense(dense_values) => { - for (column_id, value) in values { - dense_values.push((*column_id, value.clone())); - } - } - CompositeValues::Sparse(sprase_value) => { - for (column_id, value) in values { - sprase_value.insert(*column_id, value.clone()); - } - } - } - } -} - -#[cfg(test)] -impl CompositeValues { - pub fn into_sparse(self) -> SparseValues { - match self { - CompositeValues::Sparse(v) => v, - _ => panic!("CompositeValues is not sparse"), - } - } - - pub fn into_dense(self) -> Vec { - match self { - CompositeValues::Dense(v) => v.into_iter().map(|(_, v)| v).collect(), - _ => panic!("CompositeValues is not dense"), - } - } -} - -pub trait PrimaryKeyCodec: Send + Sync + Debug { - /// Encodes a key value to bytes. - fn encode_key_value(&self, key_value: &KeyValue, buffer: &mut Vec) -> Result<()>; - - /// Encodes values to bytes. - fn encode_values(&self, values: &[(ColumnId, Value)], buffer: &mut Vec) -> Result<()>; - - /// Encodes values to bytes. - fn encode_value_refs( - &self, - values: &[(ColumnId, ValueRef)], - buffer: &mut Vec, - ) -> Result<()>; - - /// Returns the number of fields in the primary key. - fn num_fields(&self) -> Option; - - /// Returns a primary key filter factory. - fn primary_key_filter( - &self, - metadata: &RegionMetadataRef, - filters: Arc>, - ) -> Box; - - /// Returns the estimated size of the primary key. - fn estimated_size(&self) -> Option { - None - } - - /// Returns the encoding type of the primary key. - fn encoding(&self) -> PrimaryKeyEncoding; - - /// Decodes the primary key from the given bytes. - /// - /// Returns a [`CompositeValues`] that follows the primary key ordering. - fn decode(&self, bytes: &[u8]) -> Result; - - /// Decode the leftmost value from bytes. - fn decode_leftmost(&self, bytes: &[u8]) -> Result>; -} - -/// Builds a primary key codec from region metadata. -pub fn build_primary_key_codec(region_metadata: &RegionMetadata) -> Arc { - let fields = region_metadata.primary_key_columns().map(|col| { - ( - col.column_id, - SortField::new(col.column_schema.data_type.clone()), - ) - }); - build_primary_key_codec_with_fields(region_metadata.primary_key_encoding, fields) -} - -/// Builds a primary key codec from region metadata. -pub fn build_primary_key_codec_with_fields( - encoding: PrimaryKeyEncoding, - fields: impl Iterator, -) -> Arc { - match encoding { - PrimaryKeyEncoding::Dense => Arc::new(DensePrimaryKeyCodec::with_fields(fields.collect())), - PrimaryKeyEncoding::Sparse => { - Arc::new(SparsePrimaryKeyCodec::with_fields(fields.collect())) - } - } -} diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs index 1d94e74eaa..f3f51bdc08 100644 --- a/src/mito2/src/sst.rs +++ b/src/mito2/src/sst.rs @@ -21,7 +21,9 @@ use common_base::readable_size::ReadableSize; use datatypes::arrow::datatypes::{ DataType as ArrowDataType, Field, FieldRef, Fields, Schema, SchemaRef, }; +use datatypes::arrow::record_batch::RecordBatch; use datatypes::prelude::ConcreteDataType; +use datatypes::timestamp::timestamp_array_to_primitive; use serde::{Deserialize, Serialize}; use store_api::codec::PrimaryKeyEncoding; use store_api::metadata::RegionMetadata; @@ -29,6 +31,9 @@ use store_api::storage::consts::{ OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME, }; +use crate::read::Batch; +use crate::sst::parquet::flat_format::time_index_column_index; + pub mod file; pub mod file_purger; pub mod file_ref; @@ -241,3 +246,426 @@ fn plain_internal_fields() -> [FieldRef; 2] { Arc::new(Field::new(OP_TYPE_COLUMN_NAME, ArrowDataType::UInt8, false)), ] } + +/// Gets the estimated number of series from record batches. +/// +/// This struct tracks the last timestamp value to detect series boundaries +/// by observing when timestamps decrease (indicating a new series). +#[derive(Default)] +pub(crate) struct SeriesEstimator { + /// The last timestamp value seen + last_timestamp: Option, + /// The estimated number of series + series_count: u64, +} + +impl SeriesEstimator { + /// Updates the estimator with a new Batch. + /// + /// Since each Batch contains only one series, this increments the series count + /// and updates the last timestamp. + pub(crate) fn update(&mut self, batch: &Batch) { + let Some(last_ts) = batch.last_timestamp() else { + return; + }; + + // Checks if there's a boundary between the last batch and this batch + if let Some(prev_last_ts) = self.last_timestamp { + // If the first timestamp of this batch is less than the last timestamp + // we've seen, it indicates a new series + if let Some(first_ts) = batch.first_timestamp() + && first_ts.value() <= prev_last_ts + { + self.series_count += 1; + } + } else { + // First batch, counts as first series + self.series_count = 1; + } + + // Updates the last timestamp + self.last_timestamp = Some(last_ts.value()); + } + + /// Updates the estimator with a new record batch in flat format. + /// + /// This method examines the time index column to detect series boundaries. + pub(crate) fn update_flat(&mut self, record_batch: &RecordBatch) { + let batch_rows = record_batch.num_rows(); + if batch_rows == 0 { + return; + } + + let time_index_pos = time_index_column_index(record_batch.num_columns()); + let timestamps = record_batch.column(time_index_pos); + let Some((ts_values, _unit)) = timestamp_array_to_primitive(timestamps) else { + return; + }; + let values = ts_values.values(); + + // Checks if there's a boundary between the last batch and this batch + if let Some(last_ts) = self.last_timestamp { + if values[0] <= last_ts { + self.series_count += 1; + } + } else { + // First batch, counts as first series + self.series_count = 1; + } + + // Counts series boundaries within this batch. + for i in 0..batch_rows - 1 { + // We assumes the same timestamp as a new series, which is different from + // how we split batches. + if values[i] >= values[i + 1] { + self.series_count += 1; + } + } + + // Updates the last timestamp + self.last_timestamp = Some(values[batch_rows - 1]); + } + + /// Returns the estimated number of series. + pub(crate) fn finish(&mut self) -> u64 { + self.last_timestamp = None; + let count = self.series_count; + self.series_count = 0; + + count + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use api::v1::OpType; + use datatypes::arrow::array::{ + BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt8Builder, + UInt32Array, UInt64Array, + }; + use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; + use datatypes::arrow::record_batch::RecordBatch; + + use super::*; + use crate::read::{Batch, BatchBuilder}; + + fn new_batch( + primary_key: &[u8], + timestamps: &[i64], + sequences: &[u64], + op_types: &[OpType], + ) -> Batch { + let timestamps = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec())); + let sequences = Arc::new(UInt64Array::from(sequences.to_vec())); + let mut op_type_builder = UInt8Builder::with_capacity(op_types.len()); + for op_type in op_types { + op_type_builder.append_value(*op_type as u8); + } + let op_types = Arc::new(UInt8Array::from( + op_types.iter().map(|op| *op as u8).collect::>(), + )); + + let mut builder = BatchBuilder::new(primary_key.to_vec()); + builder + .timestamps_array(timestamps) + .unwrap() + .sequences_array(sequences) + .unwrap() + .op_types_array(op_types) + .unwrap(); + builder.build().unwrap() + } + + fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch { + // Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type] + let num_cols = 4; // time_index + 3 internal columns + let time_index_pos = time_index_column_index(num_cols); + assert_eq!(time_index_pos, 0); // For 4 columns, time index should be at position 0 + + let time_array = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec())); + let pk_array = Arc::new(DictionaryArray::new( + UInt32Array::from(vec![0; timestamps.len()]), + Arc::new(BinaryArray::from(vec![b"test".as_slice()])), + )); + let seq_array = Arc::new(UInt64Array::from(vec![1; timestamps.len()])); + let op_array = Arc::new(UInt8Array::from(vec![1; timestamps.len()])); + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "time", + ArrowDataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new_dictionary( + "__primary_key", + ArrowDataType::UInt32, + ArrowDataType::Binary, + false, + ), + Field::new("__sequence", ArrowDataType::UInt64, false), + Field::new("__op_type", ArrowDataType::UInt8, false), + ])); + + RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap() + } + + #[test] + fn test_series_estimator_empty_batch() { + let mut estimator = SeriesEstimator::default(); + let batch = new_batch(b"test", &[], &[], &[]); + estimator.update(&batch); + assert_eq!(0, estimator.finish()); + } + + #[test] + fn test_series_estimator_single_batch() { + let mut estimator = SeriesEstimator::default(); + let batch = new_batch( + b"test", + &[1, 2, 3], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch); + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_multiple_batches_same_series() { + let mut estimator = SeriesEstimator::default(); + + // First batch with timestamps 1, 2, 3 + let batch1 = new_batch( + b"test", + &[1, 2, 3], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch1); + + // Second batch with timestamps 4, 5, 6 (continuation) + let batch2 = new_batch( + b"test", + &[4, 5, 6], + &[4, 5, 6], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch2); + + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_new_series_detected() { + let mut estimator = SeriesEstimator::default(); + + // First batch with timestamps 1, 2, 3 + let batch1 = new_batch( + b"pk0", + &[1, 2, 3], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch1); + + // Second batch with timestamps 2, 3, 4 (timestamp goes back, new series) + let batch2 = new_batch( + b"pk1", + &[2, 3, 4], + &[4, 5, 6], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch2); + + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_equal_timestamp_boundary() { + let mut estimator = SeriesEstimator::default(); + + // First batch ending at timestamp 5 + let batch1 = new_batch( + b"test", + &[1, 2, 5], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch1); + + // Second batch starting at timestamp 5 (equal, indicates new series) + let batch2 = new_batch( + b"test", + &[5, 6, 7], + &[4, 5, 6], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch2); + + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_finish_resets_state() { + let mut estimator = SeriesEstimator::default(); + + let batch1 = new_batch( + b"test", + &[1, 2, 3], + &[1, 2, 3], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch1); + + assert_eq!(1, estimator.finish()); + + // After finish, state should be reset + let batch2 = new_batch( + b"test", + &[4, 5, 6], + &[4, 5, 6], + &[OpType::Put, OpType::Put, OpType::Put], + ); + estimator.update(&batch2); + + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_empty_batch() { + let mut estimator = SeriesEstimator::default(); + let record_batch = new_flat_record_batch(&[]); + estimator.update_flat(&record_batch); + assert_eq!(0, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_single_batch() { + let mut estimator = SeriesEstimator::default(); + let record_batch = new_flat_record_batch(&[1, 2, 3]); + estimator.update_flat(&record_batch); + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_series_boundary_within_batch() { + let mut estimator = SeriesEstimator::default(); + // Timestamps decrease from 3 to 2, indicating a series boundary + let record_batch = new_flat_record_batch(&[1, 2, 3, 2, 4, 5]); + estimator.update_flat(&record_batch); + // Should detect boundary at position 3 (3 >= 2) + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_multiple_boundaries_within_batch() { + let mut estimator = SeriesEstimator::default(); + // Multiple series boundaries: 5>=4, 6>=3 + let record_batch = new_flat_record_batch(&[1, 2, 5, 4, 6, 3, 7]); + estimator.update_flat(&record_batch); + assert_eq!(3, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_equal_timestamps() { + let mut estimator = SeriesEstimator::default(); + // Equal timestamps are considered as new series + let record_batch = new_flat_record_batch(&[1, 2, 2, 3, 3, 3, 4]); + estimator.update_flat(&record_batch); + // Boundaries at: 2>=2, 3>=3, 3>=3 + assert_eq!(4, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_multiple_batches_continuation() { + let mut estimator = SeriesEstimator::default(); + + // First batch: timestamps 1, 2, 3 + let batch1 = new_flat_record_batch(&[1, 2, 3]); + estimator.update_flat(&batch1); + + // Second batch: timestamps 4, 5, 6 (continuation) + let batch2 = new_flat_record_batch(&[4, 5, 6]); + estimator.update_flat(&batch2); + + assert_eq!(1, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_multiple_batches_new_series() { + let mut estimator = SeriesEstimator::default(); + + // First batch: timestamps 1, 2, 3 + let batch1 = new_flat_record_batch(&[1, 2, 3]); + estimator.update_flat(&batch1); + + // Second batch: timestamps 2, 3, 4 (goes back to 2, new series) + let batch2 = new_flat_record_batch(&[2, 3, 4]); + estimator.update_flat(&batch2); + + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_boundary_at_batch_edge_equal() { + let mut estimator = SeriesEstimator::default(); + + // First batch ending at 5 + let batch1 = new_flat_record_batch(&[1, 2, 5]); + estimator.update_flat(&batch1); + + // Second batch starting at 5 (equal timestamp, new series) + let batch2 = new_flat_record_batch(&[5, 6, 7]); + estimator.update_flat(&batch2); + + assert_eq!(2, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_mixed_batches() { + let mut estimator = SeriesEstimator::default(); + + // Batch 1: single series [10, 20, 30] + let batch1 = new_flat_record_batch(&[10, 20, 30]); + estimator.update_flat(&batch1); + + // Batch 2: starts new series [5, 15], boundary within batch [15, 10, 25] + let batch2 = new_flat_record_batch(&[5, 15, 10, 25]); + estimator.update_flat(&batch2); + + // Batch 3: continues from 25 to [30, 35] + let batch3 = new_flat_record_batch(&[30, 35]); + estimator.update_flat(&batch3); + + // Expected: 1 (batch1) + 1 (batch2 start) + 1 (within batch2) = 3 + assert_eq!(3, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_descending_timestamps() { + let mut estimator = SeriesEstimator::default(); + // Strictly descending timestamps - each pair creates a boundary + let record_batch = new_flat_record_batch(&[10, 9, 8, 7, 6]); + estimator.update_flat(&record_batch); + // Boundaries: 10>=9, 9>=8, 8>=7, 7>=6 = 4 boundaries + 1 initial = 5 series + assert_eq!(5, estimator.finish()); + } + + #[test] + fn test_series_estimator_flat_finish_resets_state() { + let mut estimator = SeriesEstimator::default(); + + let batch1 = new_flat_record_batch(&[1, 2, 3]); + estimator.update_flat(&batch1); + + assert_eq!(1, estimator.finish()); + + // After finish, state should be reset + let batch2 = new_flat_record_batch(&[4, 5, 6]); + estimator.update_flat(&batch2); + + assert_eq!(1, estimator.finish()); + } +} diff --git a/src/mito2/src/sst/file.rs b/src/mito2/src/sst/file.rs index 4ddde55746..caead6e3f4 100644 --- a/src/mito2/src/sst/file.rs +++ b/src/mito2/src/sst/file.rs @@ -21,13 +21,14 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use common_base::readable_size::ReadableSize; -use common_telemetry::{error, info}; +use common_telemetry::{debug, error}; use common_time::Timestamp; use partition::expr::PartitionExpr; use serde::{Deserialize, Serialize}; use smallvec::SmallVec; +use store_api::metadata::ColumnMetadata; use store_api::region_request::PathType; -use store_api::storage::{FileId, RegionId}; +use store_api::storage::{ColumnId, FileId, RegionId}; use crate::access_layer::AccessLayerRef; use crate::cache::CacheManagerRef; @@ -79,6 +80,8 @@ where pub type Level = u8; /// Maximum level of SSTs. pub const MAX_LEVEL: Level = 2; +/// Type to store index types for a column. +pub type IndexTypes = SmallVec<[IndexType; 4]>; /// Cross-region file id. /// @@ -143,9 +146,25 @@ pub struct FileMeta { /// Size of the file. pub file_size: u64, /// Available indexes of the file. - pub available_indexes: SmallVec<[IndexType; 4]>, + pub available_indexes: IndexTypes, + /// Created indexes of the file for each column. + /// + /// This is essentially a more granular, column-level version of `available_indexes`, + /// primarily used for manual index building in the asynchronous index construction mode. + /// + /// For backward compatibility, older `FileMeta` versions might only contain `available_indexes`. + /// In such cases, we cannot deduce specific column index information from `available_indexes` alone. + /// Therefore, defaulting this `indexes` field to an empty list during deserialization is a + /// reasonable and necessary step to ensure column information consistency. + pub indexes: Vec, /// Size of the index file. pub index_file_size: u64, + /// File ID of the index file. + /// + /// When this field is None, it means the index file id is the same as the file id. + /// Only meaningful when index_file_size > 0. + /// Used for rebuilding index files. + pub index_file_id: Option, /// Number of rows in the file. /// /// For historical reasons, this field might be missing in old files. Thus @@ -175,6 +194,10 @@ pub struct FileMeta { deserialize_with = "deserialize_partition_expr" )] pub partition_expr: Option, + /// Number of series in the file. + /// + /// The number is 0 if the series number is not available. + pub num_series: u64, } impl Debug for FileMeta { @@ -196,6 +219,7 @@ impl Debug for FileMeta { if !self.available_indexes.is_empty() { debug_struct .field("available_indexes", &self.available_indexes) + .field("indexes", &self.indexes) .field("index_file_size", &ReadableSize(self.index_file_size)); } debug_struct @@ -210,6 +234,7 @@ impl Debug for FileMeta { } }) .field("partition_expr", &self.partition_expr) + .field("num_series", &self.num_series) .finish() } } @@ -225,6 +250,24 @@ pub enum IndexType { BloomFilterIndex, } +/// Metadata of indexes created for a specific column in an SST file. +/// +/// This structure tracks which index types have been successfully created for a column. +/// It provides more granular, column-level index information compared to the file-level +/// `available_indexes` field in [`FileMeta`]. +/// +/// This is primarily used for: +/// - Manual index building in asynchronous index construction mode +/// - Verifying index consistency between files and region metadata +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +#[serde(default)] +pub struct ColumnIndexMetadata { + /// The column ID this index metadata applies to. + pub column_id: ColumnId, + /// List of index types that have been successfully created for this column. + pub created_indexes: IndexTypes, +} + impl FileMeta { pub fn exists_index(&self) -> bool { !self.available_indexes.is_empty() @@ -250,10 +293,54 @@ impl FileMeta { self.index_file_size } + /// Check whether the file index is consistent with the given region metadata. + pub fn is_index_consistent_with_region(&self, metadata: &[ColumnMetadata]) -> bool { + let id_to_indexes = self + .indexes + .iter() + .map(|index| (index.column_id, index.created_indexes.clone())) + .collect::>(); + for column in metadata { + if !column.column_schema.is_indexed() { + continue; + } + if let Some(indexes) = id_to_indexes.get(&column.column_id) { + if column.column_schema.is_inverted_indexed() + && !indexes.contains(&IndexType::InvertedIndex) + { + return false; + } + if column.column_schema.is_fulltext_indexed() + && !indexes.contains(&IndexType::FulltextIndex) + { + return false; + } + if column.column_schema.is_skipping_indexed() + && !indexes.contains(&IndexType::BloomFilterIndex) + { + return false; + } + } else { + return false; + } + } + true + } + /// Returns the cross-region file id. pub fn file_id(&self) -> RegionFileId { RegionFileId::new(self.region_id, self.file_id) } + + /// Returns the cross-region index file id. + /// If the index file id is not set, returns the file id. + pub fn index_file_id(&self) -> RegionFileId { + if let Some(index_file_id) = self.index_file_id { + RegionFileId::new(self.region_id, index_file_id) + } else { + self.file_id() + } + } } /// Handle to a SST file. @@ -289,9 +376,19 @@ impl FileHandle { RegionFileId::new(self.inner.meta.region_id, self.inner.meta.file_id) } + /// Returns the cross-region index file id. + /// If the index file id is not set, returns the file id. + pub fn index_file_id(&self) -> RegionFileId { + if let Some(index_file_id) = self.inner.meta.index_file_id { + RegionFileId::new(self.inner.meta.region_id, index_file_id) + } else { + self.file_id() + } + } + /// Returns the complete file path of the file. - pub fn file_path(&self, file_dir: &str, path_type: PathType) -> String { - location::sst_file_path(file_dir, self.file_id(), path_type) + pub fn file_path(&self, table_dir: &str, path_type: PathType) -> String { + location::sst_file_path(table_dir, self.file_id(), path_type) } /// Returns the time range of the file. @@ -374,22 +471,28 @@ impl FileHandleInner { /// Delete pub async fn delete_files( region_id: RegionId, - file_ids: &[FileId], + file_ids: &[(FileId, FileId)], delete_index: bool, access_layer: &AccessLayerRef, cache_manager: &Option, ) -> crate::error::Result<()> { // Remove meta of the file from cache. if let Some(cache) = &cache_manager { - for file_id in file_ids { + for (file_id, _) in file_ids { cache.remove_parquet_meta_data(RegionFileId::new(region_id, *file_id)); } } let mut deleted_files = Vec::with_capacity(file_ids.len()); - for file_id in file_ids { + for (file_id, index_file_id) in file_ids { let region_file_id = RegionFileId::new(region_id, *file_id); - match access_layer.delete_sst(®ion_file_id).await { + match access_layer + .delete_sst( + &RegionFileId::new(region_id, *file_id), + &RegionFileId::new(region_id, *index_file_id), + ) + .await + { Ok(_) => { deleted_files.push(*file_id); } @@ -399,21 +502,19 @@ pub async fn delete_files( } } - info!( + debug!( "Deleted {} files for region {}: {:?}", deleted_files.len(), region_id, deleted_files ); - for file_id in file_ids { - let region_file_id = RegionFileId::new(region_id, *file_id); - + for (file_id, index_file_id) in file_ids { if let Some(write_cache) = cache_manager.as_ref().and_then(|cache| cache.write_cache()) { // Removes index file from the cache. if delete_index { write_cache - .remove(IndexKey::new(region_id, *file_id, FileType::Puffin)) + .remove(IndexKey::new(region_id, *index_file_id, FileType::Puffin)) .await; } @@ -426,11 +527,11 @@ pub async fn delete_files( // Purges index content in the stager. if let Err(e) = access_layer .puffin_manager_factory() - .purge_stager(region_file_id) + .purge_stager(RegionFileId::new(region_id, *index_file_id)) .await { error!(e; "Failed to purge stager with index file, file_id: {}, region: {}", - file_id, region_id); + index_file_id, region_id); } } Ok(()) @@ -440,6 +541,10 @@ pub async fn delete_files( mod tests { use std::str::FromStr; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::{ + ColumnSchema, FulltextAnalyzer, FulltextBackend, FulltextOptions, SkippingIndexOptions, + }; use datatypes::value::Value; use partition::expr::{PartitionExpr, col}; @@ -453,11 +558,17 @@ mod tests { level, file_size: 0, available_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + indexes: vec![ColumnIndexMetadata { + column_id: 0, + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }], index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, sequence: None, partition_expr: None, + num_series: 0, } } @@ -473,7 +584,7 @@ mod tests { fn test_deserialize_from_string() { let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55\",\ \"time_range\":[{\"value\":0,\"unit\":\"Millisecond\"},{\"value\":0,\"unit\":\"Millisecond\"}],\ - \"available_indexes\":[\"InvertedIndex\"],\"level\":0}"; + \"available_indexes\":[\"InvertedIndex\"],\"indexes\":[{\"column_id\": 0, \"created_indexes\": [\"InvertedIndex\"]}],\"level\":0}"; let file_meta = create_file_meta( FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(), 0, @@ -498,11 +609,17 @@ mod tests { level: 0, file_size: 0, available_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + indexes: vec![ColumnIndexMetadata { + column_id: 0, + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }], index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, sequence: None, partition_expr: Some(partition_expr.clone()), + num_series: 0, }; // Test serialization/deserialization @@ -614,4 +731,147 @@ mod tests { let file_meta_empty: FileMeta = serde_json::from_str(json_with_empty_expr).unwrap(); assert!(file_meta_empty.partition_expr.is_none()); } + + #[test] + fn test_file_meta_indexes_backward_compatibility() { + // Old FileMeta format without the 'indexes' field + let json_old_file_meta = r#"{ + "region_id": 0, + "file_id": "bc5896ec-e4d8-4017-a80d-f2de73188d55", + "time_range": [ + {"value": 0, "unit": "Millisecond"}, + {"value": 0, "unit": "Millisecond"} + ], + "available_indexes": ["InvertedIndex"], + "level": 0, + "file_size": 0, + "index_file_size": 0, + "num_rows": 0, + "num_row_groups": 0 + }"#; + + let deserialized_file_meta: FileMeta = serde_json::from_str(json_old_file_meta).unwrap(); + + // Verify backward compatibility: indexes field should default to empty vec + assert_eq!(deserialized_file_meta.indexes, vec![]); + + let expected_indexes: IndexTypes = SmallVec::from_iter([IndexType::InvertedIndex]); + assert_eq!(deserialized_file_meta.available_indexes, expected_indexes); + + assert_eq!( + deserialized_file_meta.file_id, + FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap() + ); + } + #[test] + fn test_is_index_consistent_with_region() { + fn new_column_meta( + id: ColumnId, + name: &str, + inverted: bool, + fulltext: bool, + skipping: bool, + ) -> ColumnMetadata { + let mut column_schema = + ColumnSchema::new(name, ConcreteDataType::string_datatype(), true); + if inverted { + column_schema = column_schema.with_inverted_index(true); + } + if fulltext { + column_schema = column_schema + .with_fulltext_options(FulltextOptions::new_unchecked( + true, + FulltextAnalyzer::English, + false, + FulltextBackend::Bloom, + 1000, + 0.01, + )) + .unwrap(); + } + if skipping { + column_schema = column_schema + .with_skipping_options(SkippingIndexOptions::new_unchecked( + 1024, + 0.01, + datatypes::schema::SkippingIndexType::BloomFilter, + )) + .unwrap(); + } + + ColumnMetadata { + column_schema, + semantic_type: api::v1::SemanticType::Tag, + column_id: id, + } + } + + // Case 1: Perfect match. File has exactly the required indexes. + let mut file_meta = FileMeta { + indexes: vec![ColumnIndexMetadata { + column_id: 1, + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }], + ..Default::default() + }; + let region_meta = vec![new_column_meta(1, "tag1", true, false, false)]; + assert!(file_meta.is_index_consistent_with_region(®ion_meta)); + + // Case 2: Superset match. File has more indexes than required. + file_meta.indexes = vec![ColumnIndexMetadata { + column_id: 1, + created_indexes: SmallVec::from_iter([ + IndexType::InvertedIndex, + IndexType::BloomFilterIndex, + ]), + }]; + let region_meta = vec![new_column_meta(1, "tag1", true, false, false)]; + assert!(file_meta.is_index_consistent_with_region(®ion_meta)); + + // Case 3: Missing index type. File has the column but lacks the required index type. + file_meta.indexes = vec![ColumnIndexMetadata { + column_id: 1, + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }]; + let region_meta = vec![new_column_meta(1, "tag1", true, true, false)]; // Requires fulltext too + assert!(!file_meta.is_index_consistent_with_region(®ion_meta)); + + // Case 4: Missing column. Region requires an index on a column not in the file's index list. + file_meta.indexes = vec![ColumnIndexMetadata { + column_id: 2, // File only has index for column 2 + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }]; + let region_meta = vec![new_column_meta(1, "tag1", true, false, false)]; // Requires index on column 1 + assert!(!file_meta.is_index_consistent_with_region(®ion_meta)); + + // Case 5: No indexes required by region. Should always be consistent. + file_meta.indexes = vec![ColumnIndexMetadata { + column_id: 1, + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }]; + let region_meta = vec![new_column_meta(1, "tag1", false, false, false)]; // No index required + assert!(file_meta.is_index_consistent_with_region(®ion_meta)); + + // Case 6: Empty file indexes. Region requires an index. + file_meta.indexes = vec![]; + let region_meta = vec![new_column_meta(1, "tag1", true, false, false)]; + assert!(!file_meta.is_index_consistent_with_region(®ion_meta)); + + // Case 7: Multiple columns, one is inconsistent. + file_meta.indexes = vec![ + ColumnIndexMetadata { + column_id: 1, + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }, + ColumnIndexMetadata { + column_id: 2, // Column 2 is missing the required BloomFilterIndex + created_indexes: SmallVec::from_iter([IndexType::FulltextIndex]), + }, + ]; + let region_meta = vec![ + new_column_meta(1, "tag1", true, false, false), + new_column_meta(2, "tag2", false, true, true), // Requires Fulltext and BloomFilter + ]; + assert!(!file_meta.is_index_consistent_with_region(®ion_meta)); + } } diff --git a/src/mito2/src/sst/file_purger.rs b/src/mito2/src/sst/file_purger.rs index 7bd0e6b515..bf1c2ee5c3 100644 --- a/src/mito2/src/sst/file_purger.rs +++ b/src/mito2/src/sst/file_purger.rs @@ -80,15 +80,16 @@ pub fn is_local_fs(sst_layer: &AccessLayerRef) -> bool { /// only manages the file references without deleting the actual files. /// pub fn create_file_purger( + gc_enabled: bool, scheduler: SchedulerRef, sst_layer: AccessLayerRef, cache_manager: Option, file_ref_manager: FileReferenceManagerRef, ) -> FilePurgerRef { - if is_local_fs(&sst_layer) { - Arc::new(LocalFilePurger::new(scheduler, sst_layer, cache_manager)) - } else { + if gc_enabled && !is_local_fs(&sst_layer) { Arc::new(ObjectStoreFilePurger { file_ref_manager }) + } else { + Arc::new(LocalFilePurger::new(scheduler, sst_layer, cache_manager)) } } @@ -128,7 +129,7 @@ impl LocalFilePurger { if let Err(e) = self.scheduler.schedule(Box::pin(async move { if let Err(e) = delete_files( file_meta.region_id, - &[file_meta.file_id], + &[(file_meta.file_id, file_meta.index_file_id().file_id())], file_meta.exists_index(), &sst_layer, &cache_manager, @@ -162,6 +163,7 @@ impl FilePurger for ObjectStoreFilePurger { // notice that no matter whether the file is deleted or not, we need to remove the reference // because the file is no longer in use nonetheless. self.file_ref_manager.remove_file(&file_meta); + // TODO(discord9): consider impl a .tombstone file to reduce files needed to list } fn new_file(&self, file_meta: &FileMeta) { @@ -183,7 +185,9 @@ mod tests { use super::*; use crate::access_layer::AccessLayer; use crate::schedule::scheduler::{LocalScheduler, Scheduler}; - use crate::sst::file::{FileHandle, FileMeta, FileTimeRange, IndexType, RegionFileId}; + use crate::sst::file::{ + ColumnIndexMetadata, FileHandle, FileMeta, FileTimeRange, IndexType, RegionFileId, + }; use crate::sst::index::intermediate::IntermediateManager; use crate::sst::index::puffin_manager::PuffinManagerFactory; use crate::sst::location; @@ -231,11 +235,14 @@ mod tests { level: 0, file_size: 4096, available_indexes: Default::default(), + indexes: Default::default(), index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, sequence: None, partition_expr: None, + num_series: 0, }, file_purger, ); @@ -297,11 +304,17 @@ mod tests { level: 0, file_size: 4096, available_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + indexes: vec![ColumnIndexMetadata { + column_id: 0, + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }], index_file_size: 4096, + index_file_id: None, num_rows: 1024, num_row_groups: 1, sequence: NonZeroU64::new(4096), partition_expr: None, + num_series: 0, }, file_purger, ); diff --git a/src/mito2/src/sst/file_ref.rs b/src/mito2/src/sst/file_ref.rs index c8b86ed0fd..e3cc38640c 100644 --- a/src/mito2/src/sst/file_ref.rs +++ b/src/mito2/src/sst/file_ref.rs @@ -17,38 +17,23 @@ use std::sync::Arc; use common_telemetry::debug; use dashmap::{DashMap, Entry}; -use serde::{Deserialize, Serialize}; -use store_api::ManifestVersion; -use store_api::storage::{FileId, RegionId, TableId}; +use store_api::storage::{FileRef, FileRefsManifest, RegionId}; use crate::error::Result; use crate::metrics::GC_REF_FILE_CNT; -use crate::region::RegionMapRef; +use crate::region::MitoRegionRef; use crate::sst::file::FileMeta; -#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct FileRef { - pub region_id: RegionId, - pub file_id: FileId, -} - -impl FileRef { - pub fn new(region_id: RegionId, file_id: FileId) -> Self { - Self { region_id, file_id } - } -} - -/// File references for a table. -/// It contains all files referenced by the table. +/// File references for a region. +/// It contains all files referenced by the region. #[derive(Debug, Clone, Default)] -pub struct TableFileRefs { +pub struct RegionFileRefs { /// (FileRef, Ref Count) meaning how many FileHandleInner is opened for this file. pub files: HashMap, } /// Manages all file references in one datanode. /// It keeps track of which files are referenced and group by table ids. -/// And periodically update the references to tmp file in object storage. /// This is useful for ensuring that files are not deleted while they are still in use by any /// query. #[derive(Debug)] @@ -56,33 +41,24 @@ pub struct FileReferenceManager { /// Datanode id. used to determine tmp ref file name. node_id: Option, /// TODO(discord9): use no hash hasher since table id is sequential. - files_per_table: DashMap, + files_per_region: DashMap, } pub type FileReferenceManagerRef = Arc; -/// The tmp file uploaded to object storage to record one table's file references. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] -pub struct TableFileRefsManifest { - pub file_refs: HashSet, - /// Manifest version when this manifest is read for it's files - pub manifest_version: HashMap, -} - impl FileReferenceManager { pub fn new(node_id: Option) -> Self { Self { node_id, - files_per_table: Default::default(), + files_per_region: Default::default(), } } - fn ref_file_set(&self, table_id: TableId) -> Option> { - let file_refs = if let Some(file_refs) = self.files_per_table.get(&table_id) { + fn ref_file_set(&self, region_id: RegionId) -> Option> { + let file_refs = if let Some(file_refs) = self.files_per_region.get(®ion_id) { file_refs.clone() } else { - // still return an empty manifest to indicate no files are referenced. - // and differentiate from error case where table_id not found. + // region id not found. return None; }; @@ -95,8 +71,8 @@ impl FileReferenceManager { let ref_file_set: HashSet = file_refs.files.keys().cloned().collect(); debug!( - "Get file refs for table {}, node {:?}, {} files", - table_id, + "Get file refs for region {}, node {:?}, {} files", + region_id, self.node_id, ref_file_set.len(), ); @@ -104,52 +80,47 @@ impl FileReferenceManager { Some(ref_file_set) } - /// Gets all ref files for the given table id, excluding those already in region manifest. - /// - /// It's safe if manifest version became outdated when gc worker is called, as gc worker will check the changes between those two versions and act accordingly to make sure to get the real truly tmp ref file sets at the time of old manifest version. - /// - /// TODO(discord9): Since query will only possible refer to files in latest manifest when it's started, the only true risks is files removed from manifest between old version(when reading refs) and new version(at gc worker), so in case of having outdated manifest version, gc worker should make sure not to delete those files(Until next gc round which will use the latest manifest version and handle those files normally). - /// or perhaps using a two-phase commit style process where it proposes a set of files for deletion and then verifies no new references have appeared before committing the delete. - /// - /// gc worker could do this: - /// 1. if can get the files that got removed from old manifest to new manifest, then shouldn't delete those files even if they are not in tmp ref file, other files can be normally handled(deleted if not in use, otherwise keep) - /// and report back allow next gc round to handle those files with newer tmp ref file sets. - /// 2. if can't get the files that got removed from old manifest to new manifest(possible if just did a checkpoint), - /// then can do nothing as can't sure whether a file is truly unused or just tmp ref file sets haven't report it, so need to report back and try next gc round to handle those files with newer tmp ref file sets. - /// - #[allow(unused)] - pub(crate) async fn get_snapshot_of_unmanifested_refs( + /// Gets all ref files for the given regions, meaning all open FileHandles for those regions + /// and from related regions' manifests. + pub(crate) async fn get_snapshot_of_file_refs( &self, - table_id: TableId, - region_map: &RegionMapRef, - ) -> Result { - let Some(ref_files) = self.ref_file_set(table_id) else { - return Ok(Default::default()); - }; - let region_list = region_map.list_regions(); - let table_regions = region_list - .iter() - .filter(|r| r.region_id().table_id() == table_id) - .collect::>(); + query_regions: Vec, + related_regions: Vec<(MitoRegionRef, Vec)>, + ) -> Result { + let mut ref_files = HashMap::new(); + // get from in memory file handles + for region_id in query_regions.iter().map(|r| r.region_id()) { + if let Some(files) = self.ref_file_set(region_id) { + ref_files.insert(region_id, files.into_iter().map(|f| f.file_id).collect()); + } + } - let mut in_manifest_files = HashSet::new(); let mut manifest_version = HashMap::new(); - for r in &table_regions { + for r in &query_regions { let manifest = r.manifest_ctx.manifest().await; - let files = manifest.files.keys().cloned().collect::>(); - in_manifest_files.extend(files); manifest_version.insert(r.region_id(), manifest.manifest_version); } - let ref_files_excluding_in_manifest = ref_files - .iter() - .filter(|f| !in_manifest_files.contains(&f.file_id)) - .cloned() - .collect::>(); + // get file refs from related regions' manifests + for (related_region, queries) in &related_regions { + let queries = queries.iter().cloned().collect::>(); + let manifest = related_region.manifest_ctx.manifest().await; + for meta in manifest.files.values() { + if queries.contains(&meta.region_id) { + ref_files + .entry(meta.region_id) + .or_insert_with(HashSet::new) + .insert(meta.file_id); + } + } + // not sure if related region's manifest version is needed, but record it for now. + manifest_version.insert(related_region.region_id(), manifest.manifest_version); + } - Ok(TableFileRefsManifest { - file_refs: ref_files_excluding_in_manifest, + // simply return all ref files, no manifest version filtering for now. + Ok(FileRefsManifest { + file_refs: ref_files, manifest_version, }) } @@ -158,12 +129,12 @@ impl FileReferenceManager { /// Also records the access layer for the table if not exists. /// The access layer will be used to upload ref file to object storage. pub fn add_file(&self, file_meta: &FileMeta) { - let table_id = file_meta.region_id.table_id(); + let region_id = file_meta.region_id; let mut is_new = false; { let file_ref = FileRef::new(file_meta.region_id, file_meta.file_id); - self.files_per_table - .entry(table_id) + self.files_per_region + .entry(region_id) .and_modify(|refs| { refs.files .entry(file_ref.clone()) @@ -173,7 +144,7 @@ impl FileReferenceManager { 1 }); }) - .or_insert_with(|| TableFileRefs { + .or_insert_with(|| RegionFileRefs { files: HashMap::from_iter([(file_ref, 1)]), }); } @@ -185,14 +156,14 @@ impl FileReferenceManager { /// Removes a file reference. /// If the reference count reaches zero, the file reference will be removed from the manager. pub fn remove_file(&self, file_meta: &FileMeta) { - let table_id = file_meta.region_id.table_id(); - let file_ref = FileRef::new(file_meta.region_id, file_meta.file_id); + let region_id = file_meta.region_id; + let file_ref = FileRef::new(region_id, file_meta.file_id); let mut remove_table_entry = false; let mut remove_file_ref = false; let mut file_cnt = 0; - let table_ref = self.files_per_table.entry(table_id).and_modify(|refs| { + let region_ref = self.files_per_region.entry(region_id).and_modify(|refs| { let entry = refs.files.entry(file_ref.clone()).and_modify(|count| { if *count > 0 { *count -= 1; @@ -214,7 +185,7 @@ impl FileReferenceManager { } }); - if let Entry::Occupied(o) = table_ref + if let Entry::Occupied(o) = region_ref && remove_table_entry { o.remove_entry(); @@ -234,10 +205,10 @@ mod tests { use std::num::NonZeroU64; use smallvec::SmallVec; - use store_api::storage::RegionId; + use store_api::storage::{FileId, RegionId}; use super::*; - use crate::sst::file::{FileMeta, FileTimeRange, IndexType, RegionFileId}; + use crate::sst::file::{ColumnIndexMetadata, FileMeta, FileTimeRange, IndexType, RegionFileId}; #[tokio::test] async fn test_file_ref_mgr() { @@ -254,64 +225,85 @@ mod tests { level: 0, file_size: 4096, available_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + indexes: vec![ColumnIndexMetadata { + column_id: 0, + created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]), + }], index_file_size: 4096, + index_file_id: None, num_rows: 1024, num_row_groups: 1, sequence: NonZeroU64::new(4096), partition_expr: None, + num_series: 0, }; file_ref_mgr.add_file(&file_meta); assert_eq!( - file_ref_mgr.files_per_table.get(&0).unwrap().files, + file_ref_mgr + .files_per_region + .get(&file_meta.region_id) + .unwrap() + .files, HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 1)]) ); file_ref_mgr.add_file(&file_meta); - let expected_table_ref_manifest = + let expected_region_ref_manifest = HashSet::from_iter([FileRef::new(file_meta.region_id, file_meta.file_id)]); assert_eq!( - file_ref_mgr.ref_file_set(0).unwrap(), - expected_table_ref_manifest + file_ref_mgr.ref_file_set(file_meta.region_id).unwrap(), + expected_region_ref_manifest ); assert_eq!( - file_ref_mgr.files_per_table.get(&0).unwrap().files, + file_ref_mgr + .files_per_region + .get(&file_meta.region_id) + .unwrap() + .files, HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 2)]) ); assert_eq!( - file_ref_mgr.ref_file_set(0).unwrap(), - expected_table_ref_manifest + file_ref_mgr.ref_file_set(file_meta.region_id).unwrap(), + expected_region_ref_manifest ); file_ref_mgr.remove_file(&file_meta); assert_eq!( - file_ref_mgr.files_per_table.get(&0).unwrap().files, + file_ref_mgr + .files_per_region + .get(&file_meta.region_id) + .unwrap() + .files, HashMap::from_iter([(FileRef::new(file_meta.region_id, file_meta.file_id), 1)]) ); assert_eq!( - file_ref_mgr.ref_file_set(0).unwrap(), - expected_table_ref_manifest + file_ref_mgr.ref_file_set(file_meta.region_id).unwrap(), + expected_region_ref_manifest ); file_ref_mgr.remove_file(&file_meta); assert!( - file_ref_mgr.files_per_table.get(&0).is_none(), + file_ref_mgr + .files_per_region + .get(&file_meta.region_id) + .is_none(), "{:?}", - file_ref_mgr.files_per_table + file_ref_mgr.files_per_region ); assert!( - file_ref_mgr.ref_file_set(0).is_none(), + file_ref_mgr.ref_file_set(file_meta.region_id).is_none(), "{:?}", - file_ref_mgr.files_per_table + file_ref_mgr.files_per_region ); } } diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index 8ad7f6ef01..fec2e4552c 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -21,11 +21,13 @@ pub mod puffin_manager; mod statistics; pub(crate) mod store; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, HashMap, HashSet}; use std::num::NonZeroUsize; use std::sync::Arc; use bloom_filter::creator::BloomFilterIndexer; -use common_telemetry::{debug, info, warn}; +use common_telemetry::{debug, error, info, warn}; use datatypes::arrow::array::BinaryArray; use datatypes::arrow::record_batch::RecordBatch; use mito_codec::index::IndexValuesCodec; @@ -43,7 +45,10 @@ use crate::access_layer::{AccessLayerRef, FilePathProvider, OperationType, Regio use crate::cache::file_cache::{FileType, IndexKey}; use crate::cache::write_cache::{UploadTracker, WriteCacheRef}; use crate::config::{BloomFilterConfig, FulltextIndexConfig, InvertedIndexConfig}; -use crate::error::{BuildIndexAsyncSnafu, DecodeSnafu, Error, InvalidRecordBatchSnafu, Result}; +use crate::error::{ + BuildIndexAsyncSnafu, DecodeSnafu, Error, InvalidRecordBatchSnafu, RegionClosedSnafu, + RegionDroppedSnafu, RegionTruncatedSnafu, Result, +}; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::metrics::INDEX_CREATE_MEMORY_USAGE; use crate::read::{Batch, BatchReader}; @@ -51,10 +56,13 @@ use crate::region::options::IndexOptions; use crate::region::version::VersionControlRef; use crate::region::{ManifestContextRef, RegionLeaderState}; use crate::request::{ - BackgroundNotify, IndexBuildFailed, IndexBuildFinished, WorkerRequest, WorkerRequestWithTime, + BackgroundNotify, IndexBuildFailed, IndexBuildFinished, IndexBuildStopped, WorkerRequest, + WorkerRequestWithTime, }; use crate::schedule::scheduler::{Job, SchedulerRef}; -use crate::sst::file::{FileHandle, FileMeta, IndexType, RegionFileId}; +use crate::sst::file::{ + ColumnIndexMetadata, FileHandle, FileMeta, IndexType, IndexTypes, RegionFileId, +}; use crate::sst::file_purger::FilePurgerRef; use crate::sst::index::fulltext_index::creator::FulltextIndexer; use crate::sst::index::intermediate::IntermediateManager; @@ -62,6 +70,7 @@ use crate::sst::index::inverted_index::creator::InvertedIndexer; use crate::sst::parquet::SstInfo; use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::format::PrimaryKeyArray; +use crate::worker::WorkerListener; pub(crate) const TYPE_INVERTED_INDEX: &str = "inverted_index"; pub(crate) const TYPE_FULLTEXT_INDEX: &str = "fulltext_index"; @@ -94,6 +103,35 @@ impl IndexOutput { } indexes } + + pub fn build_indexes(&self) -> Vec { + let mut map: HashMap = HashMap::new(); + + if self.inverted_index.is_available() { + for &col in &self.inverted_index.columns { + map.entry(col).or_default().push(IndexType::InvertedIndex); + } + } + if self.fulltext_index.is_available() { + for &col in &self.fulltext_index.columns { + map.entry(col).or_default().push(IndexType::FulltextIndex); + } + } + if self.bloom_filter.is_available() { + for &col in &self.bloom_filter.columns { + map.entry(col) + .or_default() + .push(IndexType::BloomFilterIndex); + } + } + + map.into_iter() + .map(|(column_id, created_indexes)| ColumnIndexMetadata { + column_id, + created_indexes, + }) + .collect::>() + } } /// Base output of the index creation. @@ -409,7 +447,7 @@ impl IndexerBuilderImpl { } /// Type of an index build task. -#[derive(Debug, Clone, PartialEq, IntoStaticStr)] +#[derive(Debug, Clone, IntoStaticStr, PartialEq)] pub enum IndexBuildType { /// Build index when schema change. SchemaChange, @@ -425,6 +463,16 @@ impl IndexBuildType { fn as_str(&self) -> &'static str { self.into() } + + // Higher value means higher priority. + fn priority(&self) -> u8 { + match self { + IndexBuildType::Manual => 3, + IndexBuildType::SchemaChange => 2, + IndexBuildType::Flush => 1, + IndexBuildType::Compact => 0, + } + } } impl From for IndexBuildType { @@ -446,11 +494,13 @@ pub enum IndexBuildOutcome { /// Mpsc output result sender. pub type ResultMpscSender = Sender>; +#[derive(Clone)] pub struct IndexBuildTask { /// The file meta to build index for. pub file_meta: FileMeta, pub reason: IndexBuildType, pub access_layer: AccessLayerRef, + pub(crate) listener: WorkerListener, pub(crate) manifest_ctx: ManifestContextRef, pub write_cache: Option, pub file_purger: FilePurgerRef, @@ -463,14 +513,24 @@ pub struct IndexBuildTask { pub(crate) result_sender: ResultMpscSender, } +impl std::fmt::Debug for IndexBuildTask { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("IndexBuildTask") + .field("region_id", &self.file_meta.region_id) + .field("file_id", &self.file_meta.file_id) + .field("reason", &self.reason) + .finish() + } +} + impl IndexBuildTask { /// Notify the caller the job is success. - pub async fn on_success(&mut self, outcome: IndexBuildOutcome) { + pub async fn on_success(&self, outcome: IndexBuildOutcome) { let _ = self.result_sender.send(Ok(outcome)).await; } /// Send index build error to waiter. - pub async fn on_failure(&mut self, err: Arc) { + pub async fn on_failure(&self, err: Arc) { let _ = self .result_sender .send(Err(err.clone()).context(BuildIndexAsyncSnafu { @@ -486,6 +546,12 @@ impl IndexBuildTask { } async fn do_index_build(&mut self, version_control: VersionControlRef) { + self.listener + .on_index_build_begin(RegionFileId::new( + self.file_meta.region_id, + self.file_meta.file_id, + )) + .await; match self.index_build(version_control).await { Ok(outcome) => self.on_success(outcome).await, Err(e) => { @@ -495,7 +561,18 @@ impl IndexBuildTask { ); self.on_failure(e.into()).await } + } + let worker_request = WorkerRequest::Background { + region_id: self.file_meta.region_id, + notify: BackgroundNotify::IndexBuildStopped(IndexBuildStopped { + region_id: self.file_meta.region_id, + file_id: self.file_meta.file_id, + }), }; + let _ = self + .request_sender + .send(WorkerRequestWithTime::new(worker_request)) + .await; } // Checks if the SST file still exists in object store and version to avoid conflict with compaction. @@ -534,12 +611,24 @@ impl IndexBuildTask { &mut self, version_control: VersionControlRef, ) -> Result { - let mut indexer = self.indexer_builder.build(self.file_meta.file_id).await; + let index_file_id = if self.file_meta.index_file_size > 0 { + // Generate new file ID if index file exists to avoid overwrite. + FileId::random() + } else { + self.file_meta.file_id + }; + let mut indexer = self.indexer_builder.build(index_file_id).await; // Check SST file existence before building index to avoid failure of parquet reader. if !self.check_sst_file_exists(&version_control).await { // Calls abort to clean up index files. indexer.abort().await; + self.listener + .on_index_build_abort(RegionFileId::new( + self.file_meta.region_id, + self.file_meta.file_id, + )) + .await; return Ok(IndexBuildOutcome::Aborted(format!( "SST file not found during index build, region: {}, file_id: {}", self.file_meta.region_id, self.file_meta.file_id @@ -575,6 +664,12 @@ impl IndexBuildTask { if !self.check_sst_file_exists(&version_control).await { // Calls abort to clean up index files. indexer.abort().await; + self.listener + .on_index_build_abort(RegionFileId::new( + self.file_meta.region_id, + self.file_meta.file_id, + )) + .await; return Ok(IndexBuildOutcome::Aborted(format!( "SST file not found during index build, region: {}, file_id: {}", self.file_meta.region_id, self.file_meta.file_id @@ -582,9 +677,10 @@ impl IndexBuildTask { } // Upload index file if write cache is enabled. - self.maybe_upload_index_file(index_output.clone()).await?; + self.maybe_upload_index_file(index_output.clone(), index_file_id) + .await?; - let worker_request = match self.update_manifest(index_output).await { + let worker_request = match self.update_manifest(index_output, index_file_id).await { Ok(edit) => { let index_build_finished = IndexBuildFinished { region_id: self.file_meta.region_id, @@ -612,14 +708,18 @@ impl IndexBuildTask { Ok(IndexBuildOutcome::Finished) } - async fn maybe_upload_index_file(&self, output: IndexOutput) -> Result<()> { + async fn maybe_upload_index_file( + &self, + output: IndexOutput, + index_file_id: FileId, + ) -> Result<()> { if let Some(write_cache) = &self.write_cache { let file_id = self.file_meta.file_id; let region_id = self.file_meta.region_id; let remote_store = self.access_layer.object_store(); let mut upload_tracker = UploadTracker::new(region_id); let mut err = None; - let puffin_key = IndexKey::new(region_id, file_id, FileType::Puffin); + let puffin_key = IndexKey::new(region_id, index_file_id, FileType::Puffin); let puffin_path = RegionFilePathFactory::new( self.access_layer.table_dir().to_string(), self.access_layer.path_type(), @@ -653,9 +753,15 @@ impl IndexBuildTask { Ok(()) } - async fn update_manifest(&mut self, output: IndexOutput) -> Result { + async fn update_manifest( + &mut self, + output: IndexOutput, + index_file_id: FileId, + ) -> Result { self.file_meta.available_indexes = output.build_available_indexes(); + self.file_meta.indexes = output.build_indexes(); self.file_meta.index_file_size = output.file_size; + self.file_meta.index_file_id = Some(index_file_id); let edit = RegionEdit { files_to_add: vec![self.file_meta.clone()], files_to_remove: vec![], @@ -670,6 +776,7 @@ impl IndexBuildTask { .update_manifest( RegionLeaderState::Writable, RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone())), + false, ) .await?; info!( @@ -681,26 +788,205 @@ impl IndexBuildTask { } } -#[derive(Clone)] -pub struct IndexBuildScheduler { - scheduler: SchedulerRef, +impl PartialEq for IndexBuildTask { + fn eq(&self, other: &Self) -> bool { + self.reason.priority() == other.reason.priority() + } } -impl IndexBuildScheduler { - pub fn new(scheduler: SchedulerRef) -> Self { - IndexBuildScheduler { scheduler } +impl Eq for IndexBuildTask {} + +impl PartialOrd for IndexBuildTask { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for IndexBuildTask { + fn cmp(&self, other: &Self) -> Ordering { + self.reason.priority().cmp(&other.reason.priority()) + } +} + +/// Tracks the index build status of a region scheduled by the [IndexBuildScheduler]. +pub struct IndexBuildStatus { + pub region_id: RegionId, + pub building_files: HashSet, + pub pending_tasks: BinaryHeap, +} + +impl IndexBuildStatus { + pub fn new(region_id: RegionId) -> Self { + IndexBuildStatus { + region_id, + building_files: HashSet::new(), + pending_tasks: BinaryHeap::new(), + } } - pub(crate) fn schedule_build( + async fn on_failure(self, err: Arc) { + for task in self.pending_tasks { + task.on_failure(err.clone()).await; + } + } +} + +pub struct IndexBuildScheduler { + /// Background job scheduler. + scheduler: SchedulerRef, + /// Tracks regions need to build index. + region_status: HashMap, + /// Limit of files allowed to build index concurrently for a region. + files_limit: usize, +} + +/// Manager background index build tasks of a worker. +impl IndexBuildScheduler { + pub fn new(scheduler: SchedulerRef, files_limit: usize) -> Self { + IndexBuildScheduler { + scheduler, + region_status: HashMap::new(), + files_limit, + } + } + + pub(crate) async fn schedule_build( &mut self, version_control: &VersionControlRef, task: IndexBuildTask, ) -> Result<()> { - // We should clone version control to expand the lifetime. - let job = task.into_index_build_job(version_control.clone()); - self.scheduler.schedule(job)?; + let status = self + .region_status + .entry(task.file_meta.region_id) + .or_insert_with(|| IndexBuildStatus::new(task.file_meta.region_id)); + + if status.building_files.contains(&task.file_meta.file_id) { + let region_file_id = + RegionFileId::new(task.file_meta.region_id, task.file_meta.file_id); + debug!( + "Aborting index build task since index is already being built for region file {:?}", + region_file_id + ); + task.on_success(IndexBuildOutcome::Aborted(format!( + "Index is already being built for region file {:?}", + region_file_id + ))) + .await; + task.listener.on_index_build_abort(region_file_id).await; + return Ok(()); + } + + status.pending_tasks.push(task); + + self.schedule_next_build_batch(version_control); Ok(()) } + + /// Schedule tasks until reaching the files limit or no more tasks. + fn schedule_next_build_batch(&mut self, version_control: &VersionControlRef) { + let mut building_count = 0; + for status in self.region_status.values() { + building_count += status.building_files.len(); + } + + while building_count < self.files_limit { + if let Some(task) = self.find_next_task() { + let region_id = task.file_meta.region_id; + let file_id = task.file_meta.file_id; + let job = task.into_index_build_job(version_control.clone()); + if self.scheduler.schedule(job).is_ok() { + if let Some(status) = self.region_status.get_mut(®ion_id) { + status.building_files.insert(file_id); + building_count += 1; + status + .pending_tasks + .retain(|t| t.file_meta.file_id != file_id); + } else { + error!( + "Region status not found when scheduling index build task, region: {}", + region_id + ); + } + } else { + error!( + "Failed to schedule index build job, region: {}, file_id: {}", + region_id, file_id + ); + } + } else { + // No more tasks to schedule. + break; + } + } + } + + /// Find the next task which has the highest priority to run. + fn find_next_task(&self) -> Option { + self.region_status + .iter() + .filter_map(|(_, status)| status.pending_tasks.peek()) + .max() + .cloned() + } + + pub(crate) fn on_task_stopped( + &mut self, + region_id: RegionId, + file_id: FileId, + version_control: &VersionControlRef, + ) { + if let Some(status) = self.region_status.get_mut(®ion_id) { + status.building_files.remove(&file_id); + if status.building_files.is_empty() && status.pending_tasks.is_empty() { + // No more tasks for this region, remove it. + self.region_status.remove(®ion_id); + } + } + + self.schedule_next_build_batch(version_control); + } + + pub(crate) async fn on_failure(&mut self, region_id: RegionId, err: Arc) { + error!( + err; "Index build scheduler encountered failure for region {}, removing all pending tasks.", + region_id + ); + let Some(status) = self.region_status.remove(®ion_id) else { + return; + }; + status.on_failure(err).await; + } + + /// Notifies the scheduler that the region is dropped. + pub(crate) async fn on_region_dropped(&mut self, region_id: RegionId) { + self.remove_region_on_failure( + region_id, + Arc::new(RegionDroppedSnafu { region_id }.build()), + ) + .await; + } + + /// Notifies the scheduler that the region is closed. + pub(crate) async fn on_region_closed(&mut self, region_id: RegionId) { + self.remove_region_on_failure(region_id, Arc::new(RegionClosedSnafu { region_id }.build())) + .await; + } + + /// Notifies the scheduler that the region is truncated. + pub(crate) async fn on_region_truncated(&mut self, region_id: RegionId) { + self.remove_region_on_failure( + region_id, + Arc::new(RegionTruncatedSnafu { region_id }.build()), + ) + .await; + } + + async fn remove_region_on_failure(&mut self, region_id: RegionId, err: Arc) { + let Some(status) = self.region_status.remove(®ion_id) else { + return; + }; + status.on_failure(err).await; + } } /// Decodes primary keys from a flat format RecordBatch. @@ -771,7 +1057,7 @@ mod tests { use tokio::sync::mpsc; use super::*; - use crate::access_layer::{FilePathProvider, SstWriteRequest, WriteType}; + use crate::access_layer::{FilePathProvider, Metrics, SstWriteRequest, WriteType}; use crate::cache::write_cache::WriteCache; use crate::config::{FulltextIndexConfig, IndexBuildMode, MitoConfig, Mode}; use crate::memtable::time_partition::TimePartitions; @@ -907,11 +1193,11 @@ mod tests { fulltext_index_config: Default::default(), bloom_filter_index_config: Default::default(), }; + let mut metrics = Metrics::new(WriteType::Flush); env.access_layer - .write_sst(write_request, &WriteOptions::default(), WriteType::Flush) + .write_sst(write_request, &WriteOptions::default(), &mut metrics) .await .unwrap() - .0 .remove(0) } @@ -1172,7 +1458,7 @@ mod tests { let env = SchedulerEnv::new().await; let (tx, _rx) = mpsc::channel(4); let (result_tx, mut result_rx) = mpsc::channel::>(4); - let mut scheduler = env.mock_index_build_scheduler(); + let mut scheduler = env.mock_index_build_scheduler(4); let metadata = Arc::new(sst_region_metadata()); let manifest_ctx = env.mock_manifest_context(metadata.clone()).await; let file_purger = Arc::new(NoopFilePurger {}); @@ -1192,6 +1478,7 @@ mod tests { }, reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: None, file_purger, @@ -1201,7 +1488,10 @@ mod tests { }; // Schedule the build task and check result. - scheduler.schedule_build(&version_control, task).unwrap(); + scheduler + .schedule_build(&version_control, task) + .await + .unwrap(); match result_rx.recv().await.unwrap() { Ok(outcome) => { if outcome == IndexBuildOutcome::Finished { @@ -1215,7 +1505,7 @@ mod tests { #[tokio::test] async fn test_index_build_task_sst_exist() { let env = SchedulerEnv::new().await; - let mut scheduler = env.mock_index_build_scheduler(); + let mut scheduler = env.mock_index_build_scheduler(4); let metadata = Arc::new(sst_region_metadata()); let manifest_ctx = env.mock_manifest_context(metadata.clone()).await; let region_id = metadata.region_id; @@ -1242,6 +1532,7 @@ mod tests { file_meta: file_meta.clone(), reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: None, file_purger, @@ -1250,7 +1541,10 @@ mod tests { result_sender: result_tx, }; - scheduler.schedule_build(&version_control, task).unwrap(); + scheduler + .schedule_build(&version_control, task) + .await + .unwrap(); // The task should finish successfully. match result_rx.recv().await.unwrap() { @@ -1282,7 +1576,7 @@ mod tests { async fn schedule_index_build_task_with_mode(build_mode: IndexBuildMode) { let env = SchedulerEnv::new().await; - let mut scheduler = env.mock_index_build_scheduler(); + let mut scheduler = env.mock_index_build_scheduler(4); let metadata = Arc::new(sst_region_metadata()); let manifest_ctx = env.mock_manifest_context(metadata.clone()).await; let file_purger = Arc::new(NoopFilePurger {}); @@ -1309,6 +1603,7 @@ mod tests { file_meta: file_meta.clone(), reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: None, file_purger, @@ -1317,7 +1612,10 @@ mod tests { result_sender: result_tx, }; - scheduler.schedule_build(&version_control, task).unwrap(); + scheduler + .schedule_build(&version_control, task) + .await + .unwrap(); let puffin_path = location::index_file_path( env.access_layer.table_dir(), @@ -1372,7 +1670,7 @@ mod tests { #[tokio::test] async fn test_index_build_task_no_index() { let env = SchedulerEnv::new().await; - let mut scheduler = env.mock_index_build_scheduler(); + let mut scheduler = env.mock_index_build_scheduler(4); let mut metadata = sst_region_metadata(); // Unset indexes in metadata to simulate no index scenario. metadata.column_metadatas.iter_mut().for_each(|col| { @@ -1405,6 +1703,7 @@ mod tests { file_meta: file_meta.clone(), reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: None, file_purger, @@ -1413,7 +1712,10 @@ mod tests { result_sender: result_tx, }; - scheduler.schedule_build(&version_control, task).unwrap(); + scheduler + .schedule_build(&version_control, task) + .await + .unwrap(); // The task should finish successfully. match result_rx.recv().await.unwrap() { @@ -1430,7 +1732,7 @@ mod tests { #[tokio::test] async fn test_index_build_task_with_write_cache() { let env = SchedulerEnv::new().await; - let mut scheduler = env.mock_index_build_scheduler(); + let mut scheduler = env.mock_index_build_scheduler(4); let metadata = Arc::new(sst_region_metadata()); let manifest_ctx = env.mock_manifest_context(metadata.clone()).await; let file_purger = Arc::new(NoopFilePurger {}); @@ -1445,6 +1747,7 @@ mod tests { dir.path().to_str().unwrap(), ReadableSize::mb(10), None, + None, factory, intm_manager, ) @@ -1485,6 +1788,7 @@ mod tests { file_meta: file_meta.clone(), reason: IndexBuildType::Flush, access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), manifest_ctx, write_cache: Some(write_cache.clone()), file_purger, @@ -1493,7 +1797,10 @@ mod tests { result_sender: result_tx, }; - scheduler.schedule_build(&version_control, task).unwrap(); + scheduler + .schedule_build(&version_control, task) + .await + .unwrap(); // The task should finish successfully. match result_rx.recv().await.unwrap() { @@ -1507,4 +1814,188 @@ mod tests { let index_key = IndexKey::new(region_id, file_meta.file_id, FileType::Puffin); assert!(write_cache.file_cache().contains_key(&index_key)); } + + async fn create_mock_task_for_schedule( + env: &SchedulerEnv, + file_id: FileId, + region_id: RegionId, + reason: IndexBuildType, + ) -> IndexBuildTask { + let metadata = Arc::new(sst_region_metadata()); + let manifest_ctx = env.mock_manifest_context(metadata.clone()).await; + let file_purger = Arc::new(NoopFilePurger {}); + let indexer_builder = mock_indexer_builder(metadata, env).await; + let (tx, _rx) = mpsc::channel(4); + let (result_tx, _result_rx) = mpsc::channel::>(4); + + IndexBuildTask { + file_meta: FileMeta { + region_id, + file_id, + file_size: 100, + ..Default::default() + }, + reason, + access_layer: env.access_layer.clone(), + listener: WorkerListener::default(), + manifest_ctx, + write_cache: None, + file_purger, + indexer_builder, + request_sender: tx, + result_sender: result_tx, + } + } + + #[tokio::test] + async fn test_scheduler_comprehensive() { + let env = SchedulerEnv::new().await; + let mut scheduler = env.mock_index_build_scheduler(2); + let metadata = Arc::new(sst_region_metadata()); + let region_id = metadata.region_id; + let file_purger = Arc::new(NoopFilePurger {}); + + // Prepare multiple files for testing + let file_id1 = FileId::random(); + let file_id2 = FileId::random(); + let file_id3 = FileId::random(); + let file_id4 = FileId::random(); + let file_id5 = FileId::random(); + + let mut files = HashMap::new(); + for file_id in [file_id1, file_id2, file_id3, file_id4, file_id5] { + files.insert( + file_id, + FileMeta { + region_id, + file_id, + file_size: 100, + ..Default::default() + }, + ); + } + + let version_control = mock_version_control(metadata, file_purger, files).await; + + // Test 1: Basic scheduling + let task1 = + create_mock_task_for_schedule(&env, file_id1, region_id, IndexBuildType::Flush).await; + assert!( + scheduler + .schedule_build(&version_control, task1) + .await + .is_ok() + ); + assert!(scheduler.region_status.contains_key(®ion_id)); + let status = scheduler.region_status.get(®ion_id).unwrap(); + assert_eq!(status.building_files.len(), 1); + assert!(status.building_files.contains(&file_id1)); + + // Test 2: Duplicate file scheduling (should be skipped) + let task1_dup = + create_mock_task_for_schedule(&env, file_id1, region_id, IndexBuildType::Flush).await; + scheduler + .schedule_build(&version_control, task1_dup) + .await + .unwrap(); + let status = scheduler.region_status.get(®ion_id).unwrap(); + assert_eq!(status.building_files.len(), 1); // Still only one + + // Test 3: Fill up to limit (2 building tasks) + let task2 = + create_mock_task_for_schedule(&env, file_id2, region_id, IndexBuildType::Flush).await; + scheduler + .schedule_build(&version_control, task2) + .await + .unwrap(); + let status = scheduler.region_status.get(®ion_id).unwrap(); + assert_eq!(status.building_files.len(), 2); // Reached limit + assert_eq!(status.pending_tasks.len(), 0); + + // Test 4: Add tasks with different priorities to pending queue + // Now all new tasks will be pending since we reached the limit + let task3 = + create_mock_task_for_schedule(&env, file_id3, region_id, IndexBuildType::Compact).await; + let task4 = + create_mock_task_for_schedule(&env, file_id4, region_id, IndexBuildType::SchemaChange) + .await; + let task5 = + create_mock_task_for_schedule(&env, file_id5, region_id, IndexBuildType::Manual).await; + + scheduler + .schedule_build(&version_control, task3) + .await + .unwrap(); + scheduler + .schedule_build(&version_control, task4) + .await + .unwrap(); + scheduler + .schedule_build(&version_control, task5) + .await + .unwrap(); + + let status = scheduler.region_status.get(®ion_id).unwrap(); + assert_eq!(status.building_files.len(), 2); // Still at limit + assert_eq!(status.pending_tasks.len(), 3); // Three pending + + // Test 5: Task completion triggers scheduling next highest priority task (Manual) + scheduler.on_task_stopped(region_id, file_id1, &version_control); + let status = scheduler.region_status.get(®ion_id).unwrap(); + assert!(!status.building_files.contains(&file_id1)); + assert_eq!(status.building_files.len(), 2); // Should schedule next task + assert_eq!(status.pending_tasks.len(), 2); // One less pending + // The highest priority task (Manual) should now be building + assert!(status.building_files.contains(&file_id5)); + + // Test 6: Complete another task, should schedule SchemaChange (second highest priority) + scheduler.on_task_stopped(region_id, file_id2, &version_control); + let status = scheduler.region_status.get(®ion_id).unwrap(); + assert_eq!(status.building_files.len(), 2); + assert_eq!(status.pending_tasks.len(), 1); // One less pending + assert!(status.building_files.contains(&file_id4)); // SchemaChange should be building + + // Test 7: Complete remaining tasks and cleanup + scheduler.on_task_stopped(region_id, file_id5, &version_control); + scheduler.on_task_stopped(region_id, file_id4, &version_control); + + let status = scheduler.region_status.get(®ion_id).unwrap(); + assert_eq!(status.building_files.len(), 1); // Last task (Compact) should be building + assert_eq!(status.pending_tasks.len(), 0); + assert!(status.building_files.contains(&file_id3)); + + scheduler.on_task_stopped(region_id, file_id3, &version_control); + + // Region should be removed when all tasks complete + assert!(!scheduler.region_status.contains_key(®ion_id)); + + // Test 8: Region dropped with pending tasks + let task6 = + create_mock_task_for_schedule(&env, file_id1, region_id, IndexBuildType::Flush).await; + let task7 = + create_mock_task_for_schedule(&env, file_id2, region_id, IndexBuildType::Flush).await; + let task8 = + create_mock_task_for_schedule(&env, file_id3, region_id, IndexBuildType::Manual).await; + + scheduler + .schedule_build(&version_control, task6) + .await + .unwrap(); + scheduler + .schedule_build(&version_control, task7) + .await + .unwrap(); + scheduler + .schedule_build(&version_control, task8) + .await + .unwrap(); + + assert!(scheduler.region_status.contains_key(®ion_id)); + let status = scheduler.region_status.get(®ion_id).unwrap(); + assert_eq!(status.building_files.len(), 2); + assert_eq!(status.pending_tasks.len(), 1); + + scheduler.on_region_dropped(region_id).await; + assert!(!scheduler.region_status.contains_key(®ion_id)); + } } diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder.rs b/src/mito2/src/sst/index/inverted_index/applier/builder.rs index 8bc5e8b6d1..078d1837fb 100644 --- a/src/mito2/src/sst/index/inverted_index/applier/builder.rs +++ b/src/mito2/src/sst/index/inverted_index/applier/builder.rs @@ -350,7 +350,7 @@ mod tests { #[test] fn test_collect_and_basic() { - let (_d, facotry) = PuffinManagerFactory::new_for_test_block("test_collect_and_basic_"); + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_and_basic_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -359,7 +359,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let expr = Expr::BinaryExpr(BinaryExpr { diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs index 0703e69db6..f9e9f22ba8 100644 --- a/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs +++ b/src/mito2/src/sst/index/inverted_index/applier/builder/between.rs @@ -72,7 +72,7 @@ mod tests { #[test] fn test_collect_between_basic() { - let (_d, facotry) = PuffinManagerFactory::new_for_test_block("test_collect_between_basic_"); + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_between_basic_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( "test".to_string(), @@ -80,7 +80,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let between = Between { @@ -113,7 +113,7 @@ mod tests { #[test] fn test_collect_between_negated() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_between_negated_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -122,7 +122,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let between = Between { @@ -138,7 +138,7 @@ mod tests { #[test] fn test_collect_between_field_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_between_field_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -147,7 +147,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let between = Between { @@ -180,7 +180,7 @@ mod tests { #[test] fn test_collect_between_type_mismatch() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_between_type_mismatch_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -189,7 +189,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let between = Between { @@ -206,7 +206,7 @@ mod tests { #[test] fn test_collect_between_nonexistent_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_between_nonexistent_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -215,7 +215,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let between = Between { diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs index c3a6dacf7b..1a02862996 100644 --- a/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs +++ b/src/mito2/src/sst/index/inverted_index/applier/builder/comparison.rs @@ -227,7 +227,7 @@ mod tests { ), ]; - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_comparison_basic_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -236,7 +236,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); for ((left, op, right), _) in &cases { @@ -255,7 +255,7 @@ mod tests { #[test] fn test_collect_comparison_type_mismatch() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_comparison_type_mismatch_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -264,7 +264,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let res = builder.collect_comparison_expr(&tag_column(), &Operator::Lt, &int64_lit(10)); @@ -274,7 +274,7 @@ mod tests { #[test] fn test_collect_comparison_field_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_comparison_field_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -283,7 +283,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); builder @@ -308,7 +308,7 @@ mod tests { #[test] fn test_collect_comparison_nonexistent_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_comparison_nonexistent_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -317,7 +317,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let res = builder.collect_comparison_expr( diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs index 5c2016e85d..310ea3786c 100644 --- a/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs +++ b/src/mito2/src/sst/index/inverted_index/applier/builder/eq_list.rs @@ -136,7 +136,7 @@ mod tests { #[test] fn test_collect_eq_basic() { - let (_d, facotry) = PuffinManagerFactory::new_for_test_block("test_collect_eq_basic_"); + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_eq_basic_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( "test".to_string(), @@ -144,7 +144,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); builder @@ -172,7 +172,7 @@ mod tests { #[test] fn test_collect_eq_field_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_eq_field_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -181,7 +181,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); builder @@ -200,7 +200,7 @@ mod tests { #[test] fn test_collect_eq_nonexistent_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_eq_nonexistent_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -209,7 +209,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let res = builder.collect_eq(&nonexistent_column(), &string_lit("abc")); @@ -219,7 +219,7 @@ mod tests { #[test] fn test_collect_eq_type_mismatch() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_eq_type_mismatch_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -228,7 +228,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let res = builder.collect_eq(&tag_column(), &int64_lit(1)); @@ -238,7 +238,7 @@ mod tests { #[test] fn test_collect_or_eq_list_basic() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_or_eq_list_basic_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -247,7 +247,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let eq_expr = DfExpr::BinaryExpr(BinaryExpr { @@ -296,7 +296,7 @@ mod tests { #[test] fn test_collect_or_eq_list_invalid_op() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_or_eq_list_invalid_op_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -305,7 +305,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let eq_expr = DfExpr::BinaryExpr(BinaryExpr { @@ -333,7 +333,7 @@ mod tests { #[test] fn test_collect_or_eq_list_multiple_columns() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_or_eq_list_multiple_columns_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -342,7 +342,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let eq_expr = DfExpr::BinaryExpr(BinaryExpr { diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs index abb2b95a19..297becd788 100644 --- a/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs +++ b/src/mito2/src/sst/index/inverted_index/applier/builder/in_list.rs @@ -67,7 +67,7 @@ mod tests { #[test] fn test_collect_in_list_basic() { - let (_d, facotry) = PuffinManagerFactory::new_for_test_block("test_collect_in_list_basic_"); + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_in_list_basic_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( "test".to_string(), @@ -75,7 +75,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let in_list = InList { @@ -98,7 +98,7 @@ mod tests { #[test] fn test_collect_in_list_negated() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_in_list_negated_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -107,7 +107,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let in_list = InList { @@ -122,7 +122,7 @@ mod tests { #[test] fn test_collect_in_list_field_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_in_list_field_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -131,7 +131,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let in_list = InList { @@ -154,7 +154,7 @@ mod tests { #[test] fn test_collect_in_list_type_mismatch() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_in_list_type_mismatch_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -163,7 +163,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let in_list = InList { @@ -179,7 +179,7 @@ mod tests { #[test] fn test_collect_in_list_nonexistent_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_collect_in_list_nonexistent_column_"); let metadata = test_region_metadata(); @@ -189,7 +189,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let in_list = InList { diff --git a/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs b/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs index 71a2631eed..2d2e9fe4ca 100644 --- a/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs +++ b/src/mito2/src/sst/index/inverted_index/applier/builder/regex_match.rs @@ -59,7 +59,7 @@ mod tests { #[test] fn test_regex_match_basic() { - let (_d, facotry) = PuffinManagerFactory::new_for_test_block("test_regex_match_basic_"); + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_regex_match_basic_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( "test".to_string(), @@ -67,7 +67,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); builder @@ -86,7 +86,7 @@ mod tests { #[test] fn test_regex_match_field_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_regex_match_field_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -95,7 +95,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); builder @@ -114,7 +114,7 @@ mod tests { #[test] fn test_regex_match_type_mismatch() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_regex_match_type_mismatch_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -123,7 +123,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); builder @@ -135,7 +135,7 @@ mod tests { #[test] fn test_regex_match_type_nonexist_column() { - let (_d, facotry) = + let (_d, factory) = PuffinManagerFactory::new_for_test_block("test_regex_match_type_nonexist_column_"); let metadata = test_region_metadata(); let mut builder = InvertedIndexApplierBuilder::new( @@ -144,7 +144,7 @@ mod tests { test_object_store(), &metadata, HashSet::from_iter([1, 2, 3]), - facotry, + factory, ); let res = builder.collect_regex_match(&nonexistent_column(), &string_lit("abc")); diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 9b56ffd4ae..4553372569 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -84,6 +84,8 @@ pub struct SstInfo { pub file_metadata: Option>, /// Index Meta Data pub index_metadata: IndexOutput, + /// Number of series + pub num_series: u64, } #[cfg(test)] @@ -179,13 +181,14 @@ mod tests { ..Default::default() }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), NoopIndexBuilder, file_path, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -241,6 +244,7 @@ mod tests { ..Default::default() }; // Prepare data. + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -249,7 +253,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -327,6 +331,7 @@ mod tests { // write the sst file and get sst info // sst info contains the parquet metadata, which is converted from FileMetaData + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -335,7 +340,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -376,6 +381,7 @@ mod tests { ..Default::default() }; // Prepare data. + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -384,7 +390,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; writer @@ -435,6 +441,7 @@ mod tests { ..Default::default() }; // Prepare data. + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -443,7 +450,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; writer @@ -479,6 +486,7 @@ mod tests { ..Default::default() }; // Prepare data. + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), @@ -487,7 +495,7 @@ mod tests { FixedPathProvider { region_file_id: handle.file_id(), }, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -637,13 +645,14 @@ mod tests { table_dir: "test".to_string(), path_type: PathType::Bare, }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), NoopIndexBuilder, path_provider, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -714,13 +723,14 @@ mod tests { bloom_filter_index_config: Default::default(), }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), indexer_builder, file_path.clone(), - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -757,7 +767,9 @@ mod tests { level: 0, file_size: info.file_size, available_indexes: info.index_metadata.build_available_indexes(), + indexes: info.index_metadata.build_indexes(), index_file_size: info.index_metadata.file_size, + index_file_id: None, num_row_groups: info.num_row_groups, num_rows: info.num_rows as u64, sequence: None, @@ -766,6 +778,7 @@ mod tests { .expect("partition expression should be valid JSON"), None => None, }, + num_series: 0, }, Arc::new(NoopFilePurger), ); @@ -839,8 +852,8 @@ mod tests { object_store.clone(), ) .predicate(Some(Predicate::new(preds))) - .inverted_index_applier(inverted_index_applier.clone()) - .bloom_filter_index_applier(bloom_filter_applier.clone()) + .inverted_index_appliers([inverted_index_applier.clone(), None]) + .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) .cache(CacheStrategy::EnableAll(cache.clone())); let mut metrics = ReaderMetrics::default(); @@ -895,8 +908,8 @@ mod tests { object_store.clone(), ) .predicate(Some(Predicate::new(preds))) - .inverted_index_applier(inverted_index_applier.clone()) - .bloom_filter_index_applier(bloom_filter_applier.clone()) + .inverted_index_appliers([inverted_index_applier.clone(), None]) + .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) .cache(CacheStrategy::EnableAll(cache.clone())); let mut metrics = ReaderMetrics::default(); @@ -952,8 +965,8 @@ mod tests { object_store.clone(), ) .predicate(Some(Predicate::new(preds))) - .inverted_index_applier(inverted_index_applier.clone()) - .bloom_filter_index_applier(bloom_filter_applier.clone()) + .inverted_index_appliers([inverted_index_applier.clone(), None]) + .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) .cache(CacheStrategy::EnableAll(cache.clone())); let mut metrics = ReaderMetrics::default(); @@ -1089,13 +1102,14 @@ mod tests { bloom_filter_index_config: Default::default(), }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), indexer_builder, file_path.clone(), - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; @@ -1145,13 +1159,14 @@ mod tests { ..Default::default() }; + let mut metrics = Metrics::new(WriteType::Flush); let mut writer = ParquetWriter::new_with_object_store( object_store.clone(), metadata.clone(), IndexConfig::default(), NoopIndexBuilder, file_path, - Metrics::new(WriteType::Flush), + &mut metrics, ) .await; diff --git a/src/mito2/src/sst/parquet/file_range.rs b/src/mito2/src/sst/parquet/file_range.rs index 268391135b..689a8de599 100644 --- a/src/mito2/src/sst/parquet/file_range.rs +++ b/src/mito2/src/sst/parquet/file_range.rs @@ -26,6 +26,7 @@ use datatypes::arrow::buffer::BooleanBuffer; use datatypes::arrow::record_batch::RecordBatch; use mito_codec::row_converter::{CompositeValues, PrimaryKeyCodec}; use parquet::arrow::arrow_reader::RowSelection; +use parquet::file::metadata::ParquetMetaData; use snafu::{OptionExt, ResultExt}; use store_api::codec::PrimaryKeyEncoding; use store_api::storage::{ColumnId, TimeSeriesRowSelector}; @@ -44,6 +45,33 @@ use crate::sst::parquet::format::ReadFormat; use crate::sst::parquet::reader::{ FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext, }; + +/// Checks if a row group contains delete operations by examining the min value of op_type column. +/// +/// Returns `Ok(true)` if the row group contains delete operations, `Ok(false)` if it doesn't, +/// or an error if the statistics are not present or cannot be decoded. +pub(crate) fn row_group_contains_delete( + parquet_meta: &ParquetMetaData, + row_group_index: usize, + file_path: &str, +) -> Result { + let row_group_metadata = &parquet_meta.row_groups()[row_group_index]; + + // safety: The last column of SST must be op_type + let column_metadata = &row_group_metadata.columns().last().unwrap(); + let stats = column_metadata + .statistics() + .context(StatsNotPresentSnafu { file_path })?; + stats + .min_bytes_opt() + .context(StatsNotPresentSnafu { file_path })? + .try_into() + .map(i32::from_le_bytes) + .map(|min_op_type| min_op_type == OpType::Delete as i32) + .ok() + .context(DecodeStatsSnafu { file_path }) +} + /// A range of a parquet SST. Now it is a row group. /// We can read different file ranges in parallel. #[derive(Clone)] @@ -115,6 +143,9 @@ impl FileRange { false }; + // Compute skip_fields once for this row group + let skip_fields = self.context.should_skip_fields(self.row_group_idx); + let prune_reader = if use_last_row_reader { // Row group is PUT only, use LastRowReader to skip unnecessary rows. let reader = RowGroupLastRowCachedReader::new( @@ -123,12 +154,13 @@ impl FileRange { self.context.reader_builder.cache_strategy().clone(), RowGroupReader::new(self.context.clone(), parquet_reader), ); - PruneReader::new_with_last_row_reader(self.context.clone(), reader) + PruneReader::new_with_last_row_reader(self.context.clone(), reader, skip_fields) } else { // Row group contains DELETE, fallback to default reader. PruneReader::new_with_row_group_reader( self.context.clone(), RowGroupReader::new(self.context.clone(), parquet_reader), + skip_fields, ) }; @@ -143,9 +175,15 @@ impl FileRange { .build(self.row_group_idx, self.row_selection.clone()) .await?; + // Compute skip_fields once for this row group + let skip_fields = self.context.should_skip_fields(self.row_group_idx); + let flat_row_group_reader = FlatRowGroupReader::new(self.context.clone(), parquet_reader); - let flat_prune_reader = - FlatPruneReader::new_with_row_group_reader(self.context.clone(), flat_row_group_reader); + let flat_prune_reader = FlatPruneReader::new_with_row_group_reader( + self.context.clone(), + flat_row_group_reader, + skip_fields, + ); Ok(flat_prune_reader) } @@ -178,6 +216,7 @@ impl FileRangeContext { filters: Vec, read_format: ReadFormat, codec: Arc, + pre_filter_mode: PreFilterMode, ) -> Self { Self { reader_builder, @@ -186,6 +225,7 @@ impl FileRangeContext { read_format, codec, compat_batch: None, + pre_filter_mode, }, } } @@ -222,40 +262,50 @@ impl FileRangeContext { /// TRY THE BEST to perform pushed down predicate precisely on the input batch. /// Return the filtered batch. If the entire batch is filtered out, return None. - pub(crate) fn precise_filter(&self, input: Batch) -> Result> { - self.base.precise_filter(input) + pub(crate) fn precise_filter(&self, input: Batch, skip_fields: bool) -> Result> { + self.base.precise_filter(input, skip_fields) } /// Filters the input RecordBatch by the pushed down predicate and returns RecordBatch. - pub(crate) fn precise_filter_flat(&self, input: RecordBatch) -> Result> { - self.base.precise_filter_flat(input) + pub(crate) fn precise_filter_flat( + &self, + input: RecordBatch, + skip_fields: bool, + ) -> Result> { + self.base.precise_filter_flat(input, skip_fields) + } + + /// Determines whether to skip field filters based on PreFilterMode and row group delete status. + pub(crate) fn should_skip_fields(&self, row_group_idx: usize) -> bool { + match self.base.pre_filter_mode { + PreFilterMode::All => false, + PreFilterMode::SkipFields => true, + PreFilterMode::SkipFieldsOnDelete => { + // Check if this specific row group contains delete op + self.contains_delete(row_group_idx).unwrap_or(true) + } + } } //// Decodes parquet metadata and finds if row group contains delete op. pub(crate) fn contains_delete(&self, row_group_index: usize) -> Result { let metadata = self.reader_builder.parquet_metadata(); - let row_group_metadata = &metadata.row_groups()[row_group_index]; - - // safety: The last column of SST must be op_type - let column_metadata = &row_group_metadata.columns().last().unwrap(); - let stats = column_metadata.statistics().context(StatsNotPresentSnafu { - file_path: self.reader_builder.file_path(), - })?; - stats - .min_bytes_opt() - .context(StatsNotPresentSnafu { - file_path: self.reader_builder.file_path(), - })? - .try_into() - .map(i32::from_le_bytes) - .map(|min_op_type| min_op_type == OpType::Delete as i32) - .ok() - .context(DecodeStatsSnafu { - file_path: self.reader_builder.file_path(), - }) + row_group_contains_delete(metadata, row_group_index, self.reader_builder.file_path()) } } +/// Mode to pre-filter columns in a range. +#[derive(Debug, Clone, Copy)] +pub enum PreFilterMode { + /// Filters all columns. + All, + /// If the range doesn't contain delete op or doesn't have statistics, filters all columns. + /// Otherwise, skips filtering fields. + SkipFieldsOnDelete, + /// Always skip fields. + SkipFields, +} + /// Common fields for a range to read and filter batches. pub(crate) struct RangeBase { /// Filters pushed down. @@ -266,6 +316,8 @@ pub(crate) struct RangeBase { pub(crate) codec: Arc, /// Optional helper to compat batches. pub(crate) compat_batch: Option, + /// Mode to pre-filter columns. + pub(crate) pre_filter_mode: PreFilterMode, } impl RangeBase { @@ -276,7 +328,15 @@ impl RangeBase { /// /// When a filter is referencing primary key column, this method will decode /// the primary key and put it into the batch. - pub(crate) fn precise_filter(&self, mut input: Batch) -> Result> { + /// + /// # Arguments + /// * `input` - The batch to filter + /// * `skip_fields` - Whether to skip field filters based on PreFilterMode and row group delete status + pub(crate) fn precise_filter( + &self, + mut input: Batch, + skip_fields: bool, + ) -> Result> { let mut mask = BooleanBuffer::new_set(input.num_rows()); // Run filter one by one and combine them result @@ -331,6 +391,10 @@ impl RangeBase { } } SemanticType::Field => { + // Skip field filters if skip_fields is true + if skip_fields { + continue; + } // Safety: Input is Batch so we are using primary key format. let Some(field_index) = self .read_format @@ -361,8 +425,16 @@ impl RangeBase { /// Filters the input RecordBatch by the pushed down predicate and returns RecordBatch. /// /// It assumes all necessary tags are already decoded from the primary key. - pub(crate) fn precise_filter_flat(&self, input: RecordBatch) -> Result> { - let mask = self.compute_filter_mask_flat(&input)?; + /// + /// # Arguments + /// * `input` - The RecordBatch to filter + /// * `skip_fields` - Whether to skip field filters based on PreFilterMode and row group delete status + pub(crate) fn precise_filter_flat( + &self, + input: RecordBatch, + skip_fields: bool, + ) -> Result> { + let mask = self.compute_filter_mask_flat(&input, skip_fields)?; // If mask is None, the entire batch is filtered out let Some(mask) = mask else { @@ -383,9 +455,14 @@ impl RangeBase { /// Computes the filter mask for the input RecordBatch based on pushed down predicates. /// /// Returns `None` if the entire batch is filtered out, otherwise returns the boolean mask. + /// + /// # Arguments + /// * `input` - The RecordBatch to compute mask for + /// * `skip_fields` - Whether to skip field filters based on PreFilterMode and row group delete status pub(crate) fn compute_filter_mask_flat( &self, input: &RecordBatch, + skip_fields: bool, ) -> Result> { let mut mask = BooleanBuffer::new_set(input.num_rows()); @@ -411,6 +488,11 @@ impl RangeBase { MaybeFilter::Pruned => return Ok(None), }; + // Skip field filters if skip_fields is true + if skip_fields && filter_ctx.semantic_type() == SemanticType::Field { + continue; + } + // Get the column directly by its projected index let column_idx = flat_format.projected_index_by_id(filter_ctx.column_id()); if let Some(idx) = column_idx { diff --git a/src/mito2/src/sst/parquet/metadata.rs b/src/mito2/src/sst/parquet/metadata.rs index 2cf1ecfda8..05c7aac462 100644 --- a/src/mito2/src/sst/parquet/metadata.rs +++ b/src/mito2/src/sst/parquet/metadata.rs @@ -12,17 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::result::Result as StdResult; + +use bytes::Bytes; +use futures::FutureExt; +use futures::future::BoxFuture; use object_store::ObjectStore; -use parquet::file::FOOTER_SIZE; +use parquet::arrow::async_reader::MetadataFetch; +use parquet::errors::{ParquetError, Result as ParquetResult}; use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; -use snafu::ResultExt; +use snafu::{IntoError as _, ResultExt}; use crate::error::{self, Result}; /// The estimated size of the footer and metadata need to read from the end of parquet file. const DEFAULT_PREFETCH_SIZE: u64 = 64 * 1024; -/// Load the metadata of parquet file in an async way. pub(crate) struct MetadataLoader<'a> { // An object store that supports async read object_store: ObjectStore, @@ -46,111 +51,7 @@ impl<'a> MetadataLoader<'a> { } } - /// Async load the metadata of parquet file. - /// - /// Read [DEFAULT_PREFETCH_SIZE] from the end of parquet file at first, if File Metadata is in the - /// read range, decode it and return [ParquetMetaData], otherwise, read again to get the rest of the metadata. - /// - /// Parquet File Format: - /// ```text - /// ┌───────────────────────────────────┐ - /// |4-byte magic number "PAR1" | - /// |───────────────────────────────────| - /// |Column 1 Chunk 1 + Column Metadata | - /// |Column 2 Chunk 1 + Column Metadata | - /// |... | - /// |Column N Chunk M + Column Metadata | - /// |───────────────────────────────────| - /// |File Metadata | - /// |───────────────────────────────────| - /// |4-byte length of file metadata | - /// |4-byte magic number "PAR1" | - /// └───────────────────────────────────┘ - /// ``` - /// - /// Refer to https://github.com/apache/arrow-rs/blob/093a10e46203be1a0e94ae117854701bf58d4c79/parquet/src/arrow/async_reader/metadata.rs#L55-L106 - pub async fn load(&self) -> Result { - let object_store = &self.object_store; - let path = self.file_path; - let file_size = self.get_file_size().await?; - - if file_size < FOOTER_SIZE as u64 { - return error::InvalidParquetSnafu { - file: path, - reason: "file size is smaller than footer size", - } - .fail(); - } - - // Prefetch bytes for metadata from the end and process the footer - let buffer_start = file_size.saturating_sub(DEFAULT_PREFETCH_SIZE); - let buffer = object_store - .read_with(path) - .range(buffer_start..file_size) - .await - .context(error::OpenDalSnafu)? - .to_vec(); - let buffer_len = buffer.len(); - - let mut footer = [0; 8]; - footer.copy_from_slice(&buffer[buffer_len - FOOTER_SIZE..]); - - let footer_tail = ParquetMetaDataReader::decode_footer_tail(&footer).map_err(|e| { - error::InvalidParquetSnafu { - file: path, - reason: format!("failed to decode footer, {e}"), - } - .build() - })?; - let metadata_len = footer_tail.metadata_length() as u64; - - if file_size - (FOOTER_SIZE as u64) < metadata_len { - return error::InvalidParquetSnafu { - file: path, - reason: format!( - "the sum of Metadata length {} and Footer size {} is larger than file size {}", - metadata_len, FOOTER_SIZE, file_size - ), - } - .fail(); - } - - if (metadata_len as usize) <= buffer_len - FOOTER_SIZE { - // The whole metadata is in the first read - let metadata_start = buffer_len - metadata_len as usize - FOOTER_SIZE; - let metadata = ParquetMetaDataReader::decode_metadata( - &buffer[metadata_start..buffer_len - FOOTER_SIZE], - ) - .map_err(|e| { - error::InvalidParquetSnafu { - file: path, - reason: format!("failed to decode metadata, {e}"), - } - .build() - })?; - Ok(metadata) - } else { - // The metadata is out of buffer, need to make a second read - let metadata_start = file_size - metadata_len - FOOTER_SIZE as u64; - let data = object_store - .read_with(path) - .range(metadata_start..(file_size - FOOTER_SIZE as u64)) - .await - .context(error::OpenDalSnafu)? - .to_vec(); - - let metadata = ParquetMetaDataReader::decode_metadata(&data).map_err(|e| { - error::InvalidParquetSnafu { - file: path, - reason: format!("failed to decode metadata, {e}"), - } - .build() - })?; - Ok(metadata) - } - } - - /// Get the size of parquet file. + /// Get the size of parquet file. If file_size is 0, stat the object store to get the size. async fn get_file_size(&self) -> Result { let file_size = match self.file_size { 0 => self @@ -163,4 +64,55 @@ impl<'a> MetadataLoader<'a> { }; Ok(file_size) } + + pub async fn load(&self) -> Result { + let path = self.file_path; + let file_size = self.get_file_size().await?; + let reader = + ParquetMetaDataReader::new().with_prefetch_hint(Some(DEFAULT_PREFETCH_SIZE as usize)); + + let fetch = ObjectStoreFetch { + object_store: &self.object_store, + file_path: self.file_path, + }; + + reader + .load_and_finish(fetch, file_size) + .await + .map_err(|e| match unbox_external_error(e) { + Ok(os_err) => error::OpenDalSnafu {}.into_error(os_err), + Err(parquet_err) => error::ReadParquetSnafu { path }.into_error(parquet_err), + }) + } +} + +/// Unpack ParquetError to get object_store::Error if possible. +fn unbox_external_error(e: ParquetError) -> StdResult { + match e { + ParquetError::External(boxed_err) => match boxed_err.downcast::() { + Ok(os_err) => Ok(*os_err), + Err(parquet_error) => Err(ParquetError::External(parquet_error)), + }, + other => Err(other), + } +} + +struct ObjectStoreFetch<'a> { + object_store: &'a ObjectStore, + file_path: &'a str, +} + +impl MetadataFetch for ObjectStoreFetch<'_> { + fn fetch(&mut self, range: std::ops::Range) -> BoxFuture<'_, ParquetResult> { + async move { + let data = self + .object_store + .read_with(self.file_path) + .range(range) + .await + .map_err(|e| ParquetError::External(Box::new(e)))?; + Ok(data.to_bytes()) + } + .boxed() + } } diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 60cf654380..2c77145e5b 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -55,7 +55,9 @@ use crate::sst::file::FileHandle; use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplierRef; use crate::sst::index::fulltext_index::applier::FulltextIndexApplierRef; use crate::sst::index::inverted_index::applier::InvertedIndexApplierRef; -use crate::sst::parquet::file_range::{FileRangeContext, FileRangeContextRef}; +use crate::sst::parquet::file_range::{ + FileRangeContext, FileRangeContextRef, PreFilterMode, row_group_contains_delete, +}; use crate::sst::parquet::format::{ReadFormat, need_override_sequence}; use crate::sst::parquet::metadata::MetadataLoader; use crate::sst::parquet::row_group::InMemoryRowGroup; @@ -91,7 +93,7 @@ macro_rules! handle_index_error { /// Parquet SST reader builder. pub struct ParquetReaderBuilder { /// SST directory. - file_dir: String, + table_dir: String, /// Path type for generating file paths. path_type: PathType, file_handle: FileHandle, @@ -106,9 +108,9 @@ pub struct ParquetReaderBuilder { /// Strategy to cache SST data. cache_strategy: CacheStrategy, /// Index appliers. - inverted_index_applier: Option, - bloom_filter_index_applier: Option, - fulltext_index_applier: Option, + inverted_index_appliers: [Option; 2], + bloom_filter_index_appliers: [Option; 2], + fulltext_index_appliers: [Option; 2], /// Expected metadata of the region while reading the SST. /// This is usually the latest metadata of the region. The reader use /// it get the correct column id of a column by name. @@ -117,30 +119,33 @@ pub struct ParquetReaderBuilder { flat_format: bool, /// Whether this reader is for compaction. compaction: bool, + /// Mode to pre-filter columns. + pre_filter_mode: PreFilterMode, } impl ParquetReaderBuilder { /// Returns a new [ParquetReaderBuilder] to read specific SST. pub fn new( - file_dir: String, + table_dir: String, path_type: PathType, file_handle: FileHandle, object_store: ObjectStore, ) -> ParquetReaderBuilder { ParquetReaderBuilder { - file_dir, + table_dir, path_type, file_handle, object_store, predicate: None, projection: None, cache_strategy: CacheStrategy::Disabled, - inverted_index_applier: None, - bloom_filter_index_applier: None, - fulltext_index_applier: None, + inverted_index_appliers: [None, None], + bloom_filter_index_appliers: [None, None], + fulltext_index_appliers: [None, None], expected_metadata: None, flat_format: false, compaction: false, + pre_filter_mode: PreFilterMode::All, } } @@ -167,33 +172,33 @@ impl ParquetReaderBuilder { self } - /// Attaches the inverted index applier to the builder. + /// Attaches the inverted index appliers to the builder. #[must_use] - pub(crate) fn inverted_index_applier( + pub(crate) fn inverted_index_appliers( mut self, - index_applier: Option, + index_appliers: [Option; 2], ) -> Self { - self.inverted_index_applier = index_applier; + self.inverted_index_appliers = index_appliers; self } - /// Attaches the bloom filter index applier to the builder. + /// Attaches the bloom filter index appliers to the builder. #[must_use] - pub(crate) fn bloom_filter_index_applier( + pub(crate) fn bloom_filter_index_appliers( mut self, - index_applier: Option, + index_appliers: [Option; 2], ) -> Self { - self.bloom_filter_index_applier = index_applier; + self.bloom_filter_index_appliers = index_appliers; self } - /// Attaches the fulltext index applier to the builder. + /// Attaches the fulltext index appliers to the builder. #[must_use] - pub(crate) fn fulltext_index_applier( + pub(crate) fn fulltext_index_appliers( mut self, - index_applier: Option, + index_appliers: [Option; 2], ) -> Self { - self.fulltext_index_applier = index_applier; + self.fulltext_index_appliers = index_appliers; self } @@ -218,6 +223,13 @@ impl ParquetReaderBuilder { self } + /// Sets the pre-filter mode. + #[must_use] + pub(crate) fn pre_filter_mode(mut self, pre_filter_mode: PreFilterMode) -> Self { + self.pre_filter_mode = pre_filter_mode; + self + } + /// Builds a [ParquetReader]. /// /// This needs to perform IO operation. @@ -237,7 +249,7 @@ impl ParquetReaderBuilder { ) -> Result<(FileRangeContext, RowGroupSelection)> { let start = Instant::now(); - let file_path = self.file_handle.file_path(&self.file_dir, self.path_type); + let file_path = self.file_handle.file_path(&self.table_dir, self.path_type); let file_size = self.file_handle.meta_ref().file_size; // Loads parquet metadata of the file. @@ -321,7 +333,13 @@ impl ParquetReaderBuilder { let codec = build_primary_key_codec(read_format.metadata()); - let context = FileRangeContext::new(reader_builder, filters, read_format, codec); + let context = FileRangeContext::new( + reader_builder, + filters, + read_format, + codec, + self.pre_filter_mode, + ); metrics.build_cost += start.elapsed(); @@ -407,7 +425,16 @@ impl ParquetReaderBuilder { let mut output = RowGroupSelection::new(row_group_size, num_rows as _); - self.prune_row_groups_by_minmax(read_format, parquet_meta, &mut output, metrics); + // Compute skip_fields once for all pruning operations + let skip_fields = self.compute_skip_fields(parquet_meta); + + self.prune_row_groups_by_minmax( + read_format, + parquet_meta, + &mut output, + metrics, + skip_fields, + ); if output.is_empty() { return output; } @@ -418,6 +445,7 @@ impl ParquetReaderBuilder { num_row_groups, &mut output, metrics, + skip_fields, ) .await; if output.is_empty() { @@ -429,14 +457,21 @@ impl ParquetReaderBuilder { num_row_groups, &mut output, metrics, + skip_fields, ) .await; if output.is_empty() { return output; } - self.prune_row_groups_by_bloom_filter(row_group_size, parquet_meta, &mut output, metrics) - .await; + self.prune_row_groups_by_bloom_filter( + row_group_size, + parquet_meta, + &mut output, + metrics, + skip_fields, + ) + .await; if output.is_empty() { return output; } @@ -447,6 +482,7 @@ impl ParquetReaderBuilder { parquet_meta, &mut output, metrics, + skip_fields, ) .await; } @@ -460,50 +496,61 @@ impl ParquetReaderBuilder { num_row_groups: usize, output: &mut RowGroupSelection, metrics: &mut ReaderFilterMetrics, + skip_fields: bool, ) -> bool { - let Some(index_applier) = &self.fulltext_index_applier else { - return false; - }; if !self.file_handle.meta_ref().fulltext_index_available() { return false; } - let predicate_key = index_applier.predicate_key(); - // Fast path: return early if the result is in the cache. - let cached = self - .cache_strategy - .index_result_cache() - .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id())); - if let Some(result) = cached.as_ref() - && all_required_row_groups_searched(output, result) - { - apply_selection_and_update_metrics(output, result, metrics, INDEX_TYPE_FULLTEXT); - return true; - } - - // Slow path: apply the index from the file. - let file_size_hint = self.file_handle.meta_ref().index_file_size(); - let apply_res = index_applier - .apply_fine(self.file_handle.file_id(), Some(file_size_hint)) - .await; - let selection = match apply_res { - Ok(Some(res)) => RowGroupSelection::from_row_ids(res, row_group_size, num_row_groups), - Ok(None) => return false, - Err(err) => { - handle_index_error!(err, self.file_handle, INDEX_TYPE_FULLTEXT); - return false; - } + let mut pruned = false; + // If skip_fields is true, only apply the first applier (for tags). + let appliers = if skip_fields { + &self.fulltext_index_appliers[..1] + } else { + &self.fulltext_index_appliers[..] }; + for index_applier in appliers.iter().flatten() { + let predicate_key = index_applier.predicate_key(); + // Fast path: return early if the result is in the cache. + let cached = self + .cache_strategy + .index_result_cache() + .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id())); + if let Some(result) = cached.as_ref() + && all_required_row_groups_searched(output, result) + { + apply_selection_and_update_metrics(output, result, metrics, INDEX_TYPE_FULLTEXT); + pruned = true; + continue; + } - self.apply_index_result_and_update_cache( - predicate_key, - self.file_handle.file_id().file_id(), - selection, - output, - metrics, - INDEX_TYPE_FULLTEXT, - ); - true + // Slow path: apply the index from the file. + let file_size_hint = self.file_handle.meta_ref().index_file_size(); + let apply_res = index_applier + .apply_fine(self.file_handle.file_id(), Some(file_size_hint)) + .await; + let selection = match apply_res { + Ok(Some(res)) => { + RowGroupSelection::from_row_ids(res, row_group_size, num_row_groups) + } + Ok(None) => continue, + Err(err) => { + handle_index_error!(err, self.file_handle, INDEX_TYPE_FULLTEXT); + continue; + } + }; + + self.apply_index_result_and_update_cache( + predicate_key, + self.file_handle.file_id().file_id(), + selection, + output, + metrics, + INDEX_TYPE_FULLTEXT, + ); + pruned = true; + } + pruned } /// Applies index to prune row groups. @@ -517,53 +564,62 @@ impl ParquetReaderBuilder { num_row_groups: usize, output: &mut RowGroupSelection, metrics: &mut ReaderFilterMetrics, + skip_fields: bool, ) -> bool { - let Some(index_applier) = &self.inverted_index_applier else { - return false; - }; if !self.file_handle.meta_ref().inverted_index_available() { return false; } - let predicate_key = index_applier.predicate_key(); - // Fast path: return early if the result is in the cache. - let cached = self - .cache_strategy - .index_result_cache() - .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id())); - if let Some(result) = cached.as_ref() - && all_required_row_groups_searched(output, result) - { - apply_selection_and_update_metrics(output, result, metrics, INDEX_TYPE_INVERTED); - return true; - } - - // Slow path: apply the index from the file. - let file_size_hint = self.file_handle.meta_ref().index_file_size(); - let apply_res = index_applier - .apply(self.file_handle.file_id(), Some(file_size_hint)) - .await; - let selection = match apply_res { - Ok(output) => RowGroupSelection::from_inverted_index_apply_output( - row_group_size, - num_row_groups, - output, - ), - Err(err) => { - handle_index_error!(err, self.file_handle, INDEX_TYPE_INVERTED); - return false; - } + let mut pruned = false; + // If skip_fields is true, only apply the first applier (for tags). + let appliers = if skip_fields { + &self.inverted_index_appliers[..1] + } else { + &self.inverted_index_appliers[..] }; + for index_applier in appliers.iter().flatten() { + let predicate_key = index_applier.predicate_key(); + // Fast path: return early if the result is in the cache. + let cached = self + .cache_strategy + .index_result_cache() + .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id())); + if let Some(result) = cached.as_ref() + && all_required_row_groups_searched(output, result) + { + apply_selection_and_update_metrics(output, result, metrics, INDEX_TYPE_INVERTED); + pruned = true; + continue; + } - self.apply_index_result_and_update_cache( - predicate_key, - self.file_handle.file_id().file_id(), - selection, - output, - metrics, - INDEX_TYPE_INVERTED, - ); - true + // Slow path: apply the index from the file. + let file_size_hint = self.file_handle.meta_ref().index_file_size(); + let apply_res = index_applier + .apply(self.file_handle.file_id(), Some(file_size_hint)) + .await; + let selection = match apply_res { + Ok(output) => RowGroupSelection::from_inverted_index_apply_output( + row_group_size, + num_row_groups, + output, + ), + Err(err) => { + handle_index_error!(err, self.file_handle, INDEX_TYPE_INVERTED); + continue; + } + }; + + self.apply_index_result_and_update_cache( + predicate_key, + self.file_handle.file_id().file_id(), + selection, + output, + metrics, + INDEX_TYPE_INVERTED, + ); + pruned = true; + } + pruned } async fn prune_row_groups_by_bloom_filter( @@ -572,65 +628,76 @@ impl ParquetReaderBuilder { parquet_meta: &ParquetMetaData, output: &mut RowGroupSelection, metrics: &mut ReaderFilterMetrics, + skip_fields: bool, ) -> bool { - let Some(index_applier) = &self.bloom_filter_index_applier else { - return false; - }; if !self.file_handle.meta_ref().bloom_filter_index_available() { return false; } - let predicate_key = index_applier.predicate_key(); - // Fast path: return early if the result is in the cache. - let cached = self - .cache_strategy - .index_result_cache() - .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id())); - if let Some(result) = cached.as_ref() - && all_required_row_groups_searched(output, result) - { - apply_selection_and_update_metrics(output, result, metrics, INDEX_TYPE_BLOOM); - return true; - } - - // Slow path: apply the index from the file. - let file_size_hint = self.file_handle.meta_ref().index_file_size(); - let rgs = parquet_meta.row_groups().iter().enumerate().map(|(i, rg)| { - ( - rg.num_rows() as usize, - // Optimize: only search the row group that required by `output` and not stored in `cached`. - output.contains_non_empty_row_group(i) - && cached - .as_ref() - .map(|c| !c.contains_row_group(i)) - .unwrap_or(true), - ) - }); - let apply_res = index_applier - .apply(self.file_handle.file_id(), Some(file_size_hint), rgs) - .await; - let mut selection = match apply_res { - Ok(apply_output) => RowGroupSelection::from_row_ranges(apply_output, row_group_size), - Err(err) => { - handle_index_error!(err, self.file_handle, INDEX_TYPE_BLOOM); - return false; - } + let mut pruned = false; + // If skip_fields is true, only apply the first applier (for tags). + let appliers = if skip_fields { + &self.bloom_filter_index_appliers[..1] + } else { + &self.bloom_filter_index_appliers[..] }; + for index_applier in appliers.iter().flatten() { + let predicate_key = index_applier.predicate_key(); + // Fast path: return early if the result is in the cache. + let cached = self + .cache_strategy + .index_result_cache() + .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id())); + if let Some(result) = cached.as_ref() + && all_required_row_groups_searched(output, result) + { + apply_selection_and_update_metrics(output, result, metrics, INDEX_TYPE_BLOOM); + pruned = true; + continue; + } - // New searched row groups are added to `selection`, concat them with `cached`. - if let Some(cached) = cached.as_ref() { - selection.concat(cached); + // Slow path: apply the index from the file. + let file_size_hint = self.file_handle.meta_ref().index_file_size(); + let rgs = parquet_meta.row_groups().iter().enumerate().map(|(i, rg)| { + ( + rg.num_rows() as usize, + // Optimize: only search the row group that required by `output` and not stored in `cached`. + output.contains_non_empty_row_group(i) + && cached + .as_ref() + .map(|c| !c.contains_row_group(i)) + .unwrap_or(true), + ) + }); + let apply_res = index_applier + .apply(self.file_handle.file_id(), Some(file_size_hint), rgs) + .await; + let mut selection = match apply_res { + Ok(apply_output) => { + RowGroupSelection::from_row_ranges(apply_output, row_group_size) + } + Err(err) => { + handle_index_error!(err, self.file_handle, INDEX_TYPE_BLOOM); + continue; + } + }; + + // New searched row groups are added to `selection`, concat them with `cached`. + if let Some(cached) = cached.as_ref() { + selection.concat(cached); + } + + self.apply_index_result_and_update_cache( + predicate_key, + self.file_handle.file_id().file_id(), + selection, + output, + metrics, + INDEX_TYPE_BLOOM, + ); + pruned = true; } - - self.apply_index_result_and_update_cache( - predicate_key, - self.file_handle.file_id().file_id(), - selection, - output, - metrics, - INDEX_TYPE_BLOOM, - ); - true + pruned } async fn prune_row_groups_by_fulltext_bloom( @@ -639,68 +706,96 @@ impl ParquetReaderBuilder { parquet_meta: &ParquetMetaData, output: &mut RowGroupSelection, metrics: &mut ReaderFilterMetrics, + skip_fields: bool, ) -> bool { - let Some(index_applier) = &self.fulltext_index_applier else { - return false; - }; if !self.file_handle.meta_ref().fulltext_index_available() { return false; } - let predicate_key = index_applier.predicate_key(); - // Fast path: return early if the result is in the cache. - let cached = self - .cache_strategy - .index_result_cache() - .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id())); - if let Some(result) = cached.as_ref() - && all_required_row_groups_searched(output, result) - { - apply_selection_and_update_metrics(output, result, metrics, INDEX_TYPE_FULLTEXT); - return true; - } - - // Slow path: apply the index from the file. - let file_size_hint = self.file_handle.meta_ref().index_file_size(); - let rgs = parquet_meta.row_groups().iter().enumerate().map(|(i, rg)| { - ( - rg.num_rows() as usize, - // Optimize: only search the row group that required by `output` and not stored in `cached`. - output.contains_non_empty_row_group(i) - && cached - .as_ref() - .map(|c| !c.contains_row_group(i)) - .unwrap_or(true), - ) - }); - let apply_res = index_applier - .apply_coarse(self.file_handle.file_id(), Some(file_size_hint), rgs) - .await; - let mut selection = match apply_res { - Ok(Some(apply_output)) => { - RowGroupSelection::from_row_ranges(apply_output, row_group_size) - } - Ok(None) => return false, - Err(err) => { - handle_index_error!(err, self.file_handle, INDEX_TYPE_FULLTEXT); - return false; - } + let mut pruned = false; + // If skip_fields is true, only apply the first applier (for tags). + let appliers = if skip_fields { + &self.fulltext_index_appliers[..1] + } else { + &self.fulltext_index_appliers[..] }; + for index_applier in appliers.iter().flatten() { + let predicate_key = index_applier.predicate_key(); + // Fast path: return early if the result is in the cache. + let cached = self + .cache_strategy + .index_result_cache() + .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id())); + if let Some(result) = cached.as_ref() + && all_required_row_groups_searched(output, result) + { + apply_selection_and_update_metrics(output, result, metrics, INDEX_TYPE_FULLTEXT); + pruned = true; + continue; + } - // New searched row groups are added to `selection`, concat them with `cached`. - if let Some(cached) = cached.as_ref() { - selection.concat(cached); + // Slow path: apply the index from the file. + let file_size_hint = self.file_handle.meta_ref().index_file_size(); + let rgs = parquet_meta.row_groups().iter().enumerate().map(|(i, rg)| { + ( + rg.num_rows() as usize, + // Optimize: only search the row group that required by `output` and not stored in `cached`. + output.contains_non_empty_row_group(i) + && cached + .as_ref() + .map(|c| !c.contains_row_group(i)) + .unwrap_or(true), + ) + }); + let apply_res = index_applier + .apply_coarse(self.file_handle.file_id(), Some(file_size_hint), rgs) + .await; + let mut selection = match apply_res { + Ok(Some(apply_output)) => { + RowGroupSelection::from_row_ranges(apply_output, row_group_size) + } + Ok(None) => continue, + Err(err) => { + handle_index_error!(err, self.file_handle, INDEX_TYPE_FULLTEXT); + continue; + } + }; + + // New searched row groups are added to `selection`, concat them with `cached`. + if let Some(cached) = cached.as_ref() { + selection.concat(cached); + } + + self.apply_index_result_and_update_cache( + predicate_key, + self.file_handle.file_id().file_id(), + selection, + output, + metrics, + INDEX_TYPE_FULLTEXT, + ); + pruned = true; } + pruned + } - self.apply_index_result_and_update_cache( - predicate_key, - self.file_handle.file_id().file_id(), - selection, - output, - metrics, - INDEX_TYPE_FULLTEXT, - ); - true + /// Computes whether to skip field columns when building statistics based on PreFilterMode. + fn compute_skip_fields(&self, parquet_meta: &ParquetMetaData) -> bool { + match self.pre_filter_mode { + PreFilterMode::All => false, + PreFilterMode::SkipFields => true, + PreFilterMode::SkipFieldsOnDelete => { + // Check if any row group contains delete op + let file_path = self.file_handle.file_path(&self.table_dir, self.path_type); + (0..parquet_meta.num_row_groups()).any(|rg_idx| { + row_group_contains_delete(parquet_meta, rg_idx, &file_path) + .inspect_err(|e| { + warn!(e; "Failed to decode min value of op_type, fallback to not skipping fields"); + }) + .unwrap_or(false) + }) + } + } } /// Prunes row groups by min-max index. @@ -710,6 +805,7 @@ impl ParquetReaderBuilder { parquet_meta: &ParquetMetaData, output: &mut RowGroupSelection, metrics: &mut ReaderFilterMetrics, + skip_fields: bool, ) -> bool { let Some(predicate) = &self.predicate else { return false; @@ -719,8 +815,12 @@ impl ParquetReaderBuilder { let region_meta = read_format.metadata(); let row_groups = parquet_meta.row_groups(); - let stats = - RowGroupPruningStats::new(row_groups, read_format, self.expected_metadata.clone()); + let stats = RowGroupPruningStats::new( + row_groups, + read_format, + self.expected_metadata.clone(), + skip_fields, + ); let prune_schema = self .expected_metadata .as_ref() @@ -1151,10 +1251,12 @@ impl BatchReader for ParquetReader { .await?; // Resets the parquet reader. - reader.reset_source(Source::RowGroup(RowGroupReader::new( - self.context.clone(), - parquet_reader, - ))); + // Compute skip_fields for this row group + let skip_fields = self.context.should_skip_fields(row_group_idx); + reader.reset_source( + Source::RowGroup(RowGroupReader::new(self.context.clone(), parquet_reader)), + skip_fields, + ); if let Some(batch) = reader.next_batch().await? { return Ok(Some(batch)); } @@ -1207,9 +1309,12 @@ impl ParquetReader { .reader_builder() .build(row_group_idx, Some(row_selection)) .await?; + // Compute skip_fields once for this row group + let skip_fields = context.should_skip_fields(row_group_idx); ReaderState::Readable(PruneReader::new_with_row_group_reader( context.clone(), RowGroupReader::new(context.clone(), parquet_reader), + skip_fields, )) } else { ReaderState::Exhausted(ReaderMetrics::default()) @@ -1227,7 +1332,6 @@ impl ParquetReader { self.context.read_format().metadata() } - #[cfg(test)] pub fn parquet_metadata(&self) -> Arc { self.context.reader_builder().parquet_meta.clone() } diff --git a/src/mito2/src/sst/parquet/stats.rs b/src/mito2/src/sst/parquet/stats.rs index e16d5de76c..7c5da69d4b 100644 --- a/src/mito2/src/sst/parquet/stats.rs +++ b/src/mito2/src/sst/parquet/stats.rs @@ -18,6 +18,7 @@ use std::borrow::Borrow; use std::collections::HashSet; use std::sync::Arc; +use api::v1::SemanticType; use datafusion_common::pruning::PruningStatistics; use datafusion_common::{Column, ScalarValue}; use datatypes::arrow::array::{ArrayRef, BooleanArray, UInt64Array}; @@ -38,6 +39,8 @@ pub(crate) struct RowGroupPruningStats<'a, T> { /// of the metadata in the SST to get the column id of a column as the SST may have /// different columns. expected_metadata: Option, + /// If true, skip columns with Field semantic type during pruning. + skip_fields: bool, } impl<'a, T> RowGroupPruningStats<'a, T> { @@ -46,22 +49,32 @@ impl<'a, T> RowGroupPruningStats<'a, T> { row_groups: &'a [T], read_format: &'a ReadFormat, expected_metadata: Option, + skip_fields: bool, ) -> Self { Self { row_groups, read_format, expected_metadata, + skip_fields, } } /// Returns the column id of specific column name if we need to read it. /// Prefers the column id in the expected metadata if it exists. + /// Returns None if skip_fields is true and the column is a Field. fn column_id_to_prune(&self, name: &str) -> Option { let metadata = self .expected_metadata .as_ref() .unwrap_or_else(|| self.read_format.metadata()); - metadata.column_by_name(name).map(|col| col.column_id) + let col = metadata.column_by_name(name)?; + + // Skip field columns when skip_fields is enabled + if self.skip_fields && col.semantic_type == SemanticType::Field { + return None; + } + + Some(col.column_id) } /// Returns the default value of all row groups for `column` according to the metadata. diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index 01e1e95a9c..5247e2eec8 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -40,7 +40,7 @@ use parquet::schema::types::ColumnPath; use smallvec::smallvec; use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; -use store_api::storage::consts::SEQUENCE_COLUMN_NAME; +use store_api::storage::consts::{OP_TYPE_COLUMN_NAME, SEQUENCE_COLUMN_NAME}; use store_api::storage::{FileId, SequenceNumber}; use tokio::io::AsyncWrite; use tokio_util::compat::{Compat, FuturesAsyncWriteCompatExt}; @@ -57,10 +57,12 @@ use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index} use crate::sst::parquet::format::PrimaryKeyWriteFormat; use crate::sst::parquet::helper::parse_parquet_metadata; use crate::sst::parquet::{PARQUET_METADATA_KEY, SstInfo, WriteOptions}; -use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions}; +use crate::sst::{ + DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator, +}; /// Parquet SST writer. -pub struct ParquetWriter { +pub struct ParquetWriter<'a, F: WriterFactory, I: IndexerBuilder, P: FilePathProvider> { /// Path provider that creates SST and index file paths according to file id. path_provider: P, writer: Option>>, @@ -79,7 +81,7 @@ pub struct ParquetWriter, /// Write metrics - metrics: Metrics, + metrics: &'a mut Metrics, } pub trait WriterFactory { @@ -105,7 +107,7 @@ impl WriterFactory for ObjectStoreWriterFactory { } } -impl ParquetWriter +impl<'a, I, P> ParquetWriter<'a, ObjectStoreWriterFactory, I, P> where P: FilePathProvider, I: IndexerBuilder, @@ -116,8 +118,8 @@ where index_config: IndexConfig, indexer_builder: I, path_provider: P, - metrics: Metrics, - ) -> ParquetWriter { + metrics: &'a mut Metrics, + ) -> ParquetWriter<'a, ObjectStoreWriterFactory, I, P> { ParquetWriter::new( ObjectStoreWriterFactory { object_store }, metadata, @@ -135,7 +137,7 @@ where } } -impl ParquetWriter +impl<'a, F, I, P> ParquetWriter<'a, F, I, P> where F: WriterFactory, I: IndexerBuilder, @@ -148,8 +150,8 @@ where index_config: IndexConfig, indexer_builder: I, path_provider: P, - metrics: Metrics, - ) -> ParquetWriter { + metrics: &'a mut Metrics, + ) -> ParquetWriter<'a, F, I, P> { let init_file = FileId::random(); let indexer = indexer_builder.build(init_file).await; @@ -176,7 +178,7 @@ where ) -> Result<()> { // maybe_init_writer will re-create a new file. if let Some(mut current_writer) = mem::take(&mut self.writer) { - let stats = mem::take(stats); + let mut stats = mem::take(stats); // At least one row has been written. assert!(stats.num_rows > 0); @@ -211,6 +213,7 @@ where // convert FileMetaData to ParquetMetaData let parquet_metadata = parse_parquet_metadata(file_meta)?; + let num_series = stats.series_estimator.finish(); ssts.push(SstInfo { file_id: self.current_file, time_range, @@ -219,6 +222,7 @@ where num_row_groups: parquet_metadata.num_row_groups() as u64, file_metadata: Some(Arc::new(parquet_metadata)), index_metadata: index_output, + num_series, }); self.current_file = FileId::random(); self.bytes_written.store(0, Ordering::Relaxed) @@ -384,12 +388,14 @@ where .clone(), ]); let seq_col = ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]); + let op_type_col = ColumnPath::new(vec![OP_TYPE_COLUMN_NAME.to_string()]); builder .set_column_encoding(seq_col.clone(), Encoding::DELTA_BINARY_PACKED) .set_column_dictionary_enabled(seq_col, false) .set_column_encoding(ts_col.clone(), Encoding::DELTA_BINARY_PACKED) .set_column_dictionary_enabled(ts_col, false) + .set_column_compression(op_type_col, Compression::UNCOMPRESSED) } async fn write_next_batch( @@ -483,11 +489,6 @@ where Ok(self.writer.as_mut().unwrap()) } } - - /// Consumes write and return the collected metrics. - pub fn into_metrics(self) -> Metrics { - self.metrics - } } #[derive(Default)] @@ -496,6 +497,8 @@ struct SourceStats { num_rows: usize, /// Time range of fetched batches. time_range: Option<(Timestamp, Timestamp)>, + /// Series estimator for computing num_series. + series_estimator: SeriesEstimator, } impl SourceStats { @@ -505,6 +508,7 @@ impl SourceStats { } self.num_rows += batch.num_rows(); + self.series_estimator.update(batch); // Safety: batch is not empty. let (min_in_batch, max_in_batch) = ( batch.first_timestamp().unwrap(), @@ -524,6 +528,7 @@ impl SourceStats { } self.num_rows += record_batch.num_rows(); + self.series_estimator.update_flat(record_batch); // Get the timestamp column by index let time_index_col_idx = time_index_column_index(record_batch.num_columns()); diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs index 4aac6f2e4b..bcb1db4331 100644 --- a/src/mito2/src/test_util.rs +++ b/src/mito2/src/test_util.rs @@ -39,7 +39,7 @@ use common_meta::cache::{new_schema_cache, new_table_schema_cache}; use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; use common_meta::kv_backend::memory::MemoryKvBackend; -use common_telemetry::warn; +use common_telemetry::{debug, warn}; use common_test_util::temp_dir::{TempDir, create_temp_dir}; use common_wal::options::{KafkaWalOptions, WAL_OPTIONS_KEY, WalOptions}; use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array}; @@ -50,6 +50,7 @@ use log_store::raft_engine::log_store::RaftEngineLogStore; use log_store::test_util::log_store_util; use moka::future::CacheBuilder; use object_store::ObjectStore; +use object_store::layers::mock::MockLayer; use object_store::manager::{ObjectStoreManager, ObjectStoreManagerRef}; use object_store::services::Fs; use rskafka::client::partition::{Compression, UnknownTopicHandling}; @@ -90,7 +91,7 @@ pub(crate) fn raft_engine_log_store_factory() -> Option { Some(LogStoreFactory::RaftEngine(RaftEngineLogStoreFactory)) } -pub(crate) fn kafka_log_store_factory() -> Option { +pub fn kafka_log_store_factory() -> Option { let _ = dotenv::dotenv(); let Ok(broker_endpoints) = std::env::var("GT_KAFKA_ENDPOINTS") else { warn!("env GT_KAFKA_ENDPOINTS not found"); @@ -130,7 +131,7 @@ pub(crate) fn multiple_log_store_factories(#[case] factory: Option) {} #[template] @@ -140,7 +141,7 @@ pub(crate) fn single_kafka_log_store_factory(#[case] factory: Option) {} #[derive(Clone)] -pub(crate) struct RaftEngineLogStoreFactory; +pub struct RaftEngineLogStoreFactory; impl RaftEngineLogStoreFactory { async fn create_log_store>(&self, wal_path: P) -> RaftEngineLogStore { @@ -148,7 +149,7 @@ impl RaftEngineLogStoreFactory { } } -pub(crate) async fn prepare_test_for_kafka_log_store(factory: &LogStoreFactory) -> Option { +pub async fn prepare_test_for_kafka_log_store(factory: &LogStoreFactory) -> Option { if let LogStoreFactory::Kafka(factory) = factory { let topic = uuid::Uuid::new_v4().to_string(); let client = factory.client().await; @@ -186,7 +187,7 @@ pub(crate) async fn append_noop_record(client: &Client, topic: &str) { .unwrap(); } #[derive(Clone)] -pub(crate) struct KafkaLogStoreFactory { +pub struct KafkaLogStoreFactory { broker_endpoints: Vec, } @@ -204,7 +205,7 @@ impl KafkaLogStoreFactory { } #[derive(Clone)] -pub(crate) enum LogStoreFactory { +pub enum LogStoreFactory { RaftEngine(RaftEngineLogStoreFactory), Kafka(KafkaLogStoreFactory), } @@ -221,13 +222,14 @@ pub struct TestEnv { data_home: TempDir, intermediate_manager: IntermediateManager, puffin_manager: PuffinManagerFactory, - log_store: Option, + pub(crate) log_store: Option, log_store_factory: LogStoreFactory, - object_store_manager: Option, + pub(crate) object_store_manager: Option, schema_metadata_manager: SchemaMetadataManagerRef, file_ref_manager: FileReferenceManagerRef, kv_backend: KvBackendRef, partition_expr_fetcher: PartitionExprFetcherRef, + object_store_mock_layer: Option, } impl TestEnv { @@ -264,15 +266,22 @@ impl TestEnv { file_ref_manager: Arc::new(FileReferenceManager::new(None)), kv_backend, partition_expr_fetcher: noop_partition_expr_fetcher(), + object_store_mock_layer: None, } } /// Overwrites the original `log_store_factory`. - pub(crate) fn with_log_store_factory(mut self, log_store_factory: LogStoreFactory) -> TestEnv { + pub fn with_log_store_factory(mut self, log_store_factory: LogStoreFactory) -> TestEnv { self.log_store_factory = log_store_factory; self } + /// Sets the original `object_store_mock_layer`. + pub fn with_mock_layer(mut self, mock_layer: MockLayer) -> TestEnv { + self.object_store_mock_layer = Some(mock_layer); + self + } + pub fn get_object_store(&self) -> Option { self.object_store_manager .as_ref() @@ -287,7 +296,7 @@ impl TestEnv { self.object_store_manager.clone() } - async fn new_mito_engine(&self, config: MitoConfig) -> MitoEngine { + pub(crate) async fn new_mito_engine(&self, config: MitoConfig) -> MitoEngine { async fn create( zelf: &TestEnv, config: MitoConfig, @@ -541,37 +550,53 @@ impl TestEnv { /// Returns the log store and object store manager. async fn create_log_and_object_store_manager(&self) -> (LogStoreImpl, ObjectStoreManager) { + let log_store = self.create_log_store().await; + let object_store_manager = self.create_object_store_manager(); + + (log_store, object_store_manager) + } + + pub(crate) async fn create_log_store(&self) -> LogStoreImpl { let data_home = self.data_home.path(); let wal_path = data_home.join("wal"); - let object_store_manager = self.create_object_store_manager(); match &self.log_store_factory { LogStoreFactory::RaftEngine(factory) => { let log_store = factory.create_log_store(wal_path).await; - ( - LogStoreImpl::RaftEngine(Arc::new(log_store)), - object_store_manager, - ) + + LogStoreImpl::RaftEngine(Arc::new(log_store)) } LogStoreFactory::Kafka(factory) => { let log_store = factory.create_log_store().await; - ( - LogStoreImpl::Kafka(Arc::new(log_store)), - object_store_manager, - ) + LogStoreImpl::Kafka(Arc::new(log_store)) } } } - fn create_object_store_manager(&self) -> ObjectStoreManager { + pub(crate) fn create_object_store_manager(&self) -> ObjectStoreManager { let data_home = self.data_home.path(); let data_path = data_home.join("data").as_path().display().to_string(); let builder = Fs::default().root(&data_path); - let object_store = ObjectStore::new(builder).unwrap().finish(); + + let object_store = if let Some(mock_layer) = self.object_store_mock_layer.as_ref() { + debug!("create object store with mock layer"); + ObjectStore::new(builder) + .unwrap() + .layer(mock_layer.clone()) + .finish() + } else { + ObjectStore::new(builder).unwrap().finish() + }; ObjectStoreManager::new("default", object_store) } + pub(crate) fn create_in_memory_object_store_manager(&self) -> ObjectStoreManager { + let builder = object_store::services::Memory::default(); + let object_store = ObjectStore::new(builder).unwrap().finish(); + ObjectStoreManager::new("memory", object_store) + } + /// If `initial_metadata` is `Some`, creates a new manifest. If `initial_metadata` /// is `None`, opens an existing manifest and returns `None` if no such manifest. pub async fn create_manifest_manager( @@ -608,14 +633,13 @@ impl TestEnv { metadata, 0, manifest_opts, - Default::default(), - Default::default(), FormatType::PrimaryKey, + &Default::default(), ) .await .map(Some) } else { - RegionManifestManager::open(manifest_opts, Default::default(), Default::default()).await + RegionManifestManager::open(manifest_opts, &Default::default()).await } } @@ -629,6 +653,7 @@ impl TestEnv { local_store, capacity, None, + None, self.puffin_manager.clone(), self.intermediate_manager.clone(), ) @@ -648,6 +673,7 @@ impl TestEnv { path, capacity, None, + None, self.puffin_manager.clone(), self.intermediate_manager.clone(), ) @@ -1068,6 +1094,19 @@ pub fn build_rows(start: usize, end: usize) -> Vec { .collect() } +/// Build rows with schema (string, ts_millis) in range `[start, end)`. +/// `start`, `end` are in second resolution. +pub fn build_delete_rows(start: usize, end: usize) -> Vec { + (start..end) + .map(|i| { + row(vec![ + ValueData::StringValue(i.to_string()), + ValueData::TimestampMillisecondValue(i as i64 * 1000), + ]) + }) + .collect() +} + /// Build rows with schema (string, f64, f64, ts_millis). /// - `key`: A string key that is common across all rows. /// - `timestamps`: Array of timestamp values. @@ -1165,7 +1204,7 @@ pub async fn delete_rows(engine: &MitoEngine, region_id: RegionId, rows: Rows) { let result = engine .handle_request( region_id, - RegionRequest::Delete(RegionDeleteRequest { rows }), + RegionRequest::Delete(RegionDeleteRequest { rows, hint: None }), ) .await .unwrap(); diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs index 2174ac7b9f..7ddac4ee0d 100644 --- a/src/mito2/src/test_util/memtable_util.rs +++ b/src/mito2/src/test_util/memtable_util.rs @@ -38,9 +38,8 @@ use crate::memtable::bulk::part::BulkPart; use crate::memtable::partition_tree::data::{DataBatch, DataBuffer, timestamp_array_to_i64_slice}; use crate::memtable::{ BoxedBatchIterator, KeyValues, Memtable, MemtableBuilder, MemtableId, MemtableRanges, - MemtableRef, MemtableStats, + MemtableRef, MemtableStats, RangesOptions, }; -use crate::read::scan_region::PredicateGroup; /// Empty memtable for test. #[derive(Debug, Default)] @@ -97,9 +96,7 @@ impl Memtable for EmptyMemtable { fn ranges( &self, _projection: Option<&[ColumnId]>, - _predicate: PredicateGroup, - _sequence: Option, - _for_flush: bool, + _options: RangesOptions, ) -> Result { Ok(MemtableRanges::default()) } diff --git a/src/mito2/src/test_util/scheduler_util.rs b/src/mito2/src/test_util/scheduler_util.rs index 3fbfbd0ad1..712649b4d6 100644 --- a/src/mito2/src/test_util/scheduler_util.rs +++ b/src/mito2/src/test_util/scheduler_util.rs @@ -111,10 +111,10 @@ impl SchedulerEnv { } /// Creates a new index build scheduler. - pub(crate) fn mock_index_build_scheduler(&self) -> IndexBuildScheduler { + pub(crate) fn mock_index_build_scheduler(&self, files_limit: usize) -> IndexBuildScheduler { let scheduler = self.get_scheduler(); - IndexBuildScheduler::new(scheduler) + IndexBuildScheduler::new(scheduler, files_limit) } /// Creates a new manifest context. @@ -133,9 +133,8 @@ impl SchedulerEnv { checkpoint_distance: 10, remove_file_options: Default::default(), }, - Default::default(), - Default::default(), FormatType::PrimaryKey, + &Default::default(), ) .await .unwrap(), diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs index 5eacf06bd5..8299c9e3da 100644 --- a/src/mito2/src/test_util/sst_util.rs +++ b/src/mito2/src/test_util/sst_util.rs @@ -124,9 +124,12 @@ pub fn sst_file_handle_with_file_id(file_id: FileId, start_ms: i64, end_ms: i64) level: 0, file_size: 0, available_indexes: Default::default(), + indexes: Default::default(), index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, + num_series: 0, sequence: None, partition_expr: None, }, diff --git a/src/mito2/src/test_util/version_util.rs b/src/mito2/src/test_util/version_util.rs index 86cc11eaf5..68fe723ab0 100644 --- a/src/mito2/src/test_util/version_util.rs +++ b/src/mito2/src/test_util/version_util.rs @@ -102,9 +102,12 @@ impl VersionControlBuilder { level: 0, file_size: 0, // We don't care file size. available_indexes: Default::default(), + indexes: Default::default(), index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, + num_series: 0, sequence: NonZeroU64::new(start_ms as u64), partition_expr: match &self.metadata.partition_expr { Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str) @@ -190,9 +193,12 @@ pub(crate) fn apply_edit( level: 0, file_size: 0, // We don't care file size. available_indexes: Default::default(), + indexes: Default::default(), index_file_size: 0, + index_file_id: None, num_rows: 0, num_row_groups: 0, + num_series: 0, sequence: NonZeroU64::new(*start_ms as u64), partition_expr: match &version_control.current().version.metadata.partition_expr { Some(json_str) => partition::expr::PartitionExpr::from_json_str(json_str) diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs index 87c25cd964..d80a83264a 100644 --- a/src/mito2/src/worker.rs +++ b/src/mito2/src/worker.rs @@ -21,10 +21,12 @@ mod handle_close; mod handle_compaction; mod handle_create; mod handle_drop; +mod handle_enter_staging; mod handle_flush; mod handle_manifest; mod handle_open; mod handle_rebuild_index; +mod handle_remap; mod handle_truncate; mod handle_write; @@ -58,10 +60,14 @@ use crate::compaction::CompactionScheduler; use crate::config::MitoConfig; use crate::error::{self, CreateDirSnafu, JoinSnafu, Result, WorkerStoppedSnafu}; use crate::flush::{FlushScheduler, WriteBufferManagerImpl, WriteBufferManagerRef}; +use crate::gc::{GcLimiter, GcLimiterRef}; use crate::memtable::MemtableBuilderProvider; use crate::metrics::{REGION_COUNT, REQUEST_WAIT_TIME, WRITE_STALLING}; use crate::region::opener::PartitionExprFetcherRef; -use crate::region::{MitoRegionRef, OpeningRegions, OpeningRegionsRef, RegionMap, RegionMapRef}; +use crate::region::{ + CatchupRegions, CatchupRegionsRef, MitoRegionRef, OpeningRegions, OpeningRegionsRef, RegionMap, + RegionMapRef, +}; use crate::request::{ BackgroundNotify, DdlRequest, SenderBulkRequest, SenderDdlRequest, SenderWriteRequest, WorkerRequest, WorkerRequestWithTime, @@ -138,6 +144,8 @@ pub(crate) struct WorkerGroup { cache_manager: CacheManagerRef, /// File reference manager. file_ref_manager: FileReferenceManagerRef, + /// Gc limiter to limit concurrent gc jobs. + gc_limiter: GcLimiterRef, } impl WorkerGroup { @@ -196,6 +204,7 @@ impl WorkerGroup { .build(), ); let time_provider = Arc::new(StdTimeProvider); + let gc_limiter = Arc::new(GcLimiter::new(config.gc.max_concurrent_gc_job)); let workers = (0..config.num_workers) .map(|id| { @@ -234,6 +243,7 @@ impl WorkerGroup { purge_scheduler, cache_manager, file_ref_manager, + gc_limiter, }) } @@ -275,6 +285,11 @@ impl WorkerGroup { self.worker(region_id).is_region_opening(region_id) } + /// Returns true if the specific region is catching up. + pub(crate) fn is_region_catching_up(&self, region_id: RegionId) -> bool { + self.worker(region_id).is_region_catching_up(region_id) + } + /// Returns region of specific `region_id`. /// /// This method should not be public. @@ -291,6 +306,10 @@ impl WorkerGroup { self.file_ref_manager.clone() } + pub(crate) fn gc_limiter(&self) -> GcLimiterRef { + self.gc_limiter.clone() + } + /// Get worker for specific `region_id`. pub(crate) fn worker(&self, region_id: RegionId) -> &RegionWorker { let index = region_id_to_index(region_id, self.workers.len()); @@ -361,6 +380,7 @@ impl WorkerGroup { .write_cache(write_cache) .build(), ); + let gc_limiter = Arc::new(GcLimiter::new(config.gc.max_concurrent_gc_job)); let workers = (0..config.num_workers) .map(|id| { WorkerStarter { @@ -398,6 +418,7 @@ impl WorkerGroup { purge_scheduler, cache_manager, file_ref_manager, + gc_limiter, }) } @@ -412,7 +433,7 @@ fn region_id_to_index(id: RegionId, num_workers: usize) -> usize { % num_workers } -async fn write_cache_from_config( +pub async fn write_cache_from_config( config: &MitoConfig, puffin_manager_factory: PuffinManagerFactory, intermediate_manager: IntermediateManager, @@ -431,6 +452,7 @@ async fn write_cache_from_config( &config.write_cache_path, config.write_cache_size, config.write_cache_ttl, + Some(config.index_cache_percent), puffin_manager_factory, intermediate_manager, ) @@ -476,6 +498,7 @@ impl WorkerStarter { fn start(self) -> Result { let regions = Arc::new(RegionMap::default()); let opening_regions = Arc::new(OpeningRegions::default()); + let catchup_regions = Arc::new(CatchupRegions::default()); let (sender, receiver) = mpsc::channel(self.config.worker_channel_size); let running = Arc::new(AtomicBool::new(true)); @@ -485,6 +508,7 @@ impl WorkerStarter { id: self.id, config: self.config.clone(), regions: regions.clone(), + catchup_regions: catchup_regions.clone(), dropping_regions: Arc::new(RegionMap::default()), opening_regions: opening_regions.clone(), sender: sender.clone(), @@ -498,7 +522,10 @@ impl WorkerStarter { ), purge_scheduler: self.purge_scheduler.clone(), write_buffer_manager: self.write_buffer_manager, - index_build_scheduler: IndexBuildScheduler::new(self.index_build_job_pool), + index_build_scheduler: IndexBuildScheduler::new( + self.index_build_job_pool, + self.config.max_background_index_builds, + ), flush_scheduler: FlushScheduler::new(self.flush_job_pool), compaction_scheduler: CompactionScheduler::new( self.compact_job_pool, @@ -534,6 +561,7 @@ impl WorkerStarter { id: self.id, regions, opening_regions, + catchup_regions, sender, handle: Mutex::new(Some(handle)), running, @@ -549,6 +577,8 @@ pub(crate) struct RegionWorker { regions: RegionMapRef, /// The opening regions. opening_regions: OpeningRegionsRef, + /// The catching up regions. + catchup_regions: CatchupRegionsRef, /// Request sender. sender: Sender, /// Handle to the worker thread. @@ -619,6 +649,11 @@ impl RegionWorker { self.opening_regions.is_region_exists(region_id) } + /// Returns true if the region is catching up. + fn is_region_catching_up(&self, region_id: RegionId) -> bool { + self.catchup_regions.is_region_exists(region_id) + } + /// Returns region of specific `region_id`. fn get_region(&self, region_id: RegionId) -> Option { self.regions.get_region(region_id) @@ -629,6 +664,12 @@ impl RegionWorker { pub(crate) fn opening_regions(&self) -> &OpeningRegionsRef { &self.opening_regions } + + #[cfg(test)] + /// Returns the [CatchupRegionsRef]. + pub(crate) fn catchup_regions(&self) -> &CatchupRegionsRef { + &self.catchup_regions + } } impl Drop for RegionWorker { @@ -726,6 +767,8 @@ struct RegionWorkerLoop { dropping_regions: RegionMapRef, /// Regions that are opening. opening_regions: OpeningRegionsRef, + /// Regions that are catching up. + catchup_regions: CatchupRegionsRef, /// Request sender. sender: Sender, /// Request receiver. @@ -961,6 +1004,9 @@ impl RegionWorkerLoop { ); } } + WorkerRequest::RemapManifests(req) => { + self.handle_remap_manifests_request(req); + } } } @@ -994,8 +1040,7 @@ impl RegionWorkerLoop { continue; } DdlRequest::Flush(req) => { - self.handle_flush_request(ddl.region_id, req, ddl.sender) - .await; + self.handle_flush_request(ddl.region_id, req, ddl.sender); continue; } DdlRequest::Compact(req) => { @@ -1013,7 +1058,20 @@ impl RegionWorkerLoop { .await; continue; } - DdlRequest::Catchup(req) => self.handle_catchup_request(ddl.region_id, req).await, + DdlRequest::Catchup((req, wal_entry_receiver)) => { + self.handle_catchup_request(ddl.region_id, req, wal_entry_receiver, ddl.sender) + .await; + continue; + } + DdlRequest::EnterStaging(req) => { + self.handle_enter_staging_request( + ddl.region_id, + req.partition_expr, + ddl.sender, + ) + .await; + continue; + } }; ddl.sender.send(res); @@ -1048,6 +1106,9 @@ impl RegionWorkerLoop { BackgroundNotify::IndexBuildFinished(req) => { self.handle_index_build_finished(region_id, req).await } + BackgroundNotify::IndexBuildStopped(req) => { + self.handle_index_build_stopped(region_id, req).await + } BackgroundNotify::IndexBuildFailed(req) => { self.handle_index_build_failed(region_id, req).await } @@ -1059,6 +1120,7 @@ impl RegionWorkerLoop { BackgroundNotify::RegionChange(req) => { self.handle_manifest_region_change_result(req).await } + BackgroundNotify::EnterStaging(req) => self.handle_enter_staging_result(req).await, BackgroundNotify::RegionEdit(req) => self.handle_region_edit_result(req).await, } } @@ -1220,10 +1282,17 @@ impl WorkerListener { } } - pub(crate) async fn on_index_build_success(&self, _region_file_id: RegionFileId) { + pub(crate) async fn on_enter_staging_result_begin(&self, _region_id: RegionId) { #[cfg(any(test, feature = "test"))] if let Some(listener) = &self.listener { - listener.on_index_build_success(_region_file_id).await; + listener.on_enter_staging_result_begin(_region_id).await; + } + } + + pub(crate) async fn on_index_build_finish(&self, _region_file_id: RegionFileId) { + #[cfg(any(test, feature = "test"))] + if let Some(listener) = &self.listener { + listener.on_index_build_finish(_region_file_id).await; } } @@ -1233,6 +1302,13 @@ impl WorkerListener { listener.on_index_build_begin(_region_file_id).await; } } + + pub(crate) async fn on_index_build_abort(&self, _region_file_id: RegionFileId) { + #[cfg(any(test, feature = "test"))] + if let Some(listener) = &self.listener { + listener.on_index_build_abort(_region_file_id).await; + } + } } #[cfg(test)] diff --git a/src/mito2/src/worker/handle_alter.rs b/src/mito2/src/worker/handle_alter.rs index e02c4cd33c..a8a4a3f46c 100644 --- a/src/mito2/src/worker/handle_alter.rs +++ b/src/mito2/src/worker/handle_alter.rs @@ -21,7 +21,7 @@ use common_base::readable_size::ReadableSize; use common_telemetry::info; use common_telemetry::tracing::warn; use humantime_serde::re::humantime; -use snafu::ResultExt; +use snafu::{ResultExt, ensure}; use store_api::metadata::{ InvalidSetRegionOptionRequestSnafu, MetadataError, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, @@ -35,9 +35,10 @@ use crate::flush::FlushReason; use crate::manifest::action::RegionChange; use crate::region::MitoRegionRef; use crate::region::options::CompactionOptions::Twcs; -use crate::region::options::TwcsOptions; +use crate::region::options::{RegionOptions, TwcsOptions}; use crate::region::version::VersionRef; use crate::request::{DdlRequest, OptionOutputTx, SenderDdlRequest}; +use crate::sst::FormatType; use crate::worker::RegionWorkerLoop; impl RegionWorkerLoop { @@ -57,36 +58,38 @@ impl RegionWorkerLoop { info!("Try to alter region: {}, request: {:?}", region_id, request); - // Get the version before alter. - let version = region.version(); + // Gets the version before alter. + let mut version = region.version(); // fast path for memory state changes like options. - match request.kind { - AlterKind::SetRegionOptions { options } => { - match self.handle_alter_region_options(region, version, options) { - Ok(_) => sender.send(Ok(0)), - Err(e) => sender.send(Err(e).context(InvalidMetadataSnafu)), - } - return; - } + let set_options = match &request.kind { + AlterKind::SetRegionOptions { options } => options.clone(), AlterKind::UnsetRegionOptions { keys } => { // Converts the keys to SetRegionOption. // // It passes an empty string to achieve the purpose of unset - match self.handle_alter_region_options( - region, - version, - keys.iter().map(Into::into).collect(), - ) { - Ok(_) => sender.send(Ok(0)), - Err(e) => sender.send(Err(e).context(InvalidMetadataSnafu)), - } - return; + keys.iter().map(Into::into).collect() + } + _ => Vec::new(), + }; + if !set_options.is_empty() { + match self.handle_alter_region_options_fast(®ion, version, set_options) { + Ok(new_version) => { + let Some(new_version) = new_version else { + // We don't have options to alter after flush. + sender.send(Ok(0)); + return; + }; + version = new_version; + } + Err(e) => { + sender.send(Err(e).context(InvalidMetadataSnafu)); + return; + } } - _ => {} } - // Validate request. + // Validates request. if let Err(e) = request.validate(&version.metadata) { // Invalid request. sender.send(Err(e).context(InvalidRegionRequestSnafu)); @@ -110,7 +113,13 @@ impl RegionWorkerLoop { info!("Flush region: {} before alteration", region_id); // Try to submit a flush task. - let task = self.new_flush_task(®ion, FlushReason::Alter, None, self.config.clone()); + let task = self.new_flush_task( + ®ion, + FlushReason::Alter, + None, + self.config.clone(), + region.is_staging(), + ); if let Err(e) = self.flush_scheduler .schedule_flush(region.region_id, ®ion.version_control, task) @@ -132,14 +141,15 @@ impl RegionWorkerLoop { } info!( - "Try to alter region {}, version.metadata: {:?}, request: {:?}", - region_id, version.metadata, request, + "Try to alter region {}, version.metadata: {:?}, version.options: {:?}, request: {:?}", + region_id, version.metadata, version.options, request, ); - self.handle_alter_region_metadata(region, version, request, sender); + self.handle_alter_region_with_empty_memtable(region, version, request, sender); } - /// Handles region metadata changes. - fn handle_alter_region_metadata( + // TODO(yingwen): Optional new options and sst format. + /// Handles region metadata and format changes when the region memtable is empty. + fn handle_alter_region_with_empty_memtable( &mut self, region: MitoRegionRef, version: VersionRef, @@ -147,6 +157,7 @@ impl RegionWorkerLoop { sender: OptionOutputTx, ) { let need_index = need_change_index(&request.kind); + let new_options = new_region_options_on_empty_memtable(&version.options, &request.kind); let new_meta = match metadata_after_alteration(&version.metadata, request) { Ok(new_meta) => new_meta, Err(e) => { @@ -157,19 +168,30 @@ impl RegionWorkerLoop { // Persist the metadata to region's manifest. let change = RegionChange { metadata: new_meta, - sst_format: region.sst_format(), + sst_format: new_options + .as_ref() + .unwrap_or(&version.options) + .sst_format + .unwrap_or_default(), }; - self.handle_manifest_region_change(region, change, need_index, sender); + self.handle_manifest_region_change(region, change, need_index, new_options, sender); } /// Handles requests that changes region options, like TTL. It only affects memory state /// since changes are persisted in the `DatanodeTableValue` in metasrv. - fn handle_alter_region_options( + /// + /// If the options require empty memtable, it only does validation. + /// + /// Returns a new version with the updated options if it needs further alteration. + fn handle_alter_region_options_fast( &mut self, - region: MitoRegionRef, + region: &MitoRegionRef, version: VersionRef, options: Vec, - ) -> std::result::Result<(), MetadataError> { + ) -> std::result::Result, MetadataError> { + assert!(!options.is_empty()); + + let mut all_options_altered = true; let mut current_options = version.options.clone(); for option in options { match option { @@ -190,13 +212,68 @@ impl RegionWorkerLoop { region.region_id, )?; } + SetRegionOption::Format(format_str) => { + let new_format = format_str.parse::().map_err(|_| { + store_api::metadata::InvalidRegionRequestSnafu { + region_id: region.region_id, + err: format!("Invalid format type: {}", format_str), + } + .build() + })?; + // If the format is unchanged, we also consider the option is altered. + if new_format != current_options.sst_format.unwrap_or_default() { + all_options_altered = false; + + // Validates the format type. + ensure!( + new_format == FormatType::Flat, + store_api::metadata::InvalidRegionRequestSnafu { + region_id: region.region_id, + err: "Only allow changing format type to flat", + } + ); + } + } } } region.version_control.alter_options(current_options); - Ok(()) + if all_options_altered { + Ok(None) + } else { + Ok(Some(region.version())) + } } } +/// Returns the new region options if there are updates to the options. +fn new_region_options_on_empty_memtable( + current_options: &RegionOptions, + kind: &AlterKind, +) -> Option { + let AlterKind::SetRegionOptions { options } = kind else { + return None; + }; + + if options.is_empty() { + return None; + } + + let mut current_options = current_options.clone(); + for option in options { + match option { + SetRegionOption::Ttl(_) | SetRegionOption::Twsc(_, _) => (), + SetRegionOption::Format(format_str) => { + // Safety: handle_alter_region_options_fast() has validated this. + let new_format = format_str.parse::().unwrap(); + assert_eq!(FormatType::Flat, new_format); + + current_options.sst_format = Some(new_format); + } + } + } + Some(current_options) +} + /// Creates a metadata after applying the alter `request` to the old `metadata`. /// /// Returns an error if the `request` is invalid. diff --git a/src/mito2/src/worker/handle_catchup.rs b/src/mito2/src/worker/handle_catchup.rs index caabb6ae55..8ba8b75b29 100644 --- a/src/mito2/src/worker/handle_catchup.rs +++ b/src/mito2/src/worker/handle_catchup.rs @@ -17,17 +17,18 @@ use std::sync::Arc; use common_telemetry::tracing::warn; -use common_telemetry::{debug, info}; -use snafu::ensure; +use common_telemetry::{debug, error, info}; use store_api::logstore::LogStore; use store_api::region_engine::{RegionRole, SettableRegionRoleState}; -use store_api::region_request::{AffectedRows, RegionCatchupRequest}; +use store_api::region_request::RegionCatchupRequest; use store_api::storage::RegionId; -use tokio::time::Instant; use crate::error::{self, Result}; use crate::region::MitoRegion; -use crate::region::opener::{RegionOpener, replay_memtable}; +use crate::region::catchup::RegionCatchupTask; +use crate::region::opener::RegionOpener; +use crate::request::OptionOutputTx; +use crate::wal::entry_distributor::WalEntryReceiver; use crate::worker::RegionWorkerLoop; impl RegionWorkerLoop { @@ -35,16 +36,110 @@ impl RegionWorkerLoop { &mut self, region_id: RegionId, request: RegionCatchupRequest, - ) -> Result { + entry_receiver: Option, + sender: OptionOutputTx, + ) { let Some(region) = self.regions.get_region(region_id) else { - return error::RegionNotFoundSnafu { region_id }.fail(); + sender.send(Err(error::RegionNotFoundSnafu { region_id }.build())); + return; }; if region.is_writable() { debug!("Region {region_id} is writable, skip catchup"); - return Ok(0); + sender.send(Ok(0)); + return; } - // Note: Currently, We protect the split brain by ensuring the mutable table is empty. + + if self.catchup_regions.is_region_exists(region_id) { + warn!("Region {region_id} under catching up"); + sender.send(Err(error::RegionBusySnafu { region_id }.build())); + return; + } + + // If the memtable is not empty or the manifest has been updated, we need to reopen the region. + let region = match self.reopen_region_if_needed(region).await { + Ok(region) => region, + Err(e) => { + sender.send(Err(e)); + return; + } + }; + + self.catchup_regions.insert_region(region_id); + let catchup_regions = self.catchup_regions.clone(); + let wal = self.wal.clone(); + let allow_stale_entries = self.config.allow_stale_entries; + common_runtime::spawn_global(async move { + let mut task = RegionCatchupTask::new(region.clone(), wal, allow_stale_entries) + .with_entry_receiver(entry_receiver) + .with_expected_last_entry_id(request.entry_id) + .with_location_id(request.location_id) + .with_replay_checkpoint_entry_id(request.checkpoint.map(|c| c.entry_id)); + + match task.run().await { + Ok(_) => { + if request.set_writable { + region.set_role(RegionRole::Leader); + // Finalize leadership: persist backfilled metadata. + if let Err(err) = region + .set_role_state_gracefully(SettableRegionRoleState::Leader) + .await + { + error!(err; "Failed to set region {region_id} to leader"); + } + } + sender.send(Ok(0)); + catchup_regions.remove_region(region_id); + } + Err(err) => { + error!(err; "Failed to catchup region {region_id}"); + sender.send(Err(err)); + catchup_regions.remove_region(region_id); + } + } + }); + } + + /// Reopens a region. + pub(crate) async fn reopen_region( + &mut self, + region: &Arc, + ) -> Result> { + let region_id = region.region_id; + let manifest_version = region.manifest_ctx.manifest_version().await; + let flushed_entry_id = region.version_control.current().last_entry_id; + info!( + "Reopening the region: {region_id}, manifest version: {manifest_version}, flushed entry id: {flushed_entry_id}" + ); + let reopened_region = RegionOpener::new( + region_id, + region.table_dir(), + region.access_layer.path_type(), + self.memtable_builder_provider.clone(), + self.object_store_manager.clone(), + self.purge_scheduler.clone(), + self.puffin_manager_factory.clone(), + self.intermediate_manager.clone(), + self.time_provider.clone(), + self.file_ref_manager.clone(), + self.partition_expr_fetcher.clone(), + ) + .cache(Some(self.cache_manager.clone())) + .options(region.version().options.clone())? + .skip_wal_replay(true) + .open(&self.config, &self.wal) + .await?; + debug_assert!(!reopened_region.is_writable()); + self.regions.insert_region(reopened_region.clone()); + + Ok(reopened_region) + } + + async fn reopen_region_if_needed( + &mut self, + region: Arc, + ) -> Result> { + // Note: Currently, We protect the split brain by ensuring the memtable table is empty. // It's expensive to execute catch-up requests without `set_writable=true` multiple times. let version = region.version(); let is_empty_memtable = version.memtables.is_empty(); @@ -64,121 +159,6 @@ impl RegionWorkerLoop { region }; - if region.provider.is_remote_wal() { - let flushed_entry_id = region.version_control.current().last_entry_id; - let replay_from_entry_id = request - .checkpoint - .map(|c| c.entry_id) - .unwrap_or_default() - .max(flushed_entry_id); - info!( - "Trying to replay memtable for region: {region_id}, provider: {:?}, replay from entry id: {replay_from_entry_id}, flushed entry id: {flushed_entry_id}", - region.provider - ); - let timer = Instant::now(); - let wal_entry_reader = - self.wal - .wal_entry_reader(®ion.provider, region_id, request.location_id); - let on_region_opened = self.wal.on_region_opened(); - let last_entry_id = replay_memtable( - ®ion.provider, - wal_entry_reader, - region_id, - replay_from_entry_id, - ®ion.version_control, - self.config.allow_stale_entries, - on_region_opened, - ) - .await?; - info!( - "Elapsed: {:?}, region: {region_id}, provider: {:?} catchup finished. replay from entry id: {replay_from_entry_id}, flushed entry id: {flushed_entry_id}, last entry id: {last_entry_id}, expected: {:?}.", - timer.elapsed(), - region.provider, - request.entry_id - ); - if let Some(expected_last_entry_id) = request.entry_id { - ensure!( - // The replayed last entry id may be greater than the `expected_last_entry_id`. - last_entry_id >= expected_last_entry_id, - error::UnexpectedSnafu { - reason: format!( - "failed to set region {} to writable, it was expected to replayed to {}, but actually replayed to {}", - region_id, expected_last_entry_id, last_entry_id, - ), - } - ) - } - } else { - let version = region.version_control.current(); - let mut flushed_entry_id = version.last_entry_id; - - let latest_entry_id = self - .wal - .store() - .latest_entry_id(®ion.provider) - .unwrap_or_default(); - warn!( - "Skips to replay memtable for region: {}, flushed entry id: {}, latest entry id: {}", - region.region_id, flushed_entry_id, latest_entry_id - ); - - if latest_entry_id > flushed_entry_id { - warn!( - "Found latest entry id is greater than flushed entry id, using latest entry id as flushed entry id, region: {}, latest entry id: {}, flushed entry id: {}", - region_id, latest_entry_id, flushed_entry_id - ); - flushed_entry_id = latest_entry_id; - region.version_control.set_entry_id(flushed_entry_id); - } - let on_region_opened = self.wal.on_region_opened(); - on_region_opened(region_id, flushed_entry_id, ®ion.provider).await?; - } - - if request.set_writable { - region.set_role(RegionRole::Leader); - // Finalize leadership: persist backfilled metadata. - region - .set_role_state_gracefully(SettableRegionRoleState::Leader) - .await?; - } - - Ok(0) - } - - /// Reopens a region. - pub(crate) async fn reopen_region( - &mut self, - region: &Arc, - ) -> Result> { - let region_id = region.region_id; - let manifest_version = region.manifest_ctx.manifest_version().await; - let flushed_entry_id = region.version_control.current().last_entry_id; - info!( - "Reopening the region: {region_id}, manifest version: {manifest_version}, flushed entry id: {flushed_entry_id}" - ); - let reopened_region = Arc::new( - RegionOpener::new( - region_id, - region.table_dir(), - region.access_layer.path_type(), - self.memtable_builder_provider.clone(), - self.object_store_manager.clone(), - self.purge_scheduler.clone(), - self.puffin_manager_factory.clone(), - self.intermediate_manager.clone(), - self.time_provider.clone(), - self.file_ref_manager.clone(), - self.partition_expr_fetcher.clone(), - ) - .cache(Some(self.cache_manager.clone())) - .options(region.version().options.clone())? - .skip_wal_replay(true) - .open(&self.config, &self.wal) - .await?, - ); - debug_assert!(!reopened_region.is_writable()); - self.regions.insert_region(reopened_region.clone()); - - Ok(reopened_region) + Ok(region) } } diff --git a/src/mito2/src/worker/handle_close.rs b/src/mito2/src/worker/handle_close.rs index 8e33fcb1eb..1568ae0799 100644 --- a/src/mito2/src/worker/handle_close.rs +++ b/src/mito2/src/worker/handle_close.rs @@ -38,6 +38,8 @@ impl RegionWorkerLoop { self.flush_scheduler.on_region_closed(region_id); // Clean compaction status. self.compaction_scheduler.on_region_closed(region_id); + // clean index build status. + self.index_build_scheduler.on_region_closed(region_id).await; info!("Region {} closed, worker: {}", region_id, self.id); diff --git a/src/mito2/src/worker/handle_create.rs b/src/mito2/src/worker/handle_create.rs index 3c5f091a1a..9e812ba88f 100644 --- a/src/mito2/src/worker/handle_create.rs +++ b/src/mito2/src/worker/handle_create.rs @@ -14,8 +14,6 @@ //! Handling create request. -use std::sync::Arc; - use common_telemetry::info; use store_api::logstore::LogStore; use store_api::metadata::RegionMetadataBuilder; @@ -84,7 +82,7 @@ impl RegionWorkerLoop { self.region_count.inc(); // Insert the MitoRegion into the RegionMap. - self.regions.insert_region(Arc::new(region)); + self.regions.insert_region(region); Ok(0) } diff --git a/src/mito2/src/worker/handle_drop.rs b/src/mito2/src/worker/handle_drop.rs index fd90ef7f3e..84337bd9d0 100644 --- a/src/mito2/src/worker/handle_drop.rs +++ b/src/mito2/src/worker/handle_drop.rs @@ -83,11 +83,13 @@ where self.flush_scheduler.on_region_dropped(region_id); // Notifies compaction scheduler. self.compaction_scheduler.on_region_dropped(region_id); + // notifies index build scheduler. + self.index_build_scheduler + .on_region_dropped(region_id) + .await; // Marks region version as dropped - region - .version_control - .mark_dropped(®ion.memtable_builder); + region.version_control.mark_dropped(); info!( "Region {} is dropped logically, but some files are not deleted yet", region_id diff --git a/src/mito2/src/worker/handle_enter_staging.rs b/src/mito2/src/worker/handle_enter_staging.rs new file mode 100644 index 0000000000..6dee72525e --- /dev/null +++ b/src/mito2/src/worker/handle_enter_staging.rs @@ -0,0 +1,249 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Instant; + +use common_telemetry::{error, info, warn}; +use store_api::logstore::LogStore; +use store_api::region_request::EnterStagingRequest; +use store_api::storage::RegionId; + +use crate::error::{RegionNotFoundSnafu, Result, StagingPartitionExprMismatchSnafu}; +use crate::flush::FlushReason; +use crate::manifest::action::{RegionChange, RegionMetaAction, RegionMetaActionList}; +use crate::region::{MitoRegionRef, RegionLeaderState}; +use crate::request::{ + BackgroundNotify, DdlRequest, EnterStagingResult, OptionOutputTx, SenderDdlRequest, + WorkerRequest, WorkerRequestWithTime, +}; +use crate::worker::RegionWorkerLoop; + +impl RegionWorkerLoop { + pub(crate) async fn handle_enter_staging_request( + &mut self, + region_id: RegionId, + partition_expr: String, + mut sender: OptionOutputTx, + ) { + let Some(region) = self.regions.writable_region_or(region_id, &mut sender) else { + return; + }; + + // If the region is already in staging mode, verify the partition expr matches. + if region.is_staging() { + let staging_partition_expr = region.staging_partition_expr.lock().unwrap().clone(); + // If the partition expr mismatch, return error. + if staging_partition_expr.as_ref() != Some(&partition_expr) { + sender.send(Err(StagingPartitionExprMismatchSnafu { + manifest_expr: staging_partition_expr, + request_expr: partition_expr, + } + .build())); + return; + } + + // If the partition expr matches, return success. + sender.send(Ok(0)); + return; + } + + let version = region.version(); + if !version.memtables.is_empty() { + // If memtable is not empty, we can't enter staging directly and need to flush + // all memtables first. + info!("Flush region: {} before entering staging", region_id); + debug_assert!(!region.is_staging()); + let task = self.new_flush_task( + ®ion, + FlushReason::EnterStaging, + None, + self.config.clone(), + region.is_staging(), + ); + if let Err(e) = + self.flush_scheduler + .schedule_flush(region.region_id, ®ion.version_control, task) + { + // Unable to flush the region, send error to waiter. + sender.send(Err(e)); + return; + } + + // Safety: We have requested flush. + self.flush_scheduler + .add_ddl_request_to_pending(SenderDdlRequest { + region_id, + sender, + request: DdlRequest::EnterStaging(EnterStagingRequest { partition_expr }), + }); + + return; + } + + self.handle_enter_staging(region, partition_expr, sender); + } + + async fn enter_staging(region: &MitoRegionRef, partition_expr: String) -> Result<()> { + let now = Instant::now(); + // First step: clear all staging manifest files. + { + let mut manager = region.manifest_ctx.manifest_manager.write().await; + manager + .clear_staging_manifest_and_dir() + .await + .inspect_err(|e| { + error!( + e; + "Failed to clear staging manifest files for region {}", + region.region_id + ); + })?; + + info!( + "Cleared all staging manifest files for region {}, elapsed: {:?}", + region.region_id, + now.elapsed(), + ); + } + + // Second step: write new staging manifest. + let mut new_meta = (*region.metadata()).clone(); + new_meta.partition_expr = Some(partition_expr.clone()); + let sst_format = region.version().options.sst_format.unwrap_or_default(); + let change = RegionChange { + metadata: Arc::new(new_meta), + sst_format, + }; + let action_list = RegionMetaActionList::with_action(RegionMetaAction::Change(change)); + region + .manifest_ctx + .update_manifest(RegionLeaderState::EnteringStaging, action_list, true) + .await?; + + Ok(()) + } + + fn handle_enter_staging( + &self, + region: MitoRegionRef, + partition_expr: String, + sender: OptionOutputTx, + ) { + if let Err(e) = region.set_entering_staging() { + sender.send(Err(e)); + return; + } + + let listener = self.listener.clone(); + let request_sender = self.sender.clone(); + common_runtime::spawn_global(async move { + let now = Instant::now(); + let result = Self::enter_staging(®ion, partition_expr.clone()).await; + match result { + Ok(_) => { + info!( + "Created staging manifest for region {}, elapsed: {:?}", + region.region_id, + now.elapsed(), + ); + } + Err(ref e) => { + // Unset the staging manifest + region + .manifest_ctx + .manifest_manager + .write() + .await + .unset_staging_manifest(); + error!( + "Failed to create staging manifest for region {}: {:?}, elapsed: {:?}", + region.region_id, + e, + now.elapsed(), + ); + } + } + + let notify = WorkerRequest::Background { + region_id: region.region_id, + notify: BackgroundNotify::EnterStaging(EnterStagingResult { + region_id: region.region_id, + sender, + result, + partition_expr, + }), + }; + listener + .on_enter_staging_result_begin(region.region_id) + .await; + + if let Err(res) = request_sender + .send(WorkerRequestWithTime::new(notify)) + .await + { + warn!( + "Failed to send enter staging result back to the worker, region_id: {}, res: {:?}", + region.region_id, res + ); + } + }); + } + + /// Handles enter staging result. + pub(crate) async fn handle_enter_staging_result( + &mut self, + enter_staging_result: EnterStagingResult, + ) { + let region = match self.regions.get_region(enter_staging_result.region_id) { + Some(region) => region, + None => { + self.reject_region_stalled_requests(&enter_staging_result.region_id); + enter_staging_result.sender.send( + RegionNotFoundSnafu { + region_id: enter_staging_result.region_id, + } + .fail(), + ); + return; + } + }; + + if enter_staging_result.result.is_ok() { + info!( + "Updating region {} staging partition expr to {}", + region.region_id, enter_staging_result.partition_expr + ); + Self::update_region_staging_partition_expr( + ®ion, + enter_staging_result.partition_expr, + ); + region.switch_state_to_staging(RegionLeaderState::EnteringStaging); + } else { + region.switch_state_to_writable(RegionLeaderState::EnteringStaging); + } + enter_staging_result + .sender + .send(enter_staging_result.result.map(|_| 0)); + // Handles the stalled requests. + self.handle_region_stalled_requests(&enter_staging_result.region_id) + .await; + } + + fn update_region_staging_partition_expr(region: &MitoRegionRef, partition_expr: String) { + let mut staging_partition_expr = region.staging_partition_expr.lock().unwrap(); + debug_assert!(staging_partition_expr.is_none()); + *staging_partition_expr = Some(partition_expr); + } +} diff --git a/src/mito2/src/worker/handle_flush.rs b/src/mito2/src/worker/handle_flush.rs index 04dbb4ae78..8b9e750ffb 100644 --- a/src/mito2/src/worker/handle_flush.rs +++ b/src/mito2/src/worker/handle_flush.rs @@ -76,8 +76,13 @@ impl RegionWorkerLoop { if region.last_flush_millis() < min_last_flush_time { // If flush time of this region is earlier than `min_last_flush_time`, we can flush this region. - let task = - self.new_flush_task(region, FlushReason::EngineFull, None, self.config.clone()); + let task = self.new_flush_task( + region, + FlushReason::EngineFull, + None, + self.config.clone(), + region.is_staging(), + ); self.flush_scheduler.schedule_flush( region.region_id, ®ion.version_control, @@ -91,8 +96,13 @@ impl RegionWorkerLoop { if let Some(region) = max_mem_region && !self.flush_scheduler.is_flush_requested(region.region_id) { - let task = - self.new_flush_task(region, FlushReason::EngineFull, None, self.config.clone()); + let task = self.new_flush_task( + region, + FlushReason::EngineFull, + None, + self.config.clone(), + region.is_staging(), + ); self.flush_scheduler .schedule_flush(region.region_id, ®ion.version_control, task)?; } @@ -107,6 +117,7 @@ impl RegionWorkerLoop { reason: FlushReason, row_group_size: Option, engine_config: Arc, + is_staging: bool, ) -> RegionFlushTask { RegionFlushTask { region_id: region.region_id, @@ -121,13 +132,14 @@ impl RegionWorkerLoop { manifest_ctx: region.manifest_ctx.clone(), index_options: region.version().options.index_options.clone(), flush_semaphore: self.flush_semaphore.clone(), + is_staging, } } } impl RegionWorkerLoop { /// Handles manual flush request. - pub(crate) async fn handle_flush_request( + pub(crate) fn handle_flush_request( &mut self, region_id: RegionId, request: RegionFlushRequest, @@ -147,8 +159,13 @@ impl RegionWorkerLoop { FlushReason::Manual }; - let mut task = - self.new_flush_task(®ion, reason, request.row_group_size, self.config.clone()); + let mut task = self.new_flush_task( + ®ion, + reason, + request.row_group_size, + self.config.clone(), + region.is_staging(), + ); task.push_sender(sender); if let Err(e) = self.flush_scheduler @@ -178,6 +195,7 @@ impl RegionWorkerLoop { FlushReason::Periodically, None, self.config.clone(), + region.is_staging(), ); self.flush_scheduler.schedule_flush( region.region_id, @@ -208,11 +226,8 @@ impl RegionWorkerLoop { } }; - // Check if region is currently in staging mode - let is_staging = region.manifest_ctx.current_state() - == crate::region::RegionRoleState::Leader(crate::region::RegionLeaderState::Staging); - - if is_staging { + if request.is_staging { + // Skip the region metadata update. info!( "Skipping region metadata update for region {} in staging mode", region_id diff --git a/src/mito2/src/worker/handle_manifest.rs b/src/mito2/src/worker/handle_manifest.rs index be4bde6583..433c440639 100644 --- a/src/mito2/src/worker/handle_manifest.rs +++ b/src/mito2/src/worker/handle_manifest.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use common_telemetry::{info, warn}; use store_api::logstore::LogStore; +use store_api::metadata::RegionMetadataRef; use store_api::storage::RegionId; use crate::cache::CacheManagerRef; @@ -31,8 +32,11 @@ use crate::error::{RegionBusySnafu, RegionNotFoundSnafu, Result}; use crate::manifest::action::{ RegionChange, RegionEdit, RegionMetaAction, RegionMetaActionList, RegionTruncate, }; +use crate::memtable::MemtableBuilderProvider; use crate::metrics::WRITE_CACHE_INFLIGHT_DOWNLOAD; -use crate::region::version::VersionBuilder; +use crate::region::opener::{sanitize_region_options, version_builder_from_manifest}; +use crate::region::options::RegionOptions; +use crate::region::version::VersionControlRef; use crate::region::{MitoRegionRef, RegionLeaderState, RegionRoleState}; use crate::request::{ BackgroundNotify, BuildIndexRequest, OptionOutputTx, RegionChangeResult, RegionEditRequest, @@ -102,15 +106,12 @@ impl RegionWorkerLoop { }; if change_result.result.is_ok() { - // Apply the metadata to region's version. - region - .version_control - .alter_schema(change_result.new_meta, ®ion.memtable_builder); - - let version = region.version(); - info!( - "Region {} is altered, metadata is {:?}, options: {:?}", - region.region_id, version.metadata, version.options, + // Updates the region metadata and format. + Self::update_region_version( + ®ion.version_control, + change_result.new_meta, + change_result.new_options, + &self.memtable_builder_provider, ); } @@ -164,6 +165,10 @@ impl RegionWorkerLoop { } }; let version = region.version(); + let mut region_options = version.options.clone(); + let old_format = region_options.sst_format.unwrap_or_default(); + // Updates the region options with the manifest. + sanitize_region_options(&manifest, &mut region_options); if !version.memtables.is_empty() { let current = region.version_control.current(); warn!( @@ -171,23 +176,35 @@ impl RegionWorkerLoop { region.region_id, manifest.manifest_version, current.last_entry_id ); } - let region_options = version.options.clone(); + + // We should sanitize the region options before creating a new memtable. + let memtable_builder = if old_format != region_options.sst_format.unwrap_or_default() { + // Format changed, also needs to replace the memtable builder. + Some( + self.memtable_builder_provider + .builder_for_options(®ion_options), + ) + } else { + None + }; let new_mutable = Arc::new( region .version() .memtables .mutable - .new_with_part_duration(version.compaction_time_window), + .new_with_part_duration(version.compaction_time_window, memtable_builder), ); + // Here it assumes the leader has backfilled the partition_expr of the metadata. let metadata = manifest.metadata.clone(); - let version = VersionBuilder::new(metadata, new_mutable) - .add_files(region.file_purger.clone(), manifest.files.values().cloned()) - .flushed_entry_id(manifest.flushed_entry_id) - .flushed_sequence(manifest.flushed_sequence) - .truncated_entry_id(manifest.truncated_entry_id) - .compaction_time_window(manifest.compaction_time_window) - .options(region_options) - .build(); + + let version_builder = version_builder_from_manifest( + &manifest, + metadata, + region.file_purger.clone(), + new_mutable, + region_options, + ); + let version = version_builder.build(); region.version_control.overwrite_current(Arc::new(version)); let updated = manifest.manifest_version > original_manifest_version; @@ -249,6 +266,8 @@ impl RegionWorkerLoop { sender, edit, result, + // we always need to restore region state after region edit + update_region_state: true, }), }; @@ -292,8 +311,10 @@ impl RegionWorkerLoop { ); } - // Sets the region as writable. - region.switch_state_to_writable(RegionLeaderState::Editing); + if edit_result.update_region_state { + // Sets the region as writable. + region.switch_state_to_writable(RegionLeaderState::Editing); + } let _ = edit_result.sender.send(edit_result.result); @@ -325,6 +346,7 @@ impl RegionWorkerLoop { let request_sender = self.sender.clone(); let manifest_ctx = region.manifest_ctx.clone(); + let is_staging = region.is_staging(); // Updates manifest in background. common_runtime::spawn_global(async move { @@ -333,7 +355,7 @@ impl RegionWorkerLoop { RegionMetaActionList::with_action(RegionMetaAction::Truncate(truncate.clone())); let result = manifest_ctx - .update_manifest(RegionLeaderState::Truncating, action_list) + .update_manifest(RegionLeaderState::Truncating, action_list, is_staging) .await .map(|_| ()); @@ -360,6 +382,7 @@ impl RegionWorkerLoop { region: MitoRegionRef, change: RegionChange, need_index: bool, + new_options: Option, sender: OptionOutputTx, ) { // Marks the region as altering. @@ -369,6 +392,7 @@ impl RegionWorkerLoop { } let listener = self.listener.clone(); let request_sender = self.sender.clone(); + let is_staging = region.is_staging(); // Now the region is in altering state. common_runtime::spawn_global(async move { let new_meta = change.metadata.clone(); @@ -376,7 +400,7 @@ impl RegionWorkerLoop { let result = region .manifest_ctx - .update_manifest(RegionLeaderState::Altering, action_list) + .update_manifest(RegionLeaderState::Altering, action_list, is_staging) .await .map(|_| ()); let notify = WorkerRequest::Background { @@ -387,6 +411,7 @@ impl RegionWorkerLoop { result, new_meta, need_index, + new_options, }), }; listener @@ -404,6 +429,32 @@ impl RegionWorkerLoop { } }); } + + fn update_region_version( + version_control: &VersionControlRef, + new_meta: RegionMetadataRef, + new_options: Option, + memtable_builder_provider: &MemtableBuilderProvider, + ) { + let options_changed = new_options.is_some(); + let region_id = new_meta.region_id; + if let Some(new_options) = new_options { + // Needs to update the region with new format and memtables. + // Creates a new memtable builder for the new options as it may change the memtable type. + let new_memtable_builder = memtable_builder_provider.builder_for_options(&new_options); + version_control.alter_schema_and_format(new_meta, new_options, new_memtable_builder); + } else { + // Only changes the schema. + version_control.alter_schema(new_meta); + } + + let version_data = version_control.current(); + let version = version_data.version; + info!( + "Region {} is altered, metadata is {:?}, options: {:?}, options_changed: {}", + region_id, version.metadata, version.options, options_changed, + ); + } } /// Checks the edit, writes and applies it. @@ -414,6 +465,7 @@ async fn edit_region( listener: WorkerListener, ) -> Result<()> { let region_id = region.region_id; + let is_staging = region.is_staging(); if let Some(write_cache) = cache_manager.write_cache() { for file_meta in &edit.files_to_add { let write_cache = write_cache.clone(); @@ -427,8 +479,11 @@ async fn edit_region( let is_index_exist = file_meta.exists_index(); let index_file_size = file_meta.index_file_size(); - let index_file_index_key = - IndexKey::new(region_id, file_meta.file_id, FileType::Puffin); + let index_file_index_key = IndexKey::new( + region_id, + file_meta.index_file_id().file_id(), + FileType::Puffin, + ); let index_remote_path = location::index_file_path( layer.table_dir(), file_meta.file_id(), @@ -480,7 +535,7 @@ async fn edit_region( let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit)); region .manifest_ctx - .update_manifest(RegionLeaderState::Editing, action_list) + .update_manifest(RegionLeaderState::Editing, action_list, is_staging) .await .map(|_| ()) } diff --git a/src/mito2/src/worker/handle_open.rs b/src/mito2/src/worker/handle_open.rs index 4d71289c08..e50e166d47 100644 --- a/src/mito2/src/worker/handle_open.rs +++ b/src/mito2/src/worker/handle_open.rs @@ -15,6 +15,7 @@ //! Handling open request. use std::sync::Arc; +use std::time::Instant; use common_telemetry::info; use object_store::util::join_path; @@ -119,6 +120,7 @@ impl RegionWorkerLoop { } }; + let now = Instant::now(); let regions = self.regions.clone(); let wal = self.wal.clone(); let config = self.config.clone(); @@ -129,11 +131,16 @@ impl RegionWorkerLoop { common_runtime::spawn_global(async move { match opener.open(&config, &wal).await { Ok(region) => { - info!("Region {} is opened, worker: {}", region_id, worker_id); + info!( + "Region {} is opened, worker: {}, elapsed: {:?}", + region_id, + worker_id, + now.elapsed() + ); region_count.inc(); // Insert the Region into the RegionMap. - regions.insert_region(Arc::new(region)); + regions.insert_region(region); let senders = opening_regions.remove_sender(region_id); for sender in senders { diff --git a/src/mito2/src/worker/handle_rebuild_index.rs b/src/mito2/src/worker/handle_rebuild_index.rs index 71f9bc206f..5030cd77cd 100644 --- a/src/mito2/src/worker/handle_rebuild_index.rs +++ b/src/mito2/src/worker/handle_rebuild_index.rs @@ -22,9 +22,12 @@ use store_api::region_request::RegionBuildIndexRequest; use store_api::storage::{FileId, RegionId}; use tokio::sync::mpsc; +use crate::cache::CacheStrategy; use crate::error::Result; use crate::region::MitoRegionRef; -use crate::request::{BuildIndexRequest, IndexBuildFailed, IndexBuildFinished, OptionOutputTx}; +use crate::request::{ + BuildIndexRequest, IndexBuildFailed, IndexBuildFinished, IndexBuildStopped, OptionOutputTx, +}; use crate::sst::file::{FileHandle, RegionFileId}; use crate::sst::index::{ IndexBuildOutcome, IndexBuildTask, IndexBuildType, IndexerBuilderImpl, ResultMpscSender, @@ -71,6 +74,7 @@ impl RegionWorkerLoop { file_meta: file.meta_ref().clone(), reason: build_type, access_layer: access_layer.clone(), + listener: self.listener.clone(), manifest_ctx: region.manifest_ctx.clone(), write_cache: self.cache_manager.write_cache().cloned(), file_purger: file.file_purger(), @@ -81,7 +85,6 @@ impl RegionWorkerLoop { } /// Handles manual build index requests. - /// TODO(SNC123): Support admin function of manual index building later. pub(crate) async fn handle_build_index_request( &mut self, region_id: RegionId, @@ -122,10 +125,16 @@ impl RegionWorkerLoop { .collect(); let build_tasks = if request.file_metas.is_empty() { - // NOTE: Currently, rebuilding the index will reconstruct the index for all - // files in the region, which is a simplified approach and is not yet available for - // production use; further optimization is required. - all_files.values().cloned().collect::>() + // If no specific files are provided, find files whose index is inconsistent with the region metadata. + all_files + .values() + .filter(|file| { + !file + .meta_ref() + .is_index_consistent_with_region(&version.metadata.column_metadatas) + }) + .cloned() + .collect::>() } else { request .file_metas @@ -171,9 +180,7 @@ impl RegionWorkerLoop { ); let _ = self .index_build_scheduler - .schedule_build(®ion.version_control, task); - self.listener - .on_index_build_begin(RegionFileId::new(region_id, file_handle.meta_ref().file_id)) + .schedule_build(®ion.version_control, task) .await; } // Wait for all index build tasks to finish and notify the caller. @@ -205,14 +212,22 @@ impl RegionWorkerLoop { } }; + // Clean old puffin-related cache for all rebuilt files. + let cache_strategy = CacheStrategy::EnableAll(self.cache_manager.clone()); + for file_meta in &request.edit.files_to_add { + let region_file_id = RegionFileId::new(region_id, file_meta.file_id); + cache_strategy.evict_puffin_cache(region_file_id).await; + } + region.version_control.apply_edit( Some(request.edit.clone()), &[], region.file_purger.clone(), ); + for file_meta in &request.edit.files_to_add { self.listener - .on_index_build_success(RegionFileId::new(region_id, file_meta.file_id)) + .on_index_build_finish(RegionFileId::new(region_id, file_meta.file_id)) .await; } } @@ -223,6 +238,27 @@ impl RegionWorkerLoop { request: IndexBuildFailed, ) { error!(request.err; "Index build failed for region: {}", region_id); - // TODO(SNC123): Implement error handling logic after IndexBuildScheduler optimization. + self.index_build_scheduler + .on_failure(region_id, request.err.clone()) + .await; + } + + pub(crate) async fn handle_index_build_stopped( + &mut self, + region_id: RegionId, + request: IndexBuildStopped, + ) { + let Some(region) = self.regions.get_region(region_id) else { + warn!( + "Region not found for index build stopped, region_id: {}", + region_id + ); + return; + }; + self.index_build_scheduler.on_task_stopped( + region_id, + request.file_id, + ®ion.version_control, + ); } } diff --git a/src/mito2/src/worker/handle_remap.rs b/src/mito2/src/worker/handle_remap.rs new file mode 100644 index 0000000000..3039e04a1c --- /dev/null +++ b/src/mito2/src/worker/handle_remap.rs @@ -0,0 +1,125 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::time::Instant; + +use common_error::ext::BoxedError; +use common_telemetry::info; +use futures::future::try_join_all; +use partition::expr::PartitionExpr; +use snafu::{OptionExt, ResultExt}; +use store_api::region_request::PathType; +use store_api::storage::RegionId; + +use crate::error::{FetchManifestsSnafu, InvalidRequestSnafu, MissingManifestSnafu, Result}; +use crate::manifest::action::RegionManifest; +use crate::region::MitoRegionRef; +use crate::region::opener::RegionMetadataLoader; +use crate::remap_manifest::RemapManifest; +use crate::request::RemapManifestsRequest; +use crate::sst::location::region_dir_from_table_dir; +use crate::worker::RegionWorkerLoop; + +impl RegionWorkerLoop { + pub(crate) fn handle_remap_manifests_request(&mut self, request: RemapManifestsRequest) { + let region_id = request.region_id; + let sender = request.sender; + let region = match self.regions.staging_region(region_id) { + Ok(region) => region, + Err(e) => { + let _ = sender.send(Err(e)); + return; + } + }; + + let same_table = request + .input_regions + .iter() + .map(|r| r.table_id()) + .all(|t| t == region_id.table_id()); + + if !same_table { + let _ = sender.send( + InvalidRequestSnafu { + region_id, + reason: "Input regions must be from the same table", + } + .fail(), + ); + return; + } + + let region_metadata_loader = + RegionMetadataLoader::new(self.config.clone(), self.object_store_manager.clone()); + common_runtime::spawn_global(async move { + let result = Self::fetch_and_remap_manifests( + region, + region_metadata_loader, + request.input_regions, + request.new_partition_exprs, + request.region_mapping, + ) + .await; + + let _ = sender.send(result); + }); + } + + async fn fetch_and_remap_manifests( + region: MitoRegionRef, + region_metadata_loader: RegionMetadataLoader, + input_regions: Vec, + new_partition_exprs: HashMap, + region_mapping: HashMap>, + ) -> Result> { + let mut tasks = Vec::with_capacity(input_regions.len()); + let region_options = region.version().options.clone(); + let table_dir = region.table_dir(); + + let now = Instant::now(); + for input_region in &input_regions { + let region_dir = region_dir_from_table_dir(table_dir, *input_region, PathType::Bare); + let storage = region_options.storage.clone(); + let moved_region_metadata_loader = region_metadata_loader.clone(); + tasks.push(async move { + moved_region_metadata_loader + .load_manifest(®ion_dir, &storage) + .await + }); + } + + let results = try_join_all(tasks) + .await + .map_err(BoxedError::new) + .context(FetchManifestsSnafu)?; + let manifests = results + .into_iter() + .zip(input_regions) + .map(|(manifest_res, region_id)| { + let manifest = manifest_res.context(MissingManifestSnafu { region_id })?; + Ok((region_id, (*manifest).clone())) + }) + .collect::>>()?; + let mut mapper = RemapManifest::new(manifests, new_partition_exprs, region_mapping); + let remap_result = mapper.remap_manifests()?; + info!( + "Remap manifests cost: {:?}, region: {}", + now.elapsed(), + region.region_id + ); + + Ok(remap_result.new_manifests) + } +} diff --git a/src/mito2/src/worker/handle_truncate.rs b/src/mito2/src/worker/handle_truncate.rs index 16a1b5a59a..0867560a7b 100644 --- a/src/mito2/src/worker/handle_truncate.rs +++ b/src/mito2/src/worker/handle_truncate.rs @@ -129,7 +129,7 @@ impl RegionWorkerLoop { // Applies the truncate action to the region. region .version_control - .truncate(truncate_result.kind.clone(), ®ion.memtable_builder); + .truncate(truncate_result.kind.clone()); } Err(e) => { // Unable to truncate the region. @@ -142,6 +142,10 @@ impl RegionWorkerLoop { self.flush_scheduler.on_region_truncated(region_id); // Notifies compaction scheduler. self.compaction_scheduler.on_region_truncated(region_id); + // Notifies index build scheduler. + self.index_build_scheduler + .on_region_truncated(region_id) + .await; if let TruncateKind::All { truncated_entry_id, diff --git a/src/mito2/src/worker/handle_write.rs b/src/mito2/src/worker/handle_write.rs index e86aa67630..c338eef88f 100644 --- a/src/mito2/src/worker/handle_write.rs +++ b/src/mito2/src/worker/handle_write.rs @@ -241,6 +241,12 @@ impl RegionWorkerLoop { // No such region. continue; }; + #[cfg(test)] + debug!( + "Handling write request for region {}, state: {:?}", + region_id, + region.state() + ); match region.state() { RegionRoleState::Leader(RegionLeaderState::Writable) | RegionRoleState::Leader(RegionLeaderState::Staging) => { @@ -263,6 +269,16 @@ impl RegionWorkerLoop { self.stalled_requests.push(sender_req); continue; } + RegionRoleState::Leader(RegionLeaderState::EnteringStaging) => { + debug!( + "Region {} is entering staging, add request to pending writes", + region.region_id + ); + self.stalling_count.add(1); + WRITE_STALL_TOTAL.inc(); + self.stalled_requests.push(sender_req); + continue; + } state => { // The region is not writable. sender_req.sender.send( @@ -388,17 +404,14 @@ impl RegionWorkerLoop { let need_fill_missing_columns = region_ctx.version().metadata.schema_version != bulk_req.region_metadata.schema_version; - // Only fill missing columns if primary key is dense encoded. - if need_fill_missing_columns { - // todo(hl): support filling default columns - bulk_req.sender.send( - InvalidRequestSnafu { - region_id, - reason: "Schema mismatch", - } - .fail(), - ); - return; + // Fill missing columns if needed + if need_fill_missing_columns + && let Err(e) = bulk_req + .request + .fill_missing_columns(®ion_ctx.version().metadata) + { + bulk_req.sender.send(Err(e)); + continue; } // Collect requests by region. diff --git a/src/object-store/Cargo.toml b/src/object-store/Cargo.toml index 8a1a877567..cb02ca1499 100644 --- a/src/object-store/Cargo.toml +++ b/src/object-store/Cargo.toml @@ -9,6 +9,7 @@ workspace = true [features] services-memory = ["opendal/services-memory"] +testing = ["derive_builder"] [dependencies] bytes.workspace = true @@ -16,6 +17,7 @@ common-base.workspace = true common-error.workspace = true common-macro.workspace = true common-telemetry.workspace = true +derive_builder = { workspace = true, optional = true } futures.workspace = true humantime-serde.workspace = true lazy_static.workspace = true diff --git a/src/object-store/src/layers.rs b/src/object-store/src/layers.rs index 7b111927e2..00b18a70de 100644 --- a/src/object-store/src/layers.rs +++ b/src/object-store/src/layers.rs @@ -13,6 +13,8 @@ // limitations under the License. mod lru_cache; +#[cfg(feature = "testing")] +pub mod mock; pub use lru_cache::*; pub use opendal::layers::*; diff --git a/src/object-store/src/layers/mock.rs b/src/object-store/src/layers/mock.rs new file mode 100644 index 0000000000..0ee0f73b21 --- /dev/null +++ b/src/object-store/src/layers/mock.rs @@ -0,0 +1,217 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Debug; +use std::sync::Arc; + +use derive_builder::Builder; +pub use oio::*; +pub use opendal::raw::{ + Access, Layer, LayeredAccess, OpDelete, OpList, OpRead, OpWrite, RpDelete, RpList, RpRead, + RpWrite, oio, +}; +pub use opendal::{Buffer, Error, ErrorKind, Metadata, Result}; + +pub type MockWriterFactory = Arc oio::Writer + Send + Sync>; +pub type MockReaderFactory = Arc oio::Reader + Send + Sync>; +pub type MockListerFactory = Arc oio::Lister + Send + Sync>; +pub type MockDeleterFactory = Arc oio::Deleter + Send + Sync>; + +#[derive(Builder)] +pub struct MockLayer { + #[builder(setter(strip_option), default)] + writer_factory: Option, + #[builder(setter(strip_option), default)] + reader_factory: Option, + #[builder(setter(strip_option), default)] + lister_factory: Option, + #[builder(setter(strip_option), default)] + deleter_factory: Option, +} + +impl Clone for MockLayer { + fn clone(&self) -> Self { + Self { + writer_factory: self.writer_factory.clone(), + reader_factory: self.reader_factory.clone(), + lister_factory: self.lister_factory.clone(), + deleter_factory: self.deleter_factory.clone(), + } + } +} + +impl Layer for MockLayer { + type LayeredAccess = MockAccessor; + + fn layer(&self, inner: A) -> Self::LayeredAccess { + MockAccessor { + inner, + writer_factory: self.writer_factory.clone(), + reader_factory: self.reader_factory.clone(), + lister_factory: self.lister_factory.clone(), + deleter_factory: self.deleter_factory.clone(), + } + } +} + +pub struct MockAccessor { + inner: A, + writer_factory: Option, + reader_factory: Option, + lister_factory: Option, + deleter_factory: Option, +} + +impl Debug for MockAccessor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MockAccessor") + .field("inner", &self.inner) + .finish() + } +} + +pub struct MockReader { + inner: oio::Reader, +} + +impl oio::Read for MockReader { + async fn read(&mut self) -> Result { + self.inner.read().await + } +} + +pub struct MockWriter { + inner: oio::Writer, +} + +impl oio::Write for MockWriter { + async fn write(&mut self, bs: Buffer) -> Result<()> { + self.inner.write(bs).await + } + + async fn close(&mut self) -> Result { + self.inner.close().await + } + + async fn abort(&mut self) -> Result<()> { + self.inner.abort().await + } +} + +pub struct MockLister { + inner: oio::Lister, +} + +impl oio::List for MockLister { + async fn next(&mut self) -> Result> { + self.inner.next().await + } +} + +pub struct MockDeleter { + inner: oio::Deleter, +} + +impl oio::Delete for MockDeleter { + fn delete(&mut self, path: &str, args: OpDelete) -> Result<()> { + self.inner.delete(path, args) + } + + async fn flush(&mut self) -> Result { + self.inner.flush().await + } +} + +impl LayeredAccess for MockAccessor { + type Inner = A; + type Reader = MockReader; + type Writer = MockWriter; + type Lister = MockLister; + type Deleter = MockDeleter; + + fn inner(&self) -> &Self::Inner { + &self.inner + } + + async fn read(&self, path: &str, args: OpRead) -> Result<(RpRead, Self::Reader)> { + if let Some(reader_factory) = self.reader_factory.as_ref() { + let (rp_read, reader) = self.inner.read(path, args.clone()).await?; + let reader = reader_factory(path, args, Box::new(reader)); + Ok((rp_read, MockReader { inner: reader })) + } else { + self.inner.read(path, args).await.map(|(rp_read, reader)| { + ( + rp_read, + MockReader { + inner: Box::new(reader), + }, + ) + }) + } + } + + async fn write(&self, path: &str, args: OpWrite) -> Result<(RpWrite, Self::Writer)> { + if let Some(writer_factory) = self.writer_factory.as_ref() { + let (rp_write, writer) = self.inner.write(path, args.clone()).await?; + let writer = writer_factory(path, args, Box::new(writer)); + Ok((rp_write, MockWriter { inner: writer })) + } else { + self.inner + .write(path, args) + .await + .map(|(rp_write, writer)| { + ( + rp_write, + MockWriter { + inner: Box::new(writer), + }, + ) + }) + } + } + + async fn delete(&self) -> Result<(RpDelete, Self::Deleter)> { + if let Some(deleter_factory) = self.deleter_factory.as_ref() { + let (rp_delete, deleter) = self.inner.delete().await?; + let deleter = deleter_factory(Box::new(deleter)); + Ok((rp_delete, MockDeleter { inner: deleter })) + } else { + self.inner.delete().await.map(|(rp_delete, deleter)| { + ( + rp_delete, + MockDeleter { + inner: Box::new(deleter), + }, + ) + }) + } + } + + async fn list(&self, path: &str, args: OpList) -> Result<(RpList, Self::Lister)> { + if let Some(lister_factory) = self.lister_factory.as_ref() { + let (rp_list, lister) = self.inner.list(path, args.clone()).await?; + let lister = lister_factory(path, args, Box::new(lister)); + Ok((rp_list, MockLister { inner: lister })) + } else { + self.inner.list(path, args).await.map(|(rp_list, lister)| { + ( + rp_list, + MockLister { + inner: Box::new(lister), + }, + ) + }) + } + } +} diff --git a/src/object-store/src/util.rs b/src/object-store/src/util.rs index 00858eb74a..a402b8237c 100644 --- a/src/object-store/src/util.rs +++ b/src/object-store/src/util.rs @@ -16,6 +16,7 @@ use std::fmt::Display; use std::path; use std::time::Duration; +use common_error::root_source; use common_telemetry::{debug, error, info, warn}; use opendal::layers::{ LoggingInterceptor, LoggingLayer, RetryInterceptor, RetryLayer, TracingLayer, @@ -174,11 +175,12 @@ impl LoggingInterceptor for DefaultLoggingInterceptor { err: Option<&opendal::Error>, ) { if let Some(err) = err { + let root = root_source(err); // Print error if it's unexpected, otherwise in error. if err.kind() == ErrorKind::Unexpected { error!( target: LOGGING_TARGET, - "service={} name={} {}: {operation} {message} {err:#?}", + "service={} name={} {}: {operation} {message} {err:#?}, root={root:#?}", info.scheme(), info.name(), LoggingContext(context), @@ -186,7 +188,7 @@ impl LoggingInterceptor for DefaultLoggingInterceptor { } else { debug!( target: LOGGING_TARGET, - "service={} name={} {}: {operation} {message} {err}", + "service={} name={} {}: {operation} {message} {err}, root={root:?}", info.scheme(), info.name(), LoggingContext(context), diff --git a/src/operator/Cargo.toml b/src/operator/Cargo.toml index d883c15689..82ddb12e20 100644 --- a/src/operator/Cargo.toml +++ b/src/operator/Cargo.toml @@ -36,6 +36,7 @@ common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true common-sql.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-time.workspace = true datafusion.workspace = true @@ -46,6 +47,7 @@ file-engine.workspace = true futures.workspace = true futures-util.workspace = true humantime.workspace = true +itertools.workspace = true jsonb.workspace = true lazy_static.workspace = true meta-client.workspace = true diff --git a/src/operator/src/bulk_insert.rs b/src/operator/src/bulk_insert.rs index 15b92958b4..a06cc9503c 100644 --- a/src/operator/src/bulk_insert.rs +++ b/src/operator/src/bulk_insert.rs @@ -66,6 +66,7 @@ impl Inserter { return Ok(0); } + // TODO(yingwen): Fill record batch impure default values. // notify flownode to update dirty timestamps if flow is configured. self.maybe_update_flow_dirty_window(table_info.clone(), record_batch.clone()); diff --git a/src/operator/src/error.rs b/src/operator/src/error.rs index 2ba71444e7..68576db582 100644 --- a/src/operator/src/error.rs +++ b/src/operator/src/error.rs @@ -578,7 +578,7 @@ pub enum Error { #[snafu(implicit)] location: Location, #[snafu(source)] - error: datafusion::error::DataFusionError, + error: common_datasource::error::Error, }, #[snafu(display( diff --git a/src/operator/src/expr_helper.rs b/src/operator/src/expr_helper.rs index 3fa9a0ae1f..4b7e0946cd 100644 --- a/src/operator/src/expr_helper.rs +++ b/src/operator/src/expr_helper.rs @@ -762,7 +762,8 @@ pub(crate) fn to_alter_table_expr( target_type, } => { let target_type = - sql_data_type_to_concrete_data_type(&target_type).context(ParseSqlSnafu)?; + sql_data_type_to_concrete_data_type(&target_type, &Default::default()) + .context(ParseSqlSnafu)?; let (target_type, target_type_extension) = ColumnDataTypeWrapper::try_from(target_type) .map(|w| w.to_parts()) .context(ColumnDataTypeSnafu)?; diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs index cb63b07772..201d5d99f4 100644 --- a/src/operator/src/insert.rs +++ b/src/operator/src/insert.rs @@ -29,14 +29,15 @@ use catalog::CatalogManagerRef; use client::{OutputData, OutputMeta}; use common_catalog::consts::{ PARENT_SPAN_ID_COLUMN, SERVICE_NAME_COLUMN, TRACE_ID_COLUMN, TRACE_TABLE_NAME, - TRACE_TABLE_NAME_SESSION_KEY, default_engine, trace_services_table_name, + TRACE_TABLE_NAME_SESSION_KEY, default_engine, trace_operations_table_name, + trace_services_table_name, }; use common_grpc_expr::util::ColumnExpr; use common_meta::cache::TableFlownodeSetCacheRef; use common_meta::node_manager::{AffectedRows, NodeManagerRef}; use common_meta::peer::Peer; use common_query::Output; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use common_telemetry::tracing_context::TracingContext; use common_telemetry::{error, info, warn}; use datatypes::schema::SkippingIndexOptions; @@ -53,7 +54,8 @@ use store_api::metric_engine_consts::{ LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY, }; use store_api::mito_engine_options::{ - APPEND_MODE_KEY, COMPACTION_TYPE, COMPACTION_TYPE_TWCS, MERGE_MODE_KEY, TWCS_TIME_WINDOW, + APPEND_MODE_KEY, COMPACTION_TYPE, COMPACTION_TYPE_TWCS, MERGE_MODE_KEY, TTL_KEY, + TWCS_TIME_WINDOW, }; use store_api::storage::{RegionId, TableId}; use table::TableRef; @@ -351,10 +353,11 @@ impl Inserter { &self, insert: &Insert, ctx: &QueryContextRef, + statement_executor: &StatementExecutor, ) -> Result { let (inserts, table_info) = StatementToRegion::new(self.catalog_manager.as_ref(), &self.partition_manager, ctx) - .convert(insert, ctx) + .convert(insert, ctx, statement_executor) .await?; let table_infos = @@ -618,11 +621,16 @@ impl Inserter { // note that auto create table shouldn't be ttl instant table // for it's a very unexpected behavior and should be set by user explicitly for mut create_table in create_tables { - if create_table.table_name == trace_services_table_name(trace_table_name) { - // Disable append mode for trace services table since it requires upsert behavior. + if create_table.table_name == trace_services_table_name(trace_table_name) + || create_table.table_name == trace_operations_table_name(trace_table_name) + { + // Disable append mode for auxiliary tables (services/operations) since they require upsert behavior. create_table .table_options .insert(APPEND_MODE_KEY.to_string(), "false".to_string()); + // Remove `ttl` key from table options if it exists + create_table.table_options.remove(TTL_KEY); + let table = self .create_physical_table(create_table, None, ctx, statement_executor) .await?; @@ -718,14 +726,14 @@ impl Inserter { // schema with timestamp and field column let default_schema = vec![ ColumnSchema { - column_name: GREPTIME_TIMESTAMP.to_string(), + column_name: greptime_timestamp().to_string(), datatype: ColumnDataType::TimestampMillisecond as _, semantic_type: SemanticType::Timestamp as _, datatype_extension: None, options: None, }, ColumnSchema { - column_name: GREPTIME_VALUE.to_string(), + column_name: greptime_value().to_string(), datatype: ColumnDataType::Float64 as _, semantic_type: SemanticType::Field as _, datatype_extension: None, diff --git a/src/operator/src/req_convert/common.rs b/src/operator/src/req_convert/common.rs index 37529d55c6..63226ef8e4 100644 --- a/src/operator/src/req_convert/common.rs +++ b/src/operator/src/req_convert/common.rs @@ -223,7 +223,15 @@ fn push_column_to_rows(column: Column, rows: &mut [Row]) -> Result<()> { } } - )* }} + )* _ => { + return InvalidInsertRequestSnafu { + reason: format!( + "Column '{}' with type {:?} is not supported in row inserts.", + column.column_name, column_type + ), + } + .fail(); + } }} } push_column_values_match_types!( diff --git a/src/operator/src/req_convert/insert/fill_impure_default.rs b/src/operator/src/req_convert/insert/fill_impure_default.rs index 0de49611d9..0e39bc7241 100644 --- a/src/operator/src/req_convert/insert/fill_impure_default.rs +++ b/src/operator/src/req_convert/insert/fill_impure_default.rs @@ -36,6 +36,7 @@ pub fn find_all_impure_columns(table_info: &TableInfo) -> Vec { .collect() } +// TODO(yingwen): Support Bulk insert request. /// Fill impure default values in the request pub struct ImpureDefaultFiller { impure_columns: HashMap, @@ -62,7 +63,7 @@ impl ImpureDefaultFiller { column.default_constraint() ), })?; - let grpc_default_value = api::helper::to_proto_value(default_value); + let grpc_default_value = api::helper::to_grpc_value(default_value); let def = column_schemas_to_defs(vec![column], &pk_names)?.swap_remove(0); let grpc_column_schema = api::v1::ColumnSchema { column_name: def.name, diff --git a/src/operator/src/req_convert/insert/stmt_to_region.rs b/src/operator/src/req_convert/insert/stmt_to_region.rs index aca31b289a..ef4e7cac8e 100644 --- a/src/operator/src/req_convert/insert/stmt_to_region.rs +++ b/src/operator/src/req_convert/insert/stmt_to_region.rs @@ -12,13 +12,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -use api::helper::{ColumnDataTypeWrapper, value_to_grpc_value}; +use std::cell::LazyCell; +use std::collections::HashMap; + +use api::helper::{ColumnDataTypeWrapper, to_grpc_value}; +use api::v1::alter_table_expr::Kind; use api::v1::column_def::options_from_column_schema; use api::v1::region::InsertRequests as RegionInsertRequests; -use api::v1::{ColumnSchema as GrpcColumnSchema, Row, Rows, Value as GrpcValue}; +use api::v1::{ + AlterTableExpr, ColumnSchema as GrpcColumnSchema, ModifyColumnType, ModifyColumnTypes, Row, + Rows, +}; use catalog::CatalogManager; +use common_telemetry::info; use common_time::Timezone; +use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, SchemaRef}; +use datatypes::types::JsonType; +use datatypes::value::Value; use partition::manager::PartitionRuleManager; use session::context::{QueryContext, QueryContextRef}; use snafu::{OptionExt, ResultExt, ensure}; @@ -30,12 +41,13 @@ use table::metadata::TableInfoRef; use crate::error::{ CatalogSnafu, ColumnDataTypeSnafu, ColumnDefaultValueSnafu, ColumnNoneDefaultValueSnafu, - ColumnNotFoundSnafu, InvalidSqlSnafu, MissingInsertBodySnafu, ParseSqlSnafu, Result, - SchemaReadOnlySnafu, TableNotFoundSnafu, + ColumnNotFoundSnafu, InvalidInsertRequestSnafu, InvalidSqlSnafu, MissingInsertBodySnafu, + ParseSqlSnafu, Result, SchemaReadOnlySnafu, TableNotFoundSnafu, }; use crate::insert::InstantAndNormalInsertRequests; use crate::req_convert::common::partitioner::Partitioner; use crate::req_convert::insert::semantic_type; +use crate::statement::StatementExecutor; const DEFAULT_PLACEHOLDER_VALUE: &str = "default"; @@ -62,12 +74,12 @@ impl<'a> StatementToRegion<'a> { &self, stmt: &Insert, query_ctx: &QueryContextRef, + statement_executor: &StatementExecutor, ) -> Result<(InstantAndNormalInsertRequests, TableInfoRef)> { let name = stmt.table_name().context(ParseSqlSnafu)?; let (catalog, schema, table_name) = self.get_full_name(name)?; - let table = self.get_table(&catalog, &schema, &table_name).await?; + let mut table = self.get_table(&catalog, &schema, &table_name).await?; let table_schema = table.schema(); - let table_info = table.table_info(); ensure!( !common_catalog::consts::is_readonly_schema(&schema), @@ -94,7 +106,6 @@ impl<'a> StatementToRegion<'a> { Ok(()) })?; - let mut schema = Vec::with_capacity(column_count); let mut rows = vec![ Row { values: Vec::with_capacity(column_count) @@ -102,17 +113,57 @@ impl<'a> StatementToRegion<'a> { row_count ]; - for (i, column_name) in column_names.into_iter().enumerate() { - let column_schema = table_schema - .column_schema_by_name(column_name) - .with_context(|| ColumnNotFoundSnafu { - msg: format!("Column {} not found in table {}", column_name, &table_name), - })?; + fn find_insert_columns<'a>( + table: &'a TableRef, + column_names: &[&String], + ) -> Result> { + let schema = table.schema_ref(); + column_names + .iter() + .map(|name| { + schema + .column_schema_by_name(name) + .context(ColumnNotFoundSnafu { msg: *name }) + }) + .collect::>>() + } + let mut insert_columns = find_insert_columns(&table, &column_names)?; + let converter = SqlRowConverter::new(&insert_columns, query_ctx); + + // Convert the SQL values to GreptimeDB values, and merge a "largest" JSON types of all + // values on the way by `JsonColumnTypeUpdater`. + let mut updater = JsonColumnTypeUpdater::new(statement_executor, query_ctx); + let value_rows = converter.convert(&mut updater, &sql_rows)?; + + // If the JSON values have a "larger" json type than the one in the table schema, modify + // the column's json type first, by executing an "alter table" DDL. + if updater + .maybe_update_column_type(&catalog, &schema, &table_name, &insert_columns) + .await? + { + // Update with the latest schema, if changed. + table = self.get_table(&catalog, &schema, &table_name).await?; + insert_columns = find_insert_columns(&table, &column_names)?; + } + + // Finally convert GreptimeDB values to GRPC values, ready to do insertion on Datanode. + for (i, row) in value_rows.into_iter().enumerate() { + for value in row { + let grpc_value = to_grpc_value(value); + rows[i].values.push(grpc_value); + } + } + + let table_info = table.table_info(); + let mut schema = Vec::with_capacity(column_count); + for column_schema in insert_columns { let (datatype, datatype_extension) = ColumnDataTypeWrapper::try_from(column_schema.data_type.clone()) .context(ColumnDataTypeSnafu)? .to_parts(); + + let column_name = &column_schema.name; let semantic_type = semantic_type(&table_info, column_name)?; let grpc_column_schema = GrpcColumnSchema { @@ -123,16 +174,6 @@ impl<'a> StatementToRegion<'a> { options: options_from_column_schema(column_schema), }; schema.push(grpc_column_schema); - - for (sql_row, grpc_row) in sql_rows.iter().zip(rows.iter_mut()) { - let value = sql_value_to_grpc_value( - column_schema, - &sql_row[i], - Some(&query_ctx.timezone()), - query_ctx.auto_string_to_numeric(), - )?; - grpc_row.values.push(value); - } } let requests = Partitioner::new(self.partition_manager) @@ -194,6 +235,147 @@ impl<'a> StatementToRegion<'a> { } } +struct SqlRowConverter<'a, 'b> { + insert_columns: &'a [&'a ColumnSchema], + query_context: &'b QueryContextRef, +} + +impl<'a, 'b> SqlRowConverter<'a, 'b> { + fn new(insert_columns: &'a [&'a ColumnSchema], query_context: &'b QueryContextRef) -> Self { + Self { + insert_columns, + query_context, + } + } + + fn convert( + &self, + updater: &mut JsonColumnTypeUpdater<'_, 'a>, + sql_rows: &[Vec], + ) -> Result>> { + let timezone = Some(&self.query_context.timezone()); + let auto_string_to_numeric = self.query_context.auto_string_to_numeric(); + + let mut value_rows = Vec::with_capacity(sql_rows.len()); + for sql_row in sql_rows { + let mut value_row = Vec::with_capacity(self.insert_columns.len()); + + for (insert_column, sql_value) in self.insert_columns.iter().zip(sql_row) { + let value = + sql_value_to_value(insert_column, sql_value, timezone, auto_string_to_numeric)?; + + updater.merge_types(insert_column, &value)?; + + value_row.push(value); + } + value_rows.push(value_row); + } + Ok(value_rows) + } +} + +struct JsonColumnTypeUpdater<'a, 'b> { + statement_executor: &'a StatementExecutor, + query_context: &'a QueryContextRef, + merged_value_types: LazyCell>, +} + +impl<'a, 'b> JsonColumnTypeUpdater<'a, 'b> { + fn new(statement_executor: &'a StatementExecutor, query_context: &'a QueryContextRef) -> Self { + Self { + statement_executor, + query_context, + merged_value_types: LazyCell::new(Default::default), + } + } + + fn merge_types(&mut self, column_schema: &'b ColumnSchema, value: &Value) -> Result<()> { + if !matches!(value, Value::Json(_)) { + return Ok(()); + } + + if let ConcreteDataType::Json(value_type) = value.data_type() { + let merged_type = self + .merged_value_types + .entry(&column_schema.name) + .or_insert_with(|| value_type.clone()); + + if !merged_type.is_include(&value_type) { + merged_type.merge(&value_type).map_err(|e| { + InvalidInsertRequestSnafu { + reason: format!(r#"cannot merge "{value_type}" into "{merged_type}": {e}"#), + } + .build() + })?; + } + } + Ok(()) + } + + async fn maybe_update_column_type( + self, + catalog: &str, + schema: &str, + table: &str, + insert_columns: &[&ColumnSchema], + ) -> Result { + let mut has_update = false; + for (column_name, merged_type) in self.merged_value_types.iter() { + let Some(column_type) = insert_columns + .iter() + .find_map(|x| (&x.name == column_name).then(|| x.data_type.as_json())) + .flatten() + else { + continue; + }; + if column_type.is_include(merged_type) { + continue; + } + + let new_column_type = { + let mut x = column_type.clone(); + x.merge(merged_type) + .map_err(|e| { + InvalidInsertRequestSnafu { + reason: format!( + r#"cannot merge "{merged_type}" into "{column_type}": {e}"# + ), + } + .build() + }) + .map(|()| x) + }?; + info!( + "updating table {}.{}.{} column {} json type: {} => {}", + catalog, schema, table, column_name, column_type, new_column_type, + ); + + let (target_type, target_type_extension) = + ColumnDataTypeWrapper::try_from(ConcreteDataType::Json(new_column_type)) + .context(ColumnDataTypeSnafu)? + .into_parts(); + let alter_expr = AlterTableExpr { + catalog_name: catalog.to_string(), + schema_name: schema.to_string(), + table_name: table.to_string(), + kind: Some(Kind::ModifyColumnTypes(ModifyColumnTypes { + modify_column_types: vec![ModifyColumnType { + column_name: column_name.to_string(), + target_type: target_type as i32, + target_type_extension, + }], + })), + }; + self.statement_executor + .alter_table_inner(alter_expr, self.query_context.clone()) + .await?; + + has_update = true; + } + Ok(has_update) + } +} + fn column_names<'a>(stmt: &'a Insert, table_schema: &'a SchemaRef) -> Vec<&'a String> { if !stmt.columns().is_empty() { stmt.columns() @@ -209,12 +391,12 @@ fn column_names<'a>(stmt: &'a Insert, table_schema: &'a SchemaRef) -> Vec<&'a St /// Converts SQL value to gRPC value according to the column schema. /// If `auto_string_to_numeric` is true, tries to cast the string value to numeric values, /// and fills the default value if the cast fails. -fn sql_value_to_grpc_value( +fn sql_value_to_value( column_schema: &ColumnSchema, sql_val: &SqlValue, timezone: Option<&Timezone>, auto_string_to_numeric: bool, -) -> Result { +) -> Result { let column = &column_schema.name; let value = if replace_default(sql_val) { let default_value = column_schema @@ -237,9 +419,25 @@ fn sql_value_to_grpc_value( ) .context(crate::error::SqlCommonSnafu)? }; + validate(&value)?; + Ok(value) +} - let grpc_value = value_to_grpc_value(value); - Ok(grpc_value) +fn validate(value: &Value) -> Result<()> { + match value { + Value::Json(value) => { + // Json object will be stored as Arrow struct in parquet, and it has the restriction: + // "Parquet does not support writing empty structs". + ensure!( + !value.is_empty_object(), + InvalidInsertRequestSnafu { + reason: "empty json object is not supported, consider adding a dummy field" + } + ); + Ok(()) + } + _ => Ok(()), + } } fn replace_default(sql_val: &SqlValue) -> bool { diff --git a/src/operator/src/request.rs b/src/operator/src/request.rs index 1bca461842..a5ed045313 100644 --- a/src/operator/src/request.rs +++ b/src/operator/src/request.rs @@ -15,19 +15,19 @@ use std::sync::Arc; use api::v1::region::region_request::Body as RegionRequestBody; -use api::v1::region::{CompactRequest, FlushRequest, RegionRequestHeader}; +use api::v1::region::{BuildIndexRequest, CompactRequest, FlushRequest, RegionRequestHeader}; use catalog::CatalogManagerRef; use common_catalog::build_db_string; use common_meta::node_manager::{AffectedRows, NodeManagerRef}; use common_meta::peer::Peer; use common_telemetry::tracing_context::TracingContext; -use common_telemetry::{error, info}; +use common_telemetry::{debug, error, info}; use futures_util::future; use partition::manager::{PartitionInfo, PartitionRuleManagerRef}; use session::context::QueryContextRef; use snafu::prelude::*; use store_api::storage::RegionId; -use table::requests::{CompactTableRequest, FlushTableRequest}; +use table::requests::{BuildIndexTableRequest, CompactTableRequest, FlushTableRequest}; use crate::error::{ CatalogSnafu, FindRegionLeaderSnafu, FindTablePartitionRuleSnafu, JoinTaskSnafu, @@ -90,6 +90,43 @@ impl Requester { .await } + /// Handle the request to build index for table. + pub async fn handle_table_build_index( + &self, + request: BuildIndexTableRequest, + ctx: QueryContextRef, + ) -> Result { + let partitions = self + .get_table_partitions( + &request.catalog_name, + &request.schema_name, + &request.table_name, + ) + .await?; + + let requests = partitions + .into_iter() + .map(|partition| { + RegionRequestBody::BuildIndex(BuildIndexRequest { + region_id: partition.id.into(), + }) + }) + .collect(); + + info!( + "Handle table manual build index for table {}", + request.table_name + ); + debug!("Request details: {:?}", request); + + self.do_request( + requests, + Some(build_db_string(&request.catalog_name, &request.schema_name)), + &ctx, + ) + .await + } + /// Handle the request to compact table. pub async fn handle_table_compaction( &self, @@ -201,6 +238,7 @@ impl Requester { let region_id = match req { RegionRequestBody::Flush(req) => req.region_id, RegionRequestBody::Compact(req) => req.region_id, + RegionRequestBody::BuildIndex(req) => req.region_id, _ => { error!("Unsupported region request: {:?}", req); return UnsupportedRegionRequestSnafu {}.fail(); diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs index 5dd39681b6..cff6ee0711 100644 --- a/src/operator/src/statement.rs +++ b/src/operator/src/statement.rs @@ -46,12 +46,13 @@ use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; use common_meta::procedure_executor::ProcedureExecutorRef; use common_query::Output; -use common_telemetry::tracing; +use common_telemetry::{debug, tracing, warn}; use common_time::Timestamp; use common_time::range::TimestampRange; use datafusion_expr::LogicalPlan; use datatypes::prelude::ConcreteDataType; use humantime::format_duration; +use itertools::Itertools; use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef}; use query::QueryEngineRef; use query::parser::QueryStatement; @@ -87,6 +88,22 @@ use crate::insert::InserterRef; use crate::statement::copy_database::{COPY_DATABASE_TIME_END_KEY, COPY_DATABASE_TIME_START_KEY}; use crate::statement::set::set_allow_query_fallback; +/// A configurator that customizes or enhances a [`StatementExecutor`]. +#[async_trait::async_trait] +pub trait StatementExecutorConfigurator: Send + Sync { + async fn configure( + &self, + executor: StatementExecutor, + ctx: ExecutorConfigureContext, + ) -> std::result::Result; +} + +pub type StatementExecutorConfiguratorRef = Arc; + +pub struct ExecutorConfigureContext { + pub kv_backend: KvBackendRef, +} + #[derive(Clone)] pub struct StatementExecutor { catalog_manager: CatalogManagerRef, @@ -105,15 +122,6 @@ pub struct StatementExecutor { pub type StatementExecutorRef = Arc; -/// Trait for creating [`TriggerQuerier`] instance. -#[cfg(feature = "enterprise")] -pub trait TriggerQuerierFactory: Send + Sync { - fn create(&self, kv_backend: KvBackendRef) -> TriggerQuerierRef; -} - -#[cfg(feature = "enterprise")] -pub type TriggerQuerierFactoryRef = Arc; - /// Trait for querying trigger info, such as `SHOW CREATE TRIGGER` etc. #[cfg(feature = "enterprise")] #[async_trait::async_trait] @@ -452,6 +460,13 @@ impl StatementExecutor { fn set_variables(&self, set_var: SetVariables, query_ctx: QueryContextRef) -> Result { let var_name = set_var.variable.to_string().to_uppercase(); + debug!( + "Trying to set {}={} for session: {} ", + var_name, + set_var.value.iter().map(|e| e.to_string()).join(", "), + query_ctx.conn_info() + ); + match var_name.as_str() { "READ_PREFERENCE" => set_read_preference(set_var.value, query_ctx)?, @@ -473,6 +488,11 @@ impl StatementExecutor { "@@SESSION.MAX_EXECUTION_TIME" | "MAX_EXECUTION_TIME" => match query_ctx.channel() { Channel::Mysql => set_query_timeout(set_var.value, query_ctx)?, Channel::Postgres => { + warn!( + "Unsupported set variable {} for channel {:?}", + var_name, + query_ctx.channel() + ); query_ctx.set_warning(format!("Unsupported set variable {}", var_name)) } _ => { @@ -482,16 +502,23 @@ impl StatementExecutor { .fail(); } }, - "STATEMENT_TIMEOUT" => { - if query_ctx.channel() == Channel::Postgres { - set_query_timeout(set_var.value, query_ctx)? - } else { + "STATEMENT_TIMEOUT" => match query_ctx.channel() { + Channel::Postgres => set_query_timeout(set_var.value, query_ctx)?, + Channel::Mysql => { + warn!( + "Unsupported set variable {} for channel {:?}", + var_name, + query_ctx.channel() + ); + query_ctx.set_warning(format!("Unsupported set variable {}", var_name)); + } + _ => { return NotSupportedSnafu { feat: format!("Unsupported set variable {}", var_name), } .fail(); } - } + }, "SEARCH_PATH" => { if query_ctx.channel() == Channel::Postgres { set_search_path(set_var.value, query_ctx)? @@ -503,14 +530,16 @@ impl StatementExecutor { } } _ => { - // for postgres, we give unknown SET statements a warning with - // success, this is prevent the SET call becoming a blocker - // of connection establishment - // - if query_ctx.channel() == Channel::Postgres { - query_ctx.set_warning(format!("Unsupported set variable {}", var_name)); - } else if query_ctx.channel() == Channel::Mysql && var_name.starts_with("@@") { - // Just ignore `SET @@` commands for MySQL + if query_ctx.channel() == Channel::Postgres || query_ctx.channel() == Channel::Mysql + { + // For unknown SET statements, we give a warning with success. + // This prevents the SET call from becoming a blocker of MySQL/Postgres clients' + // connection establishment. + warn!( + "Unsupported set variable {} for channel {:?}", + var_name, + query_ctx.channel() + ); query_ctx.set_warning(format!("Unsupported set variable {}", var_name)); } else { return NotSupportedSnafu { diff --git a/src/operator/src/statement/copy_database.rs b/src/operator/src/statement/copy_database.rs index c7cf0b47b0..cd8eeb6d79 100644 --- a/src/operator/src/statement/copy_database.rs +++ b/src/operator/src/statement/copy_database.rs @@ -12,14 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; use std::path::Path; use std::str::FromStr; +use std::sync::Arc; use client::{Output, OutputData, OutputMeta}; +use common_catalog::format_full_table_name; use common_datasource::file_format::Format; use common_datasource::lister::{Lister, Source}; use common_datasource::object_store::build_backend; +use common_stat::get_total_cpu_cores; use common_telemetry::{debug, error, info, tracing}; +use futures::future::try_join_all; use object_store::Entry; use regex::Regex; use session::context::QueryContextRef; @@ -27,6 +32,7 @@ use snafu::{OptionExt, ResultExt, ensure}; use store_api::metric_engine_consts::{LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME}; use table::requests::{CopyDatabaseRequest, CopyDirection, CopyTableRequest}; use table::table_reference::TableReference; +use tokio::sync::Semaphore; use crate::error; use crate::error::{CatalogSnafu, InvalidCopyDatabasePathSnafu}; @@ -35,6 +41,16 @@ use crate::statement::StatementExecutor; pub(crate) const COPY_DATABASE_TIME_START_KEY: &str = "start_time"; pub(crate) const COPY_DATABASE_TIME_END_KEY: &str = "end_time"; pub(crate) const CONTINUE_ON_ERROR_KEY: &str = "continue_on_error"; +pub(crate) const PARALLELISM_KEY: &str = "parallelism"; + +/// Get parallelism from options, default to total CPU cores. +fn parse_parallelism_from_option_map(options: &HashMap) -> usize { + options + .get(PARALLELISM_KEY) + .and_then(|v| v.parse::().ok()) + .unwrap_or_else(get_total_cpu_cores) + .max(1) +} impl StatementExecutor { #[tracing::instrument(skip_all)] @@ -51,22 +67,26 @@ impl StatementExecutor { } ); + let parallelism = parse_parallelism_from_option_map(&req.with); info!( - "Copy database {}.{} to dir: {}, time: {:?}", - req.catalog_name, req.schema_name, req.location, req.time_range + "Copy database {}.{} to dir: {}, time: {:?}, parallelism: {}", + req.catalog_name, req.schema_name, req.location, req.time_range, parallelism ); let table_names = self .catalog_manager .table_names(&req.catalog_name, &req.schema_name, Some(&ctx)) .await .context(CatalogSnafu)?; + let num_tables = table_names.len(); let suffix = Format::try_from(&req.with) .context(error::ParseFileFormatSnafu)? .suffix(); - let mut exported_rows = 0; - for table_name in table_names { + let mut tasks = Vec::with_capacity(num_tables); + let semaphore = Arc::new(Semaphore::new(parallelism)); + + for (i, table_name) in table_names.into_iter().enumerate() { let table = self .get_table(&TableReference { catalog: &req.catalog_name, @@ -89,33 +109,40 @@ impl StatementExecutor { { continue; } + + let semaphore_moved = semaphore.clone(); let mut table_file = req.location.clone(); table_file.push_str(&table_name); table_file.push_str(suffix); - info!( - "Copy table: {}.{}.{} to {}", - req.catalog_name, req.schema_name, table_name, table_file - ); + let table_no = i + 1; + let moved_ctx = ctx.clone(); + let full_table_name = + format_full_table_name(&req.catalog_name, &req.schema_name, &table_name); + let copy_table_req = CopyTableRequest { + catalog_name: req.catalog_name.clone(), + schema_name: req.schema_name.clone(), + table_name, + location: table_file.clone(), + with: req.with.clone(), + connection: req.connection.clone(), + pattern: None, + direction: CopyDirection::Export, + timestamp_range: req.time_range, + limit: None, + }; - let exported = self - .copy_table_to( - CopyTableRequest { - catalog_name: req.catalog_name.clone(), - schema_name: req.schema_name.clone(), - table_name, - location: table_file, - with: req.with.clone(), - connection: req.connection.clone(), - pattern: None, - direction: CopyDirection::Export, - timestamp_range: req.time_range, - limit: None, - }, - ctx.clone(), - ) - .await?; - exported_rows += exported; + tasks.push(async move { + let _permit = semaphore_moved.acquire().await.unwrap(); + info!( + "Copy table({}/{}): {} to {}", + table_no, num_tables, full_table_name, table_file + ); + self.copy_table_to(copy_table_req, moved_ctx).await + }); } + + let results = try_join_all(tasks).await?; + let exported_rows = results.into_iter().sum(); Ok(Output::new_with_affected_rows(exported_rows)) } @@ -134,9 +161,10 @@ impl StatementExecutor { } ); + let parallelism = parse_parallelism_from_option_map(&req.with); info!( - "Copy database {}.{} from dir: {}, time: {:?}", - req.catalog_name, req.schema_name, req.location, req.time_range + "Copy database {}.{} from dir: {}, time: {:?}, parallelism: {}", + req.catalog_name, req.schema_name, req.location, req.time_range, parallelism ); let suffix = Format::try_from(&req.with) .context(error::ParseFileFormatSnafu)? @@ -150,8 +178,8 @@ impl StatementExecutor { .and_then(|v| bool::from_str(v).ok()) .unwrap_or(false); - let mut rows_inserted = 0; - let mut insert_cost = 0; + let mut tasks = Vec::with_capacity(entries.len()); + let semaphore = Arc::new(Semaphore::new(parallelism)); for e in entries { let table_name = match parse_file_name_to_copy(&e) { @@ -165,6 +193,7 @@ impl StatementExecutor { } } }; + let req = CopyTableRequest { catalog_name: req.catalog_name.clone(), schema_name: req.schema_name.clone(), @@ -177,23 +206,36 @@ impl StatementExecutor { timestamp_range: None, limit: None, }; - debug!("Copy table, arg: {:?}", req); - match self.copy_table_from(req, ctx.clone()).await { - Ok(o) => { - let (rows, cost) = o.extract_rows_and_cost(); - rows_inserted += rows; - insert_cost += cost; - } - Err(err) => { - if continue_on_error { - error!(err; "Failed to import file to table: {}", table_name); - continue; - } else { - return Err(err); + let moved_ctx = ctx.clone(); + let moved_table_name = table_name.clone(); + let moved_semaphore = semaphore.clone(); + tasks.push(async move { + let _permit = moved_semaphore.acquire().await.unwrap(); + debug!("Copy table, arg: {:?}", req); + match self.copy_table_from(req, moved_ctx).await { + Ok(o) => { + let (rows, cost) = o.extract_rows_and_cost(); + Ok((rows, cost)) + } + Err(err) => { + if continue_on_error { + error!(err; "Failed to import file to table: {}", moved_table_name); + Ok((0, 0)) + } else { + Err(err) + } } } - } + }); } + + let results = try_join_all(tasks).await?; + let (rows_inserted, insert_cost) = results + .into_iter() + .fold((0, 0), |(acc_rows, acc_cost), (rows, cost)| { + (acc_rows + rows, acc_cost + cost) + }); + Ok(Output::new( OutputData::AffectedRows(rows_inserted), OutputMeta::new_with_cost(insert_cost), @@ -229,15 +271,18 @@ async fn list_files_to_copy(req: &CopyDatabaseRequest, suffix: &str) -> error::R #[cfg(test)] mod tests { - use std::collections::HashSet; + use std::collections::{HashMap, HashSet}; + use common_stat::get_total_cpu_cores; use object_store::ObjectStore; use object_store::services::Fs; use object_store::util::normalize_dir; use path_slash::PathExt; use table::requests::CopyDatabaseRequest; - use crate::statement::copy_database::{list_files_to_copy, parse_file_name_to_copy}; + use crate::statement::copy_database::{ + list_files_to_copy, parse_file_name_to_copy, parse_parallelism_from_option_map, + }; #[tokio::test] async fn test_list_files_and_parse_table_name() { @@ -276,4 +321,16 @@ mod tests { listed ); } + + #[test] + fn test_parse_parallelism_from_option_map() { + let options = HashMap::new(); + assert_eq!( + parse_parallelism_from_option_map(&options), + get_total_cpu_cores() + ); + + let options = HashMap::from([("parallelism".to_string(), "0".to_string())]); + assert_eq!(parse_parallelism_from_option_map(&options), 1); + } } diff --git a/src/operator/src/statement/copy_table_from.rs b/src/operator/src/statement/copy_table_from.rs index da120ff1bf..35cfdc7830 100644 --- a/src/operator/src/statement/copy_table_from.rs +++ b/src/operator/src/statement/copy_table_from.rs @@ -20,8 +20,9 @@ use std::sync::Arc; use client::{Output, OutputData, OutputMeta}; use common_base::readable_size::ReadableSize; use common_datasource::file_format::csv::CsvFormat; +use common_datasource::file_format::json::JsonFormat; use common_datasource::file_format::orc::{ReaderAdapter, infer_orc_schema, new_orc_stream_reader}; -use common_datasource::file_format::{FileFormat, Format}; +use common_datasource::file_format::{FileFormat, Format, file_to_stream}; use common_datasource::lister::{Lister, Source}; use common_datasource::object_store::{FS_SCHEMA, build_backend, parse_url}; use common_datasource::util::find_dir_and_filename; @@ -29,14 +30,9 @@ use common_query::{OutputCost, OutputRows}; use common_recordbatch::DfSendableRecordBatchStream; use common_recordbatch::adapter::RecordBatchStreamTypeAdapter; use common_telemetry::{debug, tracing}; -use datafusion::datasource::listing::PartitionedFile; -use datafusion::datasource::object_store::ObjectStoreUrl; -use datafusion::datasource::physical_plan::{ - CsvSource, FileGroup, FileScanConfigBuilder, FileSource, FileStream, JsonSource, -}; +use datafusion::datasource::physical_plan::{CsvSource, FileSource, JsonSource}; use datafusion::parquet::arrow::ParquetRecordBatchStreamBuilder; use datafusion::parquet::arrow::arrow_reader::ArrowReaderMetadata; -use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_expr::Expr; use datatypes::arrow::compute::can_cast_types; use datatypes::arrow::datatypes::{DataType as ArrowDataType, Schema, SchemaRef}; @@ -55,6 +51,7 @@ use crate::statement::StatementExecutor; const DEFAULT_BATCH_SIZE: usize = 8192; const DEFAULT_READ_BUFFER: usize = 256 * 1024; + enum FileMetadata { Parquet { schema: SchemaRef, @@ -67,6 +64,7 @@ enum FileMetadata { }, Json { schema: SchemaRef, + format: JsonFormat, path: String, }, Csv { @@ -147,6 +145,7 @@ impl StatementExecutor { .await .context(error::InferSchemaSnafu { path: &path })?, ), + format, path, }), Format::Parquet(_) => { @@ -195,33 +194,6 @@ impl StatementExecutor { } } - async fn build_file_stream( - &self, - store: &ObjectStore, - filename: &str, - file_schema: SchemaRef, - file_source: Arc, - projection: Option>, - ) -> Result { - let config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - file_schema, - file_source.clone(), - ) - .with_file_group(FileGroup::new(vec![PartitionedFile::new(filename, 0)])) - .with_projection(projection) - .build(); - - let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone())); - let file_opener = file_source - .with_projection(&config) - .create_file_opener(store, &config, 0); - let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new()) - .context(error::BuildFileStreamSnafu)?; - - Ok(Box::pin(stream)) - } - async fn build_read_stream( &self, compat_schema: SchemaRef, @@ -245,16 +217,16 @@ impl StatementExecutor { let csv_source = CsvSource::new(format.has_header, format.delimiter, b'"') .with_schema(schema.clone()) .with_batch_size(DEFAULT_BATCH_SIZE); - - let stream = self - .build_file_stream( - object_store, - path, - schema.clone(), - csv_source, - Some(projection), - ) - .await?; + let stream = file_to_stream( + object_store, + path, + schema.clone(), + csv_source, + Some(projection), + format.compression_type, + ) + .await + .context(error::BuildFileStreamSnafu)?; Ok(Box::pin( // The projection is already applied in the CSV reader when we created the stream, @@ -264,7 +236,11 @@ impl StatementExecutor { .context(error::PhysicalExprSnafu)?, )) } - FileMetadata::Json { path, schema } => { + FileMetadata::Json { + path, + format, + schema, + } => { let output_schema = Arc::new( compat_schema .project(&projection) @@ -274,16 +250,16 @@ impl StatementExecutor { let json_source = JsonSource::new() .with_schema(schema.clone()) .with_batch_size(DEFAULT_BATCH_SIZE); - - let stream = self - .build_file_stream( - object_store, - path, - schema.clone(), - json_source, - Some(projection), - ) - .await?; + let stream = file_to_stream( + object_store, + path, + schema.clone(), + json_source, + Some(projection), + format.compression_type, + ) + .await + .context(error::BuildFileStreamSnafu)?; Ok(Box::pin( // The projection is already applied in the JSON reader when we created the stream, diff --git a/src/operator/src/statement/copy_table_to.rs b/src/operator/src/statement/copy_table_to.rs index d542f8acbb..3e982373c4 100644 --- a/src/operator/src/statement/copy_table_to.rs +++ b/src/operator/src/statement/copy_table_to.rs @@ -76,12 +76,13 @@ impl StatementExecutor { ) .await .context(error::WriteStreamToFileSnafu { path }), - Format::Json(_) => stream_to_json( + Format::Json(format) => stream_to_json( Box::pin(DfRecordBatchStreamAdapter::new(stream)), object_store, path, threshold, WRITE_CONCURRENCY, + format, ) .await .context(error::WriteStreamToFileSnafu { path }), diff --git a/src/operator/src/statement/ddl.rs b/src/operator/src/statement/ddl.rs index 3b626d13d0..89ca7f2b78 100644 --- a/src/operator/src/statement/ddl.rs +++ b/src/operator/src/statement/ddl.rs @@ -26,13 +26,13 @@ use api::v1::{ }; use catalog::CatalogManagerRef; use chrono::Utc; +use common_base::regex_pattern::NAME_PATTERN_REG; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, is_readonly_schema}; use common_catalog::{format_full_flow_name, format_full_table_name}; use common_error::ext::BoxedError; use common_meta::cache_invalidator::Context; use common_meta::ddl::create_flow::FlowType; use common_meta::instruction::CacheIdent; -use common_meta::key::NAME_PATTERN; use common_meta::key::schema_name::{SchemaName, SchemaNameKey}; use common_meta::procedure_executor::ExecutorContext; #[cfg(feature = "enterprise")] @@ -52,14 +52,12 @@ use datafusion_expr::LogicalPlan; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{RawSchema, Schema}; use datatypes::value::Value; -use lazy_static::lazy_static; use partition::expr::{Operand, PartitionExpr, RestrictedOp}; use partition::multi_dim::MultiDimPartitionRule; use query::parser::QueryStatement; use query::plan::extract_and_rewrite_full_table_names; use query::query_engine::DefaultSerializer; use query::sql::create_table_stmt; -use regex::Regex; use session::context::QueryContextRef; use session::table_name::table_idents_to_full_name; use snafu::{OptionExt, ResultExt, ensure}; @@ -96,10 +94,6 @@ use crate::expr_helper; use crate::statement::StatementExecutor; use crate::statement::show::create_partitions_stmt; -lazy_static! { - pub static ref NAME_PATTERN_REG: Regex = Regex::new(&format!("^{NAME_PATTERN}$")).unwrap(); -} - impl StatementExecutor { pub fn catalog_manager(&self) -> CatalogManagerRef { self.catalog_manager.clone() @@ -222,13 +216,13 @@ impl StatementExecutor { .table_options .contains_key(LOGICAL_TABLE_METADATA_KEY) { + if let Some(partitions) = partitions.as_ref() + && !partitions.exprs.is_empty() + { + self.validate_logical_table_partition_rule(create_table, partitions, &query_ctx) + .await?; + } // Create logical tables - ensure!( - partitions.is_none(), - InvalidPartitionRuleSnafu { - reason: "logical table in metric engine should not have partition rule, it will be inherited from physical table", - } - ); self.create_logical_tables(std::slice::from_ref(create_table), query_ctx) .await? .into_iter() @@ -405,6 +399,73 @@ impl StatementExecutor { .collect()) } + async fn validate_logical_table_partition_rule( + &self, + create_table: &CreateTableExpr, + partitions: &Partitions, + query_ctx: &QueryContextRef, + ) -> Result<()> { + let (_, mut logical_partition_exprs) = + parse_partitions_for_logical_validation(create_table, partitions, query_ctx)?; + + let physical_table_name = create_table + .table_options + .get(LOGICAL_TABLE_METADATA_KEY) + .with_context(|| CreateLogicalTablesSnafu { + reason: format!( + "expect `{LOGICAL_TABLE_METADATA_KEY}` option on creating logical table" + ), + })?; + + let physical_table = self + .catalog_manager + .table( + &create_table.catalog_name, + &create_table.schema_name, + physical_table_name, + Some(query_ctx), + ) + .await + .context(CatalogSnafu)? + .context(TableNotFoundSnafu { + table_name: physical_table_name.clone(), + })?; + + let physical_table_info = physical_table.table_info(); + let partition_rule = self + .partition_manager + .find_table_partition_rule(&physical_table_info) + .await + .context(error::FindTablePartitionRuleSnafu { + table_name: physical_table_name.clone(), + })?; + + let multi_dim_rule = partition_rule + .as_ref() + .as_any() + .downcast_ref::() + .context(InvalidPartitionRuleSnafu { + reason: "physical table partition rule is not range-based", + })?; + + // TODO(ruihang): project physical partition exprs to logical partition column + let mut physical_partition_exprs = multi_dim_rule.exprs().to_vec(); + logical_partition_exprs.sort_unstable(); + physical_partition_exprs.sort_unstable(); + + ensure!( + physical_partition_exprs == logical_partition_exprs, + InvalidPartitionRuleSnafu { + reason: format!( + "logical table partition rule must match the corresponding physical table's\n logical table partition exprs:\t\t {:?}\n physical table partition exprs:\t {:?}", + logical_partition_exprs, physical_partition_exprs + ), + } + ); + + Ok(()) + } + #[cfg(feature = "enterprise")] #[tracing::instrument(skip_all)] pub async fn create_trigger( @@ -1616,6 +1677,51 @@ pub fn parse_partitions( Ok((partition_exprs, partition_columns)) } +fn parse_partitions_for_logical_validation( + create_table: &CreateTableExpr, + partitions: &Partitions, + query_ctx: &QueryContextRef, +) -> Result<(Vec, Vec)> { + let partition_columns = partitions + .column_list + .iter() + .map(|ident| ident.value.clone()) + .collect::>(); + + let column_name_and_type = partition_columns + .iter() + .map(|pc| { + let column = create_table + .column_defs + .iter() + .find(|c| &c.name == pc) + .context(ColumnNotFoundSnafu { msg: pc.clone() })?; + let column_name = &column.name; + let data_type = ConcreteDataType::from( + ColumnDataTypeWrapper::try_new(column.data_type, column.datatype_extension.clone()) + .context(ColumnDataTypeSnafu)?, + ); + Ok((column_name, data_type)) + }) + .collect::>>()?; + + let mut partition_exprs = Vec::with_capacity(partitions.exprs.len()); + for expr in &partitions.exprs { + let partition_expr = convert_one_expr(expr, &column_name_and_type, &query_ctx.timezone())?; + partition_exprs.push(partition_expr); + } + + MultiDimPartitionRule::try_new( + partition_columns.clone(), + vec![], + partition_exprs.clone(), + true, + ) + .context(InvalidPartitionSnafu)?; + + Ok((partition_columns, partition_exprs)) +} + /// Verifies an alter and returns whether it is necessary to perform the alter. /// /// # Returns diff --git a/src/operator/src/statement/dml.rs b/src/operator/src/statement/dml.rs index 827bfd8b66..41169398ab 100644 --- a/src/operator/src/statement/dml.rs +++ b/src/operator/src/statement/dml.rs @@ -28,7 +28,7 @@ impl StatementExecutor { if insert.can_extract_values() { // Fast path: plain insert ("insert with literal values") is executed directly self.inserter - .handle_statement_insert(insert.as_ref(), &query_ctx) + .handle_statement_insert(insert.as_ref(), &query_ctx, self) .await } else { // Slow path: insert with subquery. Execute using query engine. diff --git a/src/operator/src/table.rs b/src/operator/src/table.rs index 52c37bb401..13ed57200c 100644 --- a/src/operator/src/table.rs +++ b/src/operator/src/table.rs @@ -23,8 +23,8 @@ use session::context::QueryContextRef; use snafu::ResultExt; use store_api::storage::RegionId; use table::requests::{ - CompactTableRequest, DeleteRequest as TableDeleteRequest, FlushTableRequest, - InsertRequest as TableInsertRequest, + BuildIndexTableRequest, CompactTableRequest, DeleteRequest as TableDeleteRequest, + FlushTableRequest, InsertRequest as TableInsertRequest, }; use crate::delete::DeleterRef; @@ -97,6 +97,18 @@ impl TableMutationHandler for TableMutationOperator { .context(query_error::TableMutationSnafu) } + async fn build_index( + &self, + request: BuildIndexTableRequest, + ctx: QueryContextRef, + ) -> QueryResult { + self.requester + .handle_table_build_index(request, ctx) + .await + .map_err(BoxedError::new) + .context(query_error::TableMutationSnafu) + } + async fn flush_region( &self, region_id: RegionId, diff --git a/src/partition/src/collider.rs b/src/partition/src/collider.rs index 1bd5000f9d..c426e84575 100644 --- a/src/partition/src/collider.rs +++ b/src/partition/src/collider.rs @@ -173,6 +173,9 @@ impl<'a> Collider<'a> { for (column, mut column_values) in values { column_values.sort_unstable(); column_values.dedup(); // Remove duplicates + + // allowed because we have carefully implemented `Hash` to eliminate the mutable + #[allow(clippy::mutable_key_type)] let mut value_map = HashMap::with_capacity(column_values.len()); let mut start_value = ZERO; for value in column_values { diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index de98213972..6774842ef1 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -24,7 +24,7 @@ use api::v1::column_data_type_extension::TypeExt; use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, SemanticType}; use coerce::{coerce_columns, coerce_value}; -use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_query::prelude::{greptime_timestamp, greptime_value}; use common_telemetry::warn; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; @@ -48,7 +48,6 @@ use crate::etl::transform::index::Index; use crate::etl::transform::{Transform, Transforms}; use crate::{PipelineContext, truthy, unwrap_or_continue_if_err}; -const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp"; const DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING: usize = 10; /// fields not in the columns will be discarded @@ -138,10 +137,7 @@ impl GreptimeTransformer { let default = None; let transform = Transform { - fields: Fields::one(Field::new( - DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), - None, - )), + fields: Fields::one(Field::new(greptime_timestamp().to_string(), None)), type_, default, index: Some(Index::Time), @@ -347,7 +343,7 @@ fn calc_ts(p_ctx: &PipelineContext, values: &VrlValue) -> Result { let ts = values .as_object() - .and_then(|m| m.get(GREPTIME_TIMESTAMP)) + .and_then(|m| m.get(greptime_timestamp())) .and_then(|ts| ts.try_into_i64().ok()) .unwrap_or_default(); Ok(Some(ValueData::TimestampMillisecondValue(ts))) @@ -395,7 +391,7 @@ pub(crate) fn values_to_row( // skip ts column let ts_column_name = custom_ts .as_ref() - .map_or(DEFAULT_GREPTIME_TIMESTAMP_COLUMN, |ts| ts.get_column_name()); + .map_or(greptime_timestamp(), |ts| ts.get_column_name()); let values = values.into_object().context(ValueMustBeMapSnafu)?; @@ -416,7 +412,7 @@ pub(crate) fn values_to_row( } fn decide_semantic(p_ctx: &PipelineContext, column_name: &str) -> i32 { - if p_ctx.channel == Channel::Prometheus && column_name != GREPTIME_VALUE { + if p_ctx.channel == Channel::Prometheus && column_name != greptime_value() { SemanticType::Tag as i32 } else { SemanticType::Field as i32 @@ -563,7 +559,7 @@ fn identity_pipeline_inner( schema_info.schema.push(ColumnSchema { column_name: custom_ts .map(|ts| ts.get_column_name().to_string()) - .unwrap_or_else(|| DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()), + .unwrap_or_else(|| greptime_timestamp().to_string()), datatype: custom_ts.map(|c| c.get_datatype()).unwrap_or_else(|| { if pipeline_ctx.channel == Channel::Prometheus { ColumnDataType::TimestampMillisecond diff --git a/src/pipeline/src/manager/table.rs b/src/pipeline/src/manager/table.rs index ad9a8c4ac5..d3e63e634e 100644 --- a/src/pipeline/src/manager/table.rs +++ b/src/pipeline/src/manager/table.rs @@ -19,6 +19,8 @@ use api::v1::{ ColumnDataType, ColumnDef, ColumnSchema as PbColumnSchema, Row, RowInsertRequest, RowInsertRequests, Rows, SemanticType, }; +use arrow::array::{Array, AsArray}; +use arrow::datatypes::TimestampNanosecondType; use common_query::OutputData; use common_recordbatch::util as record_util; use common_telemetry::{debug, info}; @@ -27,9 +29,7 @@ use datafusion::datasource::DefaultTableSource; use datafusion::logical_expr::col; use datafusion_common::TableReference; use datafusion_expr::{DmlStatement, LogicalPlan}; -use datatypes::prelude::ScalarVector; use datatypes::timestamp::TimestampNanosecond; -use datatypes::vectors::{StringVector, TimestampNanosecondVector, Vector}; use itertools::Itertools; use operator::insert::InserterRef; use operator::statement::StatementExecutorRef; @@ -527,8 +527,7 @@ impl PipelineTable { for r in records { let pipeline_content_column = r.column(0); let pipeline_content = pipeline_content_column - .as_any() - .downcast_ref::() + .as_string_opt::() .with_context(|| CastTypeSnafu { msg: format!( "can't downcast {:?} array into string vector", @@ -537,20 +536,19 @@ impl PipelineTable { })?; let pipeline_schema_column = r.column(1); - let pipeline_schema = pipeline_schema_column - .as_any() - .downcast_ref::() - .with_context(|| CastTypeSnafu { - msg: format!( - "can't downcast {:?} array into string vector", - pipeline_schema_column.data_type() - ), - })?; + let pipeline_schema = + pipeline_schema_column + .as_string_opt::() + .with_context(|| CastTypeSnafu { + msg: format!( + "expecting pipeline schema column of type string, actual: {}", + pipeline_schema_column.data_type() + ), + })?; let pipeline_created_at_column = r.column(2); let pipeline_created_at = pipeline_created_at_column - .as_any() - .downcast_ref::() + .as_primitive_opt::() .with_context(|| CastTypeSnafu { msg: format!( "can't downcast {:?} array into scalar vector", @@ -572,9 +570,9 @@ impl PipelineTable { let len = pipeline_content.len(); for i in 0..len { re.push(( - pipeline_content.get_data(i).unwrap().to_string(), - pipeline_schema.get_data(i).unwrap().to_string(), - pipeline_created_at.get_data(i).unwrap(), + pipeline_content.value(i).to_string(), + pipeline_schema.value(i).to_string(), + TimestampNanosecond::new(pipeline_created_at.value(i)), )); } } diff --git a/src/pipeline/tests/date.rs b/src/pipeline/tests/date.rs index fc9e726b61..0164dd4c22 100644 --- a/src/pipeline/tests/date.rs +++ b/src/pipeline/tests/date.rs @@ -15,6 +15,7 @@ mod common; use api::v1::ColumnSchema; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, SemanticType}; use lazy_static::lazy_static; @@ -35,7 +36,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/dissect.rs b/src/pipeline/tests/dissect.rs index a24e374532..b948110511 100644 --- a/src/pipeline/tests/dissect.rs +++ b/src/pipeline/tests/dissect.rs @@ -14,6 +14,7 @@ mod common; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, SemanticType}; use pipeline::{PipelineContext, setup_pipeline}; @@ -51,7 +52,7 @@ transform: make_string_column_schema("a".to_string()), make_string_column_schema("b".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -99,7 +100,7 @@ transform: make_string_column_schema("a".to_string()), make_string_column_schema("b".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -142,7 +143,7 @@ transform: make_string_column_schema("a".to_string()), make_string_column_schema("b".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -185,7 +186,7 @@ transform: make_string_column_schema("key3".to_string()), make_string_column_schema("key5".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -236,7 +237,7 @@ transform: let expected_schema = vec![ make_string_column_schema("key1".to_string()), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/epoch.rs b/src/pipeline/tests/epoch.rs index 84662793b9..ead018ad42 100644 --- a/src/pipeline/tests/epoch.rs +++ b/src/pipeline/tests/epoch.rs @@ -15,6 +15,7 @@ mod common; use api::v1::ColumnSchema; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, SemanticType}; @@ -128,7 +129,7 @@ transform: make_time_field("input_nanosecond", ColumnDataType::TimestampNanosecond), make_time_field("input_nano", ColumnDataType::TimestampNanosecond), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -187,7 +188,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -238,7 +239,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/join.rs b/src/pipeline/tests/join.rs index 3625160361..dbc966404f 100644 --- a/src/pipeline/tests/join.rs +++ b/src/pipeline/tests/join.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, ColumnSchema, SemanticType}; use lazy_static::lazy_static; @@ -38,7 +39,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/letter.rs b/src/pipeline/tests/letter.rs index d6d9a2cccb..307da50867 100644 --- a/src/pipeline/tests/letter.rs +++ b/src/pipeline/tests/letter.rs @@ -15,6 +15,7 @@ mod common; use api::v1::ColumnSchema; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, SemanticType}; use lazy_static::lazy_static; @@ -27,7 +28,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -125,7 +126,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -175,7 +176,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/on_failure.rs b/src/pipeline/tests/on_failure.rs index 2662a3fa96..d7df1ad7fa 100644 --- a/src/pipeline/tests/on_failure.rs +++ b/src/pipeline/tests/on_failure.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData::{U8Value, U16Value}; use greptime_proto::v1::{ColumnDataType, SemanticType}; @@ -46,7 +47,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -87,7 +88,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -123,7 +124,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -175,7 +176,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs index 0e6019ab47..ca94dbe3f0 100644 --- a/src/pipeline/tests/pipeline.rs +++ b/src/pipeline/tests/pipeline.rs @@ -855,7 +855,7 @@ transform: row.0.values.into_iter().for_each(|v| { if let ValueData::TimestampNanosecondValue(v) = v.value_data.unwrap() { let now = chrono::Utc::now().timestamp_nanos_opt().unwrap(); - assert!(now - v < 1_000_000); + assert!(now - v < 5_000_000); } }); } diff --git a/src/pipeline/tests/regex.rs b/src/pipeline/tests/regex.rs index a8a7daaf5c..a0a3944c8e 100644 --- a/src/pipeline/tests/regex.rs +++ b/src/pipeline/tests/regex.rs @@ -15,6 +15,7 @@ mod common; use api::v1::ColumnSchema; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, SemanticType}; use lazy_static::lazy_static; @@ -27,7 +28,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -156,7 +157,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/simple_extract.rs b/src/pipeline/tests/simple_extract.rs index ee2fbcbcae..2a93e5d135 100644 --- a/src/pipeline/tests/simple_extract.rs +++ b/src/pipeline/tests/simple_extract.rs @@ -16,6 +16,7 @@ mod common; use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnSchema, SemanticType}; +use common_query::prelude::greptime_timestamp; use lazy_static::lazy_static; lazy_static! { @@ -26,7 +27,7 @@ lazy_static! { SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/pipeline/tests/urlencoding.rs b/src/pipeline/tests/urlencoding.rs index dd0c4ffe9f..b8366aa044 100644 --- a/src/pipeline/tests/urlencoding.rs +++ b/src/pipeline/tests/urlencoding.rs @@ -14,6 +14,7 @@ mod common; +use common_query::prelude::greptime_timestamp; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, SemanticType}; @@ -54,7 +55,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), @@ -100,7 +101,7 @@ transform: SemanticType::Field, ), common::make_column_schema( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ColumnDataType::TimestampNanosecond, SemanticType::Timestamp, ), diff --git a/src/plugins/Cargo.toml b/src/plugins/Cargo.toml index 14df62c4fa..658e1c95e3 100644 --- a/src/plugins/Cargo.toml +++ b/src/plugins/Cargo.toml @@ -9,6 +9,7 @@ workspace = true [dependencies] auth.workspace = true +catalog.workspace = true clap.workspace = true cli.workspace = true common-base.workspace = true @@ -17,6 +18,7 @@ common-meta.workspace = true datanode.workspace = true flow.workspace = true frontend.workspace = true +meta-client.workspace = true meta-srv.workspace = true serde.workspace = true snafu.workspace = true diff --git a/src/plugins/src/flownode.rs b/src/plugins/src/flownode.rs index 6b56b008da..9fbb018030 100644 --- a/src/plugins/src/flownode.rs +++ b/src/plugins/src/flownode.rs @@ -30,3 +30,20 @@ pub async fn setup_flownode_plugins( pub async fn start_flownode_plugins(_plugins: Plugins) -> Result<()> { Ok(()) } + +pub mod context { + use std::sync::Arc; + + use catalog::CatalogManagerRef; + use common_meta::FlownodeId; + use common_meta::kv_backend::KvBackendRef; + use flow::FrontendClient; + + /// The context for `GrpcBuilderConfiguratorRef` in flownode. + pub struct GrpcConfigureContext { + pub kv_backend: KvBackendRef, + pub fe_client: Arc, + pub flownode_id: FlownodeId, + pub catalog_manager: CatalogManagerRef, + } +} diff --git a/src/plugins/src/frontend.rs b/src/plugins/src/frontend.rs index 85049d8f80..0d1c1af7b9 100644 --- a/src/plugins/src/frontend.rs +++ b/src/plugins/src/frontend.rs @@ -40,3 +40,25 @@ pub async fn setup_frontend_plugins( pub async fn start_frontend_plugins(_plugins: Plugins) -> Result<()> { Ok(()) } + +pub mod context { + use std::sync::Arc; + + use flow::FrontendClient; + use meta_client::MetaClientRef; + + /// The context for [`catalog::kvbackend::CatalogManagerConfiguratorRef`] in standalone or + /// distributed. + pub enum CatalogManagerConfigureContext { + Distributed(DistributedCatalogManagerConfigureContext), + Standalone(StandaloneCatalogManagerConfigureContext), + } + + pub struct DistributedCatalogManagerConfigureContext { + pub meta_client: MetaClientRef, + } + + pub struct StandaloneCatalogManagerConfigureContext { + pub fe_client: Arc, + } +} diff --git a/src/plugins/src/lib.rs b/src/plugins/src/lib.rs index 9a979a23a1..c973cb3131 100644 --- a/src/plugins/src/lib.rs +++ b/src/plugins/src/lib.rs @@ -13,12 +13,12 @@ // limitations under the License. mod cli; -mod datanode; -mod flownode; -mod frontend; +pub mod datanode; +pub mod flownode; +pub mod frontend; mod meta_srv; mod options; -mod standalone; +pub mod standalone; pub use cli::SubCommand; pub use datanode::{setup_datanode_plugins, start_datanode_plugins}; diff --git a/src/plugins/src/standalone.rs b/src/plugins/src/standalone.rs index 97b1c22aa7..0cb7ee60e5 100644 --- a/src/plugins/src/standalone.rs +++ b/src/plugins/src/standalone.rs @@ -33,3 +33,18 @@ pub async fn setup_standalone_plugins( pub async fn start_standalone_plugins(_plugins: Plugins) -> Result<()> { Ok(()) } + +pub mod context { + use std::sync::Arc; + + use catalog::CatalogManagerRef; + use common_meta::kv_backend::KvBackendRef; + use flow::FrontendClient; + + /// The context for [`common_meta::ddl_manager::DdlManagerConfiguratorRef`] in standalone. + pub struct DdlManagerConfigureContext { + pub kv_backend: KvBackendRef, + pub fe_client: Arc, + pub catalog_manager: CatalogManagerRef, + } +} diff --git a/src/promql/src/extension_plan/histogram_fold.rs b/src/promql/src/extension_plan/histogram_fold.rs index e80d4a7676..369754899f 100644 --- a/src/promql/src/extension_plan/histogram_fold.rs +++ b/src/promql/src/extension_plan/histogram_fold.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use std::task::Poll; use std::time::Instant; -use common_recordbatch::RecordBatch as GtRecordBatch; use common_telemetry::warn; use datafusion::arrow::array::AsArray; use datafusion::arrow::compute::{self, SortOptions, concat_batches}; @@ -41,9 +40,8 @@ use datafusion::physical_plan::{ }; use datafusion::prelude::{Column, Expr}; use datatypes::prelude::{ConcreteDataType, DataType as GtDataType}; -use datatypes::schema::Schema as GtSchema; use datatypes::value::{OrderedF64, ValueRef}; -use datatypes::vectors::MutableVector; +use datatypes::vectors::{Helper, MutableVector}; use futures::{Stream, StreamExt, ready}; /// `HistogramFold` will fold the conventional (non-native) histogram ([1]) for later @@ -560,36 +558,29 @@ impl HistogramFoldStream { let mut remaining_rows = self.input_buffered_rows; let mut cursor = 0; - let gt_schema = GtSchema::try_from(self.input.schema()).unwrap(); - let batch = GtRecordBatch::try_from_df_record_batch(Arc::new(gt_schema), batch).unwrap(); + // TODO(LFC): Try to get rid of the Arrow array to vector conversion here. + let vectors = Helper::try_into_vectors(batch.columns()) + .map_err(|e| DataFusionError::Execution(e.to_string()))?; while remaining_rows >= bucket_num { // "sample" normal columns for normal_index in &self.normal_indices { - let val = batch.column(*normal_index).get(cursor); + let val = vectors[*normal_index].get(cursor); self.output_buffer[*normal_index].push_value_ref(&val.as_value_ref()); } // "fold" `le` and field columns let le_array = batch.column(self.le_column_index); + let le_array = le_array.as_string::(); let field_array = batch.column(self.field_column_index); + let field_array = field_array.as_primitive::(); let mut bucket = vec![]; let mut counters = vec![]; for bias in 0..bucket_num { - let le_str_val = le_array.get(cursor + bias); - let le_str_val_ref = le_str_val.as_value_ref(); - let le_str = le_str_val_ref - .try_into_string() - .unwrap() - .expect("le column should not be nullable"); + let le_str = le_array.value(cursor + bias); let le = le_str.parse::().unwrap(); bucket.push(le); - let counter = field_array - .get(cursor + bias) - .as_value_ref() - .try_into_f64() - .unwrap() - .expect("field column should not be nullable"); + let counter = field_array.value(cursor + bias); counters.push(counter); } // ignore invalid data @@ -600,7 +591,7 @@ impl HistogramFoldStream { self.output_buffered_rows += 1; } - let remaining_input_batch = batch.into_df_record_batch().slice(cursor, remaining_rows); + let remaining_input_batch = batch.slice(cursor, remaining_rows); self.input_buffered_rows = remaining_input_batch.num_rows(); self.input_buffer.push(remaining_input_batch); diff --git a/src/query/Cargo.toml b/src/query/Cargo.toml index 344d7bd5fc..89a356e9a1 100644 --- a/src/query/Cargo.toml +++ b/src/query/Cargo.toml @@ -33,6 +33,7 @@ common-plugins.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true +common-stat.workspace = true common-telemetry.workspace = true common-time.workspace = true datafusion.workspace = true diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs index ab27666c01..f22a209de7 100644 --- a/src/query/src/datafusion.rs +++ b/src/query/src/datafusion.rs @@ -682,13 +682,14 @@ impl QueryExecutor for DatafusionQueryEngine { mod tests { use std::sync::Arc; + use arrow::array::{ArrayRef, UInt64Array}; use catalog::RegisterTableRequest; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, NUMBERS_TABLE_ID}; use common_recordbatch::util; use datafusion::prelude::{col, lit}; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; - use datatypes::vectors::{Helper, UInt32Vector, UInt64Vector, VectorRef}; + use datatypes::vectors::{Helper, UInt32Vector, VectorRef}; use session::context::{QueryContext, QueryContextBuilder}; use table::table::numbers::{NUMBERS_TABLE_NAME, NumbersTable}; @@ -770,10 +771,8 @@ mod tests { assert_eq!(1, batch.num_columns()); assert_eq!(batch.column(0).len(), 1); - assert_eq!( - *batch.column(0), - Arc::new(UInt64Vector::from_slice([4950])) as VectorRef - ); + let expected = Arc::new(UInt64Array::from_iter_values([4950])) as ArrayRef; + assert_eq!(batch.column(0), &expected); } _ => unreachable!(), } diff --git a/src/query/src/dist_plan/analyzer.rs b/src/query/src/dist_plan/analyzer.rs index 34e035644b..0bdf4dd70a 100644 --- a/src/query/src/dist_plan/analyzer.rs +++ b/src/query/src/dist_plan/analyzer.rs @@ -30,6 +30,7 @@ use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder, Subquery, col as co use datafusion_optimizer::analyzer::AnalyzerRule; use datafusion_optimizer::simplify_expressions::SimplifyExpressions; use datafusion_optimizer::{OptimizerConfig, OptimizerRule}; +use promql::extension_plan::SeriesDivide; use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; use table::metadata::TableType; use table::table::adapter::DfTableProviderAdapter; @@ -380,7 +381,23 @@ impl PlanRewriter { } match Categorizer::check_plan(plan, self.partition_cols.clone())? { - Commutativity::Commutative => {} + Commutativity::Commutative => { + // PATCH: we should reconsider SORT's commutativity instead of doing this trick. + // explain: for a fully commutative SeriesDivide, its child Sort plan only serves it. I.e., that + // Sort plan is also fully commutative, instead of conditional commutative. So we can remove + // the generated MergeSort from stage safely. + if let LogicalPlan::Extension(ext_a) = plan + && ext_a.node.name() == SeriesDivide::name() + && let Some(LogicalPlan::Extension(ext_b)) = self.stage.last() + && ext_b.node.name() == MergeSortLogicalPlan::name() + { + // revert last `ConditionalCommutative` result for Sort plan in this case. + // `update_column_requirements` left unchanged because Sort won't generate + // new columns or remove existing columns. + self.stage.pop(); + self.expand_on_next_part_cond_trans_commutative = false; + } + } Commutativity::PartialCommutative => { if let Some(plan) = partial_commutative_transformer(plan) { // notice this plan is parent of current node, so `self.level - 1` when updating column requirements diff --git a/src/query/src/dist_plan/merge_scan.rs b/src/query/src/dist_plan/merge_scan.rs index aebf9a457d..a4dd5243a7 100644 --- a/src/query/src/dist_plan/merge_scan.rs +++ b/src/query/src/dist_plan/merge_scan.rs @@ -20,40 +20,35 @@ use ahash::{HashMap, HashSet}; use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, SortOptions}; use async_stream::stream; use common_catalog::parse_catalog_and_schema_from_db_string; -use common_error::ext::BoxedError; use common_plugins::GREPTIME_EXEC_READ_COST; use common_query::request::QueryRequest; -use common_recordbatch::adapter::{DfRecordBatchStreamAdapter, RecordBatchMetrics}; -use common_recordbatch::error::ExternalSnafu; -use common_recordbatch::{ - DfSendableRecordBatchStream, RecordBatch, RecordBatchStreamWrapper, SendableRecordBatchStream, -}; +use common_recordbatch::adapter::RecordBatchMetrics; use common_telemetry::tracing_context::TracingContext; +use datafusion::arrow::record_batch::RecordBatch; use datafusion::execution::{SessionState, TaskContext}; use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion::physical_plan::metrics::{ Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, MetricsSet, Time, }; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + SendableRecordBatchStream, }; -use datafusion_common::{Column as ColumnExpr, Result}; +use datafusion_common::{Column as ColumnExpr, DataFusionError, Result}; use datafusion_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore}; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::{Distribution, EquivalenceProperties, PhysicalSortExpr}; -use datatypes::schema::{Schema, SchemaRef}; use futures_util::StreamExt; use greptime_proto::v1::region::RegionRequestHeader; use meter_core::data::ReadItem; use meter_macros::read_meter; use session::context::QueryContextRef; -use snafu::ResultExt; use store_api::storage::RegionId; use table::table_name::TableName; use tokio::time::Instant; use crate::dist_plan::analyzer::AliasMapping; -use crate::error::ConvertSchemaSnafu; use crate::metrics::{MERGE_SCAN_ERRORS_TOTAL, MERGE_SCAN_POLL_ELAPSED, MERGE_SCAN_REGIONS}; use crate::region_query::RegionQueryHandlerRef; @@ -140,7 +135,6 @@ pub struct MergeScanExec { table: TableName, regions: Vec, plan: LogicalPlan, - schema: SchemaRef, arrow_schema: ArrowSchemaRef, region_query_handler: RegionQueryHandlerRef, metric: ExecutionPlanMetricsSet, @@ -159,7 +153,6 @@ impl std::fmt::Debug for MergeScanExec { f.debug_struct("MergeScanExec") .field("table", &self.table) .field("regions", &self.regions) - .field("schema", &self.schema) .field("plan", &self.plan) .finish() } @@ -238,12 +231,10 @@ impl MergeScanExec { EmissionType::Incremental, Boundedness::Bounded, ); - let schema = Self::arrow_schema_to_schema(arrow_schema.clone())?; Ok(Self { table, regions, plan, - schema, arrow_schema, region_query_handler, metric: ExecutionPlanMetricsSet::new(), @@ -265,7 +256,7 @@ impl MergeScanExec { let regions = self.regions.clone(); let region_query_handler = self.region_query_handler.clone(); let metric = MergeScanMetric::new(&self.metric); - let schema = self.schema.clone(); + let arrow_schema = self.arrow_schema.clone(); let query_ctx = self.query_ctx.clone(); let sub_stage_metrics_moved = self.sub_stage_metrics.clone(); let partition_metrics_moved = self.partition_metrics.clone(); @@ -318,9 +309,8 @@ impl MergeScanExec { .await .map_err(|e| { MERGE_SCAN_ERRORS_TOTAL.inc(); - BoxedError::new(e) - }) - .context(ExternalSnafu)?; + DataFusionError::External(Box::new(e)) + })?; let do_get_cost = do_get_start.elapsed(); ready_timer.stop(); @@ -331,10 +321,11 @@ impl MergeScanExec { let poll_elapsed = poll_timer.elapsed(); poll_duration += poll_elapsed; - let batch = batch?; - // reconstruct batch using `self.schema` - // to remove metadata and correct column name - let batch = RecordBatch::new(schema.clone(), batch.columns().iter().cloned())?; + let batch = batch.map_err(|e| DataFusionError::External(Box::new(e)))?; + let batch = RecordBatch::try_new( + arrow_schema.clone(), + batch.into_df_record_batch().columns().to_vec(), + )?; metric.record_output_batch_rows(batch.num_rows()); if let Some(mut first_consume_timer) = first_consume_timer.take() { first_consume_timer.stop(); @@ -410,12 +401,10 @@ impl MergeScanExec { } })); - Ok(Box::pin(RecordBatchStreamWrapper { - schema: self.schema.clone(), + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.arrow_schema.clone(), stream, - output_ordering: None, - metrics: Default::default(), - })) + ))) } pub fn try_with_new_distribution(&self, distribution: Distribution) -> Option { @@ -453,7 +442,6 @@ impl MergeScanExec { table: self.table.clone(), regions: self.regions.clone(), plan: self.plan.clone(), - schema: self.schema.clone(), arrow_schema: self.arrow_schema.clone(), region_query_handler: self.region_query_handler.clone(), metric: self.metric.clone(), @@ -471,11 +459,6 @@ impl MergeScanExec { }) } - fn arrow_schema_to_schema(arrow_schema: ArrowSchemaRef) -> Result { - let schema = Schema::try_from(arrow_schema).context(ConvertSchemaSnafu)?; - Ok(Arc::new(schema)) - } - pub fn sub_stage_metrics(&self) -> Vec { self.sub_stage_metrics .lock() @@ -614,10 +597,8 @@ impl ExecutionPlan for MergeScanExec { &self, partition: usize, context: Arc, - ) -> Result { - Ok(Box::pin(DfRecordBatchStreamAdapter::new( - self.to_stream(context, partition)?, - ))) + ) -> Result { + self.to_stream(context, partition) } fn metrics(&self) -> Option { diff --git a/src/query/src/dummy_catalog.rs b/src/query/src/dummy_catalog.rs index 798ae52549..907b5e8c99 100644 --- a/src/query/src/dummy_catalog.rs +++ b/src/query/src/dummy_catalog.rs @@ -185,7 +185,8 @@ impl TableProvider for DummyTableProvider { .handle_query(self.region_id, request.clone()) .await .map_err(|e| DataFusionError::External(Box::new(e)))?; - let mut scan_exec = RegionScanExec::new(scanner, request)?; + let query_memory_permit = self.engine.register_query_memory_permit(); + let mut scan_exec = RegionScanExec::new(scanner, request, query_memory_permit)?; if let Some(query_ctx) = &self.query_ctx { scan_exec.set_explain_verbose(query_ctx.explain_verbose()); } diff --git a/src/query/src/error.rs b/src/query/src/error.rs index 8cf64dbffc..4649b7fe49 100644 --- a/src/query/src/error.rs +++ b/src/query/src/error.rs @@ -353,6 +353,13 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(transparent)] + Datatypes { + source: datatypes::error::Error, + #[snafu(implicit)] + location: Location, + }, } impl ErrorExt for Error { @@ -406,9 +413,10 @@ impl ErrorExt for Error { MissingTableMutationHandler { .. } => StatusCode::Unexpected, GetRegionMetadata { .. } => StatusCode::RegionNotReady, TableReadOnly { .. } => StatusCode::Unsupported, - GetFulltextOptions { source, .. } | GetSkippingIndexOptions { source, .. } => { - source.status_code() - } + + GetFulltextOptions { source, .. } + | GetSkippingIndexOptions { source, .. } + | Datatypes { source, .. } => source.status_code(), } } diff --git a/src/query/src/metrics.rs b/src/query/src/metrics.rs index 290f368a8f..e0d02e9a3d 100644 --- a/src/query/src/metrics.rs +++ b/src/query/src/metrics.rs @@ -62,6 +62,18 @@ lazy_static! { "query push down fallback errors total" ) .unwrap(); + + pub static ref QUERY_MEMORY_POOL_USAGE_BYTES: IntGauge = register_int_gauge!( + "greptime_query_memory_pool_usage_bytes", + "current query memory pool usage in bytes" + ) + .unwrap(); + + pub static ref QUERY_MEMORY_POOL_REJECTED_TOTAL: IntCounter = register_int_counter!( + "greptime_query_memory_pool_rejected_total", + "total number of query memory allocations rejected" + ) + .unwrap(); } /// A stream to call the callback once a RecordBatch stream is done. diff --git a/src/query/src/optimizer/parallelize_scan.rs b/src/query/src/optimizer/parallelize_scan.rs index c6baecc4b6..b346fc06ef 100644 --- a/src/query/src/optimizer/parallelize_scan.rs +++ b/src/query/src/optimizer/parallelize_scan.rs @@ -62,7 +62,9 @@ impl ParallelizeScan { plan.as_any().downcast_ref::() { let expected_partition_num = config.execution.target_partitions; - if region_scan_exec.is_partition_set() { + if region_scan_exec.is_partition_set() + || region_scan_exec.scanner_type().as_str() == "SinglePartition" + { return Ok(Transformed::no(plan)); } diff --git a/src/query/src/optimizer/test_util.rs b/src/query/src/optimizer/test_util.rs index cc5712b8a5..5c24915bde 100644 --- a/src/query/src/optimizer/test_util.rs +++ b/src/query/src/optimizer/test_util.rs @@ -29,7 +29,8 @@ use store_api::metadata::{ }; use store_api::region_engine::{ RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, - SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse, + RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse, + SettableRegionRoleState, SyncManifestResponse, }; use store_api::region_request::RegionRequest; use store_api::storage::{ConcreteDataType, RegionId, ScanRequest, SequenceNumber}; @@ -117,6 +118,13 @@ impl RegionEngine for MetaRegionEngine { unimplemented!() } + async fn remap_manifests( + &self, + _request: RemapManifestsRequest, + ) -> Result { + unimplemented!() + } + fn role(&self, _region_id: RegionId) -> Option { None } diff --git a/src/query/src/options.rs b/src/query/src/options.rs index 25e1a0a2a0..50ca1177a5 100644 --- a/src/query/src/options.rs +++ b/src/query/src/options.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_base::memory_limit::MemoryLimit; use serde::{Deserialize, Serialize}; /// Query engine config @@ -22,6 +23,10 @@ pub struct QueryOptions { pub parallelism: usize, /// Whether to allow query fallback when push down fails. pub allow_query_fallback: bool, + /// Memory pool size for query execution. Setting it to 0 disables the limit (unbounded). + /// Supports absolute size (e.g., "2GB") or percentage (e.g., "50%"). + /// When this limit is reached, queries will fail with ResourceExhausted error. + pub memory_pool_size: MemoryLimit, } #[allow(clippy::derivable_impls)] @@ -30,6 +35,7 @@ impl Default for QueryOptions { Self { parallelism: 0, allow_query_fallback: false, + memory_pool_size: MemoryLimit::default(), } } } diff --git a/src/query/src/part_sort.rs b/src/query/src/part_sort.rs index 64ba76a149..ebf4fddc1e 100644 --- a/src/query/src/part_sort.rs +++ b/src/query/src/part_sort.rs @@ -30,14 +30,18 @@ use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream}; use datafusion::common::arrow::compute::sort_to_indices; use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion::execution::{RecordBatchStream, TaskContext}; +use datafusion::physical_plan::execution_plan::CardinalityEffect; +use datafusion::physical_plan::filter_pushdown::{ + ChildFilterDescription, FilterDescription, FilterPushdownPhase, +}; use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, TopK, TopKDynamicFilters, }; use datafusion_common::{DataFusionError, internal_err}; -use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit}; +use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use futures::{Stream, StreamExt}; use itertools::Itertools; use parking_lot::RwLock; @@ -61,6 +65,10 @@ pub struct PartSortExec { metrics: ExecutionPlanMetricsSet, partition_ranges: Vec>, properties: PlanProperties, + /// Filter matching the state of the sort for dynamic filter pushdown. + /// If `limit` is `Some`, this will also be set and a TopK operator may be used. + /// If `limit` is `None`, this will be `None`. + filter: Option>>, } impl PartSortExec { @@ -79,6 +87,10 @@ impl PartSortExec { properties.boundedness, ); + let filter = limit + .is_some() + .then(|| Self::create_filter(expression.expr.clone())); + Self { expression, limit, @@ -86,9 +98,17 @@ impl PartSortExec { metrics, partition_ranges, properties, + filter, } } + /// Add or reset `self.filter` to a new `TopKDynamicFilters`. + fn create_filter(expr: Arc) -> Arc> { + Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new( + DynamicFilterPhysicalExpr::new(vec![expr], lit(true)), + )))) + } + pub fn to_stream( &self, context: Arc, @@ -113,6 +133,7 @@ impl PartSortExec { input_stream, self.partition_ranges[partition].clone(), partition, + self.filter.clone(), )?) as _; Ok(df_stream) @@ -192,6 +213,51 @@ impl ExecutionPlan for PartSortExec { fn benefits_from_input_partitioning(&self) -> Vec { vec![false] } + + fn cardinality_effect(&self) -> CardinalityEffect { + if self.limit.is_none() { + CardinalityEffect::Equal + } else { + CardinalityEffect::LowerEqual + } + } + + fn gather_filters_for_pushdown( + &self, + phase: FilterPushdownPhase, + parent_filters: Vec>, + _config: &datafusion::config::ConfigOptions, + ) -> datafusion_common::Result { + if !matches!(phase, FilterPushdownPhase::Post) { + return FilterDescription::from_children(parent_filters, &self.children()); + } + + let mut child = ChildFilterDescription::from_child(&parent_filters, &self.input)?; + + if let Some(filter) = &self.filter { + child = child.with_self_filter(filter.read().expr()); + } + + Ok(FilterDescription::new().with_child(child)) + } + + fn reset_state(self: Arc) -> datafusion_common::Result> { + // shared dynamic filter needs to be reset + let new_filter = self + .limit + .is_some() + .then(|| Self::create_filter(self.expression.expr.clone())); + + Ok(Arc::new(Self { + expression: self.expression.clone(), + limit: self.limit, + input: self.input.clone(), + metrics: self.metrics.clone(), + partition_ranges: self.partition_ranges.clone(), + properties: self.properties.clone(), + filter: new_filter, + })) + } } enum PartSortBuffer { @@ -240,11 +306,16 @@ impl PartSortStream { input: DfSendableRecordBatchStream, partition_ranges: Vec, partition: usize, + filter: Option>>, ) -> datafusion_common::Result { let buffer = if let Some(limit) = limit { - let filter = Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new( - DynamicFilterPhysicalExpr::new(vec![], lit(true)), - )))); + let Some(filter) = filter else { + return internal_err!( + "TopKDynamicFilters must be provided when limit is set at {}", + snafu::location!() + ); + }; + PartSortBuffer::Top( TopK::try_new( partition, diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs index a1dc1b640a..5cc26cee05 100644 --- a/src/query/src/promql/planner.rs +++ b/src/query/src/promql/planner.rs @@ -22,7 +22,7 @@ use catalog::table_source::DfTableSourceProvider; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_function::function::FunctionContext; -use common_query::prelude::GREPTIME_VALUE; +use common_query::prelude::greptime_value; use datafusion::common::DFSchemaRef; use datafusion::datasource::DefaultTableSource; use datafusion::functions_aggregate::average::avg_udaf; @@ -2576,7 +2576,7 @@ impl PromPlanner { self.ctx.time_index_column = Some(SPECIAL_TIME_FUNCTION.to_string()); self.ctx.reset_table_name_and_schema(); self.ctx.tag_columns = vec![]; - self.ctx.field_columns = vec![GREPTIME_VALUE.to_string()]; + self.ctx.field_columns = vec![greptime_value().to_string()]; Ok(LogicalPlan::Extension(Extension { node: Arc::new( EmptyMetric::new( @@ -2584,7 +2584,7 @@ impl PromPlanner { self.ctx.end, self.ctx.interval, SPECIAL_TIME_FUNCTION.to_string(), - GREPTIME_VALUE.to_string(), + greptime_value().to_string(), Some(lit), ) .context(DataFusionPlanningSnafu)?, @@ -3433,6 +3433,7 @@ mod test { use catalog::memory::{MemoryCatalogManager, new_memory_catalog_manager}; use common_base::Plugins; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; + use common_query::prelude::greptime_timestamp; use common_query::test_util::DummyDecoder; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; @@ -3543,14 +3544,14 @@ mod test { } columns.push( ColumnSchema::new( - "greptime_timestamp".to_string(), + greptime_timestamp().to_string(), ConcreteDataType::timestamp_millisecond_datatype(), false, ) .with_time_index(true), ); columns.push(ColumnSchema::new( - "greptime_value".to_string(), + greptime_value().to_string(), ConcreteDataType::float64_datatype(), true, )); diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs index 7b53f385e8..9328f5f736 100644 --- a/src/query/src/query_engine/state.rs +++ b/src/query/src/query_engine/state.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; use std::fmt; +use std::num::NonZeroUsize; use std::sync::{Arc, RwLock}; use async_trait::async_trait; @@ -25,13 +26,18 @@ use common_function::handlers::{ FlowServiceHandlerRef, ProcedureServiceHandlerRef, TableMutationHandlerRef, }; use common_function::state::FunctionState; +use common_stat::get_total_memory_bytes; use common_telemetry::warn; use datafusion::catalog::TableFunction; use datafusion::dataframe::DataFrame; use datafusion::error::Result as DfResult; use datafusion::execution::SessionStateBuilder; use datafusion::execution::context::{QueryPlanner, SessionConfig, SessionContext, SessionState}; -use datafusion::execution::runtime_env::RuntimeEnv; +use datafusion::execution::memory_pool::{ + GreedyMemoryPool, MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation, + TrackConsumersPool, +}; +use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; use datafusion::physical_optimizer::PhysicalOptimizerRule; use datafusion::physical_optimizer::optimizer::PhysicalOptimizer; use datafusion::physical_optimizer::sanity_checker::SanityCheckPlan; @@ -49,6 +55,7 @@ use crate::QueryEngineContext; use crate::dist_plan::{ DistExtensionPlanner, DistPlannerAnalyzer, DistPlannerOptions, MergeSortExtensionPlanner, }; +use crate::metrics::{QUERY_MEMORY_POOL_REJECTED_TOTAL, QUERY_MEMORY_POOL_USAGE_BYTES}; use crate::optimizer::ExtensionAnalyzerRule; use crate::optimizer::constant_term::MatchesConstantTermOptimizer; use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule; @@ -100,7 +107,18 @@ impl QueryEngineState { plugins: Plugins, options: QueryOptionsNew, ) -> Self { - let runtime_env = Arc::new(RuntimeEnv::default()); + let total_memory = get_total_memory_bytes().max(0) as u64; + let memory_pool_size = options.memory_pool_size.resolve(total_memory) as usize; + let runtime_env = if memory_pool_size > 0 { + Arc::new( + RuntimeEnvBuilder::new() + .with_memory_pool(Arc::new(MetricsMemoryPool::new(memory_pool_size))) + .build() + .expect("Failed to build RuntimeEnv"), + ) + } else { + Arc::new(RuntimeEnv::default()) + }; let mut session_config = SessionConfig::new().with_create_default_catalog_and_schema(false); if options.parallelism > 0 { session_config = session_config.with_target_partitions(options.parallelism); @@ -420,3 +438,72 @@ impl DfQueryPlanner { } } } + +/// A wrapper around TrackConsumersPool that records metrics. +/// +/// This wrapper intercepts all memory pool operations and updates +/// Prometheus metrics for monitoring query memory usage and rejections. +#[derive(Debug)] +struct MetricsMemoryPool { + inner: Arc>, +} + +impl MetricsMemoryPool { + // Number of top memory consumers to report in OOM error messages + const TOP_CONSUMERS_TO_REPORT: usize = 5; + + fn new(limit: usize) -> Self { + Self { + inner: Arc::new(TrackConsumersPool::new( + GreedyMemoryPool::new(limit), + NonZeroUsize::new(Self::TOP_CONSUMERS_TO_REPORT).unwrap(), + )), + } + } + + #[inline] + fn update_metrics(&self) { + QUERY_MEMORY_POOL_USAGE_BYTES.set(self.inner.reserved() as i64); + } +} + +impl MemoryPool for MetricsMemoryPool { + fn register(&self, consumer: &MemoryConsumer) { + self.inner.register(consumer); + } + + fn unregister(&self, consumer: &MemoryConsumer) { + self.inner.unregister(consumer); + } + + fn grow(&self, reservation: &MemoryReservation, additional: usize) { + self.inner.grow(reservation, additional); + self.update_metrics(); + } + + fn shrink(&self, reservation: &MemoryReservation, shrink: usize) { + self.inner.shrink(reservation, shrink); + self.update_metrics(); + } + + fn try_grow( + &self, + reservation: &MemoryReservation, + additional: usize, + ) -> datafusion_common::Result<()> { + let result = self.inner.try_grow(reservation, additional); + if result.is_err() { + QUERY_MEMORY_POOL_REJECTED_TOTAL.inc(); + } + self.update_metrics(); + result + } + + fn reserved(&self) -> usize { + self.inner.reserved() + } + + fn memory_limit(&self) -> MemoryLimit { + self.inner.memory_limit() + } +} diff --git a/src/query/src/sql.rs b/src/query/src/sql.rs index 6b6ee2ed07..39ef00e1fb 100644 --- a/src/query/src/sql.rs +++ b/src/query/src/sql.rs @@ -15,6 +15,7 @@ mod show_create_table; use std::collections::HashMap; +use std::ops::ControlFlow; use std::sync::Arc; use catalog::CatalogManagerRef; @@ -34,7 +35,7 @@ use common_datasource::util::find_dir_and_filename; use common_meta::SchemaOptions; use common_meta::key::flow::flow_info::FlowInfoValue; use common_query::Output; -use common_query::prelude::GREPTIME_TIMESTAMP; +use common_query::prelude::greptime_timestamp; use common_recordbatch::RecordBatches; use common_recordbatch::adapter::RecordBatchStreamAdapter; use common_time::Timestamp; @@ -52,7 +53,7 @@ use regex::Regex; use session::context::{Channel, QueryContextRef}; pub use show_create_table::create_table_stmt; use snafu::{OptionExt, ResultExt, ensure}; -use sql::ast::Ident; +use sql::ast::{Ident, visit_expressions_mut}; use sql::parser::ParserContext; use sql::statements::OptionMap; use sql::statements::create::{CreateDatabase, CreateFlow, CreateView, Partitions, SqlOrTql}; @@ -73,7 +74,6 @@ use crate::planner::DfLogicalPlanner; const SCHEMAS_COLUMN: &str = "Database"; const OPTIONS_COLUMN: &str = "Options"; -const TABLES_COLUMN: &str = "Tables"; const VIEWS_COLUMN: &str = "Views"; const FLOWS_COLUMN: &str = "Flows"; const FIELD_COLUMN: &str = "Field"; @@ -210,6 +210,29 @@ pub async fn show_databases( .await } +/// Replaces column identifier references in a SQL expression. +/// Used for backward compatibility where old column names should work with new ones. +fn replace_column_in_expr(expr: &mut sqlparser::ast::Expr, from_column: &str, to_column: &str) { + let _ = visit_expressions_mut(expr, |e| { + match e { + sqlparser::ast::Expr::Identifier(ident) => { + if ident.value.eq_ignore_ascii_case(from_column) { + ident.value = to_column.to_string(); + } + } + sqlparser::ast::Expr::CompoundIdentifier(idents) => { + if let Some(last) = idents.last_mut() + && last.value.eq_ignore_ascii_case(from_column) + { + last.value = to_column.to_string(); + } + } + _ => {} + } + ControlFlow::<()>::Continue(()) + }); +} + /// Cast a `show` statement execution into a query from tables in `information_schema`. /// - `table_name`: the table name in `information_schema`, /// - `projects`: query projection, a list of `(column, renamed_column)`, @@ -540,15 +563,15 @@ pub async fn show_tables( query_ctx.current_schema() }; - // (dennis): MySQL rename `table_name` to `Tables_in_{schema}`, but we use `Tables` instead. - // I don't want to modify this currently, our dashboard may depend on it. + // MySQL renames `table_name` to `Tables_in_{schema}` for protocol compatibility + let tables_column = format!("Tables_in_{}", schema_name); let projects = if stmt.full { vec![ - (tables::TABLE_NAME, TABLES_COLUMN), + (tables::TABLE_NAME, tables_column.as_str()), (tables::TABLE_TYPE, TABLE_TYPE_COLUMN), ] } else { - vec![(tables::TABLE_NAME, TABLES_COLUMN)] + vec![(tables::TABLE_NAME, tables_column.as_str())] }; let filters = vec![ col(tables::TABLE_SCHEMA).eq(lit(schema_name.clone())), @@ -557,6 +580,16 @@ pub async fn show_tables( let like_field = Some(tables::TABLE_NAME); let sort = vec![col(tables::TABLE_NAME).sort(true, true)]; + // Transform the WHERE clause for backward compatibility: + // Replace "Tables" with "Tables_in_{schema}" to support old queries + let kind = match stmt.kind { + ShowKind::Where(mut filter) => { + replace_column_in_expr(&mut filter, "Tables", &tables_column); + ShowKind::Where(filter) + } + other => other, + }; + query_from_information_schema_table( query_engine, catalog_manager, @@ -567,7 +600,7 @@ pub async fn show_tables( filters, like_field, sort, - stmt.kind, + kind, ) .await } @@ -1195,14 +1228,14 @@ pub fn file_column_schemas_to_table( let timestamp_type = ConcreteDataType::timestamp_millisecond_datatype(); let default_zero = Value::Timestamp(Timestamp::new_millisecond(0)); - let timestamp_column_schema = ColumnSchema::new(GREPTIME_TIMESTAMP, timestamp_type, false) + let timestamp_column_schema = ColumnSchema::new(greptime_timestamp(), timestamp_type, false) .with_time_index(true) .with_default_constraint(Some(ColumnDefaultConstraint::Value(default_zero))) .unwrap(); if let Some(column_schema) = column_schemas .iter_mut() - .find(|column_schema| column_schema.name == GREPTIME_TIMESTAMP) + .find(|column_schema| column_schema.name == greptime_timestamp()) { // Replace the column schema with the default one *column_schema = timestamp_column_schema; @@ -1210,7 +1243,7 @@ pub fn file_column_schemas_to_table( column_schemas.push(timestamp_column_schema); } - (column_schemas, GREPTIME_TIMESTAMP.to_string()) + (column_schemas, greptime_timestamp().to_string()) } /// This function checks if the column schemas from a file can be matched with @@ -1440,8 +1473,7 @@ mod test { .. }) => { let record = record.take().first().cloned().unwrap(); - let data = record.column(0); - Ok(data.get(0).to_string()) + Ok(record.iter_column_as_string(0).next().unwrap().unwrap()) } Ok(_) => unreachable!(), Err(e) => Err(e), diff --git a/src/query/src/sql/show_create_table.rs b/src/query/src/sql/show_create_table.rs index 5466bb91e6..2c566af508 100644 --- a/src/query/src/sql/show_create_table.rs +++ b/src/query/src/sql/show_create_table.rs @@ -16,7 +16,9 @@ use std::collections::HashMap; +use arrow_schema::extension::ExtensionType; use common_meta::SchemaOptions; +use datatypes::extension::json::JsonExtensionType; use datatypes::schema::{ COLUMN_FULLTEXT_OPT_KEY_ANALYZER, COLUMN_FULLTEXT_OPT_KEY_BACKEND, COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE, COLUMN_FULLTEXT_OPT_KEY_FALSE_POSITIVE_RATE, @@ -159,6 +161,15 @@ fn create_column(column_schema: &ColumnSchema, quote_style: char) -> Result()? { + let settings = json_extension + .metadata() + .json_structure_settings + .clone() + .unwrap_or_default(); + extensions.set_json_structure_settings(settings); + } + Ok(Column { column_def: ColumnDef { name: Ident::with_quote(quote_style, name), diff --git a/src/query/src/tests.rs b/src/query/src/tests.rs index c70381d32f..4b12464b73 100644 --- a/src/query/src/tests.rs +++ b/src/query/src/tests.rs @@ -26,6 +26,7 @@ mod query_engine_test; mod time_range_filter_test; mod function; +mod vec_avg_test; mod vec_product_test; mod vec_sum_test; diff --git a/src/query/src/tests/function.rs b/src/query/src/tests/function.rs index b383daf521..9f6ce0137e 100644 --- a/src/query/src/tests/function.rs +++ b/src/query/src/tests/function.rs @@ -18,7 +18,7 @@ use common_function::scalars::vector::impl_conv::veclit_to_binlit; use common_recordbatch::RecordBatch; use datatypes::prelude::*; use datatypes::schema::{ColumnSchema, Schema}; -use datatypes::vectors::BinaryVector; +use datatypes::vectors::{BinaryVector, Helper}; use rand::Rng; use table::test_util::MemTable; @@ -64,5 +64,6 @@ pub fn get_value_from_batches(column_name: &str, batches: Vec) -> V assert_eq!(batch.column(0).len(), 1); let v = batch.column(0); assert_eq!(1, v.len()); + let v = Helper::try_into_vector(v).unwrap(); v.get(0) } diff --git a/src/query/src/tests/query_engine_test.rs b/src/query/src/tests/query_engine_test.rs index 797d2cf26a..a96abc36d7 100644 --- a/src/query/src/tests/query_engine_test.rs +++ b/src/query/src/tests/query_engine_test.rs @@ -14,6 +14,7 @@ use std::sync::Arc; +use arrow::array::{ArrayRef, UInt32Array}; use catalog::RegisterTableRequest; use catalog::memory::MemoryCatalogManager; use common_base::Plugins; @@ -97,11 +98,10 @@ async fn test_datafusion_query_engine() -> Result<()> { let batch = &numbers[0]; assert_eq!(1, batch.num_columns()); assert_eq!(batch.column(0).len(), limit); - let expected: Vec = (0u32..limit as u32).collect(); - assert_eq!( - *batch.column(0), - Arc::new(UInt32Vector::from_slice(expected)) as VectorRef - ); + let expected = Arc::new(UInt32Array::from_iter_values( + (0u32..limit as u32).collect::>(), + )) as ArrayRef; + assert_eq!(batch.column(0), &expected); Ok(()) } diff --git a/src/query/src/tests/vec_avg_test.rs b/src/query/src/tests/vec_avg_test.rs new file mode 100644 index 0000000000..672cbeaa27 --- /dev/null +++ b/src/query/src/tests/vec_avg_test.rs @@ -0,0 +1,60 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::ops::AddAssign; + +use common_function::scalars::vector::impl_conv::{as_veclit, veclit_to_binlit}; +use datafusion_common::ScalarValue; +use datatypes::prelude::Value; +use nalgebra::{Const, DVectorView, Dyn, OVector}; + +use crate::tests::{exec_selection, function}; + +#[tokio::test] +async fn test_vec_avg_aggregator() -> Result<(), common_query::error::Error> { + common_telemetry::init_default_ut_logging(); + let engine = function::create_query_engine_for_vector10x3(); + let sql = "select VEC_AVG(vector) as vec_avg from vectors"; + let result = exec_selection(engine.clone(), sql).await; + let value = function::get_value_from_batches("vec_avg", result); + + let mut expected_value = None; + + let sql = "SELECT vector FROM vectors"; + let vectors = exec_selection(engine, sql).await; + + let column = vectors[0].column(0); + let len = column.len(); + for i in 0..column.len() { + let v = ScalarValue::try_from_array(&column, i)?; + let vector = as_veclit(&v)?; + let Some(vector) = vector else { + expected_value = None; + break; + }; + expected_value + .get_or_insert_with(|| OVector::zeros_generic(Dyn(3), Const::<1>)) + .add_assign(&DVectorView::from_slice(&vector, vector.len())); + } + let expected_value = match expected_value.map(|mut v| { + v /= len as f32; + veclit_to_binlit(v.as_slice()) + }) { + None => Value::Null, + Some(bytes) => Value::from(bytes), + }; + assert_eq!(value, expected_value); + + Ok(()) +} diff --git a/src/query/src/tests/vec_product_test.rs b/src/query/src/tests/vec_product_test.rs index 53eb0d3272..26c275a5cc 100644 --- a/src/query/src/tests/vec_product_test.rs +++ b/src/query/src/tests/vec_product_test.rs @@ -32,7 +32,7 @@ async fn test_vec_product_aggregator() -> Result<(), common_query::error::Error> let sql = "SELECT vector FROM vectors"; let vectors = exec_selection(engine, sql).await; - let column = vectors[0].column(0).to_arrow_array(); + let column = vectors[0].column(0); for i in 0..column.len() { let v = ScalarValue::try_from_array(&column, i)?; let vector = as_veclit(&v)?; diff --git a/src/query/src/tests/vec_sum_test.rs b/src/query/src/tests/vec_sum_test.rs index 2c488c3c53..389bb0724d 100644 --- a/src/query/src/tests/vec_sum_test.rs +++ b/src/query/src/tests/vec_sum_test.rs @@ -34,7 +34,7 @@ async fn test_vec_sum_aggregator() -> Result<(), common_query::error::Error> { let sql = "SELECT vector FROM vectors"; let vectors = exec_selection(engine, sql).await; - let column = vectors[0].column(0).to_arrow_array(); + let column = vectors[0].column(0); for i in 0..column.len() { let v = ScalarValue::try_from_array(&column, i)?; let vector = as_veclit(&v)?; diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index ee7d4fbdd4..b5e6371785 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -81,17 +81,13 @@ notify.workspace = true object-pool = "0.5" once_cell.workspace = true openmetrics-parser = "0.4" -simd-json.workspace = true -socket2 = "0.5" -# use crates.io version once the following PRs is merged into the nextest release -# 1. fix: Use After Free in PacketReader in https://github.com/databendlabs/opensrv/pull/67 -# 2. Use ring, instead of aws-lc-rs in https://github.com/databendlabs/opensrv/pull/72 -opensrv-mysql = { git = "https://github.com/datafuselabs/opensrv", rev = "a1fb4da215c8693c7e4f62be249a01b7fec52997" } +opensrv-mysql = { git = "https://github.com/datafuselabs/opensrv", tag = "v0.10.0" } opentelemetry-proto.workspace = true operator.workspace = true otel-arrow-rust.workspace = true parking_lot.workspace = true -pgwire = { version = "0.34", default-features = false, features = [ +pg_interval = "0.4" +pgwire = { version = "0.36.1", default-features = false, features = [ "server-api-ring", "pg-ext-types", ] } @@ -113,8 +109,10 @@ rustls-pki-types = "1.0" serde.workspace = true serde_json.workspace = true session.workspace = true +simd-json.workspace = true snafu.workspace = true snap = "1" +socket2 = "0.5" sql.workspace = true store-api.workspace = true strum.workspace = true @@ -128,6 +126,7 @@ tonic-reflection = "0.13" tower = { workspace = true, features = ["full"] } tower-http = { version = "0.6", features = ["full"] } tracing.workspace = true +tracing-opentelemetry.workspace = true urlencoding = "2.1" uuid.workspace = true vrl.workspace = true diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION index daf2abc7c2..bf2a1dbea8 100644 --- a/src/servers/dashboard/VERSION +++ b/src/servers/dashboard/VERSION @@ -1 +1 @@ -v0.11.6 +v0.11.8 diff --git a/src/servers/src/configurator.rs b/src/servers/src/configurator.rs index 2b2e47d1e7..e8ba8264bd 100644 --- a/src/servers/src/configurator.rs +++ b/src/servers/src/configurator.rs @@ -15,16 +15,45 @@ use std::sync::Arc; use axum::Router as HttpRouter; +use common_error::ext::BoxedError; use tonic::transport::server::Router as GrpcRouter; -pub trait Configurator: Send + Sync { - fn config_http(&self, route: HttpRouter) -> HttpRouter { - route - } +use crate::grpc::builder::GrpcServerBuilder; - fn config_grpc(&self, route: GrpcRouter) -> GrpcRouter { - route - } +/// A configurator that customizes or enhances an HTTP router. +#[async_trait::async_trait] +pub trait HttpConfigurator: Send + Sync { + /// Configures the given HTTP router using the provided context. + async fn configure_http( + &self, + route: HttpRouter, + ctx: C, + ) -> std::result::Result; } -pub type ConfiguratorRef = Arc; +pub type HttpConfiguratorRef = Arc>; + +/// A configurator that customizes or enhances a gRPC router. +#[async_trait::async_trait] +pub trait GrpcRouterConfigurator: Send + Sync { + /// Configures the given gRPC router using the provided context. + async fn configure_grpc_router( + &self, + route: GrpcRouter, + ctx: C, + ) -> std::result::Result; +} + +pub type GrpcRouterConfiguratorRef = Arc>; + +/// A configurator that customizes or enhances a [`GrpcServerBuilder`]. +#[async_trait::async_trait] +pub trait GrpcBuilderConfigurator: Send + Sync { + async fn configure( + &self, + builder: GrpcServerBuilder, + ctx: C, + ) -> std::result::Result; +} + +pub type GrpcBuilderConfiguratorRef = Arc>; diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index c7e5c5d07a..2e39f80c85 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -229,14 +229,33 @@ pub enum Error { error: prost::DecodeError, }, - #[snafu(display("Failed to decode OTLP request"))] + #[snafu(display( + "Failed to decode OTLP request (content-type: {content_type}): {error}. The endpoint only accepts 'application/x-protobuf' format." + ))] DecodeOtlpRequest { + content_type: String, #[snafu(implicit)] location: Location, #[snafu(source)] error: prost::DecodeError, }, + #[snafu(display("Failed to decode Loki request: {error}"))] + DecodeLokiRequest { + #[snafu(implicit)] + location: Location, + #[snafu(source)] + error: prost::DecodeError, + }, + + #[snafu(display( + "Unsupported content type 'application/json'. OTLP endpoint only supports 'application/x-protobuf'. Please configure your OTLP exporter to use protobuf encoding." + ))] + UnsupportedJsonContentType { + #[snafu(implicit)] + location: Location, + }, + #[snafu(display( "OTLP metric input have incompatible existing tables, please refer to docs for details" ))] @@ -269,21 +288,6 @@ pub enum Error { error: std::io::Error, }, - #[snafu(display("Failed to send prometheus remote request"))] - SendPromRemoteRequest { - #[snafu(implicit)] - location: Location, - #[snafu(source)] - error: reqwest::Error, - }, - - #[snafu(display("Invalid export metrics config, msg: {}", msg))] - InvalidExportMetricsConfig { - msg: String, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Failed to compress prometheus remote request"))] CompressPromRemoteRequest { #[snafu(implicit)] @@ -661,7 +665,6 @@ impl ErrorExt for Error { | StartHttp { .. } | StartGrpc { .. } | TcpBind { .. } - | SendPromRemoteRequest { .. } | BuildHttpResponse { .. } | Arrow { .. } | FileWatch { .. } => StatusCode::Internal, @@ -694,11 +697,12 @@ impl ErrorExt for Error { | InvalidOpentsdbJsonRequest { .. } | DecodePromRemoteRequest { .. } | DecodeOtlpRequest { .. } + | DecodeLokiRequest { .. } + | UnsupportedJsonContentType { .. } | CompressPromRemoteRequest { .. } | DecompressSnappyPromRemoteRequest { .. } | DecompressZstdPromRemoteRequest { .. } | InvalidPromRemoteRequest { .. } - | InvalidExportMetricsConfig { .. } | InvalidFlightTicket { .. } | InvalidPrepareStatement { .. } | DataFrame { .. } diff --git a/src/servers/src/export_metrics.rs b/src/servers/src/export_metrics.rs deleted file mode 100644 index aac7e8dda4..0000000000 --- a/src/servers/src/export_metrics.rs +++ /dev/null @@ -1,369 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::Arc; -use std::time::Duration; - -use common_base::Plugins; -use common_telemetry::metric::{MetricFilter, convert_metric_to_write_request}; -use common_telemetry::{error, info}; -use common_time::Timestamp; -use prost::Message; -use reqwest::header::{HeaderMap, HeaderName, HeaderValue}; -use serde::{Deserialize, Serialize}; -use session::context::QueryContextBuilder; -use snafu::{ResultExt, ensure}; -use tokio::time::{self, Interval}; - -use crate::error::{InvalidExportMetricsConfigSnafu, Result, SendPromRemoteRequestSnafu}; -use crate::prom_store::{snappy_compress, to_grpc_row_insert_requests}; -use crate::query_handler::PromStoreProtocolHandlerRef; - -/// Use to export the metrics generated by greptimedb. -/// -/// Encoded to Prometheus [RemoteWrite format](https://prometheus.io/docs/concepts/remote_write_spec/), -/// and send to Prometheus remote-write compatible receiver (e.g. send to `greptimedb` itself) -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] -#[serde(default)] -pub struct ExportMetricsOption { - pub enable: bool, - #[serde(with = "humantime_serde")] - pub write_interval: Duration, - pub self_import: Option, - pub remote_write: Option, -} - -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default)] -#[serde(default)] -pub struct RemoteWriteOption { - pub url: String, - pub headers: HashMap, -} - -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] -#[serde(default)] -pub struct SelfImportOption { - pub db: String, -} - -impl Default for SelfImportOption { - fn default() -> Self { - Self { - db: "greptime_metrics".to_string(), - } - } -} - -impl Default for ExportMetricsOption { - fn default() -> Self { - Self { - enable: false, - write_interval: Duration::from_secs(30), - self_import: None, - remote_write: None, - } - } -} - -#[derive(Default, Clone)] -pub struct ExportMetricsTask { - config: ExportMetricsOption, - filter: Option, - headers: HeaderMap, - pub send_by_handler: bool, -} - -impl ExportMetricsTask { - pub fn try_new( - config: &ExportMetricsOption, - plugins: Option<&Plugins>, - ) -> Result> { - if !config.enable { - return Ok(None); - } - let filter = plugins.map(|p| p.get::()).unwrap_or(None); - ensure!( - config.write_interval.as_secs() != 0, - InvalidExportMetricsConfigSnafu { - msg: "Expected export metrics write_interval greater than zero" - } - ); - ensure!( - (config.remote_write.is_none() && config.self_import.is_some()) - || (config.remote_write.is_some() && config.self_import.is_none()), - InvalidExportMetricsConfigSnafu { - msg: "Only one of `self_import` or `remote_write` can be used as the export method" - } - ); - if let Some(self_import) = &config.self_import { - ensure!( - !self_import.db.is_empty(), - InvalidExportMetricsConfigSnafu { - msg: "Expected `self_import` metrics `db` not empty" - } - ); - } - let mut headers = HeaderMap::new(); - if let Some(remote_write) = &config.remote_write { - ensure!( - !remote_write.url.is_empty(), - InvalidExportMetricsConfigSnafu { - msg: "Expected `remote_write` metrics `url` not empty" - } - ); - // construct http header - remote_write.headers.iter().try_for_each(|(k, v)| { - let header = match TryInto::::try_into(k) { - Ok(header) => header, - Err(_) => { - return InvalidExportMetricsConfigSnafu { - msg: format!("Export metrics: invalid HTTP header name: {}", k), - } - .fail(); - } - }; - match TryInto::::try_into(v) { - Ok(value) => headers.insert(header, value), - Err(_) => { - return InvalidExportMetricsConfigSnafu { - msg: format!("Export metrics: invalid HTTP header value: {}", v), - } - .fail(); - } - }; - Ok(()) - })?; - } - Ok(Some(Self { - config: config.clone(), - filter, - headers, - send_by_handler: config.self_import.is_some(), - })) - } - - pub fn start(&self, handler: Option) -> Result<()> { - if !self.config.enable { - return Ok(()); - } - let interval = time::interval(self.config.write_interval); - let filter = self.filter.clone(); - let _handle = if let Some(self_import) = &self.config.self_import { - ensure!( - handler.is_some(), - InvalidExportMetricsConfigSnafu { - msg: "Only `frontend` or `standalone` can use `self_import` as export method." - } - ); - common_runtime::spawn_global(write_system_metric_by_handler( - self_import.db.clone(), - handler.unwrap(), - filter, - interval, - )) - } else if let Some(remote_write) = &self.config.remote_write { - common_runtime::spawn_global(write_system_metric_by_network( - self.headers.clone(), - remote_write.url.clone(), - filter, - interval, - )) - } else { - unreachable!() - }; - Ok(()) - } -} - -/// Send metrics collected by standard Prometheus [RemoteWrite format](https://prometheus.io/docs/concepts/remote_write_spec/) -pub async fn write_system_metric_by_network( - headers: HeaderMap, - endpoint: String, - filter: Option, - mut interval: Interval, -) { - info!( - "Start export metrics task to endpoint: {}, interval: {}s", - endpoint, - interval.period().as_secs() - ); - // Pass the first tick. Because the first tick completes immediately. - interval.tick().await; - let client = reqwest::Client::new(); - loop { - interval.tick().await; - let metric_families = prometheus::gather(); - let request = convert_metric_to_write_request( - metric_families, - filter.as_ref(), - Timestamp::current_millis().value(), - ); - let resp = match snappy_compress(&request.encode_to_vec()) { - Ok(body) => client - .post(endpoint.as_str()) - .header("X-Prometheus-Remote-Write-Version", "0.1.0") - .header("Content-Type", "application/x-protobuf") - .headers(headers.clone()) - .body(body) - .send() - .await - .context(SendPromRemoteRequestSnafu), - Err(e) => Err(e), - }; - match resp { - Ok(resp) => { - if !resp.status().is_success() { - error!("report export metrics error, msg: {:#?}", resp); - } - } - Err(e) => error!(e; "report export metrics failed"), - }; - } -} - -/// Send metrics collected by our internal handler -/// for case `frontend` and `standalone` dispose it's own metrics, -/// reducing compression and network transmission overhead. -pub async fn write_system_metric_by_handler( - db: String, - handler: PromStoreProtocolHandlerRef, - filter: Option, - mut interval: Interval, -) { - info!( - "Start export metrics task by handler, interval: {}s", - interval.period().as_secs() - ); - // Pass the first tick. Because the first tick completes immediately. - interval.tick().await; - let ctx = Arc::new(QueryContextBuilder::default().current_schema(db).build()); - loop { - interval.tick().await; - let metric_families = prometheus::gather(); - let request = convert_metric_to_write_request( - metric_families, - filter.as_ref(), - Timestamp::current_millis().value(), - ); - - let (requests, samples) = match to_grpc_row_insert_requests(&request) { - Ok((requests, samples)) => (requests, samples), - Err(e) => { - error!(e; "Failed to convert gathered metrics to RowInsertRequests"); - continue; - } - }; - - if let Err(e) = handler.write(requests, ctx.clone(), false).await { - error!(e; "report export metrics by handler failed"); - } else { - crate::metrics::PROM_STORE_REMOTE_WRITE_SAMPLES - .with_label_values(&[ctx.get_db_string().as_str()]) - .inc_by(samples as u64); - } - } -} - -#[cfg(test)] -mod test { - use std::time::Duration; - - use crate::export_metrics::{ - ExportMetricsOption, ExportMetricsTask, RemoteWriteOption, SelfImportOption, - }; - - #[tokio::test] - async fn test_config() { - // zero write_interval - assert!( - ExportMetricsTask::try_new( - &ExportMetricsOption { - enable: true, - write_interval: Duration::from_secs(0), - ..Default::default() - }, - None - ) - .is_err() - ); - // none self_import and remote_write - assert!( - ExportMetricsTask::try_new( - &ExportMetricsOption { - enable: true, - ..Default::default() - }, - None - ) - .is_err() - ); - // both self_import and remote_write - assert!( - ExportMetricsTask::try_new( - &ExportMetricsOption { - enable: true, - self_import: Some(SelfImportOption::default()), - remote_write: Some(RemoteWriteOption::default()), - ..Default::default() - }, - None - ) - .is_err() - ); - // empty db - assert!( - ExportMetricsTask::try_new( - &ExportMetricsOption { - enable: true, - self_import: Some(SelfImportOption { - db: String::default() - }), - remote_write: None, - ..Default::default() - }, - None - ) - .is_err() - ); - // empty url - assert!( - ExportMetricsTask::try_new( - &ExportMetricsOption { - enable: true, - self_import: None, - remote_write: Some(RemoteWriteOption { - url: String::default(), - ..Default::default() - }), - ..Default::default() - }, - None - ) - .is_err() - ); - // self import but no handle - let s = ExportMetricsTask::try_new( - &ExportMetricsOption { - enable: true, - self_import: Some(SelfImportOption::default()), - ..Default::default() - }, - None, - ) - .unwrap() - .unwrap(); - assert!(s.start(None).is_err()); - } -} diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index 404b087535..543054ae54 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -32,9 +32,7 @@ use common_telemetry::{debug, error, info}; use common_time::Timestamp; use common_time::timestamp::TimeUnit; use datatypes::data_type::DataType; -use datatypes::prelude::ConcreteDataType; use datatypes::schema::SchemaRef; -use datatypes::types::jsonb_to_serde_json; use event::{LogState, LogValidatorRef}; use futures::FutureExt; use http::{HeaderValue, Method}; @@ -52,11 +50,11 @@ use tower_http::trace::TraceLayer; use self::authorize::AuthState; use self::result::table_result::TableResponse; -use crate::configurator::ConfiguratorRef; +use crate::configurator::HttpConfiguratorRef; use crate::elasticsearch; use crate::error::{ - AddressBindSnafu, AlreadyStartedSnafu, ConvertSqlValueSnafu, Error, InternalIoSnafu, - InvalidHeaderValueSnafu, Result, ToJsonSnafu, + AddressBindSnafu, AlreadyStartedSnafu, Error, InternalIoSnafu, InvalidHeaderValueSnafu, + OtherSnafu, Result, }; use crate::http::influxdb::{influxdb_health, influxdb_ping, influxdb_write_v1, influxdb_write_v2}; use crate::http::otlp::OtlpState; @@ -89,6 +87,7 @@ pub mod authorize; #[cfg(feature = "dashboard")] mod dashboard; pub mod dyn_log; +pub mod dyn_trace; pub mod event; pub mod extractor; pub mod handler; @@ -108,6 +107,7 @@ pub mod result; mod timeout; pub mod utils; +use result::HttpOutputWriter; pub(crate) use timeout::DynamicTimeoutLayer; mod hints; @@ -297,30 +297,10 @@ impl HttpRecordsOutput { } else { let num_rows = recordbatches.iter().map(|r| r.num_rows()).sum::(); let mut rows = Vec::with_capacity(num_rows); - let schemas = schema.column_schemas(); - let num_cols = schema.column_schemas().len(); - rows.resize_with(num_rows, || Vec::with_capacity(num_cols)); - let mut finished_row_cursor = 0; for recordbatch in recordbatches { - for (col_idx, col) in recordbatch.columns().iter().enumerate() { - // safety here: schemas length is equal to the number of columns in the recordbatch - let schema = &schemas[col_idx]; - for row_idx in 0..recordbatch.num_rows() { - let value = col.get(row_idx); - // TODO(sunng87): is this duplicated with `map_json_type_to_string` in recordbatch? - let value = if let ConcreteDataType::Json(_json_type) = &schema.data_type - && let datatypes::value::Value::Binary(bytes) = value - { - jsonb_to_serde_json(bytes.as_ref()).context(ConvertSqlValueSnafu)? - } else { - serde_json::Value::try_from(col.get(row_idx)).context(ToJsonSnafu)? - }; - - rows[row_idx + finished_row_cursor].push(value); - } - } - finished_row_cursor += recordbatch.num_rows(); + let mut writer = HttpOutputWriter::new(schema.num_columns(), None); + writer.write(recordbatch, &mut rows)?; } Ok(HttpRecordsOutput { @@ -908,6 +888,7 @@ impl HttpServer { Router::new() // handler for changing log level dynamically .route("/log_level", routing::post(dyn_log::dyn_log_handler)) + .route("/enable_trace", routing::post(dyn_trace::dyn_trace_handler)) .nest( "/prof", Router::new() @@ -924,6 +905,11 @@ impl HttpServer { .route( "/mem/status", routing::get(mem_prof::heap_prof_status_handler), + ) // jemalloc gdump flag status and toggle + .route( + "/mem/gdump", + routing::get(mem_prof::gdump_status_handler) + .post(mem_prof::gdump_toggle_handler), ), ), )) @@ -1220,8 +1206,11 @@ impl Server for HttpServer { ); let mut app = self.make_app(); - if let Some(configurator) = self.plugins.get::() { - app = configurator.config_http(app); + if let Some(configurator) = self.plugins.get::>() { + app = configurator + .configure_http(app, ()) + .await + .context(OtherSnafu)?; } let app = self.build(app)?; let listener = tokio::net::TcpListener::bind(listening) diff --git a/src/servers/src/http/dyn_log.rs b/src/servers/src/http/dyn_log.rs index b82ecdadd6..e9a58c2d74 100644 --- a/src/servers/src/http/dyn_log.rs +++ b/src/servers/src/http/dyn_log.rs @@ -15,7 +15,7 @@ use axum::http::StatusCode; use axum::response::IntoResponse; use common_telemetry::tracing_subscriber::filter; -use common_telemetry::{RELOAD_HANDLE, info}; +use common_telemetry::{LOG_RELOAD_HANDLE, info}; use snafu::OptionExt; use crate::error::{InternalSnafu, InvalidParameterSnafu, Result}; @@ -29,7 +29,7 @@ pub async fn dyn_log_handler(level: String) -> Result { .build() })?; let mut old_filter = None; - RELOAD_HANDLE + LOG_RELOAD_HANDLE .get() .context(InternalSnafu { err_msg: "Reload handle not initialized", diff --git a/src/servers/src/http/dyn_trace.rs b/src/servers/src/http/dyn_trace.rs new file mode 100644 index 0000000000..dcdb74c56a --- /dev/null +++ b/src/servers/src/http/dyn_trace.rs @@ -0,0 +1,54 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use axum::http::StatusCode; +use axum::response::IntoResponse; +use common_telemetry::{TRACE_RELOAD_HANDLE, get_or_init_tracer, info}; + +use crate::error::{InvalidParameterSnafu, Result}; + +#[axum_macros::debug_handler] +pub async fn dyn_trace_handler(enable_str: String) -> Result { + let enable = enable_str.parse::().map_err(|e| { + InvalidParameterSnafu { + reason: format!("Invalid parameter \"enable\": {e:?}"), + } + .build() + })?; + + let Some(trace_reload_handle) = TRACE_RELOAD_HANDLE.get() else { + return Ok(( + StatusCode::SERVICE_UNAVAILABLE, + "trace reload handle is not initialized".to_string(), + )); + }; + + if enable { + let tracer = match get_or_init_tracer() { + Ok(tracer) => tracer, + Err(reason) => { + return Ok((StatusCode::SERVICE_UNAVAILABLE, reason.to_string())); + } + }; + + let trace_layer = tracing_opentelemetry::layer().with_tracer(tracer); + trace_reload_handle.reload(Some(trace_layer)); + info!("trace enabled"); + Ok((StatusCode::OK, "trace enabled".to_string())) + } else { + trace_reload_handle.reload(None); + info!("trace disabled"); + Ok((StatusCode::OK, "trace disabled".to_string())) + } +} diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 24bb844dc7..2390e374a1 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -31,7 +31,7 @@ use axum_extra::TypedHeader; use common_catalog::consts::default_engine; use common_error::ext::{BoxedError, ErrorExt}; use common_query::{Output, OutputData}; -use common_telemetry::{error, warn}; +use common_telemetry::{debug, error, warn}; use headers::ContentType; use lazy_static::lazy_static; use mime_guess::mime; @@ -738,6 +738,11 @@ pub async fn log_ingester( let value = extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?; + debug!( + "receiving logs: {:?}", + serde_json::to_string(&value).unwrap() + ); + query_ctx.set_channel(Channel::Log); let query_ctx = Arc::new(query_ctx); diff --git a/src/servers/src/http/jaeger.rs b/src/servers/src/http/jaeger.rs index 77b598ad1a..148e2ac77a 100644 --- a/src/servers/src/http/jaeger.rs +++ b/src/servers/src/http/jaeger.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::fmt; use std::str::FromStr; use std::sync::Arc; @@ -21,13 +21,14 @@ use axum::Extension; use axum::extract::{Path, Query, State}; use axum::http::{HeaderMap, StatusCode as HttpStatusCode}; use axum::response::IntoResponse; -use chrono::Utc; +use axum_extra::TypedHeader; use common_catalog::consts::{PARENT_SPAN_ID_COLUMN, TRACE_TABLE_NAME}; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_query::{Output, OutputData}; use common_recordbatch::util; use common_telemetry::{debug, error, tracing, warn}; +use headers::UserAgent; use serde::{Deserialize, Deserializer, Serialize, de}; use serde_json::Value as JsonValue; use session::context::{Channel, QueryContext}; @@ -41,10 +42,12 @@ use crate::http::extractor::TraceTableName; use crate::metrics::METRIC_JAEGER_QUERY_ELAPSED; use crate::otlp::trace::{ DURATION_NANO_COLUMN, KEY_OTEL_SCOPE_NAME, KEY_OTEL_SCOPE_VERSION, KEY_OTEL_STATUS_CODE, - KEY_SERVICE_NAME, KEY_SPAN_KIND, RESOURCE_ATTRIBUTES_COLUMN, SCOPE_NAME_COLUMN, - SCOPE_VERSION_COLUMN, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_EVENTS_COLUMN, - SPAN_ID_COLUMN, SPAN_KIND_COLUMN, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, - SPAN_STATUS_PREFIX, SPAN_STATUS_UNSET, TIMESTAMP_COLUMN, TRACE_ID_COLUMN, + KEY_OTEL_STATUS_ERROR_KEY, KEY_OTEL_STATUS_MESSAGE, KEY_OTEL_TRACE_STATE, KEY_SERVICE_NAME, + KEY_SPAN_KIND, RESOURCE_ATTRIBUTES_COLUMN, SCOPE_NAME_COLUMN, SCOPE_VERSION_COLUMN, + SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN, SPAN_EVENTS_COLUMN, SPAN_ID_COLUMN, + SPAN_KIND_COLUMN, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, SPAN_STATUS_ERROR, + SPAN_STATUS_MESSAGE_COLUMN, SPAN_STATUS_PREFIX, SPAN_STATUS_UNSET, TIMESTAMP_COLUMN, + TRACE_ID_COLUMN, TRACE_STATE_COLUMN, }; use crate::query_handler::JaegerQueryHandlerRef; @@ -52,7 +55,9 @@ pub const JAEGER_QUERY_TABLE_NAME_KEY: &str = "jaeger_query_table_name"; const REF_TYPE_CHILD_OF: &str = "CHILD_OF"; const SPAN_KIND_TIME_FMTS: [&str; 2] = ["%Y-%m-%d %H:%M:%S%.6f%z", "%Y-%m-%d %H:%M:%S%.9f%z"]; -pub const JAEGER_TIME_RANGE_FOR_OPERATIONS_HEADER: &str = "x-greptime-jaeger-query-time-range"; + +const TRACE_NOT_FOUND_ERROR_CODE: i32 = 404; +const TRACE_NOT_FOUND_ERROR_MSG: &str = "trace not found"; /// JaegerAPIResponse is the response of Jaeger HTTP API. /// The original version is `structuredResponse` which is defined in https://github.com/jaegertracing/jaeger/blob/main/cmd/query/app/http_handler.go. @@ -65,6 +70,22 @@ pub struct JaegerAPIResponse { pub errors: Vec, } +impl JaegerAPIResponse { + pub fn trace_not_found() -> Self { + Self { + data: None, + total: 0, + limit: 0, + offset: 0, + errors: vec![JaegerAPIError { + code: TRACE_NOT_FOUND_ERROR_CODE, + msg: TRACE_NOT_FOUND_ERROR_MSG.to_string(), + trace_id: None, + }], + } + } +} + /// JaegerData is the query result of Jaeger HTTP API. #[derive(Debug, Serialize, Deserialize, PartialEq)] #[serde(untagged)] @@ -340,6 +361,30 @@ pub struct QueryTraceParams { pub end_time: Option, pub min_duration: Option, pub max_duration: Option, + + // The user agent of the trace query, mainly find traces + pub user_agent: TraceUserAgent, +} + +#[derive(Debug, Default, PartialEq, Eq)] +pub enum TraceUserAgent { + Grafana, + // Jaeger-UI does not actually send user agent + // But it's a jaeger API, so let's treat it as jaeger + #[default] + Jaeger, +} + +impl From for TraceUserAgent { + fn from(value: UserAgent) -> Self { + let ua_str = value.as_str().to_lowercase(); + debug!("received user agent: {}", ua_str); + if ua_str.contains("grafana") { + Self::Grafana + } else { + Self::Jaeger + } + } } /// Handle the GET `/api/services` request. @@ -427,7 +472,13 @@ pub async fn handle_get_trace( let end_time_ns = query_params.end.map(|end_us| end_us * 1000); let output = match handler - .get_trace(query_ctx, &trace_id, start_time_ns, end_time_ns) + .get_trace( + query_ctx, + &trace_id, + start_time_ns, + end_time_ns, + query_params.limit, + ) .await { Ok(output) => output, @@ -442,6 +493,10 @@ pub async fn handle_get_trace( match covert_to_records(output).await { Ok(Some(records)) => match traces_from_records(records) { + Ok(traces) if traces.is_empty() => ( + HttpStatusCode::NOT_FOUND, + axum::Json(JaegerAPIResponse::trace_not_found()), + ), Ok(traces) => ( HttpStatusCode::OK, axum::Json(JaegerAPIResponse { @@ -454,7 +509,10 @@ pub async fn handle_get_trace( error_response(err) } }, - Ok(None) => (HttpStatusCode::OK, axum::Json(JaegerAPIResponse::default())), + Ok(None) => ( + HttpStatusCode::NOT_FOUND, + axum::Json(JaegerAPIResponse::trace_not_found()), + ), Err(err) => { error!("Failed to get trace '{}': {:?}", trace_id, err); error_response(err) @@ -470,6 +528,7 @@ pub async fn handle_find_traces( Query(query_params): Query, Extension(mut query_ctx): Extension, TraceTableName(table_name): TraceTableName, + optional_user_agent: Option>, ) -> impl IntoResponse { debug!( "Received Jaeger '/api/traces' request, query_params: {:?}, query_ctx: {:?}", @@ -486,7 +545,10 @@ pub async fn handle_find_traces( .start_timer(); match QueryTraceParams::from_jaeger_query_params(query_params) { - Ok(query_params) => { + Ok(mut query_params) => { + if let Some(TypedHeader(user_agent)) = optional_user_agent { + query_params.user_agent = user_agent.into(); + } let output = handler.find_traces(query_ctx, query_params).await; match output { Ok(output) => match covert_to_records(output).await { @@ -528,13 +590,6 @@ pub async fn handle_get_operations( query_params, query_ctx, headers ); - let (start, end) = match parse_jaeger_time_range_for_operations(&headers, &query_params) { - Ok((start, end)) => (start, end), - Err(e) => return error_response(e), - }; - - debug!("Get operations with start: {:?}, end: {:?}", start, end); - if let Some(service_name) = &query_params.service_name { update_query_context(&mut query_ctx, table_name); let query_ctx = Arc::new(query_ctx); @@ -546,13 +601,7 @@ pub async fn handle_get_operations( .start_timer(); match handler - .get_operations( - query_ctx, - service_name, - query_params.span_kind.as_deref(), - start, - end, - ) + .get_operations(query_ctx, service_name, query_params.span_kind.as_deref()) .await { Ok(output) => match covert_to_records(output).await { @@ -625,15 +674,7 @@ pub async fn handle_get_operations_by_service( .with_label_values(&[&db, "/api/services"]) .start_timer(); - let (start, end) = match parse_jaeger_time_range_for_operations(&headers, &query_params) { - Ok((start, end)) => (start, end), - Err(e) => return error_response(e), - }; - - match handler - .get_operations(query_ctx, &service_name, None, start, end) - .await - { + match handler.get_operations(query_ctx, &service_name, None).await { Ok(output) => match covert_to_records(output).await { Ok(Some(records)) => match operations_from_records(records, false) { Ok(operations) => { @@ -677,7 +718,10 @@ async fn covert_to_records(output: Output) -> Result> .await .context(CollectRecordbatchSnafu)?, )?; - debug!("The query records: {:?}", records); + debug!( + "The query records: {}", + serde_json::to_string(&records).unwrap() + ); Ok(Some(records)) } // It's unlikely to happen. However, if the output is not a stream, return None. @@ -721,7 +765,8 @@ fn traces_from_records(records: HttpRecordsOutput) -> Result> { // maintain the mapping: trace_id -> (process_id -> service_name). let mut trace_id_to_processes: HashMap> = HashMap::new(); // maintain the mapping: trace_id -> spans. - let mut trace_id_to_spans: HashMap> = HashMap::new(); + // use BTreeMap to retain order + let mut trace_id_to_spans: BTreeMap> = BTreeMap::new(); // maintain the mapping: service.name -> resource.attributes. let mut service_to_resource_attributes: HashMap> = HashMap::new(); @@ -882,6 +927,38 @@ fn traces_from_records(records: HttpRecordsOutput) -> Result> { value_type: ValueType::String, value: Value::String(normalize_status_code(&span_status)), }); + // set error to comply with the Jaeger API + if span_status == SPAN_STATUS_ERROR { + span.tags.push(KeyValue { + key: KEY_OTEL_STATUS_ERROR_KEY.to_string(), + value_type: ValueType::Boolean, + value: Value::Boolean(true), + }); + } + } + } + + SPAN_STATUS_MESSAGE_COLUMN => { + if let JsonValue::String(span_status_message) = cell + && !span_status_message.is_empty() + { + span.tags.push(KeyValue { + key: KEY_OTEL_STATUS_MESSAGE.to_string(), + value_type: ValueType::String, + value: Value::String(span_status_message), + }); + } + } + + TRACE_STATE_COLUMN => { + if let JsonValue::String(trace_state) = cell + && !trace_state.is_empty() + { + span.tags.push(KeyValue { + key: KEY_OTEL_TRACE_STATE.to_string(), + value_type: ValueType::String, + value: Value::String(trace_state), + }); } } @@ -1117,42 +1194,6 @@ fn convert_string_to_boolean(input: &serde_json::Value) -> Option Result<(Option, Option)> { - if let Some(time_range) = headers.get(JAEGER_TIME_RANGE_FOR_OPERATIONS_HEADER) { - match time_range.to_str() { - Ok(time_range) => match humantime::parse_duration(time_range) { - Ok(duration) => { - debug!( - "Get operations with time range: {:?}, duration: {:?}", - time_range, duration - ); - let now = Utc::now().timestamp_micros(); - Ok((Some(now - duration.as_micros() as i64), Some(now))) - } - Err(e) => { - error!("Failed to parse time range header: {:?}", e); - Err(InvalidJaegerQuerySnafu { - reason: format!("invalid time range header: {:?}", time_range), - } - .build()) - } - }, - Err(e) => { - error!("Failed to convert time range header to string: {:?}", e); - Err(InvalidJaegerQuerySnafu { - reason: format!("invalid time range header: {:?}", time_range), - } - .build()) - } - } - } else { - Ok((query_params.start, query_params.end)) - } -} - #[cfg(test)] mod tests { use serde_json::{Number, Value as JsonValue, json}; @@ -1586,6 +1627,7 @@ mod tests { ("http.method".to_string(), JsonValue::String("GET".to_string())), ("http.path".to_string(), JsonValue::String("/api/v1/users".to_string())), ])), + user_agent: TraceUserAgent::Jaeger, }, ), ]; diff --git a/src/servers/src/http/loki.rs b/src/servers/src/http/loki.rs index 45d6eadadd..e6f1b064a3 100644 --- a/src/servers/src/http/loki.rs +++ b/src/servers/src/http/loki.rs @@ -26,7 +26,7 @@ use axum::extract::State; use axum_extra::TypedHeader; use bytes::Bytes; use chrono::DateTime; -use common_query::prelude::GREPTIME_TIMESTAMP; +use common_query::prelude::greptime_timestamp; use common_query::{Output, OutputData}; use common_telemetry::{error, warn}; use headers::ContentType; @@ -43,7 +43,7 @@ use snafu::{OptionExt, ResultExt, ensure}; use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ - DecodeOtlpRequestSnafu, InvalidLokiLabelsSnafu, InvalidLokiPayloadSnafu, ParseJsonSnafu, + DecodeLokiRequestSnafu, InvalidLokiLabelsSnafu, InvalidLokiPayloadSnafu, ParseJsonSnafu, PipelineSnafu, Result, UnsupportedContentTypeSnafu, }; use crate::http::HttpResponse; @@ -73,7 +73,7 @@ const LINES_KEY: &str = "values"; lazy_static! { static ref LOKI_INIT_SCHEMAS: Vec = vec![ ColumnSchema { - column_name: GREPTIME_TIMESTAMP.to_string(), + column_name: greptime_timestamp().to_string(), datatype: ColumnDataType::TimestampNanosecond.into(), semantic_type: SemanticType::Timestamp.into(), datatype_extension: None, @@ -453,7 +453,7 @@ impl From> for LokiPipeline { let mut map = BTreeMap::new(); map.insert( - KeyString::from(GREPTIME_TIMESTAMP), + KeyString::from(greptime_timestamp()), VrlValue::Timestamp(DateTime::from_timestamp_nanos(ts)), ); map.insert( @@ -492,7 +492,7 @@ impl LokiPbParser { pub fn from_bytes(bytes: Bytes) -> Result { let decompressed = prom_store::snappy_decompress(&bytes).unwrap(); let req = loki_proto::logproto::PushRequest::decode(&decompressed[..]) - .context(DecodeOtlpRequestSnafu)?; + .context(DecodeLokiRequestSnafu)?; Ok(Self { streams: req.streams.into(), @@ -586,7 +586,7 @@ impl From>> for LokiPipeline { let mut map = BTreeMap::new(); map.insert( - KeyString::from(GREPTIME_TIMESTAMP), + KeyString::from(greptime_timestamp()), VrlValue::Timestamp(DateTime::from_timestamp_nanos(ts)), ); map.insert( diff --git a/src/servers/src/http/mem_prof.rs b/src/servers/src/http/mem_prof.rs index 92995fd2de..e6362aef3f 100644 --- a/src/servers/src/http/mem_prof.rs +++ b/src/servers/src/http/mem_prof.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#[cfg(feature = "mem-prof")] +use axum::Form; #[cfg(feature = "mem-prof")] use axum::extract::Query; use axum::http::StatusCode; @@ -127,3 +129,57 @@ pub async fn heap_prof_status_handler() -> crate::error::Result, +) -> crate::error::Result { + use snafu::ResultExt; + + use crate::error::DumpProfileDataSnafu; + + common_mem_prof::set_gdump_active(form.activate).context(DumpProfileDataSnafu)?; + + let msg = if form.activate { + "gdump activated" + } else { + "gdump deactivated" + }; + Ok((StatusCode::OK, msg)) +} + +#[cfg(not(feature = "mem-prof"))] +#[axum_macros::debug_handler] +pub async fn gdump_toggle_handler() -> crate::error::Result { + Ok(( + StatusCode::NOT_IMPLEMENTED, + "The 'mem-prof' feature is disabled", + )) +} + +#[cfg(feature = "mem-prof")] +#[axum_macros::debug_handler] +pub async fn gdump_status_handler() -> crate::error::Result { + use snafu::ResultExt; + + use crate::error::DumpProfileDataSnafu; + + let is_active = common_mem_prof::is_gdump_active().context(DumpProfileDataSnafu)?; + Ok((StatusCode::OK, format!("{{\"active\": {}}}", is_active))) +} + +#[cfg(not(feature = "mem-prof"))] +#[axum_macros::debug_handler] +pub async fn gdump_status_handler() -> crate::error::Result { + Ok(( + StatusCode::NOT_IMPLEMENTED, + "The 'mem-prof' feature is disabled", + )) +} diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs index fc0656cf0e..4fd2d42122 100644 --- a/src/servers/src/http/otlp.rs +++ b/src/servers/src/http/otlp.rs @@ -18,9 +18,12 @@ use axum::Extension; use axum::extract::State; use axum::http::header; use axum::response::IntoResponse; +use axum_extra::TypedHeader; use bytes::Bytes; use common_catalog::consts::{TRACE_TABLE_NAME, TRACE_TABLE_NAME_SESSION_KEY}; use common_telemetry::tracing; +use headers::ContentType; +use mime_guess::mime; use opentelemetry_proto::tonic::collector::logs::v1::{ ExportLogsServiceRequest, ExportLogsServiceResponse, }; @@ -39,11 +42,26 @@ use crate::error::{self, PipelineSnafu, Result}; use crate::http::extractor::{ LogTableName, OtlpMetricOptions, PipelineInfo, SelectInfoWrapper, TraceTableName, }; -// use crate::http::header::constants::GREPTIME_METRICS_LEGACY_MODE_HEADER_NAME; use crate::http::header::{CONTENT_TYPE_PROTOBUF, write_cost_header_map}; use crate::metrics::METRIC_HTTP_OPENTELEMETRY_LOGS_ELAPSED; use crate::query_handler::{OpenTelemetryProtocolHandlerRef, PipelineHandler}; +fn is_json_content_type(content_type: Option<&ContentType>) -> bool { + match content_type { + None => false, + Some(ct) => { + let mime: mime::Mime = ct.clone().into(); + mime.subtype() == mime::JSON + } + } +} + +fn content_type_to_string(content_type: Option<&TypedHeader>) -> String { + content_type + .map(|h| h.0.to_string()) + .unwrap_or_else(|| "not specified".to_string()) +} + #[derive(Clone)] pub struct OtlpState { pub with_metric_engine: bool, @@ -56,16 +74,24 @@ pub async fn metrics( State(state): State, Extension(mut query_ctx): Extension, http_opts: OtlpMetricOptions, + content_type: Option>, bytes: Bytes, ) -> Result> { + if is_json_content_type(content_type.as_ref().map(|h| &h.0)) { + return error::UnsupportedJsonContentTypeSnafu {}.fail(); + } + let db = query_ctx.get_db_string(); query_ctx.set_channel(Channel::Otlp); let _timer = crate::metrics::METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED .with_label_values(&[db.as_str()]) .start_timer(); - let request = - ExportMetricsServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?; + let request = ExportMetricsServiceRequest::decode(bytes).with_context(|_| { + error::DecodeOtlpRequestSnafu { + content_type: content_type_to_string(content_type.as_ref()), + } + })?; let OtlpState { with_metric_engine, @@ -101,8 +127,13 @@ pub async fn traces( TraceTableName(table_name): TraceTableName, pipeline_info: PipelineInfo, Extension(mut query_ctx): Extension, + content_type: Option>, bytes: Bytes, ) -> Result> { + if is_json_content_type(content_type.as_ref().map(|h| &h.0)) { + return error::UnsupportedJsonContentTypeSnafu {}.fail(); + } + let db = query_ctx.get_db_string(); let table_name = table_name.unwrap_or_else(|| TRACE_TABLE_NAME.to_string()); @@ -113,8 +144,11 @@ pub async fn traces( let _timer = crate::metrics::METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED .with_label_values(&[db.as_str()]) .start_timer(); - let request = - ExportTraceServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?; + let request = ExportTraceServiceRequest::decode(bytes).with_context(|_| { + error::DecodeOtlpRequestSnafu { + content_type: content_type_to_string(content_type.as_ref()), + } + })?; let pipeline = PipelineWay::from_name_and_default( pipeline_info.pipeline_name.as_deref(), @@ -157,8 +191,13 @@ pub async fn logs( pipeline_info: PipelineInfo, LogTableName(tablename): LogTableName, SelectInfoWrapper(select_info): SelectInfoWrapper, + content_type: Option>, bytes: Bytes, ) -> Result> { + if is_json_content_type(content_type.as_ref().map(|h| &h.0)) { + return error::UnsupportedJsonContentTypeSnafu {}.fail(); + } + let tablename = tablename.unwrap_or_else(|| "opentelemetry_logs".to_string()); let db = query_ctx.get_db_string(); query_ctx.set_channel(Channel::Otlp); @@ -166,7 +205,11 @@ pub async fn logs( let _timer = METRIC_HTTP_OPENTELEMETRY_LOGS_ELAPSED .with_label_values(&[db.as_str()]) .start_timer(); - let request = ExportLogsServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?; + let request = ExportLogsServiceRequest::decode(bytes).with_context(|_| { + error::DecodeOtlpRequestSnafu { + content_type: content_type_to_string(content_type.as_ref()), + } + })?; let pipeline = PipelineWay::from_name_and_default( pipeline_info.pipeline_name.as_deref(), diff --git a/src/servers/src/http/prometheus.rs b/src/servers/src/http/prometheus.rs index e4c9677a4c..26a91d51fa 100644 --- a/src/servers/src/http/prometheus.rs +++ b/src/servers/src/http/prometheus.rs @@ -13,23 +13,36 @@ // limitations under the License. //! prom supply the prometheus HTTP API Server compliance + +use std::borrow::Borrow; use std::collections::{BTreeMap, HashMap, HashSet}; +use std::hash::{Hash, Hasher}; use std::sync::Arc; +use arrow::array::{Array, AsArray}; +use arrow::datatypes::{ + Date32Type, Date64Type, Decimal128Type, Float32Type, Float64Type, Int8Type, Int16Type, + Int32Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, + UInt8Type, UInt16Type, UInt32Type, UInt64Type, +}; +use arrow_schema::{DataType, IntervalUnit}; use axum::extract::{Path, Query, State}; use axum::{Extension, Form}; use catalog::CatalogManagerRef; use common_catalog::parse_catalog_and_schema_from_db_string; +use common_decimal::Decimal128; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_query::{Output, OutputData}; -use common_recordbatch::RecordBatches; +use common_recordbatch::{RecordBatch, RecordBatches}; use common_telemetry::{debug, tracing}; use common_time::util::{current_time_rfc3339, yesterday_rfc3339}; +use common_time::{Date, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth}; use common_version::OwnedBuildInfo; +use datafusion_common::ScalarValue; use datatypes::prelude::ConcreteDataType; -use datatypes::scalars::ScalarVector; -use datatypes::vectors::Float64Vector; +use datatypes::schema::{ColumnSchema, SchemaRef}; +use datatypes::types::jsonb_to_string; use futures::StreamExt; use futures::future::join_all; use itertools::Itertools; @@ -53,8 +66,9 @@ use store_api::metric_engine_consts::{ pub use super::result::prometheus_resp::PrometheusJsonResponse; use crate::error::{ - CatalogSnafu, CollectRecordbatchSnafu, Error, InvalidQuerySnafu, ParseTimestampSnafu, Result, - TableNotFoundSnafu, UnexpectedResultSnafu, + CatalogSnafu, CollectRecordbatchSnafu, ConvertScalarValueSnafu, DataFusionSnafu, Error, + InvalidQuerySnafu, NotSupportedSnafu, ParseTimestampSnafu, Result, TableNotFoundSnafu, + UnexpectedResultSnafu, }; use crate::http::header::collect_plan_metrics; use crate::prom_store::{DATABASE_LABEL, FIELD_NAME_LABEL, METRIC_NAME_LABEL, SCHEMA_LABEL}; @@ -98,12 +112,23 @@ pub struct PromData { pub result: PromQueryResult, } +/// A "holder" for the reference([Arc]) to a column name, +/// to help avoiding cloning [String]s when used as a [HashMap] key. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct Column(Arc); + +impl From<&str> for Column { + fn from(s: &str) -> Self { + Self(Arc::new(s.to_string())) + } +} + #[derive(Debug, Default, Serialize, Deserialize, PartialEq)] #[serde(untagged)] pub enum PrometheusResponse { PromData(PromData), Labels(Vec), - Series(Vec>), + Series(Vec>), LabelValues(Vec), FormatQuery(String), BuildInfo(OwnedBuildInfo), @@ -622,7 +647,7 @@ async fn get_all_column_names( async fn retrieve_series_from_query_result( result: Result, - series: &mut Vec>, + series: &mut Vec>, query_ctx: &QueryContext, table_name: &str, manager: &CatalogManagerRef, @@ -700,7 +725,7 @@ async fn retrieve_labels_name_from_query_result( fn record_batches_to_series( batches: RecordBatches, - series: &mut Vec>, + series: &mut Vec>, table_name: &str, tag_columns: &HashSet, ) -> Result<()> { @@ -723,22 +748,302 @@ fn record_batches_to_series( .try_project(&projection) .context(CollectRecordbatchSnafu)?; - for row in batch.rows() { - let mut element: HashMap = row - .iter() - .enumerate() - .map(|(idx, column)| { - let column_name = batch.schema.column_name_by_index(idx); - (column_name.to_string(), column.to_string()) - }) - .collect(); - let _ = element.insert("__name__".to_string(), table_name.to_string()); - series.push(element); - } + let mut writer = RowWriter::new(&batch.schema, table_name); + writer.write(batch, series)?; } Ok(()) } +/// Writer from a row in the record batch to a Prometheus time series: +/// +/// `{__name__="",