diff --git a/.github/actions/release-cn-artifacts/action.yaml b/.github/actions/release-cn-artifacts/action.yaml index 2825d3f5d0..fe78d5a760 100644 --- a/.github/actions/release-cn-artifacts/action.yaml +++ b/.github/actions/release-cn-artifacts/action.yaml @@ -37,17 +37,14 @@ inputs: description: Whether to push the latest tag of the image required: false default: 'true' - aws-cn-s3-bucket: - description: S3 bucket to store released artifacts in CN region + proxy-url: + description: The url of the S3 proxy server required: true - aws-cn-access-key-id: - description: AWS access key id in CN region + proxy-username: + description: The username of the S3 proxy required: true - aws-cn-secret-access-key: - description: AWS secret access key in CN region - required: true - aws-cn-region: - description: AWS region in CN + proxy-password: + description: The password of the S3 proxy required: true upload-to-s3: description: Upload to S3 @@ -77,21 +74,13 @@ runs: with: path: ${{ inputs.artifacts-dir }} - - name: Install s5cmd - shell: bash - run: | - wget https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz - tar -xzf s5cmd_2.3.0_Linux-64bit.tar.gz - sudo mv s5cmd /usr/local/bin/ - sudo chmod +x /usr/local/bin/s5cmd - - name: Release artifacts to cn region uses: nick-invision/retry@v2 if: ${{ inputs.upload-to-s3 == 'true' }} env: - AWS_ACCESS_KEY_ID: ${{ inputs.aws-cn-access-key-id }} - AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-cn-secret-access-key }} - AWS_REGION: ${{ inputs.aws-cn-region }} + PROXY_URL: ${{ inputs.proxy-url }} + PROXY_USERNAME: ${{ inputs.proxy-username }} + PROXY_PASSWORD: ${{ inputs.proxy-password }} UPDATE_VERSION_INFO: ${{ inputs.update-version-info }} with: max_attempts: ${{ inputs.upload-max-retry-times }} @@ -99,8 +88,7 @@ runs: command: | ./.github/scripts/upload-artifacts-to-s3.sh \ ${{ inputs.artifacts-dir }} \ - ${{ inputs.version }} \ - ${{ inputs.aws-cn-s3-bucket }} + ${{ inputs.version }} - name: Push greptimedb image from Dockerhub to ACR shell: bash diff --git a/.github/scripts/upload-artifacts-to-s3.sh b/.github/scripts/upload-artifacts-to-s3.sh index 75c8f8d932..1ddf32044b 100755 --- a/.github/scripts/upload-artifacts-to-s3.sh +++ b/.github/scripts/upload-artifacts-to-s3.sh @@ -5,16 +5,15 @@ set -o pipefail ARTIFACTS_DIR=$1 VERSION=$2 -AWS_S3_BUCKET=$3 RELEASE_DIRS="releases/greptimedb" GREPTIMEDB_REPO="GreptimeTeam/greptimedb" # Check if necessary variables are set. function check_vars() { - for var in AWS_S3_BUCKET VERSION ARTIFACTS_DIR; do + for var in VERSION ARTIFACTS_DIR; do if [ -z "${!var}" ]; then echo "$var is not set or empty." - echo "Usage: $0 " + echo "Usage: $0 " exit 1 fi done @@ -33,8 +32,13 @@ function upload_artifacts() { # ├── greptime-darwin-amd64-v0.2.0.sha256sum # └── greptime-darwin-amd64-v0.2.0.tar.gz find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do - s5cmd cp \ - "$file" "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/$VERSION/$(basename "$file")" + filename=$(basename "$file") + TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION" + + curl -X PUT \ + -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ + -F "file=@$file" \ + "$TARGET_URL" done } @@ -45,16 +49,24 @@ function update_version_info() { if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo "Updating latest-version.txt" echo "$VERSION" > latest-version.txt - s5cmd cp \ - latest-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-version.txt" + TARGET_URL="$PROXY_URL/$RELEASE_DIRS" + + curl -X PUT \ + -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ + -F "file=@latest-version.txt" \ + "$TARGET_URL" fi # If it's the nightly release, update latest-nightly-version.txt. if [[ "$VERSION" == *"nightly"* ]]; then echo "Updating latest-nightly-version.txt" echo "$VERSION" > latest-nightly-version.txt - s5cmd cp \ - latest-nightly-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-nightly-version.txt" + + TARGET_URL="$PROXY_URL/$RELEASE_DIRS" + curl -X PUT \ + -u "$PROXY_USERNAME:$PROXY_PASSWORD" \ + -F "file=@latest-nightly-version.txt" \ + "$TARGET_URL" fi fi } @@ -93,10 +105,10 @@ function main() { } # Usage example: -# AWS_ACCESS_KEY_ID= \ -# AWS_SECRET_ACCESS_KEY= \ -# AWS_DEFAULT_REGION= \ +# PROXY_URL= \ +# PROXY_USERNAME= \ +# PROXY_PASSWORD= \ # UPDATE_VERSION_INFO=true \ # DOWNLOAD_ARTIFACTS_FROM_GITHUB=false \ -# ./upload-artifacts-to-s3.sh +# ./upload-artifacts-to-s3.sh main diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml index 021867e4ed..d03fbeff14 100644 --- a/.github/workflows/dev-build.yml +++ b/.github/workflows/dev-build.yml @@ -285,10 +285,9 @@ jobs: dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }} dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }} version: ${{ needs.allocate-runners.outputs.version }} - aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }} - aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }} - aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }} - aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }} + proxy-url: ${{ secrets.PROXY_URL }} + proxy-username: ${{ secrets.PROXY_USERNAME }} + proxy-password: ${{ secrets.PROXY_PASSWORD }} upload-to-s3: ${{ inputs.upload_artifacts_to_s3 }} dev-mode: true # Only build the standard images(exclude centos images). push-latest-tag: false # Don't push the latest tag to registry. diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 0238e92c8d..b6ab0f8926 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -319,7 +319,13 @@ jobs: include: - target: "fuzz_repartition_table" mode: - name: "Local WAL Repartition GC" + name: "Local WAL mito table repartition" + minio: true + kafka: false + values: "with-minio-repartition-gc.yaml" + - target: "fuzz_repartition_metric_table" + mode: + name: "Local WAL metric table repartition" minio: true kafka: false values: "with-minio-repartition-gc.yaml" @@ -455,6 +461,14 @@ jobs: path: /tmp/fuzz-monitor-dumps if-no-files-found: warn retention-days: 3 + - name: Upload CSV dumps + if: failure() + uses: actions/upload-artifact@v4 + with: + name: fuzz-tests-csv-dumps-${{ matrix.mode.name }}-${{ matrix.target }} + path: /tmp/greptime-fuzz-dumps + if-no-files-found: warn + retention-days: 3 - name: Delete cluster if: success() shell: bash diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml index 9eaa38c789..14ebb6e715 100644 --- a/.github/workflows/nightly-build.yml +++ b/.github/workflows/nightly-build.yml @@ -236,10 +236,9 @@ jobs: dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }} dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }} version: ${{ needs.allocate-runners.outputs.version }} - aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }} - aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }} - aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }} - aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }} + proxy-url: ${{ secrets.PROXY_URL }} + proxy-username: ${{ secrets.PROXY_USERNAME }} + proxy-password: ${{ secrets.PROXY_PASSWORD }} upload-to-s3: false dev-mode: false update-version-info: false # Don't update version info in S3. diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3b0eb2d68c..9f8f2d9703 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -358,10 +358,9 @@ jobs: dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }} dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }} version: ${{ needs.allocate-runners.outputs.version }} - aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }} - aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }} - aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }} - aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }} + proxy-url: ${{ secrets.PROXY_URL }} + proxy-username: ${{ secrets.PROXY_USERNAME }} + proxy-password: ${{ secrets.PROXY_PASSWORD }} dev-mode: false upload-to-s3: true update-version-info: true diff --git a/.gitignore b/.gitignore index 862eb8c5b4..87412d570c 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,6 @@ CLAUDE.md # AGENTS.md AGENTS.md + +# local design docs +docs/specs/ diff --git a/Cargo.lock b/Cargo.lock index 85c2b1ed2d..32f9aa27d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1946,6 +1946,7 @@ dependencies = [ "tokio", "tracing-appender", "url", + "uuid", ] [[package]] @@ -2488,7 +2489,6 @@ version = "1.0.0-rc.2" dependencies = [ "common-error", "common-macro", - "common-telemetry", "humantime", "serde", "snafu 0.8.6", @@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -7887,6 +7887,7 @@ dependencies = [ "common-base", "common-error", "common-function", + "common-grpc", "common-macro", "common-meta", "common-query", @@ -9619,9 +9620,9 @@ dependencies = [ [[package]] name = "pgwire" -version = "0.38.0" +version = "0.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d5e5a60d3f6e40c91f6a2a7f8d09665e636272bd5611977253559b6651aabb" +checksum = "f2a798d130b8975a566c2cf6d8955746e1f09a9ee2c3ff2e6020a2c6528c5bd1" dependencies = [ "async-trait", "base64 0.22.1", @@ -10771,9 +10772,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.12" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ "bytes", "getrandom 0.3.3", @@ -11634,9 +11635,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.3" +version = "0.103.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" dependencies = [ "ring", "rustls-pki-types", @@ -13403,9 +13404,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" dependencies = [ "filetime", "libc", diff --git a/docs/rfcs/2025-12-30-export-import-v2.md b/docs/rfcs/2025-12-30-export-import-v2.md index 197eb7cc9d..6bc8428300 100644 --- a/docs/rfcs/2025-12-30-export-import-v2.md +++ b/docs/rfcs/2025-12-30-export-import-v2.md @@ -67,6 +67,7 @@ snapshot-20250101/ - Self-contained (all information needed for restore) - Immutable (content never changes after creation) - Verifiable (checksums at file, chunk, and snapshot levels) +- Schema-only snapshots contain only `manifest.json` and `schema/`; `data/` is absent, `chunks` is empty, and later data append is rejected (use `--force` to recreate) ### Chunk @@ -116,6 +117,8 @@ greptime export create \ --schema-only \ --to s3://my-bucket/snapshots/prod-schema-only +Schema-only snapshots cannot be resumed with data; use `--force` to recreate. + # Export with specific format (default: parquet) greptime export create \ --format csv \ @@ -173,7 +176,9 @@ The manifest is a JSON file containing snapshot metadata and chunk index: - `snapshot_id`: Unique identifier (UUID) - `catalog`, `schemas`: Catalog and schema list - `time_range`: Overall time range covered +- `schema_only`: Whether the snapshot contains schema only - `chunks[]`: Array of chunk metadata +- `format`: Data format for exported files - `checksum`: Snapshot-level SHA256 checksum **Chunk metadata structure**: @@ -182,7 +187,7 @@ Each chunk entry in the manifest contains: - `id`: Chunk identifier (sequential number) - `time_range`: Start and end timestamps -- `status`: Export status (Pending, Completed, Failed) +- `status`: Export status (Pending, InProgress, Completed, Failed) - `files`: List of data files in the chunk directory - `checksum`: Chunk-level checksum for integrity verification @@ -292,9 +297,9 @@ Checksums are verified during import before data is written to the database. **Resume capability**: -- Manifest tracks chunk status (Pending, Completed, Failed) +- Manifest tracks chunk status (Pending, InProgress, Completed, Failed) - Export/import automatically resumes when executed on existing snapshot -- Skips completed chunks, retries failed chunks, processes pending chunks +- Skips completed chunks, retries failed/in-progress chunks, processes pending chunks - Works across process restarts - Use `--force` (export only) to delete existing snapshot and start over diff --git a/docs/rfcs/2026-03-16-flow-inc-query.md b/docs/rfcs/2026-03-16-flow-inc-query.md new file mode 100644 index 0000000000..8041d37d2b --- /dev/null +++ b/docs/rfcs/2026-03-16-flow-inc-query.md @@ -0,0 +1,190 @@ +--- +Feature Name: Flow Batching Sequence-Based Incremental Query Plan (Lite) +Tracking Issue: TBD +Date: 2026-03-16 +Author: @discord9 +--- + +# Summary + +This RFC proposes a correctness-first incremental query mode for Flow batching. +Flow queries can read only `seq > checkpoint` and advance checkpoints using per-region correctness watermarks. +When incremental reads are stale or correctness cannot be proven, Flow falls back to full recomputation. + +# Motivation + +Flow batching still needs to repeatedly compute old data in the same time window, so incremental query can improve Flow performance. + +# Goals + +1. Add opt-in incremental reads (`seq > given_seq`) for Flow. +2. Return per-region correctness watermarks for checkpoint advancement. +3. Keep existing query behavior unchanged unless explicitly enabled. +4. Define deterministic fallback for stale or unprovable incremental reads. + +# Non-Goals + +1. No business-schema changes (no synthetic watermark columns in result rows). +2. No global throughput optimization in v1 (correctness first). +3. No observational watermark output when correctness is unprovable. + +# Proposal + +## 1) Query options + +Introduce three `QueryContext` extension keys: + +- `flow.incremental_after_seqs` +- `flow.incremental_mode` +- `flow.return_region_seq` + +These options are opt-in and only affect Flow incremental execution paths. + +## 2) Scan mapping + +When incremental mode is enabled: + +- map `after_seq` to `memtable_min_sequence` (exclusive lower bound) +- keep existing snapshot upper-bound behavior (`memtable_max_sequence`) + +Important limitation in v1: + +- incremental filtering is correctness-proven only for memtable rows +- SST files do not preserve detailed row-level sequence metadata; they only expose coarser file-level sequence information +- therefore `seq > checkpoint` must not assume precise incremental pruning across memtable->SST flush boundaries + +If required incremental parameters are missing or invalid, return argument error. + +## 3) Stale protection + +Add dedicated stale error: + +- `IncrementalQueryStale { region_id, given_seq, min_readable_seq }` + +Behavior: + +- if `given_seq < min_readable_seq`, return stale error +- if `given_seq == min_readable_seq`, query is valid and reads `seq > given_seq` +- if `given_seq > min_readable_seq`, query is also valid and reads `seq > given_seq` + +`IncrementalQueryStale` also covers the case where rows newer than the checkpoint have crossed a memtable->SST flush boundary and sequence-precise incremental exclusion can no longer be proven. +In other words, the flush-boundary case is not a separate fallback category in v1; it is one concrete way an incremental cursor becomes stale. + +## 4) Watermark return + +Extend query metrics with optional per-region watermark map: + +- `region_latest_sequences: Vec<(region_id: u64, latest_sequence: u64)>` + +Rules: + +- only terminal metrics of successful query can advance checkpoints +- for multi-region query, watermark must be complete map or absent +- if correctness is unprovable, business rows may return but watermark is absent + +## 5) Flow state machine + +Checkpoint and watermark state are kept only in flownode memory in v1; they are not persisted as durable flow metadata. +Cold start or flownode restart therefore always re-enters through a full snapshot read. +Only after that full query succeeds with a complete correctness watermark may Flow switch back to incremental mode. + +Flow starts in full mode, then transitions: + +1. Full query succeeds with correctness watermark -> enter incremental mode +2. Incremental query succeeds with correctness watermark -> advance checkpoint +3. Incremental stale/failure -> fallback to full mode +4. Full query without correctness watermark -> remain in full mode + +```mermaid +stateDiagram-v2 + [*] --> FullSnapshot: Flow starts + + state FullSnapshot { + [*] --> RunFull + RunFull --> RunFull: Full query succeeds but watermark is unprovable
no region_latest_sequences returned + } + + FullSnapshot --> Incremental: Full query succeeds and correctness watermark is returned
(checkpoint updated) + + state Incremental { + [*] --> RunInc + RunInc --> RunInc: Incremental succeeds
(checkpoint advances) + } + + Incremental --> FullSnapshot: IncrementalQueryStale
(cursor too old, fallback required) + Incremental --> FullSnapshot: Incremental fails
and fallback policy is triggered + + FullSnapshot --> [*]: Flow stops + Incremental --> [*]: Flow stops +``` + +### Fallback Policy + +Fallback to full mode is deterministic and is triggered by any of the following: + +1. `IncrementalQueryStale` is returned. +2. Incremental query fails with execution errors. +3. Incremental query succeeds but watermark is absent or incomplete for participating regions. + +Policy behavior: + +1. Do not advance any checkpoint in the failed/incomplete round. +2. Switch to full mode for the affected flow/window in the next round. +3. Return to incremental mode only after a full query succeeds with a complete correctness watermark map. + +### Persistence and recovery model + +The v1 design is intentionally correctness-first and keeps the progress cursor lightweight: + +1. Watermarks/checkpoints live only in flownode memory; v1 does not persist them separately. +2. On cold start, the flow re-establishes progress by running a successful full-query snapshot read, then resumes incremental mode only after that round returns a complete correctness watermark map. +3. Sequence-precise incremental correctness is currently limited to rows still visible in memtables. +4. Once relevant rows have been flushed into SST, the system cannot use `seq > checkpoint` alone to prove precise incremental exclusion, because SST lacks detailed row-level sequence metadata. +5. In that case the correct behavior is to fall back to full recomputation, not to continue a best-effort incremental scan. + +# Distributed and Compatibility Requirements + +1. Distributed path must preserve region-level snapshot/read-bound semantics end-to-end. +2. `snapshot_seqs` transport and `flow.*` options must both be carried correctly. + - `snapshot_seqs` means the per-region snapshot upper-bound map: `region_id -> sequence`. +3. New metrics fields must be backward-compatible (old clients ignore unknown fields). + +# Rollout Plan + +## Phase 1 (MVP, correctness first) + +1. Add extension constants and parsing. +2. Add incremental scan mapping and stale detection. +3. Add watermark metrics field and terminal-watermark checkpoint update path. +4. Complete standalone and distributed passthrough. + +## Phase 2 (performance and observability) + +1. Improve batching key strategy with sequence/watermark context. +2. Optimize watermark serialization overhead. +3. Add metrics: incremental hit rate, fallback rate, fallback window size. + +# Testing Plan + +1. Unit tests for incremental bounds and stale detection. +2. Query-path tests for extension mapping and watermark semantics. +3. Flow integration tests for full->incremental->fallback transitions. +4. Distributed tests for end-to-end snapshot/watermark propagation. +5. Compatibility tests for old/new client-server combinations. + +# Risks + +1. Boundary semantic mismatch (`<` vs `<=`) may cause correctness bugs. +2. Incomplete distributed propagation can silently invalidate watermark safety. +3. Frequent fallback can reduce throughput before phase-2 optimizations. +4. Memtable->SST flushes may force more full recomputation than expected until finer-grained SST sequence tracking exists. + +# Alternatives + +1. Put watermark into business rows (rejected: schema pollution). +2. Add new dedicated Flight message type in v1 (deferred to reduce scope). + +# Conclusion + +This plan enables a practical, correctness-first incremental path for Flow batching. +It reuses existing sequence scan capability, adds strict stale handling, and advances checkpoints only from correctness-proven per-region watermarks. diff --git a/src/catalog/src/kvbackend/table_cache.rs b/src/catalog/src/kvbackend/table_cache.rs index ea328c3e17..42b3fbc74b 100644 --- a/src/catalog/src/kvbackend/table_cache.rs +++ b/src/catalog/src/kvbackend/table_cache.rs @@ -65,11 +65,13 @@ fn init_factory( fn invalidator<'a>( cache: &'a Cache, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, MetaResult<()>> { Box::pin(async move { - if let CacheIdent::TableName(table_name) = ident { - cache.invalidate(table_name).await + for ident in idents { + if let CacheIdent::TableName(table_name) = ident { + cache.invalidate(table_name).await + } } Ok(()) }) diff --git a/src/catalog/src/system_schema/information_schema/region_peers.rs b/src/catalog/src/system_schema/information_schema/region_peers.rs index 5bc91d207e..b1438ef53d 100644 --- a/src/catalog/src/system_schema/information_schema/region_peers.rs +++ b/src/catalog/src/system_schema/information_schema/region_peers.rs @@ -267,7 +267,7 @@ impl InformationSchemaRegionPeersBuilder { ]; if !predicates.eval(&row) { - return; + continue; } self.table_catalogs.push(Some(table_catalog)); diff --git a/src/catalog/src/table_source.rs b/src/catalog/src/table_source.rs index 132e02fe14..8aabf64e99 100644 --- a/src/catalog/src/table_source.rs +++ b/src/catalog/src/table_source.rs @@ -151,7 +151,11 @@ impl DfTableSourceProvider { let catalog_list = Arc::new(DummyCatalogList::new(self.catalog_manager.clone())); let logical_plan = self .plan_decoder - .decode(Bytes::from(view_info.view_info.clone()), catalog_list, true) + .decode( + Bytes::from(view_info.view_info.clone()), + catalog_list, + false, + ) .await .context(DecodePlanSnafu { name: &table.table_info().name, diff --git a/src/cli/Cargo.toml b/src/cli/Cargo.toml index 46e79efd00..1eb2736007 100644 --- a/src/cli/Cargo.toml +++ b/src/cli/Cargo.toml @@ -65,6 +65,8 @@ store-api.workspace = true table.workspace = true tokio.workspace = true tracing-appender.workspace = true +url.workspace = true +uuid.workspace = true [dev-dependencies] common-meta = { workspace = true, features = ["testing"] } @@ -72,4 +74,3 @@ common-test-util.workspace = true common-version.workspace = true serde.workspace = true tempfile.workspace = true -url.workspace = true diff --git a/src/cli/src/data.rs b/src/cli/src/data.rs index 5966040a3b..114886542e 100644 --- a/src/cli/src/data.rs +++ b/src/cli/src/data.rs @@ -13,7 +13,12 @@ // limitations under the License. mod export; +pub mod export_v2; mod import; +pub mod import_v2; +pub(crate) mod path; +pub mod snapshot_storage; +pub(crate) mod sql; mod storage_export; use clap::Subcommand; @@ -22,15 +27,24 @@ use common_error::ext::BoxedError; use crate::Tool; use crate::data::export::ExportCommand; +use crate::data::export_v2::ExportV2Command; use crate::data::import::ImportCommand; +use crate::data::import_v2::ImportV2Command; pub(crate) const COPY_PATH_PLACEHOLDER: &str = ""; /// Command for data operations including exporting data from and importing data into GreptimeDB. #[derive(Subcommand)] pub enum DataCommand { + /// Export data (V1 - legacy). Export(ExportCommand), + /// Import data (V1 - legacy). Import(ImportCommand), + /// Export V2 - JSON-based schema export with manifest support. + #[clap(subcommand)] + ExportV2(ExportV2Command), + /// Import V2 - Import from V2 snapshot. + ImportV2(ImportV2Command), } impl DataCommand { @@ -38,6 +52,8 @@ impl DataCommand { match self { DataCommand::Export(cmd) => cmd.build().await, DataCommand::Import(cmd) => cmd.build().await, + DataCommand::ExportV2(cmd) => cmd.build().await, + DataCommand::ImportV2(cmd) => cmd.build().await, } } } diff --git a/src/cli/src/data/export.rs b/src/cli/src/data/export.rs index 1cdb159336..b5d547d4f3 100644 --- a/src/cli/src/data/export.rs +++ b/src/cli/src/data/export.rs @@ -107,13 +107,16 @@ pub struct ExportCommand { #[clap(long, value_parser = humantime::parse_duration)] timeout: Option, - /// The proxy server address to connect, if set, will override the system proxy. + /// The proxy server address to connect. /// - /// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set. + /// If set, it overrides the system proxy unless `--no-proxy` is specified. + /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used. #[clap(long)] proxy: Option, - /// Disable proxy server, if set, will not use any proxy. + /// Disable all proxy usage (ignores `--proxy` and system proxy). + /// + /// When set and `--proxy` is not provided, this explicitly disables system proxy. #[clap(long)] no_proxy: bool, @@ -173,6 +176,7 @@ impl ExportCommand { // Treats `None` as `0s` to disable server-side default timeout. self.timeout.unwrap_or_default(), proxy, + self.no_proxy, ); Ok(Box::new(Export { diff --git a/src/cli/src/data/export_v2.rs b/src/cli/src/data/export_v2.rs new file mode 100644 index 0000000000..91020d2f2e --- /dev/null +++ b/src/cli/src/data/export_v2.rs @@ -0,0 +1,49 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Export V2 module. +//! +//! This module provides the V2 implementation of database export functionality, +//! featuring: +//! - JSON-based schema export (version-agnostic) +//! - Manifest-based snapshot management +//! - Support for multiple storage backends (S3, OSS, GCS, Azure Blob, local FS) +//! - Resume capability for interrupted exports +//! +//! # Example +//! +//! ```bash +//! # Export schema only +//! greptime cli data export-v2 create \ +//! --addr 127.0.0.1:4000 \ +//! --to file:///tmp/snapshot \ +//! --schema-only +//! +//! # Export with time range (M2) +//! greptime cli data export-v2 create \ +//! --addr 127.0.0.1:4000 \ +//! --to s3://bucket/snapshots/prod-20250101 \ +//! --start-time 2025-01-01T00:00:00Z \ +//! --end-time 2025-01-31T23:59:59Z +//! ``` + +mod command; +pub mod error; +pub mod extractor; +pub mod manifest; +pub mod schema; +pub use command::ExportV2Command; + +#[cfg(test)] +mod tests; diff --git a/src/cli/src/data/export_v2/command.rs b/src/cli/src/data/export_v2/command.rs new file mode 100644 index 0000000000..341436fe0f --- /dev/null +++ b/src/cli/src/data/export_v2/command.rs @@ -0,0 +1,496 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Export V2 CLI commands. + +use std::collections::HashSet; +use std::time::Duration; + +use async_trait::async_trait; +use clap::{Parser, Subcommand}; +use common_error::ext::BoxedError; +use common_telemetry::info; +use serde_json::Value; +use snafu::{OptionExt, ResultExt}; + +use crate::Tool; +use crate::common::ObjectStoreConfig; +use crate::data::export_v2::error::{ + CannotResumeSchemaOnlySnafu, DataExportNotImplementedSnafu, DatabaseSnafu, EmptyResultSnafu, + ManifestVersionMismatchSnafu, Result, UnexpectedValueTypeSnafu, +}; +use crate::data::export_v2::extractor::SchemaExtractor; +use crate::data::export_v2::manifest::{DataFormat, MANIFEST_VERSION, Manifest}; +use crate::data::path::ddl_path_for_schema; +use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri}; +use crate::data::sql::{escape_sql_identifier, escape_sql_literal}; +use crate::database::{DatabaseClient, parse_proxy_opts}; + +/// Export V2 commands. +#[derive(Debug, Subcommand)] +pub enum ExportV2Command { + /// Create a new snapshot. + Create(ExportCreateCommand), +} + +impl ExportV2Command { + pub async fn build(&self) -> std::result::Result, BoxedError> { + match self { + ExportV2Command::Create(cmd) => cmd.build().await, + } + } +} + +/// Create a new snapshot. +#[derive(Debug, Parser)] +pub struct ExportCreateCommand { + /// Server address to connect (e.g., 127.0.0.1:4000). + #[clap(long)] + addr: String, + + /// Target storage location (e.g., s3://bucket/path, file:///tmp/backup). + #[clap(long)] + to: String, + + /// Catalog name. + #[clap(long, default_value = "greptime")] + catalog: String, + + /// Schema list to export (default: all non-system schemas). + /// Can be specified multiple times or comma-separated. + #[clap(long, value_delimiter = ',')] + schemas: Vec, + + /// Export schema only, no data. + #[clap(long)] + schema_only: bool, + + /// Time range start (ISO 8601 format, e.g., 2024-01-01T00:00:00Z). + #[clap(long)] + start_time: Option, + + /// Time range end (ISO 8601 format, e.g., 2024-12-31T23:59:59Z). + #[clap(long)] + end_time: Option, + + /// Data format: parquet, csv, json. + #[clap(long, value_enum, default_value = "parquet")] + format: DataFormat, + + /// Delete existing snapshot and recreate. + #[clap(long)] + force: bool, + + /// Concurrency level (for future use). + #[clap(long, default_value = "1")] + parallelism: usize, + + /// Basic authentication (user:password). + #[clap(long)] + auth_basic: Option, + + /// Request timeout. + #[clap(long, value_parser = humantime::parse_duration)] + timeout: Option, + + /// Proxy server address. + /// + /// If set, it overrides the system proxy unless `--no-proxy` is specified. + /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used. + #[clap(long)] + proxy: Option, + + /// Disable all proxy usage (ignores `--proxy` and system proxy). + /// + /// When set and `--proxy` is not provided, this explicitly disables system proxy. + #[clap(long)] + no_proxy: bool, + + /// Object store configuration for remote storage backends. + #[clap(flatten)] + storage: ObjectStoreConfig, +} + +impl ExportCreateCommand { + pub async fn build(&self) -> std::result::Result, BoxedError> { + // Validate URI format + validate_uri(&self.to).map_err(BoxedError::new)?; + + if !self.schema_only { + return DataExportNotImplementedSnafu + .fail() + .map_err(BoxedError::new); + } + + // Parse schemas (empty vec means all schemas) + let schemas = if self.schemas.is_empty() { + None + } else { + Some(self.schemas.clone()) + }; + + // Build storage + let storage = OpenDalStorage::from_uri(&self.to, &self.storage).map_err(BoxedError::new)?; + + // Build database client + let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?; + let database_client = DatabaseClient::new( + self.addr.clone(), + self.catalog.clone(), + self.auth_basic.clone(), + self.timeout.unwrap_or(Duration::from_secs(60)), + proxy, + self.no_proxy, + ); + + Ok(Box::new(ExportCreate { + catalog: self.catalog.clone(), + schemas, + schema_only: self.schema_only, + _format: self.format, + force: self.force, + _parallelism: self.parallelism, + storage: Box::new(storage), + database_client, + })) + } +} + +/// Export tool implementation. +pub struct ExportCreate { + catalog: String, + schemas: Option>, + schema_only: bool, + _format: DataFormat, + force: bool, + _parallelism: usize, + storage: Box, + database_client: DatabaseClient, +} + +#[async_trait] +impl Tool for ExportCreate { + async fn do_work(&self) -> std::result::Result<(), BoxedError> { + self.run().await.map_err(BoxedError::new) + } +} + +impl ExportCreate { + async fn run(&self) -> Result<()> { + // 1. Check if snapshot exists + let exists = self.storage.exists().await?; + + if exists { + if self.force { + info!("Deleting existing snapshot (--force)"); + self.storage.delete_snapshot().await?; + } else { + // Resume mode - read existing manifest + let manifest = self.storage.read_manifest().await?; + + // Check version compatibility + if manifest.version != MANIFEST_VERSION { + return ManifestVersionMismatchSnafu { + expected: MANIFEST_VERSION, + found: manifest.version, + } + .fail(); + } + + // Cannot resume schema-only with data export + if manifest.schema_only && !self.schema_only { + return CannotResumeSchemaOnlySnafu.fail(); + } + + info!( + "Resuming existing snapshot: {} (completed: {}/{} chunks)", + manifest.snapshot_id, + manifest.completed_count(), + manifest.chunks.len() + ); + + // For M1, we only handle schema-only exports + // M2 will add chunk resume logic + if manifest.is_complete() { + info!("Snapshot is already complete"); + return Ok(()); + } + + // TODO: Resume data export in M2 + info!("Data export resume not yet implemented (M2)"); + return Ok(()); + } + } + + // 2. Get schema list + let extractor = SchemaExtractor::new(&self.database_client, &self.catalog); + let schema_snapshot = extractor.extract(self.schemas.as_deref()).await?; + + let schema_names: Vec = schema_snapshot + .schemas + .iter() + .map(|s| s.name.clone()) + .collect(); + info!("Exporting schemas: {:?}", schema_names); + + // 3. Create manifest + let manifest = Manifest::new_schema_only(self.catalog.clone(), schema_names.clone()); + + // 4. Write schema files + self.storage.write_schema(&schema_snapshot).await?; + info!("Exported {} schemas", schema_snapshot.schemas.len()); + + // 5. Export DDL files for import recovery. + let ddl_by_schema = self.build_ddl_by_schema(&schema_names).await?; + for (schema, ddl) in ddl_by_schema { + let ddl_path = ddl_path_for_schema(&schema); + self.storage.write_text(&ddl_path, &ddl).await?; + info!("Exported DDL for schema {} to {}", schema, ddl_path); + } + + // 6. Write manifest last. + // + // The manifest is the snapshot commit point: only write it after the schema + // index and all DDL files are durable, so a crash cannot leave a "valid" + // snapshot that is missing required schema artifacts. + self.storage.write_manifest(&manifest).await?; + info!("Snapshot created: {}", manifest.snapshot_id); + + Ok(()) + } + + async fn build_ddl_by_schema(&self, schema_names: &[String]) -> Result> { + let mut schemas = schema_names.to_vec(); + schemas.sort(); + + let mut ddl_by_schema = Vec::with_capacity(schemas.len()); + for schema in schemas { + let create_database = self.show_create("DATABASE", &schema, None).await?; + + let (mut physical_tables, mut tables, mut views) = + self.get_schema_objects(&schema).await?; + physical_tables.sort(); + let mut physical_ddls = Vec::with_capacity(physical_tables.len()); + for table in physical_tables { + physical_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?); + } + + tables.sort(); + let mut table_ddls = Vec::with_capacity(tables.len()); + for table in tables { + table_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?); + } + + views.sort(); + let mut view_ddls = Vec::with_capacity(views.len()); + for view in views { + view_ddls.push(self.show_create("VIEW", &schema, Some(&view)).await?); + } + + let ddl = build_schema_ddl( + &schema, + create_database, + physical_ddls, + table_ddls, + view_ddls, + ); + ddl_by_schema.push((schema, ddl)); + } + + Ok(ddl_by_schema) + } + + async fn get_schema_objects( + &self, + schema: &str, + ) -> Result<(Vec, Vec, Vec)> { + let physical_tables = self.get_metric_physical_tables(schema).await?; + let physical_set: HashSet<&str> = physical_tables.iter().map(String::as_str).collect(); + let sql = format!( + "SELECT table_name, table_type FROM information_schema.tables \ + WHERE table_catalog = '{}' AND table_schema = '{}' \ + AND (table_type = 'BASE TABLE' OR table_type = 'VIEW')", + escape_sql_literal(&self.catalog), + escape_sql_literal(schema) + ); + let records: Option>> = self + .database_client + .sql_in_public(&sql) + .await + .context(DatabaseSnafu)?; + + let mut tables = Vec::new(); + let mut views = Vec::new(); + if let Some(rows) = records { + for row in rows { + let name = match row.first() { + Some(Value::String(name)) => name.clone(), + _ => return UnexpectedValueTypeSnafu.fail(), + }; + let table_type = match row.get(1) { + Some(Value::String(table_type)) => table_type.as_str(), + _ => return UnexpectedValueTypeSnafu.fail(), + }; + if !physical_set.contains(name.as_str()) { + if table_type == "VIEW" { + views.push(name); + } else { + tables.push(name); + } + } + } + } + + Ok((physical_tables, tables, views)) + } + + async fn get_metric_physical_tables(&self, schema: &str) -> Result> { + let sql = format!( + "SELECT DISTINCT table_name FROM information_schema.columns \ + WHERE table_catalog = '{}' AND table_schema = '{}' AND column_name = '__tsid'", + escape_sql_literal(&self.catalog), + escape_sql_literal(schema) + ); + let records: Option>> = self + .database_client + .sql_in_public(&sql) + .await + .context(DatabaseSnafu)?; + + let mut tables = HashSet::new(); + if let Some(rows) = records { + for row in rows { + let name = match row.first() { + Some(Value::String(name)) => name.clone(), + _ => return UnexpectedValueTypeSnafu.fail(), + }; + tables.insert(name); + } + } + + Ok(tables.into_iter().collect()) + } + + async fn show_create( + &self, + show_type: &str, + schema: &str, + table: Option<&str>, + ) -> Result { + let sql = match table { + Some(table) => format!( + r#"SHOW CREATE {} "{}"."{}"."{}""#, + show_type, + escape_sql_identifier(&self.catalog), + escape_sql_identifier(schema), + escape_sql_identifier(table) + ), + None => format!( + r#"SHOW CREATE {} "{}"."{}""#, + show_type, + escape_sql_identifier(&self.catalog), + escape_sql_identifier(schema) + ), + }; + + let records: Option>> = self + .database_client + .sql_in_public(&sql) + .await + .context(DatabaseSnafu)?; + let rows = records.context(EmptyResultSnafu)?; + let row = rows.first().context(EmptyResultSnafu)?; + let Some(Value::String(create)) = row.get(1) else { + return UnexpectedValueTypeSnafu.fail(); + }; + + Ok(format!("{};\n", create)) + } +} + +fn build_schema_ddl( + schema: &str, + create_database: String, + physical_tables: Vec, + tables: Vec, + views: Vec, +) -> String { + let mut ddl = String::new(); + ddl.push_str(&format!("-- Schema: {}\n", schema)); + ddl.push_str(&create_database); + for stmt in physical_tables { + ddl.push_str(&stmt); + } + for stmt in tables { + ddl.push_str(&stmt); + } + for stmt in views { + ddl.push_str(&stmt); + } + ddl.push('\n'); + ddl +} + +#[cfg(test)] +mod tests { + use clap::Parser; + + use super::*; + use crate::data::path::ddl_path_for_schema; + + #[test] + fn test_ddl_path_for_schema() { + assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql"); + assert_eq!( + ddl_path_for_schema("../evil"), + "schema/ddl/%2E%2E%2Fevil.sql" + ); + } + + #[test] + fn test_build_schema_ddl_order() { + let ddl = build_schema_ddl( + "public", + "CREATE DATABASE public;\n".to_string(), + vec!["PHYSICAL;\n".to_string()], + vec!["TABLE;\n".to_string()], + vec!["VIEW;\n".to_string()], + ); + + let db_pos = ddl.find("CREATE DATABASE").unwrap(); + let physical_pos = ddl.find("PHYSICAL;").unwrap(); + let table_pos = ddl.find("TABLE;").unwrap(); + let view_pos = ddl.find("VIEW;").unwrap(); + assert!(db_pos < physical_pos); + assert!(physical_pos < table_pos); + assert!(table_pos < view_pos); + } + + #[tokio::test] + async fn test_build_rejects_non_schema_only_export() { + let cmd = ExportCreateCommand::parse_from([ + "export-v2-create", + "--addr", + "127.0.0.1:4000", + "--to", + "file:///tmp/export-v2-test", + ]); + + let result = cmd.build().await; + assert!(result.is_err()); + let error = result.err().unwrap().to_string(); + + assert!(error.contains("Data export is not implemented yet")); + } +} diff --git a/src/cli/src/data/export_v2/error.rs b/src/cli/src/data/export_v2/error.rs new file mode 100644 index 0000000000..2db71d5326 --- /dev/null +++ b/src/cli/src/data/export_v2/error.rs @@ -0,0 +1,181 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_macro::stack_trace_debug; +use snafu::{Location, Snafu}; + +#[derive(Snafu)] +#[snafu(visibility(pub))] +#[stack_trace_debug] +pub enum Error { + #[snafu(display("Invalid URI '{}': {}", uri, reason))] + InvalidUri { + uri: String, + reason: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unsupported storage scheme: {}", scheme))] + UnsupportedScheme { + scheme: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Storage operation '{}' failed", operation))] + StorageOperation { + operation: String, + #[snafu(source)] + error: object_store::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to parse manifest"))] + ManifestParse { + #[snafu(source)] + error: serde_json::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to serialize manifest"))] + ManifestSerialize { + #[snafu(source)] + error: serde_json::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to decode text file as UTF-8"))] + TextDecode { + #[snafu(source)] + error: std::string::FromUtf8Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Cannot resume schema-only snapshot with data export. Use --force to recreate." + ))] + CannotResumeSchemaOnly { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Data export is not implemented yet. Use --schema-only to create a schema snapshot." + ))] + DataExportNotImplemented { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Empty result from query"))] + EmptyResult { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unexpected value type in query result"))] + UnexpectedValueType { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Database error"))] + Database { + #[snafu(source)] + error: crate::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Snapshot not found at '{}'", uri))] + SnapshotNotFound { + uri: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Schema '{}' not found in catalog '{}'", schema, catalog))] + SchemaNotFound { + catalog: String, + schema: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to parse URL"))] + UrlParse { + #[snafu(source)] + error: url::ParseError, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to build object store"))] + BuildObjectStore { + #[snafu(source)] + error: object_store::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))] + ManifestVersionMismatch { + expected: u32, + found: u32, + #[snafu(implicit)] + location: Location, + }, +} + +pub type Result = std::result::Result; + +impl ErrorExt for Error { + fn status_code(&self) -> StatusCode { + match self { + Error::InvalidUri { .. } + | Error::UnsupportedScheme { .. } + | Error::CannotResumeSchemaOnly { .. } + | Error::DataExportNotImplemented { .. } + | Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments, + + Error::StorageOperation { .. } + | Error::ManifestParse { .. } + | Error::ManifestSerialize { .. } + | Error::TextDecode { .. } + | Error::BuildObjectStore { .. } => StatusCode::StorageUnavailable, + + Error::EmptyResult { .. } + | Error::UnexpectedValueType { .. } + | Error::UrlParse { .. } => StatusCode::Internal, + + Error::Database { error, .. } => error.status_code(), + + Error::SnapshotNotFound { .. } => StatusCode::InvalidArguments, + Error::SchemaNotFound { .. } => StatusCode::DatabaseNotFound, + } + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/cli/src/data/export_v2/extractor.rs b/src/cli/src/data/export_v2/extractor.rs new file mode 100644 index 0000000000..ae15b199af --- /dev/null +++ b/src/cli/src/data/export_v2/extractor.rs @@ -0,0 +1,254 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Schema extraction from information_schema. +//! +//! For V2 DDL-only snapshots, extractor only persists the schema index. + +use std::collections::{HashMap, HashSet}; + +use serde_json::Value; +use snafu::ResultExt; + +use crate::data::export_v2::error::{ + DatabaseSnafu, EmptyResultSnafu, Result, SchemaNotFoundSnafu, UnexpectedValueTypeSnafu, +}; +use crate::data::export_v2::schema::{SchemaDefinition, SchemaSnapshot}; +use crate::data::sql::escape_sql_literal; +use crate::database::DatabaseClient; + +/// System schemas that should be excluded from export. +const SYSTEM_SCHEMAS: &[&str] = &["information_schema", "pg_catalog"]; + +/// Extracts schema definitions from information_schema. +pub struct SchemaExtractor<'a> { + client: &'a DatabaseClient, + catalog: &'a str, +} + +impl<'a> SchemaExtractor<'a> { + /// Creates a new schema extractor. + pub fn new(client: &'a DatabaseClient, catalog: &'a str) -> Self { + Self { client, catalog } + } + + /// Extracts the schema index for the given schemas. + /// + /// If `schemas` is None, extracts all non-system schemas. + pub async fn extract(&self, schemas: Option<&[String]>) -> Result { + let mut snapshot = SchemaSnapshot::new(); + + let schema_names = match schemas { + Some(names) => self.validate_schemas(names).await?, + None => self.get_all_schemas().await?, + }; + + for schema_name in &schema_names { + let schema_def = self.extract_schema_definition(schema_name).await?; + snapshot.add_schema(schema_def); + } + + Ok(snapshot) + } + + /// Gets all non-system schemas in the catalog. + async fn get_all_schemas(&self) -> Result> { + let sql = format!( + "SELECT schema_name FROM information_schema.schemata \ + WHERE catalog_name = '{}'", + escape_sql_literal(self.catalog) + ); + + let records = self.query(&sql).await?; + let mut schemas = Vec::new(); + + for row in records { + let name = extract_string(&row, 0)?; + if !SYSTEM_SCHEMAS.contains(&name.as_str()) { + schemas.push(name); + } + } + + Ok(schemas) + } + + /// Validates that all specified schemas exist. + async fn validate_schemas(&self, schemas: &[String]) -> Result> { + let all_schemas = self.get_all_schemas().await?; + dedupe_canonicalized_schemas(schemas, &all_schemas, self.catalog) + } + + /// Extracts schema (database) definition. + async fn extract_schema_definition(&self, schema: &str) -> Result { + let sql = format!( + "SELECT schema_name, options FROM information_schema.schemata \ + WHERE catalog_name = '{}' AND schema_name = '{}'", + escape_sql_literal(self.catalog), + escape_sql_literal(schema) + ); + + let records = self.query(&sql).await?; + if records.is_empty() { + return SchemaNotFoundSnafu { + catalog: self.catalog, + schema, + } + .fail(); + } + + let name = extract_string(&records[0], 0)?; + let options = extract_optional_string(&records[0], 1) + .map(|opts| parse_options(&opts)) + .unwrap_or_default(); + + Ok(SchemaDefinition { + catalog: self.catalog.to_string(), + name, + options, + }) + } + + /// Executes a SQL query and returns the results. + async fn query(&self, sql: &str) -> Result>> { + self.client + .sql_in_public(sql) + .await + .context(DatabaseSnafu)? + .ok_or_else(|| EmptyResultSnafu.build()) + } +} + +/// Extracts a string value from a row. +fn extract_string(row: &[Value], index: usize) -> Result { + match row.get(index) { + Some(Value::String(s)) => Ok(s.clone()), + Some(Value::Null) => UnexpectedValueTypeSnafu.fail(), + _ => UnexpectedValueTypeSnafu.fail(), + } +} + +/// Extracts an optional string value from a row. +fn extract_optional_string(row: &[Value], index: usize) -> Option { + match row.get(index) { + Some(Value::String(s)) if !s.is_empty() => Some(s.clone()), + _ => None, + } +} + +/// Parses options string into a HashMap. +fn parse_options(options_str: &str) -> HashMap { + if let Ok(map) = serde_json::from_str::>(options_str) { + return map; + } + + let mut options = HashMap::new(); + for line in options_str.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + + if let Some((key, value)) = parse_quoted_option_line(trimmed) { + options.insert(key, value); + continue; + } + + for part in trimmed.split_whitespace() { + if let Some((key, value)) = part.split_once('=') { + options.insert(key.to_string(), value.to_string()); + } + } + } + options +} + +fn parse_quoted_option_line(line: &str) -> Option<(String, String)> { + let key = line.strip_prefix('\'')?; + let (key, rest) = key.split_once("'='")?; + let value = rest.strip_suffix('\'')?; + Some((key.to_string(), value.to_string())) +} + +fn dedupe_canonicalized_schemas( + requested: &[String], + available: &[String], + catalog: &str, +) -> Result> { + let mut canonicalized = Vec::new(); + let mut seen = HashSet::new(); + + for schema in requested { + let Some(canonical) = available.iter().find(|s| s.eq_ignore_ascii_case(schema)) else { + return SchemaNotFoundSnafu { catalog, schema }.fail(); + }; + + if seen.insert(canonical.to_ascii_lowercase()) { + canonicalized.push(canonical.clone()); + } + } + + Ok(canonicalized) +} + +#[cfg(test)] +mod tests { + use serde_json::Value; + + use super::*; + + #[test] + fn test_parse_options_json() { + let opts = r#"{"ttl": "30d", "custom": "value"}"#; + let parsed = parse_options(opts); + assert_eq!(parsed.get("ttl"), Some(&"30d".to_string())); + assert_eq!(parsed.get("custom"), Some(&"value".to_string())); + } + + #[test] + fn test_parse_options_key_value() { + let opts = "ttl=30d custom=value"; + let parsed = parse_options(opts); + assert_eq!(parsed.get("ttl"), Some(&"30d".to_string())); + assert_eq!(parsed.get("custom"), Some(&"value".to_string())); + } + + #[test] + fn test_parse_options_schema_display_format() { + let opts = "'ttl'='30d'\n'custom'='value with spaces'\n"; + let parsed = parse_options(opts); + assert_eq!(parsed.get("ttl"), Some(&"30d".to_string())); + assert_eq!(parsed.get("custom"), Some(&"value with spaces".to_string())); + } + + #[test] + fn test_extract_string_rejects_null() { + let row = vec![Value::Null]; + assert!(extract_string(&row, 0).is_err()); + } + + #[test] + fn test_dedupe_canonicalized_schemas() { + let available = vec!["public".to_string(), "test_db".to_string()]; + let requested = vec![ + "PUBLIC".to_string(), + "public".to_string(), + "Test_Db".to_string(), + ]; + + let canonicalized = dedupe_canonicalized_schemas(&requested, &available, "greptime") + .expect("schemas should be canonicalized"); + + assert_eq!(canonicalized, vec!["public", "test_db"]); + } +} diff --git a/src/cli/src/data/export_v2/manifest.rs b/src/cli/src/data/export_v2/manifest.rs new file mode 100644 index 0000000000..0ebf753fa4 --- /dev/null +++ b/src/cli/src/data/export_v2/manifest.rs @@ -0,0 +1,381 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Manifest data structures for Export/Import V2. + +use std::{fmt, str}; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// Current manifest format version. +pub const MANIFEST_VERSION: u32 = 1; + +/// Manifest file name within snapshot directory. +pub const MANIFEST_FILE: &str = "manifest.json"; + +/// Time range for data export (half-open interval: [start, end)). +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct TimeRange { + /// Start time (inclusive). None means earliest available data. + #[serde(skip_serializing_if = "Option::is_none")] + pub start: Option>, + /// End time (exclusive). None means current time. + #[serde(skip_serializing_if = "Option::is_none")] + pub end: Option>, +} + +impl TimeRange { + /// Creates a new time range with specified bounds. + pub fn new(start: Option>, end: Option>) -> Self { + Self { start, end } + } + + /// Creates an unbounded time range (all data). + pub fn unbounded() -> Self { + Self { + start: None, + end: None, + } + } + + /// Returns true if this time range is unbounded. + pub fn is_unbounded(&self) -> bool { + self.start.is_none() && self.end.is_none() + } +} + +impl Default for TimeRange { + fn default() -> Self { + Self::unbounded() + } +} + +/// Status of a chunk during export/import. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum ChunkStatus { + /// Chunk is pending export. + #[default] + Pending, + /// Chunk export is in progress. + InProgress, + /// Chunk export completed successfully. + Completed, + /// Chunk export failed. + Failed, +} + +/// Metadata for a single chunk of exported data. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ChunkMeta { + /// Chunk identifier (sequential number starting from 1). + pub id: u32, + /// Time range covered by this chunk. + pub time_range: TimeRange, + /// Export status. + pub status: ChunkStatus, + /// List of data files in this chunk (relative paths from snapshot root). + #[serde(default)] + pub files: Vec, + /// SHA256 checksum of all files in this chunk (aggregated). + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum: Option, + /// Error message if status is Failed. + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, +} + +impl ChunkMeta { + /// Creates a new pending chunk with the given id and time range. + pub fn new(id: u32, time_range: TimeRange) -> Self { + Self { + id, + time_range, + status: ChunkStatus::Pending, + files: vec![], + checksum: None, + error: None, + } + } + + /// Marks this chunk as in progress. + pub fn mark_in_progress(&mut self) { + self.status = ChunkStatus::InProgress; + self.error = None; + } + + /// Marks this chunk as completed with the given files and checksum. + pub fn mark_completed(&mut self, files: Vec, checksum: Option) { + self.status = ChunkStatus::Completed; + self.files = files; + self.checksum = checksum; + self.error = None; + } + + /// Marks this chunk as failed with the given error message. + pub fn mark_failed(&mut self, error: String) { + self.status = ChunkStatus::Failed; + self.error = Some(error); + } +} + +/// Supported data formats for export. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default, clap::ValueEnum)] +#[serde(rename_all = "lowercase")] +#[value(rename_all = "lowercase")] +pub enum DataFormat { + /// Apache Parquet format (default, recommended for production). + #[default] + Parquet, + /// CSV format (human-readable). + Csv, + /// JSON format (structured text). + Json, +} + +impl fmt::Display for DataFormat { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DataFormat::Parquet => write!(f, "parquet"), + DataFormat::Csv => write!(f, "csv"), + DataFormat::Json => write!(f, "json"), + } + } +} + +impl str::FromStr for DataFormat { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "parquet" => Ok(DataFormat::Parquet), + "csv" => Ok(DataFormat::Csv), + "json" => Ok(DataFormat::Json), + _ => Err(format!( + "invalid format '{}': expected one of parquet, csv, json", + s + )), + } + } +} + +/// Snapshot manifest containing all metadata. +/// +/// The manifest is stored as `manifest.json` in the snapshot root directory. +/// It contains: +/// - Snapshot identification (UUID, timestamps) +/// - Scope (catalog, schemas, time range) +/// - Export configuration (format, schema_only) +/// - Chunk metadata for resume support +/// - Integrity checksums +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Manifest { + /// Manifest format version for compatibility checking. + pub version: u32, + /// Unique snapshot identifier. + pub snapshot_id: Uuid, + /// Catalog name. + pub catalog: String, + /// List of schemas included in this snapshot. + pub schemas: Vec, + /// Overall time range covered by this snapshot. + pub time_range: TimeRange, + /// Whether this is a schema-only snapshot (no data). + pub schema_only: bool, + /// Data format used for export. + pub format: DataFormat, + /// Chunk metadata (empty for schema-only snapshots). + #[serde(default)] + pub chunks: Vec, + /// Snapshot-level SHA256 checksum (aggregated from all chunks). + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum: Option, + /// Creation timestamp. + pub created_at: DateTime, + /// Last updated timestamp. + pub updated_at: DateTime, +} + +impl Manifest { + /// Creates a new manifest for schema-only export. + pub fn new_schema_only(catalog: String, schemas: Vec) -> Self { + let now = Utc::now(); + Self { + version: MANIFEST_VERSION, + snapshot_id: Uuid::new_v4(), + catalog, + schemas, + time_range: TimeRange::unbounded(), + schema_only: true, + format: DataFormat::Parquet, + chunks: vec![], + checksum: None, + created_at: now, + updated_at: now, + } + } + + /// Creates a new manifest for full export with time range and format. + pub fn new_full( + catalog: String, + schemas: Vec, + time_range: TimeRange, + format: DataFormat, + ) -> Self { + let now = Utc::now(); + Self { + version: MANIFEST_VERSION, + snapshot_id: Uuid::new_v4(), + catalog, + schemas, + time_range, + schema_only: false, + format, + chunks: vec![], + checksum: None, + created_at: now, + updated_at: now, + } + } + + /// Returns true if all chunks are completed (or if schema-only). + pub fn is_complete(&self) -> bool { + self.schema_only + || (!self.chunks.is_empty() + && self + .chunks + .iter() + .all(|c| c.status == ChunkStatus::Completed)) + } + + /// Returns the number of pending chunks. + pub fn pending_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::Pending) + .count() + } + + /// Returns the number of in-progress chunks. + pub fn in_progress_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::InProgress) + .count() + } + + /// Returns the number of completed chunks. + pub fn completed_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::Completed) + .count() + } + + /// Returns the number of failed chunks. + pub fn failed_count(&self) -> usize { + self.chunks + .iter() + .filter(|c| c.status == ChunkStatus::Failed) + .count() + } + + /// Updates the `updated_at` timestamp to now. + pub fn touch(&mut self) { + self.updated_at = Utc::now(); + } + + /// Adds a chunk to the manifest. + pub fn add_chunk(&mut self, chunk: ChunkMeta) { + self.chunks.push(chunk); + self.touch(); + } + + /// Updates a chunk by id. + pub fn update_chunk(&mut self, id: u32, updater: impl FnOnce(&mut ChunkMeta)) { + if let Some(chunk) = self.chunks.iter_mut().find(|c| c.id == id) { + updater(chunk); + self.touch(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_time_range_serialization() { + let range = TimeRange::unbounded(); + let json = serde_json::to_string(&range).unwrap(); + assert_eq!(json, "{}"); + + let range: TimeRange = serde_json::from_str("{}").unwrap(); + assert!(range.is_unbounded()); + } + + #[test] + fn test_manifest_schema_only() { + let manifest = + Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]); + + assert_eq!(manifest.version, MANIFEST_VERSION); + assert!(manifest.schema_only); + assert!(manifest.chunks.is_empty()); + assert!(manifest.is_complete()); + } + + #[test] + fn test_manifest_full() { + let manifest = Manifest::new_full( + "greptime".to_string(), + vec!["public".to_string()], + TimeRange::unbounded(), + DataFormat::Parquet, + ); + + assert!(!manifest.schema_only); + assert!(manifest.chunks.is_empty()); + assert!(!manifest.is_complete()); + } + + #[test] + fn test_data_format_parsing() { + assert_eq!( + "parquet".parse::().unwrap(), + DataFormat::Parquet + ); + assert_eq!("CSV".parse::().unwrap(), DataFormat::Csv); + assert_eq!("JSON".parse::().unwrap(), DataFormat::Json); + assert!("invalid".parse::().is_err()); + } + + #[test] + fn test_chunk_status_transitions() { + let mut chunk = ChunkMeta::new(1, TimeRange::unbounded()); + assert_eq!(chunk.status, ChunkStatus::Pending); + + chunk.mark_in_progress(); + assert_eq!(chunk.status, ChunkStatus::InProgress); + + chunk.mark_completed( + vec!["file1.parquet".to_string()], + Some("abc123".to_string()), + ); + assert_eq!(chunk.status, ChunkStatus::Completed); + assert_eq!(chunk.files.len(), 1); + } +} diff --git a/src/cli/src/data/export_v2/schema.rs b/src/cli/src/data/export_v2/schema.rs new file mode 100644 index 0000000000..1aab6ac900 --- /dev/null +++ b/src/cli/src/data/export_v2/schema.rs @@ -0,0 +1,98 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Minimal schema index structures for Export/Import V2. +//! +//! The canonical schema representation is the per-schema DDL file under +//! `schema/ddl/`. `schemas.json` only records which schemas exist in a snapshot. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +/// Schema directory name within snapshot. +pub const SCHEMA_DIR: &str = "schema"; + +/// DDL directory name within schema directory. +pub const DDL_DIR: &str = "ddl"; + +/// Schema definition file name. +pub const SCHEMAS_FILE: &str = "schemas.json"; + +/// Schema (database) definition. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct SchemaDefinition { + /// Catalog name. + pub catalog: String, + /// Schema (database) name. + pub name: String, + /// Schema options (if any). + #[serde(default, skip_serializing_if = "HashMap::is_empty")] + pub options: HashMap, +} + +/// Minimal schema index stored in a snapshot. +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct SchemaSnapshot { + /// Schema (database) definitions. + pub schemas: Vec, +} + +impl SchemaSnapshot { + /// Creates an empty schema snapshot. + pub fn new() -> Self { + Self::default() + } + + /// Adds a schema definition. + pub fn add_schema(&mut self, schema: SchemaDefinition) { + self.schemas.push(schema); + } + + /// Filters the snapshot to only include specified schemas. + pub fn filter_schemas(&self, schemas: &[String]) -> Self { + Self { + schemas: self + .schemas + .iter() + .filter(|s| schemas.contains(&s.name)) + .cloned() + .collect(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_schema_snapshot_filter() { + let mut snapshot = SchemaSnapshot::new(); + snapshot.add_schema(SchemaDefinition { + catalog: "greptime".to_string(), + name: "public".to_string(), + options: HashMap::new(), + }); + snapshot.add_schema(SchemaDefinition { + catalog: "greptime".to_string(), + name: "private".to_string(), + options: HashMap::new(), + }); + + let filtered = snapshot.filter_schemas(&["public".to_string()]); + assert_eq!(filtered.schemas.len(), 1); + assert_eq!(filtered.schemas[0].name, "public"); + } +} diff --git a/src/cli/src/data/export_v2/tests.rs b/src/cli/src/data/export_v2/tests.rs new file mode 100644 index 0000000000..bd28801a0d --- /dev/null +++ b/src/cli/src/data/export_v2/tests.rs @@ -0,0 +1,341 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::env; +use std::time::Duration; + +use clap::Parser; +use common_error::ext::BoxedError; +use snafu::ResultExt; +use tempfile::tempdir; +use url::Url; + +use super::command::ExportCreateCommand; +use crate::common::ObjectStoreConfig; +use crate::data::import_v2::ImportV2Command; +use crate::data::snapshot_storage::OpenDalStorage; +use crate::database::DatabaseClient; +use crate::error::{FileIoSnafu, InvalidArgumentsSnafu, OtherSnafu, Result}; + +#[tokio::test] +#[ignore] +async fn export_import_v2_schema_parity_e2e() -> Result<()> { + let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string()); + let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string()); + let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok(); + let schema = "test_db_schema_parity"; + + let database_client = DatabaseClient::new( + addr.clone(), + catalog.clone(), + auth_basic.clone(), + Duration::from_secs(60), + None, + false, + ); + + database_client + .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}")) + .await?; + database_client + .sql_in_public(&format!("CREATE DATABASE {schema}")) + .await?; + database_client + .sql( + "CREATE TABLE metrics (\ + ts TIMESTAMP TIME INDEX, \ + host STRING PRIMARY KEY, \ + cpu DOUBLE DEFAULT 0.0, \ + region_name STRING \ + ) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')", + schema, + ) + .await?; + database_client + .sql( + "CREATE TABLE logs (\ + ts TIMESTAMP TIME INDEX, \ + app STRING PRIMARY KEY, \ + msg STRING NOT NULL COMMENT 'log message' \ + ) ENGINE = mito", + schema, + ) + .await?; + database_client + .sql( + "CREATE TABLE metrics_physical (\ + ts TIMESTAMP TIME INDEX, \ + host STRING, \ + region_name STRING, \ + cpu DOUBLE DEFAULT 0.0, \ + PRIMARY KEY (host, region_name) \ + ) ENGINE = metric WITH (physical_metric_table='true')", + schema, + ) + .await?; + database_client + .sql( + "CREATE TABLE metrics_logical (\ + ts TIMESTAMP TIME INDEX, \ + host STRING, \ + region_name STRING, \ + cpu DOUBLE DEFAULT 0.0, \ + PRIMARY KEY (host, region_name) \ + ) ENGINE = metric WITH (on_physical_table='metrics_physical')", + schema, + ) + .await?; + database_client + .sql( + "CREATE VIEW metrics_view AS SELECT * FROM metrics WHERE cpu > 0.5", + schema, + ) + .await?; + + let src_dir = tempdir().context(FileIoSnafu)?; + let src_uri = Url::from_directory_path(src_dir.path()) + .map_err(|_| { + InvalidArgumentsSnafu { + msg: "invalid temp dir path".to_string(), + } + .build() + })? + .to_string(); + + let mut export_args = vec![ + "export-v2-create", + "--addr", + &addr, + "--to", + &src_uri, + "--catalog", + &catalog, + "--schemas", + schema, + "--schema-only", + ]; + if let Some(auth) = &auth_basic { + export_args.push("--auth-basic"); + export_args.push(auth); + } + let export_cmd = ExportCreateCommand::parse_from(export_args); + export_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + database_client + .sql_in_public(&format!("DROP DATABASE {schema}")) + .await?; + + let mut import_args = vec![ + "import-v2", + "--addr", + &addr, + "--from", + &src_uri, + "--catalog", + &catalog, + "--schemas", + schema, + ]; + if let Some(auth) = &auth_basic { + import_args.push("--auth-basic"); + import_args.push(auth); + } + let import_cmd = ImportV2Command::parse_from(import_args); + import_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + let dst_dir = tempdir().context(FileIoSnafu)?; + let dst_uri = Url::from_directory_path(dst_dir.path()) + .map_err(|_| { + InvalidArgumentsSnafu { + msg: "invalid temp dir path".to_string(), + } + .build() + })? + .to_string(); + + let mut export_args = vec![ + "export-v2-create", + "--addr", + &addr, + "--to", + &dst_uri, + "--catalog", + &catalog, + "--schemas", + schema, + "--schema-only", + ]; + if let Some(auth) = &auth_basic { + export_args.push("--auth-basic"); + export_args.push(auth); + } + let export_cmd = ExportCreateCommand::parse_from(export_args); + export_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + let storage_config = ObjectStoreConfig::default(); + let src_storage = OpenDalStorage::from_uri(&src_uri, &storage_config) + .map_err(BoxedError::new) + .context(OtherSnafu)?; + let dst_storage = OpenDalStorage::from_uri(&dst_uri, &storage_config) + .map_err(BoxedError::new) + .context(OtherSnafu)?; + + let src_schema_snapshot = src_storage + .read_schema() + .await + .map_err(BoxedError::new) + .context(OtherSnafu)?; + let dst_schema_snapshot = dst_storage + .read_schema() + .await + .map_err(BoxedError::new) + .context(OtherSnafu)?; + assert_eq!(src_schema_snapshot, dst_schema_snapshot); + + database_client + .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}")) + .await?; + + Ok(()) +} + +#[tokio::test] +#[ignore] +async fn import_v2_ddl_dry_run_e2e() -> Result<()> { + let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string()); + let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string()); + let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok(); + let schema = "test_db_ddl_dry_run"; + + let database_client = DatabaseClient::new( + addr.clone(), + catalog.clone(), + auth_basic.clone(), + Duration::from_secs(60), + None, + false, + ); + + database_client + .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}")) + .await?; + database_client + .sql_in_public(&format!("CREATE DATABASE {schema}")) + .await?; + database_client + .sql( + "CREATE TABLE metrics (\ + ts TIMESTAMP TIME INDEX, \ + host STRING PRIMARY KEY, \ + cpu DOUBLE DEFAULT 0.0, \ + region_name STRING \ + ) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')", + schema, + ) + .await?; + database_client + .sql( + "CREATE TABLE logs (\ + ts TIMESTAMP TIME INDEX, \ + app STRING PRIMARY KEY, \ + msg STRING NOT NULL COMMENT 'log message' \ + ) ENGINE = mito", + schema, + ) + .await?; + + let src_dir = tempdir().context(FileIoSnafu)?; + let src_uri = Url::from_directory_path(src_dir.path()) + .map_err(|_| { + InvalidArgumentsSnafu { + msg: "invalid temp dir path".to_string(), + } + .build() + })? + .to_string(); + + let mut export_args = vec![ + "export-v2-create", + "--addr", + &addr, + "--to", + &src_uri, + "--catalog", + &catalog, + "--schemas", + schema, + "--schema-only", + ]; + if let Some(auth) = &auth_basic { + export_args.push("--auth-basic"); + export_args.push(auth); + } + let export_cmd = ExportCreateCommand::parse_from(export_args); + export_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + let mut import_args = vec![ + "import-v2", + "--addr", + &addr, + "--from", + &src_uri, + "--catalog", + &catalog, + "--schemas", + schema, + "--dry-run", + ]; + if let Some(auth) = &auth_basic { + import_args.push("--auth-basic"); + import_args.push(auth); + } + let import_cmd = ImportV2Command::parse_from(import_args); + import_cmd + .build() + .await + .context(OtherSnafu)? + .do_work() + .await + .context(OtherSnafu)?; + + database_client + .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}")) + .await?; + + Ok(()) +} diff --git a/src/cli/src/data/import.rs b/src/cli/src/data/import.rs index ffe8b62c7e..f5c234f1a7 100644 --- a/src/cli/src/data/import.rs +++ b/src/cli/src/data/import.rs @@ -81,13 +81,16 @@ pub struct ImportCommand { #[clap(long, value_parser = humantime::parse_duration)] timeout: Option, - /// The proxy server address to connect, if set, will override the system proxy. + /// The proxy server address to connect. /// - /// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set. + /// If set, it overrides the system proxy unless `--no-proxy` is specified. + /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used. #[clap(long)] proxy: Option, - /// Disable proxy server, if set, will not use any proxy. + /// Disable all proxy usage (ignores `--proxy` and system proxy). + /// + /// When set and `--proxy` is not provided, this explicitly disables system proxy. #[clap(long, default_value = "false")] no_proxy: bool, } @@ -104,6 +107,7 @@ impl ImportCommand { // Treats `None` as `0s` to disable server-side default timeout. self.timeout.unwrap_or_default(), proxy, + self.no_proxy, ); Ok(Box::new(Import { @@ -314,6 +318,7 @@ mod tests { None, Duration::from_secs(0), None, + false, ), input_dir: input_dir.to_string(), parallelism: 1, diff --git a/src/cli/src/data/import_v2.rs b/src/cli/src/data/import_v2.rs new file mode 100644 index 0000000000..772e18cc93 --- /dev/null +++ b/src/cli/src/data/import_v2.rs @@ -0,0 +1,41 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Import V2 module. +//! +//! This module provides the V2 implementation of database import functionality, +//! featuring: +//! - DDL-based schema import +//! - Dry-run mode for verification +//! +//! # Example +//! +//! ```bash +//! # Dry-run import (verify without executing) +//! greptime cli data import-v2 \ +//! --addr 127.0.0.1:4000 \ +//! --from file:///tmp/snapshot \ +//! --dry-run +//! +//! # Actual import +//! greptime cli data import-v2 \ +//! --addr 127.0.0.1:4000 \ +//! --from s3://bucket/snapshots/prod-20250101 +//! ``` + +mod command; +pub mod error; +pub mod executor; + +pub use command::ImportV2Command; diff --git a/src/cli/src/data/import_v2/command.rs b/src/cli/src/data/import_v2/command.rs new file mode 100644 index 0000000000..544763d92b --- /dev/null +++ b/src/cli/src/data/import_v2/command.rs @@ -0,0 +1,542 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Import V2 CLI command. + +use std::collections::HashSet; +use std::time::Duration; + +use async_trait::async_trait; +use clap::Parser; +use common_error::ext::BoxedError; +use common_telemetry::info; +use snafu::ResultExt; + +use crate::Tool; +use crate::common::ObjectStoreConfig; +use crate::data::export_v2::manifest::MANIFEST_VERSION; +use crate::data::import_v2::error::{ + ManifestVersionMismatchSnafu, Result, SchemaNotInSnapshotSnafu, SnapshotStorageSnafu, +}; +use crate::data::import_v2::executor::{DdlExecutor, DdlStatement}; +use crate::data::path::ddl_path_for_schema; +use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri}; +use crate::database::{DatabaseClient, parse_proxy_opts}; + +/// Import from a snapshot. +#[derive(Debug, Parser)] +pub struct ImportV2Command { + /// Server address to connect (e.g., 127.0.0.1:4000). + #[clap(long)] + addr: String, + + /// Source snapshot location (e.g., s3://bucket/path, file:///tmp/backup). + #[clap(long)] + from: String, + + /// Target catalog name. + #[clap(long, default_value = "greptime")] + catalog: String, + + /// Schema list to import (default: all in snapshot). + /// Can be specified multiple times or comma-separated. + #[clap(long, value_delimiter = ',')] + schemas: Vec, + + /// Verify without importing (dry-run). + #[clap(long)] + dry_run: bool, + + /// Concurrency level (for future use). + #[clap(long, default_value = "1")] + parallelism: usize, + + /// Basic authentication (user:password). + #[clap(long)] + auth_basic: Option, + + /// Request timeout. + #[clap(long, value_parser = humantime::parse_duration)] + timeout: Option, + + /// Proxy server address. + /// + /// If set, it overrides the system proxy unless `--no-proxy` is specified. + /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used. + #[clap(long)] + proxy: Option, + + /// Disable all proxy usage (ignores `--proxy` and system proxy). + /// + /// When set and `--proxy` is not provided, this explicitly disables system proxy. + #[clap(long)] + no_proxy: bool, + + /// Object store configuration for remote storage backends. + #[clap(flatten)] + storage: ObjectStoreConfig, +} + +impl ImportV2Command { + pub async fn build(&self) -> std::result::Result, BoxedError> { + // Validate URI format + validate_uri(&self.from) + .context(SnapshotStorageSnafu) + .map_err(BoxedError::new)?; + + // Parse schemas (empty vec means all schemas) + let schemas = if self.schemas.is_empty() { + None + } else { + Some(self.schemas.clone()) + }; + + // Build storage + let storage = OpenDalStorage::from_uri(&self.from, &self.storage) + .context(SnapshotStorageSnafu) + .map_err(BoxedError::new)?; + + // Build database client + let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?; + let database_client = DatabaseClient::new( + self.addr.clone(), + self.catalog.clone(), + self.auth_basic.clone(), + self.timeout.unwrap_or(Duration::from_secs(60)), + proxy, + self.no_proxy, + ); + + Ok(Box::new(Import { + schemas, + dry_run: self.dry_run, + _parallelism: self.parallelism, + storage: Box::new(storage), + database_client, + })) + } +} + +/// Import tool implementation. +pub struct Import { + schemas: Option>, + dry_run: bool, + _parallelism: usize, + storage: Box, + database_client: DatabaseClient, +} + +#[async_trait] +impl Tool for Import { + async fn do_work(&self) -> std::result::Result<(), BoxedError> { + self.run().await.map_err(BoxedError::new) + } +} + +impl Import { + async fn run(&self) -> Result<()> { + // 1. Read manifest + let manifest = self + .storage + .read_manifest() + .await + .context(SnapshotStorageSnafu)?; + + info!( + "Loading snapshot: {} (version: {}, schema_only: {})", + manifest.snapshot_id, manifest.version, manifest.schema_only + ); + + // Check version compatibility + if manifest.version != MANIFEST_VERSION { + return ManifestVersionMismatchSnafu { + expected: MANIFEST_VERSION, + found: manifest.version, + } + .fail(); + } + + info!("Snapshot contains {} schema(s)", manifest.schemas.len()); + + // 2. Determine schemas to import + let schemas_to_import = match &self.schemas { + Some(filter) => canonicalize_schema_filter(filter, &manifest.schemas)?, + None => manifest.schemas.clone(), + }; + + info!("Importing schemas: {:?}", schemas_to_import); + + // 3. Read DDL statements + let ddl_statements = self.read_ddl_statements(&schemas_to_import).await?; + + info!("Generated {} DDL statements", ddl_statements.len()); + + // 4. Dry-run mode: print DDL and exit + if self.dry_run { + info!("Dry-run mode - DDL statements to execute:"); + println!(); + for (i, stmt) in ddl_statements.iter().enumerate() { + println!("-- Statement {}", i + 1); + println!("{};", stmt.sql); + println!(); + } + return Ok(()); + } + + // 5. Execute DDL + let executor = DdlExecutor::new(&self.database_client); + executor.execute_strict(&ddl_statements).await?; + + info!( + "Import completed: {} DDL statements executed", + ddl_statements.len() + ); + + // 6. Data import would happen here for non-schema-only snapshots (M2/M3) + if !manifest.schema_only && !manifest.chunks.is_empty() { + info!( + "Data import not yet implemented (M3). {} chunks pending.", + manifest.chunks.len() + ); + } + + Ok(()) + } + + async fn read_ddl_statements(&self, schemas: &[String]) -> Result> { + let mut statements = Vec::new(); + for schema in schemas { + let path = ddl_path_for_schema(schema); + let content = self + .storage + .read_text(&path) + .await + .context(SnapshotStorageSnafu)?; + statements.extend( + parse_ddl_statements(&content) + .into_iter() + .map(|sql| ddl_statement_for_schema(schema, sql)), + ); + } + + Ok(statements) + } +} + +fn parse_ddl_statements(content: &str) -> Vec { + let mut statements = Vec::new(); + let mut current = String::new(); + let mut chars = content.chars().peekable(); + let mut in_single_quote = false; + let mut in_double_quote = false; + let mut in_line_comment = false; + let mut in_block_comment = false; + + while let Some(ch) = chars.next() { + if in_line_comment { + if ch == '\n' { + in_line_comment = false; + current.push('\n'); + } + continue; + } + + if in_block_comment { + if ch == '*' && chars.peek() == Some(&'/') { + chars.next(); + in_block_comment = false; + } + continue; + } + + if in_single_quote { + current.push(ch); + if ch == '\'' { + if chars.peek() == Some(&'\'') { + current.push(chars.next().expect("peeked quote must exist")); + } else { + in_single_quote = false; + } + } + continue; + } + + if in_double_quote { + current.push(ch); + if ch == '"' { + if chars.peek() == Some(&'"') { + current.push(chars.next().expect("peeked quote must exist")); + } else { + in_double_quote = false; + } + } + continue; + } + + match ch { + '-' if chars.peek() == Some(&'-') => { + chars.next(); + in_line_comment = true; + } + '/' if chars.peek() == Some(&'*') => { + chars.next(); + in_block_comment = true; + } + '\'' => { + in_single_quote = true; + current.push(ch); + } + '"' => { + in_double_quote = true; + current.push(ch); + } + ';' => { + let statement = current.trim(); + if !statement.is_empty() { + statements.push(statement.to_string()); + } + current.clear(); + } + _ => current.push(ch), + } + } + + let statement = current.trim(); + if !statement.is_empty() { + statements.push(statement.to_string()); + } + + statements +} + +fn ddl_statement_for_schema(schema: &str, sql: String) -> DdlStatement { + if is_schema_scoped_statement(&sql) { + DdlStatement::with_execution_schema(sql, schema.to_string()) + } else { + DdlStatement::new(sql) + } +} + +fn is_schema_scoped_statement(sql: &str) -> bool { + let trimmed = sql.trim_start(); + if !starts_with_keyword(trimmed, "CREATE") { + return false; + } + + let Some(rest) = trimmed.get("CREATE".len()..) else { + return false; + }; + let mut rest = rest.trim_start(); + if starts_with_keyword(rest, "OR") { + let Some(next) = rest.get("OR".len()..) else { + return false; + }; + rest = next.trim_start(); + if !starts_with_keyword(rest, "REPLACE") { + return false; + } + let Some(next) = rest.get("REPLACE".len()..) else { + return false; + }; + rest = next.trim_start(); + } + + if starts_with_keyword(rest, "EXTERNAL") { + let Some(next) = rest.get("EXTERNAL".len()..) else { + return false; + }; + rest = next.trim_start(); + } + + starts_with_keyword(rest, "TABLE") || starts_with_keyword(rest, "VIEW") +} + +fn starts_with_keyword(input: &str, keyword: &str) -> bool { + input + .get(0..keyword.len()) + .map(|s| s.eq_ignore_ascii_case(keyword)) + .unwrap_or(false) + && input + .as_bytes() + .get(keyword.len()) + .map(|b| !b.is_ascii_alphanumeric() && *b != b'_') + .unwrap_or(true) +} + +fn canonicalize_schema_filter( + filter: &[String], + manifest_schemas: &[String], +) -> Result> { + let mut canonicalized = Vec::new(); + let mut seen = HashSet::new(); + + for schema in filter { + let canonical = manifest_schemas + .iter() + .find(|candidate| candidate.eq_ignore_ascii_case(schema)) + .cloned() + .ok_or_else(|| { + SchemaNotInSnapshotSnafu { + schema: schema.clone(), + } + .build() + })?; + + if seen.insert(canonical.to_ascii_lowercase()) { + canonicalized.push(canonical); + } + } + + Ok(canonicalized) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_ddl_statements() { + let content = r#" +-- Schema: public +CREATE DATABASE public; +CREATE TABLE t (ts TIMESTAMP TIME INDEX, host STRING, PRIMARY KEY (host)) ENGINE=mito; + +-- comment +CREATE VIEW v AS SELECT * FROM t; +"#; + let statements = parse_ddl_statements(content); + assert_eq!(statements.len(), 3); + assert!(statements[0].starts_with("CREATE DATABASE public")); + assert!(statements[1].starts_with("CREATE TABLE t")); + assert!(statements[2].starts_with("CREATE VIEW v")); + } + + #[test] + fn test_parse_ddl_statements_preserves_semicolons_in_string_literals() { + let content = r#" +CREATE TABLE t ( + host STRING DEFAULT 'a;b' +); +CREATE VIEW v AS SELECT ';' AS marker; +"#; + + let statements = parse_ddl_statements(content); + + assert_eq!(statements.len(), 2); + assert!(statements[0].contains("'a;b'")); + assert!(statements[1].contains("';' AS marker")); + } + + #[test] + fn test_parse_ddl_statements_handles_comments_without_splitting() { + let content = r#" +-- leading comment +CREATE TABLE t (ts TIMESTAMP TIME INDEX); /* block; comment */ +CREATE VIEW v AS SELECT 1; +"#; + + let statements = parse_ddl_statements(content); + + assert_eq!(statements.len(), 2); + assert!(statements[0].starts_with("CREATE TABLE t")); + assert!(statements[1].starts_with("CREATE VIEW v")); + } + + #[test] + fn test_canonicalize_schema_filter_uses_manifest_casing() { + let filter = vec!["TEST_DB".to_string(), "PUBLIC".to_string()]; + let manifest_schemas = vec!["test_db".to_string(), "public".to_string()]; + + let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap(); + + assert_eq!(canonicalized, vec!["test_db", "public"]); + } + + #[test] + fn test_canonicalize_schema_filter_dedupes_case_insensitive_matches() { + let filter = vec![ + "TEST_DB".to_string(), + "test_db".to_string(), + "PUBLIC".to_string(), + "public".to_string(), + ]; + let manifest_schemas = vec!["test_db".to_string(), "public".to_string()]; + + let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap(); + + assert_eq!(canonicalized, vec!["test_db", "public"]); + } + + #[test] + fn test_canonicalize_schema_filter_rejects_missing_schema() { + let filter = vec!["missing".to_string()]; + let manifest_schemas = vec!["test_db".to_string()]; + + let error = canonicalize_schema_filter(&filter, &manifest_schemas) + .expect_err("missing schema should fail") + .to_string(); + + assert!(error.contains("missing")); + } + + #[test] + fn test_ddl_statement_for_schema_create_table_uses_execution_schema() { + let stmt = ddl_statement_for_schema( + "test_db", + "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX) ENGINE=mito".to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some("test_db")); + } + + #[test] + fn test_ddl_statement_for_schema_create_view_uses_execution_schema() { + let stmt = ddl_statement_for_schema( + "test_db", + "CREATE VIEW metrics_view AS SELECT * FROM metrics".to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some("test_db")); + } + + #[test] + fn test_ddl_statement_for_schema_create_or_replace_view_uses_execution_schema() { + let stmt = ddl_statement_for_schema( + "test_db", + "CREATE OR REPLACE VIEW metrics_view AS SELECT * FROM metrics".to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some("test_db")); + } + + #[test] + fn test_ddl_statement_for_schema_create_external_table_uses_execution_schema() { + let stmt = ddl_statement_for_schema( + "test_db", + "CREATE EXTERNAL TABLE IF NOT EXISTS ext_metrics (ts TIMESTAMP TIME INDEX) ENGINE=file" + .to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some("test_db")); + } + + #[test] + fn test_ddl_statement_for_schema_create_database_uses_public_context() { + let stmt = ddl_statement_for_schema("test_db", "CREATE DATABASE test_db".to_string()); + assert_eq!(stmt.execution_schema, None); + } + + #[test] + fn test_starts_with_keyword_requires_word_boundary() { + assert!(starts_with_keyword("CREATE TABLE t", "CREATE")); + assert!(!starts_with_keyword("CREATED TABLE t", "CREATE")); + assert!(!starts_with_keyword("TABLESPACE foo", "TABLE")); + } +} diff --git a/src/cli/src/data/import_v2/error.rs b/src/cli/src/data/import_v2/error.rs new file mode 100644 index 0000000000..5ae3db1583 --- /dev/null +++ b/src/cli/src/data/import_v2/error.rs @@ -0,0 +1,82 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_macro::stack_trace_debug; +use snafu::{Location, Snafu}; + +#[derive(Snafu)] +#[snafu(visibility(pub))] +#[stack_trace_debug] +pub enum Error { + #[snafu(display("Snapshot not found at '{}'", uri))] + SnapshotNotFound { + uri: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))] + ManifestVersionMismatch { + expected: u32, + found: u32, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Schema '{}' not found in snapshot", schema))] + SchemaNotInSnapshot { + schema: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Snapshot storage error"))] + SnapshotStorage { + #[snafu(source)] + error: crate::data::export_v2::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Database error"))] + Database { + #[snafu(source)] + error: crate::error::Error, + #[snafu(implicit)] + location: Location, + }, +} + +pub type Result = std::result::Result; + +impl ErrorExt for Error { + fn status_code(&self) -> StatusCode { + match self { + Error::SnapshotNotFound { .. } | Error::SchemaNotInSnapshot { .. } => { + StatusCode::InvalidArguments + } + Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments, + Error::Database { error, .. } => error.status_code(), + Error::SnapshotStorage { error, .. } => error.status_code(), + } + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/cli/src/data/import_v2/executor.rs b/src/cli/src/data/import_v2/executor.rs new file mode 100644 index 0000000000..3f2bf66ae6 --- /dev/null +++ b/src/cli/src/data/import_v2/executor.rs @@ -0,0 +1,122 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! DDL execution for import. + +use common_telemetry::info; +use snafu::ResultExt; + +use crate::data::import_v2::error::{DatabaseSnafu, Result}; +use crate::database::DatabaseClient; + +/// A DDL statement with an explicit execution schema context. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DdlStatement { + pub sql: String, + pub execution_schema: Option, +} + +impl DdlStatement { + pub fn new(sql: String) -> Self { + Self { + sql, + execution_schema: None, + } + } + + pub fn with_execution_schema(sql: String, schema: String) -> Self { + Self { + sql, + execution_schema: Some(schema), + } + } +} + +/// Executes DDL statements against the database. +pub struct DdlExecutor<'a> { + client: &'a DatabaseClient, +} + +impl<'a> DdlExecutor<'a> { + /// Creates a new DDL executor. + pub fn new(client: &'a DatabaseClient) -> Self { + Self { client } + } + + /// Executes a list of DDL statements, stopping on first error. + pub async fn execute_strict(&self, statements: &[DdlStatement]) -> Result<()> { + let total = statements.len(); + + for (i, stmt) in statements.iter().enumerate() { + let preview = preview_sql(&stmt.sql); + + info!("Executing DDL ({}/{}): {}", i + 1, total, preview); + + if let Some(schema) = stmt.execution_schema.as_deref() { + self.client + .sql(&stmt.sql, schema) + .await + .context(DatabaseSnafu)?; + } else { + self.client + .sql_in_public(&stmt.sql) + .await + .context(DatabaseSnafu)?; + } + } + + Ok(()) + } +} + +fn preview_sql(sql: &str) -> String { + let mut chars = sql.chars(); + let preview: String = chars.by_ref().take(80).collect(); + if chars.next().is_some() { + format!("{preview}...") + } else { + preview + } +} +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_statement_without_execution_schema_uses_public() { + let stmt = DdlStatement::new("CREATE DATABASE IF NOT EXISTS test_db".to_string()); + assert_eq!(stmt.execution_schema, None); + } + + #[test] + fn test_statement_with_execution_schema_preserves_context() { + let stmt = DdlStatement::with_execution_schema( + r#"CREATE TABLE IF NOT EXISTS "my""schema"."metrics" (ts TIMESTAMP TIME INDEX)"# + .to_string(), + r#"my"schema"#.to_string(), + ); + assert_eq!(stmt.execution_schema.as_deref(), Some(r#"my"schema"#)); + } + + #[test] + fn test_preview_sql_truncates_at_char_boundary() { + let sql = format!( + "CREATE TABLE {} (ts TIMESTAMP TIME INDEX)", + "测".repeat(100) + ); + let preview = preview_sql(&sql); + assert!(preview.ends_with("...")); + assert!(preview.is_char_boundary(preview.len())); + } +} diff --git a/src/cli/src/data/path.rs b/src/cli/src/data/path.rs new file mode 100644 index 0000000000..2e0f5d3f1a --- /dev/null +++ b/src/cli/src/data/path.rs @@ -0,0 +1,76 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared path helpers for export/import data files. + +use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR}; + +pub(crate) fn ddl_path_for_schema(schema: &str) -> String { + format!( + "{}/{}/{}.sql", + SCHEMA_DIR, + DDL_DIR, + encode_path_segment(schema) + ) +} + +pub(crate) fn encode_path_segment(value: &str) -> String { + let mut encoded = String::with_capacity(value.len()); + for byte in value.bytes() { + match byte { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' => { + encoded.push(byte as char); + } + _ => { + encoded.push('%'); + encoded.push(hex_char(byte >> 4)); + encoded.push(hex_char(byte & 0x0F)); + } + } + } + encoded +} + +fn hex_char(nibble: u8) -> char { + match nibble { + 0..=9 => (b'0' + nibble) as char, + 10..=15 => (b'A' + (nibble - 10)) as char, + _ => unreachable!("nibble must be in 0..=15"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encode_path_segment_preserves_safe_ascii() { + assert_eq!(encode_path_segment("test_db"), "test_db"); + } + + #[test] + fn test_encode_path_segment_escapes_path_traversal_chars() { + assert_eq!(encode_path_segment("../evil"), "%2E%2E%2Fevil"); + assert_eq!(encode_path_segment(r"..\\evil"), "%2E%2E%5C%5Cevil"); + } + + #[test] + fn test_ddl_path_for_schema_encodes_schema_segment() { + assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql"); + assert_eq!( + ddl_path_for_schema("../evil"), + "schema/ddl/%2E%2E%2Fevil.sql" + ); + } +} diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs new file mode 100644 index 0000000000..50c8734a67 --- /dev/null +++ b/src/cli/src/data/snapshot_storage.rs @@ -0,0 +1,669 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Storage abstraction for Export/Import V2. +//! +//! This module provides a unified interface for reading and writing snapshot data +//! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem). + +use async_trait::async_trait; +use object_store::services::{Azblob, Fs, Gcs, Oss, S3}; +use object_store::util::{with_instrument_layers, with_retry_layers}; +use object_store::{AzblobConnection, GcsConnection, ObjectStore, OssConnection, S3Connection}; +use snafu::ResultExt; +use url::Url; + +use crate::common::ObjectStoreConfig; +use crate::data::export_v2::error::{ + BuildObjectStoreSnafu, InvalidUriSnafu, ManifestParseSnafu, ManifestSerializeSnafu, Result, + SnapshotNotFoundSnafu, StorageOperationSnafu, TextDecodeSnafu, UnsupportedSchemeSnafu, + UrlParseSnafu, +}; +use crate::data::export_v2::manifest::{MANIFEST_FILE, Manifest}; +#[cfg(test)] +use crate::data::export_v2::schema::SchemaDefinition; +use crate::data::export_v2::schema::{SCHEMA_DIR, SCHEMAS_FILE, SchemaSnapshot}; + +struct RemoteLocation { + bucket_or_container: String, + root: String, +} + +/// URI schemes supported for snapshot storage. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StorageScheme { + /// Amazon S3. + S3, + /// Alibaba Cloud OSS. + Oss, + /// Google Cloud Storage. + Gcs, + /// Azure Blob Storage. + Azblob, + /// Local filesystem (file://). + File, +} + +impl StorageScheme { + /// Parses storage scheme from URI. + pub fn from_uri(uri: &str) -> Result { + let url = Url::parse(uri).context(UrlParseSnafu)?; + + match url.scheme() { + "s3" => Ok(Self::S3), + "oss" => Ok(Self::Oss), + "gs" | "gcs" => Ok(Self::Gcs), + "azblob" => Ok(Self::Azblob), + "file" => Ok(Self::File), + scheme => UnsupportedSchemeSnafu { scheme }.fail(), + } + } +} + +/// Extracts bucket/container and root path from a URI. +fn extract_remote_location(uri: &str) -> Result { + let url = Url::parse(uri).context(UrlParseSnafu)?; + let bucket_or_container = url.host_str().unwrap_or("").to_string(); + if bucket_or_container.is_empty() { + return InvalidUriSnafu { + uri, + reason: "URI must include bucket/container in host", + } + .fail(); + } + + let root = url.path().trim_start_matches('/').to_string(); + if root.is_empty() { + return InvalidUriSnafu { + uri, + reason: "snapshot URI must include a non-empty path after the bucket/container", + } + .fail(); + } + + Ok(RemoteLocation { + bucket_or_container, + root, + }) +} + +/// Validates that a URI has a proper scheme. +/// +/// Rejects bare paths (e.g., `/tmp/backup`, `./backup`) because: +/// - Schema export (CLI) and data export (server) run in different processes +/// - Using bare paths would split the snapshot across machines +/// +/// Supported URI schemes: +/// - `s3://bucket/path` - Amazon S3 +/// - `oss://bucket/path` - Alibaba Cloud OSS +/// - `gs://bucket/path` - Google Cloud Storage +/// - `azblob://container/path` - Azure Blob Storage +/// - `file:///absolute/path` - Local filesystem +pub fn validate_uri(uri: &str) -> Result { + // Must have a scheme + if !uri.contains("://") { + return InvalidUriSnafu { + uri, + reason: "URI must have a scheme (e.g., s3://, file://). Bare paths are not supported.", + } + .fail(); + } + + StorageScheme::from_uri(uri) +} + +fn schema_index_path() -> String { + format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE) +} + +/// Extracts the absolute filesystem path from a file:// URI. +fn extract_file_path_from_uri(uri: &str) -> Result { + let url = Url::parse(uri).context(UrlParseSnafu)?; + + match url.host_str() { + Some(host) if !host.is_empty() && host != "localhost" => InvalidUriSnafu { + uri, + reason: "file:// URI must use an absolute path like file:///tmp/backup", + } + .fail(), + _ => url + .to_file_path() + .map(|path| path.to_string_lossy().into_owned()) + .map_err(|_| { + InvalidUriSnafu { + uri, + reason: "file:// URI must use a valid absolute filesystem path", + } + .build() + }), + } +} + +async fn ensure_snapshot_exists(storage: &OpenDalStorage) -> Result<()> { + if storage.exists().await? { + Ok(()) + } else { + SnapshotNotFoundSnafu { + uri: storage.target_uri.as_str(), + } + .fail() + } +} + +/// Snapshot storage abstraction. +/// +/// Provides operations for reading and writing snapshot data to various storage backends. +#[async_trait] +pub trait SnapshotStorage: Send + Sync { + /// Checks if a snapshot exists at this location (manifest.json exists). + async fn exists(&self) -> Result; + + /// Reads the manifest file. + async fn read_manifest(&self) -> Result; + + /// Writes the manifest file. + async fn write_manifest(&self, manifest: &Manifest) -> Result<()>; + + /// Writes the schema index to schema/schemas.json. + async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()>; + + /// Writes a text file to a relative path under the snapshot root. + async fn write_text(&self, path: &str, content: &str) -> Result<()>; + + /// Reads a text file from a relative path under the snapshot root. + async fn read_text(&self, path: &str) -> Result; + + /// Deletes the entire snapshot (for --force). + async fn delete_snapshot(&self) -> Result<()>; +} + +/// OpenDAL-based implementation of SnapshotStorage. +pub struct OpenDalStorage { + object_store: ObjectStore, + target_uri: String, +} + +impl OpenDalStorage { + fn new_operator_rooted(object_store: ObjectStore, target_uri: &str) -> Self { + Self { + object_store, + target_uri: target_uri.to_string(), + } + } + + fn finish_local_store(object_store: ObjectStore) -> ObjectStore { + with_instrument_layers(object_store, false) + } + + fn finish_remote_store(object_store: ObjectStore) -> ObjectStore { + with_instrument_layers(with_retry_layers(object_store), false) + } + + fn ensure_backend_enabled(uri: &str, enabled: bool, reason: &'static str) -> Result<()> { + if enabled { + Ok(()) + } else { + InvalidUriSnafu { uri, reason }.fail() + } + } + + fn validate_remote_config( + uri: &str, + backend: &str, + result: std::result::Result<(), E>, + ) -> Result<()> { + result.map_err(|error| { + InvalidUriSnafu { + uri, + reason: format!("invalid {} config: {}", backend, error), + } + .build() + }) + } + + /// Creates a new storage from a file:// URI. + pub fn from_file_uri(uri: &str) -> Result { + let path = extract_file_path_from_uri(uri)?; + + let builder = Fs::default().root(&path); + let object_store = ObjectStore::new(builder) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_local_store(object_store), + uri, + )) + } + + fn from_file_uri_with_config(uri: &str, storage: &ObjectStoreConfig) -> Result { + if storage.enable_s3 || storage.enable_oss || storage.enable_gcs || storage.enable_azblob { + return InvalidUriSnafu { + uri, + reason: "file:// cannot be used with remote storage flags", + } + .fail(); + } + + Self::from_file_uri(uri) + } + + fn from_s3_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + Self::ensure_backend_enabled( + uri, + storage.enable_s3, + "s3:// requires --s3 and related options", + )?; + + let location = extract_remote_location(uri)?; + let mut config = storage.s3.clone(); + config.s3_bucket = location.bucket_or_container; + config.s3_root = location.root; + Self::validate_remote_config(uri, "s3", config.validate())?; + + let conn: S3Connection = config.into(); + let object_store = ObjectStore::new(S3::from(&conn)) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_remote_store(object_store), + uri, + )) + } + + fn from_oss_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + Self::ensure_backend_enabled( + uri, + storage.enable_oss, + "oss:// requires --oss and related options", + )?; + + let location = extract_remote_location(uri)?; + let mut config = storage.oss.clone(); + config.oss_bucket = location.bucket_or_container; + config.oss_root = location.root; + Self::validate_remote_config(uri, "oss", config.validate())?; + + let conn: OssConnection = config.into(); + let object_store = ObjectStore::new(Oss::from(&conn)) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_remote_store(object_store), + uri, + )) + } + + fn from_gcs_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + Self::ensure_backend_enabled( + uri, + storage.enable_gcs, + "gs:// or gcs:// requires --gcs and related options", + )?; + + let location = extract_remote_location(uri)?; + let mut config = storage.gcs.clone(); + config.gcs_bucket = location.bucket_or_container; + config.gcs_root = location.root; + Self::validate_remote_config(uri, "gcs", config.validate())?; + + let conn: GcsConnection = config.into(); + let object_store = ObjectStore::new(Gcs::from(&conn)) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_remote_store(object_store), + uri, + )) + } + + fn from_azblob_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + Self::ensure_backend_enabled( + uri, + storage.enable_azblob, + "azblob:// requires --azblob and related options", + )?; + + let location = extract_remote_location(uri)?; + let mut config = storage.azblob.clone(); + config.azblob_container = location.bucket_or_container; + config.azblob_root = location.root; + Self::validate_remote_config(uri, "azblob", config.validate())?; + + let conn: AzblobConnection = config.into(); + let object_store = ObjectStore::new(Azblob::from(&conn)) + .context(BuildObjectStoreSnafu)? + .finish(); + Ok(Self::new_operator_rooted( + Self::finish_remote_store(object_store), + uri, + )) + } + + /// Creates a new storage from a URI and object store config. + pub fn from_uri(uri: &str, storage: &ObjectStoreConfig) -> Result { + match StorageScheme::from_uri(uri)? { + StorageScheme::File => Self::from_file_uri_with_config(uri, storage), + StorageScheme::S3 => Self::from_s3_uri(uri, storage), + StorageScheme::Oss => Self::from_oss_uri(uri, storage), + StorageScheme::Gcs => Self::from_gcs_uri(uri, storage), + StorageScheme::Azblob => Self::from_azblob_uri(uri, storage), + } + } + + /// Reads a file as bytes. + async fn read_file(&self, path: &str) -> Result> { + let data = self + .object_store + .read(path) + .await + .context(StorageOperationSnafu { + operation: format!("read {}", path), + })?; + Ok(data.to_vec()) + } + + /// Writes bytes to a file. + async fn write_file(&self, path: &str, data: Vec) -> Result<()> { + self.object_store + .write(path, data) + .await + .map(|_| ()) + .context(StorageOperationSnafu { + operation: format!("write {}", path), + }) + } + + /// Checks if a file exists using stat. + async fn file_exists(&self, path: &str) -> Result { + match self.object_store.stat(path).await { + Ok(_) => Ok(true), + Err(e) if e.kind() == object_store::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e).context(StorageOperationSnafu { + operation: format!("check exists {}", path), + }), + } + } + + #[cfg(test)] + pub async fn read_schema(&self) -> Result { + let schemas_path = schema_index_path(); + let schemas: Vec = if self.file_exists(&schemas_path).await? { + let data = self.read_file(&schemas_path).await?; + serde_json::from_slice(&data).context(ManifestParseSnafu)? + } else { + vec![] + }; + + Ok(SchemaSnapshot { schemas }) + } +} + +#[async_trait] +impl SnapshotStorage for OpenDalStorage { + async fn exists(&self) -> Result { + self.file_exists(MANIFEST_FILE).await + } + + async fn read_manifest(&self) -> Result { + ensure_snapshot_exists(self).await?; + + let data = self.read_file(MANIFEST_FILE).await?; + serde_json::from_slice(&data).context(ManifestParseSnafu) + } + + async fn write_manifest(&self, manifest: &Manifest) -> Result<()> { + let data = serde_json::to_vec_pretty(manifest).context(ManifestSerializeSnafu)?; + self.write_file(MANIFEST_FILE, data).await + } + + async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()> { + let schemas_path = schema_index_path(); + let schemas_data = + serde_json::to_vec_pretty(&schema.schemas).context(ManifestSerializeSnafu)?; + self.write_file(&schemas_path, schemas_data).await + } + + async fn write_text(&self, path: &str, content: &str) -> Result<()> { + self.write_file(path, content.as_bytes().to_vec()).await + } + + async fn read_text(&self, path: &str) -> Result { + let data = self.read_file(path).await?; + String::from_utf8(data).context(TextDecodeSnafu) + } + + async fn delete_snapshot(&self) -> Result<()> { + self.object_store + .remove_all("/") + .await + .context(StorageOperationSnafu { + operation: "delete snapshot", + }) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::path::Path; + + use object_store::ObjectStore; + use object_store::services::Fs; + use tempfile::tempdir; + use url::Url; + + use super::*; + use crate::data::export_v2::manifest::{DataFormat, TimeRange}; + use crate::data::export_v2::schema::SchemaDefinition; + + fn make_storage_with_rooted_fs(dir: &std::path::Path) -> OpenDalStorage { + let object_store = ObjectStore::new(Fs::default().root(dir.to_str().unwrap())) + .unwrap() + .finish(); + OpenDalStorage::new_operator_rooted( + OpenDalStorage::finish_local_store(object_store), + Url::from_directory_path(dir).unwrap().as_ref(), + ) + } + + #[test] + fn test_validate_uri_valid() { + assert_eq!(validate_uri("s3://bucket/path").unwrap(), StorageScheme::S3); + assert_eq!( + validate_uri("oss://bucket/path").unwrap(), + StorageScheme::Oss + ); + assert_eq!( + validate_uri("gs://bucket/path").unwrap(), + StorageScheme::Gcs + ); + assert_eq!( + validate_uri("gcs://bucket/path").unwrap(), + StorageScheme::Gcs + ); + assert_eq!( + validate_uri("azblob://container/path").unwrap(), + StorageScheme::Azblob + ); + assert_eq!( + validate_uri("file:///tmp/backup").unwrap(), + StorageScheme::File + ); + } + + #[test] + fn test_validate_uri_invalid() { + // Bare paths should be rejected + assert!(validate_uri("/tmp/backup").is_err()); + assert!(validate_uri("./backup").is_err()); + assert!(validate_uri("backup").is_err()); + + // Unknown schemes + assert!(validate_uri("ftp://server/path").is_err()); + } + + #[test] + fn test_extract_remote_location_requires_non_empty_root() { + assert!(extract_remote_location("s3://bucket").is_err()); + assert!(extract_remote_location("s3://bucket/").is_err()); + assert!(extract_remote_location("oss://bucket").is_err()); + assert!(extract_remote_location("gs://bucket").is_err()); + assert!(extract_remote_location("azblob://container").is_err()); + } + + #[cfg(not(windows))] + #[test] + fn test_extract_path_from_uri_unix_examples() { + assert_eq!( + extract_file_path_from_uri("file:///tmp/backup").unwrap(), + "/tmp/backup" + ); + assert_eq!( + extract_file_path_from_uri("file://localhost/tmp/backup").unwrap(), + "/tmp/backup" + ); + } + + #[test] + fn test_extract_file_path_from_uri_rejects_file_host() { + assert!(extract_file_path_from_uri("file://tmp/backup").is_err()); + } + + #[test] + fn test_extract_file_path_from_uri_round_trips_directory_url() { + let dir = tempdir().unwrap(); + let uri = Url::from_directory_path(dir.path()).unwrap().to_string(); + let path = extract_file_path_from_uri(&uri).unwrap(); + + assert_eq!(Path::new(&path), dir.path()); + } + + #[tokio::test] + async fn test_read_manifest_reports_requested_uri() { + let dir = tempdir().unwrap(); + let uri = Url::from_directory_path(dir.path()).unwrap().to_string(); + let storage = OpenDalStorage::from_file_uri(&uri).unwrap(); + + let error = storage.read_manifest().await.unwrap_err().to_string(); + + assert!(error.contains(uri.as_str())); + } + + #[tokio::test] + async fn test_manifest_round_trip() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + + let manifest = Manifest::new_full( + "greptime".to_string(), + vec!["public".to_string()], + TimeRange::unbounded(), + DataFormat::Parquet, + ); + + storage.write_manifest(&manifest).await.unwrap(); + let loaded = storage.read_manifest().await.unwrap(); + + assert_eq!(loaded.catalog, manifest.catalog); + assert_eq!(loaded.schemas, manifest.schemas); + assert_eq!(loaded.schema_only, manifest.schema_only); + assert_eq!(loaded.format, manifest.format); + assert_eq!(loaded.snapshot_id, manifest.snapshot_id); + } + + #[tokio::test] + async fn test_schema_round_trip() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + + let mut snapshot = SchemaSnapshot::new(); + snapshot.add_schema(SchemaDefinition { + catalog: "greptime".to_string(), + name: "test_db".to_string(), + options: HashMap::from([("ttl".to_string(), "7d".to_string())]), + }); + + storage.write_schema(&snapshot).await.unwrap(); + let loaded = storage.read_schema().await.unwrap(); + + assert_eq!(loaded, snapshot); + } + + #[tokio::test] + async fn test_text_round_trip() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + let content = "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX);"; + + storage + .write_text("schema/ddl/public.sql", content) + .await + .unwrap(); + let loaded = storage.read_text("schema/ddl/public.sql").await.unwrap(); + + assert_eq!(loaded, content); + } + + #[tokio::test] + async fn test_read_text_rejects_invalid_utf8() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + + storage + .write_file("schema/ddl/public.sql", vec![0xff, 0xfe, 0xfd]) + .await + .unwrap(); + + let error = storage + .read_text("schema/ddl/public.sql") + .await + .unwrap_err(); + assert!(error.to_string().contains("UTF-8")); + } + + #[tokio::test] + async fn test_exists_follows_manifest_presence() { + let dir = tempdir().unwrap(); + let storage = make_storage_with_rooted_fs(dir.path()); + + assert!(!storage.exists().await.unwrap()); + + storage + .write_manifest(&Manifest::new_schema_only( + "greptime".to_string(), + vec!["public".to_string()], + )) + .await + .unwrap(); + + assert!(storage.exists().await.unwrap()); + } + + #[tokio::test] + async fn test_delete_snapshot_only_removes_rooted_contents() { + let parent = tempdir().unwrap(); + let snapshot_root = parent.path().join("snapshot"); + let sibling = parent.path().join("sibling"); + std::fs::create_dir_all(&snapshot_root).unwrap(); + std::fs::create_dir_all(&sibling).unwrap(); + std::fs::write(snapshot_root.join("manifest.json"), b"{}").unwrap(); + std::fs::write(sibling.join("keep.txt"), b"keep").unwrap(); + + let storage = make_storage_with_rooted_fs(&snapshot_root); + storage.delete_snapshot().await.unwrap(); + + assert!(!snapshot_root.join("manifest.json").exists()); + assert!(sibling.join("keep.txt").exists()); + } +} diff --git a/src/cli/src/data/sql.rs b/src/cli/src/data/sql.rs new file mode 100644 index 0000000000..7de4206b26 --- /dev/null +++ b/src/cli/src/data/sql.rs @@ -0,0 +1,40 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared SQL escaping helpers for CLI-generated statements. + +pub(crate) fn escape_sql_literal(value: &str) -> String { + value.replace('\'', "''") +} + +pub(crate) fn escape_sql_identifier(value: &str) -> String { + value.replace('"', "\"\"") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_escape_sql_literal_escapes_single_quotes() { + assert_eq!(escape_sql_literal("test_db"), "test_db"); + assert_eq!(escape_sql_literal("te'st"), "te''st"); + } + + #[test] + fn test_escape_sql_identifier_escapes_double_quotes() { + assert_eq!(escape_sql_identifier("test_db"), "test_db"); + assert_eq!(escape_sql_identifier(r#"te"st"#), r#"te""st"#); + } +} diff --git a/src/cli/src/database.rs b/src/cli/src/database.rs index db98c38e38..fa3f6faefb 100644 --- a/src/cli/src/database.rs +++ b/src/cli/src/database.rs @@ -36,6 +36,7 @@ pub struct DatabaseClient { auth_header: Option, timeout: Duration, proxy: Option, + no_proxy: bool, } pub fn parse_proxy_opts( @@ -61,6 +62,7 @@ impl DatabaseClient { auth_basic: Option, timeout: Duration, proxy: Option, + no_proxy: bool, ) -> Self { let auth_header = if let Some(basic) = auth_basic { let encoded = general_purpose::STANDARD.encode(basic); @@ -69,7 +71,9 @@ impl DatabaseClient { None }; - if let Some(ref proxy) = proxy { + if no_proxy { + common_telemetry::info!("Proxy disabled"); + } else if let Some(ref proxy) = proxy { common_telemetry::info!("Using proxy: {:?}", proxy); } else { common_telemetry::info!("Using system proxy(if any)"); @@ -81,6 +85,7 @@ impl DatabaseClient { auth_header, timeout, proxy, + no_proxy, } } @@ -95,12 +100,14 @@ impl DatabaseClient { ("db", format!("{}-{}", self.catalog, schema)), ("sql", sql.to_string()), ]; - let client = self - .proxy - .clone() - .map(|proxy| reqwest::Client::builder().proxy(proxy).build()) - .unwrap_or_else(|| Ok(reqwest::Client::new())) - .context(BuildClientSnafu)?; + let mut builder = reqwest::Client::builder(); + if let Some(proxy) = self.proxy.clone() { + builder = builder.proxy(proxy); + } + if self.no_proxy { + builder = builder.no_proxy(); + } + let client = builder.build().context(BuildClientSnafu)?; let mut request = client .post(&url) .form(¶ms) diff --git a/src/cli/src/lib.rs b/src/cli/src/lib.rs index acf5df4086..4305da9c8f 100644 --- a/src/cli/src/lib.rs +++ b/src/cli/src/lib.rs @@ -29,7 +29,7 @@ pub use database::DatabaseClient; use error::Result; pub use crate::bench::BenchTableMetadataCommand; -pub use crate::data::DataCommand; +pub use crate::data::{DataCommand, export_v2, import_v2}; pub use crate::metadata::MetadataCommand; #[async_trait] diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs index d8f53b9d71..f6d8674d4c 100644 --- a/src/cmd/src/datanode/objbench.rs +++ b/src/cmd/src/datanode/objbench.rs @@ -20,13 +20,14 @@ use clap::Parser; use colored::Colorize; use datanode::config::RegionEngineConfig; use datanode::store; -use either::Either; +use futures::stream; use mito2::access_layer::{ AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType, }; use mito2::cache::{CacheManager, CacheManagerRef}; use mito2::config::{FulltextIndexConfig, MitoConfig, Mode}; -use mito2::read::Source; +use mito2::read::FlatSource; +use mito2::sst::FormatType; use mito2::sst::file::{FileHandle, FileMeta}; use mito2::sst::file_purger::{FilePurger, FilePurgerRef}; use mito2::sst::index::intermediate::IntermediateManager; @@ -210,6 +211,7 @@ impl ObjbenchCommand { object_store.clone(), ) .expected_metadata(Some(region_meta.clone())) + .flat_format(true) .build() .await .map_err(|e| { @@ -231,6 +233,10 @@ impl ObjbenchCommand { let reader_build_elapsed = reader_build_start.elapsed(); let total_rows = reader.parquet_metadata().file_metadata().num_rows(); println!("{} Reader built in {:?}", "✓".green(), reader_build_elapsed); + let reader_stream = Box::pin(stream::try_unfold(reader, |mut reader| async move { + let batch = reader.next_record_batch().await?; + Ok(batch.map(|batch| (batch, reader))) + })); // Build write request let fulltext_index_config = FulltextIndexConfig { @@ -241,10 +247,11 @@ impl ObjbenchCommand { let write_req = SstWriteRequest { op_type: OperationType::Flush, metadata: region_meta, - source: Either::Left(Source::Reader(Box::new(reader))), + source: FlatSource::Stream(reader_stream), cache_manager, storage: None, max_sequence: None, + sst_write_format: FormatType::PrimaryKey, index_options: Default::default(), index_config: mito_engine_config.index.clone(), inverted_index_config: MitoConfig::default().inverted_index, diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 92638d3c4a..215bea0ec5 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -32,14 +32,15 @@ use common_meta::cache::LayeredCacheRegistryBuilder; use common_meta::ddl::flow_meta::FlowMetadataAllocator; use common_meta::ddl::table_meta::TableMetadataAllocator; use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl}; -use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef}; +use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef, DdlManagerRef}; use common_meta::key::flow::FlowMetadataManager; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; -use common_meta::procedure_executor::LocalProcedureExecutor; +use common_meta::node_manager::{FlownodeRef, NodeManagerRef}; +use common_meta::procedure_executor::{LocalProcedureExecutor, ProcedureExecutorRef}; use common_meta::region_keeper::MemoryRegionKeeper; use common_meta::region_registry::LeaderRegionRegistry; -use common_meta::sequence::SequenceBuilder; +use common_meta::sequence::{Sequence, SequenceBuilder}; use common_meta::wal_provider::{WalProviderRef, build_wal_provider}; use common_procedure::ProcedureManagerRef; use common_query::prelude::set_default_prefix; @@ -49,6 +50,7 @@ use common_time::timezone::set_default_timezone; use common_version::{short_version, verbose_version}; use datanode::config::DatanodeOptions; use datanode::datanode::{Datanode, DatanodeBuilder}; +use datanode::region_server::RegionServer; use flow::{ FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient, FrontendInvoker, GrpcQueryHandlerWithBoxedError, @@ -58,6 +60,7 @@ use frontend::instance::StandaloneDatanodeManager; use frontend::instance::builder::FrontendBuilder; use frontend::server::Services; use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ}; +use plugins::PluginOptions; use plugins::frontend::context::{ CatalogManagerConfigureContext, StandaloneCatalogManagerConfigureContext, }; @@ -130,6 +133,18 @@ impl Instance { pub fn server_addr(&self, name: &str) -> Option { self.frontend.server_handlers().addr(name) } + + /// Get the mutable Frontend component of this Standalone instance for externally modification + /// by others (might not be in this code base, so don't delete this function). + pub fn mut_frontend(&mut self) -> &mut Frontend { + &mut self.frontend + } + + /// Get the Datanode component of this Standalone instance for externally usage + /// by others (might not be in this code base, so don't delete this function). + pub fn datanode(&self) -> &Datanode { + &self.datanode + } } #[async_trait] @@ -342,9 +357,18 @@ impl StartCommand { info!("Standalone start command: {:#?}", self); info!("Standalone options: {opts:#?}"); + let (mut instance, _) = + Self::build_with(opts.component, opts.plugins, InstanceCreator::default()).await?; + instance._guard.extend(guard); + Ok(instance) + } + + pub async fn build_with( + mut opts: StandaloneOptions, + plugin_opts: Vec, + creator: InstanceCreator, + ) -> Result<(Instance, InstanceCreatorResult)> { let mut plugins = Plugins::new(); - let plugin_opts = opts.plugins; - let mut opts = opts.component; set_default_prefix(opts.default_column_prefix.as_deref()) .map_err(BoxedError::new) .context(error::BuildCliSnafu)?; @@ -462,17 +486,16 @@ impl StartCommand { .await; } - let node_manager = Arc::new(StandaloneDatanodeManager { - region_server: datanode.region_server(), - flow_server: flownode.flow_engine(), - }); + let node_manager = creator + .node_manager_creator + .create( + &kv_backend, + datanode.region_server(), + flownode.flow_engine(), + ) + .await?; - let table_id_allocator = Arc::new( - SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone()) - .initial(MIN_USER_TABLE_ID as u64) - .step(10) - .build(), - ); + let table_id_allocator = creator.table_id_allocator_creator.create(&kv_backend); let flow_id_sequence = Arc::new( SequenceBuilder::new(FLOW_ID_SEQ, kv_backend.clone()) .initial(MIN_USER_FLOW_ID as u64) @@ -489,7 +512,7 @@ impl StartCommand { .context(error::BuildWalProviderSnafu)?; let wal_provider = Arc::new(wal_provider); let table_metadata_allocator = Arc::new(TableMetadataAllocator::new( - table_id_allocator, + table_id_allocator.clone(), wal_provider.clone(), )); let flow_metadata_allocator = Arc::new(FlowMetadataAllocator::with_noop_peer_allocator( @@ -532,10 +555,10 @@ impl StartCommand { ddl_manager }; - let procedure_executor = Arc::new(LocalProcedureExecutor::new( - Arc::new(ddl_manager), - procedure_manager.clone(), - )); + let procedure_executor = creator + .procedure_executor_creator + .create(Arc::new(ddl_manager), procedure_manager.clone()) + .await?; let fe_instance = FrontendBuilder::new( fe_opts.clone(), @@ -568,7 +591,7 @@ impl StartCommand { kv_backend.clone(), layered_cache_registry.clone(), procedure_executor, - node_manager, + node_manager.clone(), ) .await .context(StartFlownodeSnafu)?; @@ -584,14 +607,20 @@ impl StartCommand { heartbeat_task: None, }; - Ok(Instance { + let instance = Instance { datanode, frontend, flownode, procedure_manager, wal_provider, - _guard: guard, - }) + _guard: vec![], + }; + let result = InstanceCreatorResult { + kv_backend, + node_manager, + table_id_allocator, + }; + Ok((instance, result)) } pub async fn create_table_metadata_manager( @@ -608,6 +637,115 @@ impl StartCommand { } } +#[async_trait] +pub trait NodeManagerCreator { + async fn create( + &self, + kv_backend: &KvBackendRef, + region_server: RegionServer, + flow_server: FlownodeRef, + ) -> Result; +} + +pub struct DefaultNodeManagerCreator; + +#[async_trait] +impl NodeManagerCreator for DefaultNodeManagerCreator { + async fn create( + &self, + _: &KvBackendRef, + region_server: RegionServer, + flow_server: FlownodeRef, + ) -> Result { + Ok(Arc::new(StandaloneDatanodeManager { + region_server, + flow_server, + })) + } +} + +pub trait TableIdAllocatorCreator { + fn create(&self, kv_backend: &KvBackendRef) -> Arc; +} + +struct DefaultTableIdAllocatorCreator; + +impl TableIdAllocatorCreator for DefaultTableIdAllocatorCreator { + fn create(&self, kv_backend: &KvBackendRef) -> Arc { + Arc::new( + SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone()) + .initial(MIN_USER_TABLE_ID as u64) + .step(10) + .build(), + ) + } +} + +#[async_trait] +pub trait ProcedureExecutorCreator { + async fn create( + &self, + ddl_manager: DdlManagerRef, + procedure_manager: ProcedureManagerRef, + ) -> Result; +} + +pub struct DefaultProcedureExecutorCreator; + +#[async_trait] +impl ProcedureExecutorCreator for DefaultProcedureExecutorCreator { + async fn create( + &self, + ddl_manager: DdlManagerRef, + procedure_manager: ProcedureManagerRef, + ) -> Result { + Ok(Arc::new(LocalProcedureExecutor::new( + ddl_manager, + procedure_manager, + ))) + } +} + +/// `InstanceCreator` is used for grouping various component creators for building the +/// Standalone instance, suitable for customizing how the instance can be built. +pub struct InstanceCreator { + node_manager_creator: Box, + table_id_allocator_creator: Box, + procedure_executor_creator: Box, +} + +impl InstanceCreator { + pub fn new( + node_manager_creator: Box, + table_id_allocator_creator: Box, + procedure_executor_creator: Box, + ) -> Self { + Self { + node_manager_creator, + table_id_allocator_creator, + procedure_executor_creator, + } + } +} + +impl Default for InstanceCreator { + fn default() -> Self { + Self { + node_manager_creator: Box::new(DefaultNodeManagerCreator), + table_id_allocator_creator: Box::new(DefaultTableIdAllocatorCreator), + procedure_executor_creator: Box::new(DefaultProcedureExecutorCreator), + } + } +} + +/// `InstanceCreatorResult` is expected to be used paired with [InstanceCreator]. +/// It stores the created and other important components for further reusing. +pub struct InstanceCreatorResult { + pub kv_backend: KvBackendRef, + pub node_manager: NodeManagerRef, + pub table_id_allocator: Arc, +} + #[cfg(test)] mod tests { use std::default::Default; diff --git a/src/common/config/src/config.rs b/src/common/config/src/config.rs index e25c46a0c0..85ce3d206f 100644 --- a/src/common/config/src/config.rs +++ b/src/common/config/src/config.rs @@ -53,7 +53,7 @@ pub trait Configurable: Serialize + DeserializeOwned + Default + Sized { env.try_parsing(true) .separator(ENV_VAR_SEP) - .ignore_empty(true) + .ignore_empty(false) }; // Workaround: Replacement for `Config::try_from(&default_opts)` due to @@ -237,4 +237,31 @@ mod tests { }, ); } + + #[derive(Debug, Serialize, Deserialize, Default)] + struct SimpleConfig { + name: Option, + prefix: Option, + } + + impl Configurable for SimpleConfig {} + + #[test] + fn test_empty_env_var_is_not_ignored() { + let env_prefix = "SIMPLE_CFG_UT"; + temp_env::with_vars( + [( + [env_prefix.to_string(), "PREFIX".to_string()].join(ENV_VAR_SEP), + Some(""), + )], + || { + let opts = SimpleConfig::load_layered_options(None, env_prefix).unwrap(); + // With ignore_empty(false), an empty env var should yield Some("") + // rather than None (which was the previous behavior with ignore_empty(true)). + assert_eq!(opts.prefix, Some("".to_string())); + // Unset env var should remain None. + assert_eq!(opts.name, None); + }, + ); + } } diff --git a/src/common/function/src/aggrs/aggr_wrapper.rs b/src/common/function/src/aggrs/aggr_wrapper.rs index 3780d39582..6242ab9454 100644 --- a/src/common/function/src/aggrs/aggr_wrapper.rs +++ b/src/common/function/src/aggrs/aggr_wrapper.rs @@ -25,7 +25,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; -use arrow::array::StructArray; +use arrow::array::{ArrayRef, BooleanArray, StructArray}; use arrow_schema::{FieldRef, Fields}; use common_telemetry::debug; use datafusion::functions_aggregate::all_default_aggregate_functions; @@ -38,8 +38,8 @@ use datafusion_common::{Column, ScalarValue}; use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams}; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ - Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, Expr, ExprSchemable, LogicalPlan, - Signature, + Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, EmitTo, Expr, ExprSchemable, + GroupsAccumulator, LogicalPlan, Signature, }; use datafusion_physical_expr::aggregate::AggregateFunctionExpr; use datatypes::arrow::datatypes::{DataType, Field}; @@ -322,6 +322,14 @@ impl StateWrapper { ); }) } + + fn fix_inner_acc_args<'b>( + &self, + mut acc_args: datafusion_expr::function::AccumulatorArgs<'b>, + ) -> datafusion_common::Result> { + acc_args.return_field = self.deduce_aggr_return_type(&acc_args)?; + Ok(acc_args) + } } impl AggregateUDFImpl for StateWrapper { @@ -331,15 +339,32 @@ impl AggregateUDFImpl for StateWrapper { ) -> datafusion_common::Result> { // fix and recover proper acc args for the original aggregate function. let state_type = acc_args.return_type().clone(); - let inner = { - let mut new_acc_args = acc_args.clone(); - new_acc_args.return_field = self.deduce_aggr_return_type(&acc_args)?; - self.inner.accumulator(new_acc_args)? - }; + let inner = self.inner.accumulator(self.fix_inner_acc_args(acc_args)?)?; Ok(Box::new(StateAccum::new(inner, state_type)?)) } + fn groups_accumulator_supported( + &self, + acc_args: datafusion_expr::function::AccumulatorArgs, + ) -> bool { + self.fix_inner_acc_args(acc_args) + .map(|args| self.inner.inner().groups_accumulator_supported(args)) + .unwrap_or(false) + } + + fn create_groups_accumulator( + &self, + acc_args: datafusion_expr::function::AccumulatorArgs, + ) -> datafusion_common::Result> { + let state_type = acc_args.return_type().clone(); + let inner = self + .inner + .inner() + .create_groups_accumulator(self.fix_inner_acc_args(acc_args)?)?; + Ok(Box::new(StateGroupsAccum::new(inner, state_type)?)) + } + fn as_any(&self) -> &dyn std::any::Any { self } @@ -462,6 +487,118 @@ pub struct StateAccum { state_fields: Fields, } +pub struct StateGroupsAccum { + inner: Box, + state_fields: Fields, +} + +impl StateGroupsAccum { + fn new( + inner: Box, + state_type: DataType, + ) -> datafusion_common::Result { + let DataType::Struct(fields) = state_type else { + return Err(datafusion_common::DataFusionError::Internal(format!( + "Expected a struct type for state, got: {:?}", + state_type + ))); + }; + Ok(Self { + inner, + state_fields: fields, + }) + } + + fn wrap_state_arrays(&self, arrays: Vec) -> datafusion_common::Result { + let array_type = arrays + .iter() + .map(|array| array.data_type().clone()) + .collect::>(); + let expected_type = self + .state_fields + .iter() + .map(|field| field.data_type().clone()) + .collect::>(); + if array_type != expected_type { + debug!( + "State mismatch, expected: {}, got: {} for expected fields: {:?} and given array types: {:?}", + self.state_fields.len(), + arrays.len(), + self.state_fields, + array_type, + ); + let guess_schema = arrays + .iter() + .enumerate() + .map(|(index, array)| { + Field::new( + format!("col_{index}[mismatch_state]").as_str(), + array.data_type().clone(), + true, + ) + }) + .collect::(); + let array = StructArray::try_new(guess_schema, arrays, None)?; + return Ok(Arc::new(array)); + } + + Ok(Arc::new(StructArray::try_new( + self.state_fields.clone(), + arrays, + None, + )?)) + } +} + +impl GroupsAccumulator for StateGroupsAccum { + fn update_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&BooleanArray>, + total_num_groups: usize, + ) -> datafusion_common::Result<()> { + self.inner + .update_batch(values, group_indices, opt_filter, total_num_groups) + } + + fn merge_batch( + &mut self, + values: &[ArrayRef], + group_indices: &[usize], + opt_filter: Option<&BooleanArray>, + total_num_groups: usize, + ) -> datafusion_common::Result<()> { + self.inner + .merge_batch(values, group_indices, opt_filter, total_num_groups) + } + + fn evaluate(&mut self, emit_to: EmitTo) -> datafusion_common::Result { + let state = self.inner.state(emit_to)?; + self.wrap_state_arrays(state) + } + + fn state(&mut self, emit_to: EmitTo) -> datafusion_common::Result> { + self.inner.state(emit_to) + } + + fn convert_to_state( + &self, + values: &[ArrayRef], + opt_filter: Option<&BooleanArray>, + ) -> datafusion_common::Result> { + self.inner.convert_to_state(values, opt_filter) + } + + fn supports_convert_to_state(&self) -> bool { + self.inner.supports_convert_to_state() + } + + fn size(&self) -> usize { + self.inner.size() + } +} + impl StateAccum { pub fn new( inner: Box, diff --git a/src/common/function/src/aggrs/aggr_wrapper/tests.rs b/src/common/function/src/aggrs/aggr_wrapper/tests.rs index 8821b9fd24..de3a77df6b 100644 --- a/src/common/function/src/aggrs/aggr_wrapper/tests.rs +++ b/src/common/function/src/aggrs/aggr_wrapper/tests.rs @@ -40,10 +40,13 @@ use datafusion_common::arrow::array::AsArray; use datafusion_common::arrow::datatypes::{Float64Type, UInt64Type}; use datafusion_common::{Column, TableReference}; use datafusion_expr::expr::{AggregateFunction, NullTreatment}; +use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::{ - Aggregate, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr, TableScan, lit, + Aggregate, AggregateUDFImpl, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr, + TableScan, lit, }; use datafusion_physical_expr::aggregate::AggregateExprBuilder; +use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; use datatypes::arrow_array::StringArray; use futures::{Stream, StreamExt as _}; @@ -256,6 +259,38 @@ fn dummy_table_scan_with_ts() -> LogicalPlan { ) } +fn create_avg_state_groups_accumulator() -> Box { + let state_wrapper = StateWrapper::new((*avg_udaf()).clone()).unwrap(); + let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new( + "number", + DataType::Float64, + true, + )])); + let expr = col("number", &schema).unwrap(); + let expr_field = expr.return_field(&schema).unwrap(); + let return_field = Arc::new(Field::new( + "__avg_state(number)", + state_wrapper.return_type(&[DataType::Float64]).unwrap(), + true, + )); + let exprs = [expr]; + let expr_fields = [expr_field]; + let acc_args = AccumulatorArgs { + return_field, + schema: &schema, + ignore_nulls: false, + order_bys: &[], + is_reversed: false, + name: "__avg_state(number)", + is_distinct: false, + exprs: &exprs, + expr_fields: &expr_fields, + }; + + assert!(state_wrapper.groups_accumulator_supported(acc_args.clone())); + state_wrapper.create_groups_accumulator(acc_args).unwrap() +} + #[tokio::test] async fn test_sum_udaf() { let ctx = SessionContext::new(); @@ -796,6 +831,95 @@ async fn test_last_value_order_by_udaf() { assert_eq!(merge_eval_res, ScalarValue::Int64(Some(4))); } +#[test] +fn test_avg_state_groups_accumulator_evaluate() { + let mut state_accum = create_avg_state_groups_accumulator(); + let values = vec![Arc::new(Float64Array::from(vec![ + Some(1.0), + Some(2.0), + None, + Some(3.0), + Some(4.0), + Some(5.0), + ])) as ArrayRef]; + let group_indices = vec![0, 1, 0, 0, 1, 2]; + + state_accum + .update_batch(&values, &group_indices, None, 3) + .unwrap(); + + let result = state_accum.evaluate(EmitTo::All).unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + + assert_eq!( + result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(), + &UInt64Array::from(vec![2, 2, 1]) + ); + assert_eq!( + result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(), + &Float64Array::from(vec![4.0, 6.0, 5.0]) + ); +} + +#[test] +fn test_avg_state_groups_accumulator_state_merge_evaluate() { + let mut source_accum = create_avg_state_groups_accumulator(); + let source_values = vec![Arc::new(Float64Array::from(vec![ + Some(1.0), + Some(2.0), + None, + Some(3.0), + Some(4.0), + Some(5.0), + ])) as ArrayRef]; + let source_group_indices = vec![0, 1, 0, 0, 1, 2]; + + source_accum + .update_batch(&source_values, &source_group_indices, None, 3) + .unwrap(); + let source_state = source_accum.state(EmitTo::All).unwrap(); + + let mut merged_accum = create_avg_state_groups_accumulator(); + let merged_values = + vec![Arc::new(Float64Array::from(vec![Some(10.0), Some(20.0), Some(30.0)])) as ArrayRef]; + let merged_group_indices = vec![0, 1, 2]; + + merged_accum + .update_batch(&merged_values, &merged_group_indices, None, 3) + .unwrap(); + merged_accum + .merge_batch(&source_state, &[1, 2, 0], None, 3) + .unwrap(); + + let result = merged_accum.evaluate(EmitTo::All).unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + + assert_eq!( + result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(), + &UInt64Array::from(vec![2, 3, 3]) + ); + assert_eq!( + result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(), + &Float64Array::from(vec![15.0, 24.0, 36.0]) + ); +} + /// For testing whether the UDAF state fields are correctly implemented. /// esp. for our own custom UDAF's state fields. /// By compare eval results before and after split to state/merge functions. diff --git a/src/common/function/src/scalars/json/json_to_string.rs b/src/common/function/src/scalars/json/json_to_string.rs index 6c0cc260b2..6364dff4de 100644 --- a/src/common/function/src/scalars/json/json_to_string.rs +++ b/src/common/function/src/scalars/json/json_to_string.rs @@ -19,6 +19,7 @@ use datafusion_common::DataFusionError; use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder}; use datafusion_common::arrow::datatypes::DataType; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +use datatypes::types::jsonb_to_string; use crate::function::{Function, extract_args}; @@ -74,7 +75,7 @@ impl Function for JsonToStringFunction { for i in 0..size { let json = jsons.is_valid(i).then(|| jsons.value(i)); let result = json - .map(|json| jsonb::from_slice(json).map(|x| x.to_string())) + .map(jsonb_to_string) .transpose() .map_err(|e| DataFusionError::Execution(format!("invalid json binary: {e}")))?; diff --git a/src/common/memory-manager/Cargo.toml b/src/common/memory-manager/Cargo.toml index a6be50f774..6686c98167 100644 --- a/src/common/memory-manager/Cargo.toml +++ b/src/common/memory-manager/Cargo.toml @@ -10,7 +10,6 @@ workspace = true [dependencies] common-error = { workspace = true } common-macro = { workspace = true } -common-telemetry = { workspace = true } humantime = { workspace = true } serde = { workspace = true } snafu = { workspace = true } diff --git a/src/common/memory-manager/src/guard.rs b/src/common/memory-manager/src/guard.rs index 770b6dec24..ad3111581b 100644 --- a/src/common/memory-manager/src/guard.rs +++ b/src/common/memory-manager/src/guard.rs @@ -14,14 +14,13 @@ use std::{fmt, mem}; -use common_telemetry::debug; use snafu::ensure; use tokio::sync::{OwnedSemaphorePermit, TryAcquireError}; use crate::error::{ MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result, }; -use crate::manager::{MemoryMetrics, MemoryQuota}; +use crate::manager::{MemoryMetrics, MemoryQuota, UnlimitedMemoryQuota}; use crate::policy::OnExhaustedPolicy; /// Guard representing a slice of reserved memory. @@ -30,31 +29,57 @@ pub struct MemoryGuard { } pub(crate) enum GuardState { - Unlimited, + Released, + Unlimited { + quota: UnlimitedMemoryQuota, + granted_bytes: u64, + }, Limited { - permit: OwnedSemaphorePermit, quota: MemoryQuota, + permit: OwnedSemaphorePermit, }, } +impl GuardState { + fn release(self) { + match self { + GuardState::Released => {} + GuardState::Unlimited { + quota, + granted_bytes, + } => { + quota.sub_in_use(granted_bytes); + } + GuardState::Limited { quota, permit } => { + quota.release_permit(permit); + } + } + } +} + impl MemoryGuard { - pub(crate) fn unlimited() -> Self { + pub(crate) fn unlimited(quota: UnlimitedMemoryQuota, bytes: u64) -> Self { + quota.add_in_use(bytes); Self { - state: GuardState::Unlimited, + state: GuardState::Unlimited { + quota, + granted_bytes: bytes, + }, } } - pub(crate) fn limited(permit: OwnedSemaphorePermit, quota: MemoryQuota) -> Self { + pub(crate) fn limited(quota: MemoryQuota, permit: OwnedSemaphorePermit) -> Self { Self { - state: GuardState::Limited { permit, quota }, + state: GuardState::Limited { quota, permit }, } } /// Returns granted quota in bytes. pub fn granted_bytes(&self) -> u64 { match &self.state { - GuardState::Unlimited => 0, - GuardState::Limited { permit, quota } => { + GuardState::Released => 0, + GuardState::Unlimited { granted_bytes, .. } => *granted_bytes, + GuardState::Limited { quota, permit } => { quota.permits_to_bytes(permit.num_permits() as u32) } } @@ -68,13 +93,24 @@ impl MemoryGuard { /// - Returns error if requested bytes would exceed the manager's total limit /// - Returns error if the semaphore is unexpectedly closed pub async fn acquire_additional(&mut self, bytes: u64) -> Result<()> { - match &mut self.state { - GuardState::Unlimited => Ok(()), - GuardState::Limited { permit, quota } => { - if bytes == 0 { - return Ok(()); - } + if bytes == 0 { + return Ok(()); + } + match &mut self.state { + GuardState::Released => { + debug_assert!(false, "released memory guard state should not be reused"); + Ok(()) + } + GuardState::Unlimited { + quota, + granted_bytes, + } => { + quota.add_in_use(bytes); + *granted_bytes = granted_bytes.saturating_add(bytes); + Ok(()) + } + GuardState::Limited { quota, permit } => { let additional_permits = quota.bytes_to_permits(bytes); let current_permits = permit.num_permits() as u32; @@ -95,7 +131,6 @@ impl MemoryGuard { permit.merge(additional_permit); quota.update_in_use_metric(); - debug!("Acquired additional {} bytes", bytes); Ok(()) } } @@ -106,13 +141,24 @@ impl MemoryGuard { /// On success, merges the new memory into this guard and returns true. /// On failure, returns false and leaves this guard unchanged. pub fn try_acquire_additional(&mut self, bytes: u64) -> bool { - match &mut self.state { - GuardState::Unlimited => true, - GuardState::Limited { permit, quota } => { - if bytes == 0 { - return true; - } + if bytes == 0 { + return true; + } + match &mut self.state { + GuardState::Released => { + debug_assert!(false, "released memory guard state should not be reused"); + false + } + GuardState::Unlimited { + quota, + granted_bytes, + } => { + quota.add_in_use(bytes); + *granted_bytes = granted_bytes.saturating_add(bytes); + true + } + GuardState::Limited { quota, permit } => { let additional_permits = quota.bytes_to_permits(bytes); match quota @@ -123,7 +169,6 @@ impl MemoryGuard { Ok(additional_permit) => { permit.merge(additional_permit); quota.update_in_use_metric(); - debug!("Acquired additional {} bytes", bytes); true } Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => { @@ -168,7 +213,8 @@ impl MemoryGuard { MemoryLimitExceededSnafu { requested_bytes: bytes, limit_bytes: match &self.state { - GuardState::Unlimited => 0, // unreachable: unlimited mode always succeeds + GuardState::Released => 0, + GuardState::Unlimited { .. } => 0, // unreachable: unlimited mode always succeeds GuardState::Limited { quota, .. } => { quota.permits_to_bytes(quota.limit_permits) } @@ -184,22 +230,30 @@ impl MemoryGuard { /// /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted. pub fn release_partial(&mut self, bytes: u64) -> bool { + if bytes == 0 { + return true; + } + match &mut self.state { - GuardState::Unlimited => true, - GuardState::Limited { permit, quota } => { - if bytes == 0 { - return true; + GuardState::Released => true, + GuardState::Unlimited { + quota, + granted_bytes, + } => { + if bytes > *granted_bytes { + return false; } + quota.sub_in_use(bytes); + *granted_bytes = granted_bytes.saturating_sub(bytes); + true + } + GuardState::Limited { quota, permit } => { let release_permits = quota.bytes_to_permits(bytes); match permit.split(release_permits as usize) { Some(released_permit) => { - let released_bytes = - quota.permits_to_bytes(released_permit.num_permits() as u32); - drop(released_permit); - quota.update_in_use_metric(); - debug!("Released {} bytes from memory guard", released_bytes); + quota.release_permit(released_permit); true } None => false, @@ -211,14 +265,7 @@ impl MemoryGuard { impl Drop for MemoryGuard { fn drop(&mut self) { - if let GuardState::Limited { permit, quota } = - mem::replace(&mut self.state, GuardState::Unlimited) - { - let bytes = quota.permits_to_bytes(permit.num_permits() as u32); - drop(permit); - quota.update_in_use_metric(); - debug!("Released memory: {} bytes", bytes); - } + mem::replace(&mut self.state, GuardState::Released).release(); } } diff --git a/src/common/memory-manager/src/manager.rs b/src/common/memory-manager/src/manager.rs index 50360d2a31..8cca5f220c 100644 --- a/src/common/memory-manager/src/manager.rs +++ b/src/common/memory-manager/src/manager.rs @@ -13,9 +13,10 @@ // limitations under the License. use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use snafu::ensure; -use tokio::sync::{Semaphore, TryAcquireError}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError}; use crate::error::{ MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result, @@ -34,7 +35,7 @@ pub trait MemoryMetrics: Clone + Send + Sync + 'static { /// Generic memory manager for quota-controlled operations. #[derive(Clone)] pub struct MemoryManager { - quota: Option>, + quota: MemoryQuotaState, } impl Default for MemoryManager { @@ -51,6 +52,18 @@ pub(crate) struct MemoryQuota { pub(crate) metrics: M, } +#[derive(Clone)] +pub(crate) struct UnlimitedMemoryQuota { + pub(crate) current_bytes: Arc, + pub(crate) metrics: M, +} + +#[derive(Clone)] +pub(crate) enum MemoryQuotaState { + Unlimited(UnlimitedMemoryQuota), + Limited(MemoryQuota), +} + impl MemoryManager { /// Creates a new memory manager with the given limit in bytes. /// `limit_bytes = 0` disables the limit. @@ -62,7 +75,12 @@ impl MemoryManager { pub fn with_granularity(limit_bytes: u64, granularity: PermitGranularity, metrics: M) -> Self { if limit_bytes == 0 { metrics.set_limit(0); - return Self { quota: None }; + return Self { + quota: MemoryQuotaState::Unlimited(UnlimitedMemoryQuota { + current_bytes: Arc::new(AtomicU64::new(0)), + metrics, + }), + }; } let limit_permits = granularity.bytes_to_permits(limit_bytes); @@ -70,7 +88,7 @@ impl MemoryManager { metrics.set_limit(limit_aligned_bytes as i64); Self { - quota: Some(MemoryQuota { + quota: MemoryQuotaState::Limited(MemoryQuota { semaphore: Arc::new(Semaphore::new(limit_permits as usize)), limit_permits, granularity, @@ -81,26 +99,30 @@ impl MemoryManager { /// Returns the configured limit in bytes (0 if unlimited). pub fn limit_bytes(&self) -> u64 { - self.quota - .as_ref() - .map(|quota| quota.permits_to_bytes(quota.limit_permits)) - .unwrap_or(0) + match &self.quota { + MemoryQuotaState::Unlimited(_) => 0, + MemoryQuotaState::Limited(quota) => quota.permits_to_bytes(quota.limit_permits), + } } /// Returns currently used bytes. pub fn used_bytes(&self) -> u64 { - self.quota - .as_ref() - .map(|quota| quota.permits_to_bytes(quota.used_permits())) - .unwrap_or(0) + match &self.quota { + MemoryQuotaState::Unlimited(quota) => quota.current_bytes.load(Ordering::Acquire), + MemoryQuotaState::Limited(quota) => quota.permits_to_bytes(quota.used_permits()), + } } /// Returns available bytes. + /// + /// Unlimited managers report `u64::MAX`. pub fn available_bytes(&self) -> u64 { - self.quota - .as_ref() - .map(|quota| quota.permits_to_bytes(quota.available_permits_clamped())) - .unwrap_or(0) + match &self.quota { + MemoryQuotaState::Unlimited(_) => u64::MAX, + MemoryQuotaState::Limited(quota) => { + quota.permits_to_bytes(quota.available_permits_clamped()) + } + } } /// Acquires memory, waiting if necessary until enough is available. @@ -110,8 +132,8 @@ impl MemoryManager { /// - Returns error if the semaphore is unexpectedly closed pub async fn acquire(&self, bytes: u64) -> Result> { match &self.quota { - None => Ok(MemoryGuard::unlimited()), - Some(quota) => { + MemoryQuotaState::Unlimited(quota) => Ok(MemoryGuard::unlimited(quota.clone(), bytes)), + MemoryQuotaState::Limited(quota) => { let permits = quota.bytes_to_permits(bytes); ensure!( @@ -129,7 +151,7 @@ impl MemoryManager { .await .map_err(|_| MemorySemaphoreClosedSnafu.build())?; quota.update_in_use_metric(); - Ok(MemoryGuard::limited(permit, quota.clone())) + Ok(MemoryGuard::limited(quota.clone(), permit)) } } } @@ -137,14 +159,16 @@ impl MemoryManager { /// Tries to acquire memory. Returns Some(guard) on success, None if insufficient. pub fn try_acquire(&self, bytes: u64) -> Option> { match &self.quota { - None => Some(MemoryGuard::unlimited()), - Some(quota) => { + MemoryQuotaState::Unlimited(quota) => { + Some(MemoryGuard::unlimited(quota.clone(), bytes)) + } + MemoryQuotaState::Limited(quota) => { let permits = quota.bytes_to_permits(bytes); match quota.semaphore.clone().try_acquire_many_owned(permits) { Ok(permit) => { quota.update_in_use_metric(); - Some(MemoryGuard::limited(permit, quota.clone())) + Some(MemoryGuard::limited(quota.clone(), permit)) } Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => { quota.metrics.inc_rejected("try_acquire"); @@ -219,4 +243,49 @@ impl MemoryQuota { let bytes = self.permits_to_bytes(self.used_permits()); self.metrics.set_in_use(bytes as i64); } + + pub(crate) fn release_permit(&self, permit: OwnedSemaphorePermit) { + drop(permit); + self.update_in_use_metric(); + } +} + +impl UnlimitedMemoryQuota { + pub(crate) fn add_in_use(&self, bytes: u64) { + if bytes == 0 { + return; + } + + let previous = self + .current_bytes + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { + Some(current.saturating_add(bytes)) + }) + .unwrap(); + let new_total = previous.saturating_add(bytes); + debug_assert!( + new_total >= previous, + "unlimited memory usage counter overflowed" + ); + self.metrics.set_in_use(new_total as i64); + } + + pub(crate) fn sub_in_use(&self, bytes: u64) { + if bytes == 0 { + return; + } + + let previous = self + .current_bytes + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { + Some(current.saturating_sub(bytes)) + }) + .unwrap(); + debug_assert!( + previous >= bytes, + "unlimited memory usage counter underflowed: current={previous}, release={bytes}" + ); + let new_total = previous.saturating_sub(bytes); + self.metrics.set_in_use(new_total as i64); + } } diff --git a/src/common/memory-manager/src/tests.rs b/src/common/memory-manager/src/tests.rs index 886eef9dac..fe02703f0b 100644 --- a/src/common/memory-manager/src/tests.rs +++ b/src/common/memory-manager/src/tests.rs @@ -24,7 +24,9 @@ fn test_try_acquire_unlimited() { let manager = MemoryManager::new(0, NoOpMetrics); let guard = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap(); assert_eq!(manager.limit_bytes(), 0); - assert_eq!(guard.granted_bytes(), 0); + assert_eq!(manager.available_bytes(), u64::MAX); + assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES); + assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES); } #[test] @@ -136,7 +138,10 @@ fn test_request_additional_unlimited() { // Should always succeed with unlimited manager assert!(guard.try_acquire_additional(100 * PERMIT_GRANULARITY_BYTES)); - assert_eq!(guard.granted_bytes(), 0); + assert_eq!(guard.granted_bytes(), 105 * PERMIT_GRANULARITY_BYTES); + assert_eq!(manager.used_bytes(), 105 * PERMIT_GRANULARITY_BYTES); + + drop(guard); assert_eq!(manager.used_bytes(), 0); } @@ -187,9 +192,10 @@ fn test_early_release_partial_unlimited() { let manager = MemoryManager::new(0, NoOpMetrics); let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap(); - // Unlimited guard - release should succeed (no-op) + // Unlimited guard should track and release exact bytes. assert!(guard.release_partial(50 * PERMIT_GRANULARITY_BYTES)); - assert_eq!(guard.granted_bytes(), 0); + assert_eq!(guard.granted_bytes(), 50 * PERMIT_GRANULARITY_BYTES); + assert_eq!(manager.used_bytes(), 50 * PERMIT_GRANULARITY_BYTES); } #[test] @@ -406,6 +412,6 @@ async fn test_acquire_additional_unlimited() { .acquire_additional(1000 * PERMIT_GRANULARITY_BYTES) .await .unwrap(); - assert_eq!(guard.granted_bytes(), 0); - assert_eq!(manager.used_bytes(), 0); + assert_eq!(guard.granted_bytes(), 1000 * PERMIT_GRANULARITY_BYTES); + assert_eq!(manager.used_bytes(), 1000 * PERMIT_GRANULARITY_BYTES); } diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml index ec000c710d..f5ca9d2c09 100644 --- a/src/common/meta/Cargo.toml +++ b/src/common/meta/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true testing = [] pg_kvbackend = [ "dep:tokio-postgres", - "dep:backon", "dep:deadpool-postgres", "dep:deadpool", "dep:tokio-postgres-rustls", @@ -16,7 +15,7 @@ pg_kvbackend = [ "dep:rustls-native-certs", "dep:rustls", ] -mysql_kvbackend = ["dep:sqlx", "dep:backon"] +mysql_kvbackend = ["dep:sqlx"] enterprise = ["prost-types"] [lints] @@ -28,7 +27,7 @@ api.workspace = true async-recursion = "1.0" async-stream.workspace = true async-trait.workspace = true -backon = { workspace = true, optional = true } +backon.workspace = true base64.workspace = true bytes.workspace = true chrono.workspace = true diff --git a/src/common/meta/src/cache/container.rs b/src/common/meta/src/cache/container.rs index 0510476d15..e3a3e13a76 100644 --- a/src/common/meta/src/cache/container.rs +++ b/src/common/meta/src/cache/container.rs @@ -15,10 +15,14 @@ use std::borrow::Borrow; use std::hash::Hash; use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::time::Duration; -use futures::future::{BoxFuture, join_all}; +use backon::{BackoffBuilder, ExponentialBuilder}; +use futures::future::BoxFuture; use moka::future::Cache; use snafu::{OptionExt, ResultExt}; +use tokio::time::sleep; use crate::cache_invalidator::{CacheInvalidator, Context}; use crate::error::{self, Error, Result}; @@ -29,12 +33,29 @@ use crate::metrics; pub type TokenFilter = Box bool + Send + Sync>; /// Invalidates cached values by [CacheToken]s. -pub type Invalidator = - Box Fn(&'a Cache, &'a CacheToken) -> BoxFuture<'a, Result<()>> + Send + Sync>; +pub type Invalidator = Box< + dyn for<'a> Fn(&'a Cache, &'a [&CacheToken]) -> BoxFuture<'a, Result<()>> + Send + Sync, +>; /// Initializes value (i.e., fetches from remote). pub type Initializer = Arc BoxFuture<'_, Result>> + Send + Sync>; +#[derive(Debug, Clone, Copy)] +/// Initialization strategy for cache-miss loading. +/// +/// This strategy is selected when building [CacheContainer] and remains immutable +/// for the lifetime of the container instance. +pub enum InitStrategy { + /// Fast path: load once without version conflict retry. + /// + /// Under concurrent invalidation, callers may observe stale/dirty value. + Unchecked, + /// Strict path: retry load when version changes during initialization. + /// + /// This avoids returning dirty value under invalidate/load races. + VersionChecked, +} + /// [CacheContainer] provides ability to: /// - Cache value loaded by [Initializer]. /// - Invalidate caches by [Invalidator]. @@ -44,6 +65,16 @@ pub struct CacheContainer { invalidator: Invalidator, initializer: Initializer, token_filter: fn(&CacheToken) -> bool, + version: Arc, + init_strategy: InitStrategy, +} + +fn latest_get_backoff() -> impl Iterator { + ExponentialBuilder::default() + .with_min_delay(Duration::from_millis(10)) + .with_max_delay(Duration::from_millis(100)) + .with_max_times(3) + .build() } impl CacheContainer @@ -52,13 +83,37 @@ where V: Send + Sync, CacheToken: Send + Sync, { - /// Constructs an [CacheContainer]. + /// Constructs an [CacheContainer] with [InitStrategy::Unchecked]. + /// + /// This keeps the historical behavior and can return stale/dirty value under + /// concurrent invalidation. pub fn new( name: String, cache: Cache, invalidator: Invalidator, initializer: Initializer, token_filter: fn(&CacheToken) -> bool, + ) -> Self { + Self::with_strategy( + name, + cache, + invalidator, + initializer, + token_filter, + InitStrategy::Unchecked, + ) + } + + /// Constructs an [CacheContainer] with explicit [InitStrategy]. + /// + /// The strategy is fixed at construction time and cannot be changed later. + pub fn with_strategy( + name: String, + cache: Cache, + invalidator: Invalidator, + initializer: Initializer, + token_filter: fn(&CacheToken) -> bool, + init_strategy: InitStrategy, ) -> Self { Self { name, @@ -66,6 +121,8 @@ where invalidator, initializer, token_filter, + version: Arc::new(AtomicUsize::new(0)), + init_strategy, } } @@ -75,6 +132,67 @@ where } } +impl CacheContainer { + fn inc_version(&self) { + self.version.fetch_add(1, Ordering::Relaxed); + } +} + +async fn init<'a, K, V>(init: Initializer, key: K, cache_name: &'a str) -> Result +where + K: Send + Sync + 'a, + V: Send + 'a, +{ + metrics::CACHE_CONTAINER_CACHE_MISS + .with_label_values(&[cache_name]) + .inc(); + let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE + .with_label_values(&[cache_name]) + .start_timer(); + init(&key) + .await + .transpose() + .context(error::ValueNotExistSnafu)? +} + +async fn init_with_retry<'a, K, V>( + init: Initializer, + key: K, + mut backoff: impl Iterator + 'a, + version: Arc, + cache_name: &'a str, +) -> Result +where + K: Send + Sync + 'a, + V: Send + 'a, +{ + let mut attempts = 1usize; + loop { + let pre_version = version.load(Ordering::Relaxed); + metrics::CACHE_CONTAINER_CACHE_MISS + .with_label_values(&[cache_name]) + .inc(); + let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE + .with_label_values(&[cache_name]) + .start_timer(); + let value = init(&key) + .await + .transpose() + .context(error::ValueNotExistSnafu)??; + + if pre_version == version.load(Ordering::Relaxed) { + return Ok(value); + } + + if let Some(duration) = backoff.next() { + sleep(duration).await; + attempts += 1; + } else { + return error::GetLatestCacheRetryExceededSnafu { attempts }.fail(); + } + } +} + #[async_trait::async_trait] impl CacheInvalidator for CacheContainer where @@ -82,14 +200,15 @@ where V: Send + Sync, { async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> { - let tasks = caches + let idents = caches .iter() .filter(|token| (self.token_filter)(token)) - .map(|token| (self.invalidator)(&self.cache, token)); - join_all(tasks) - .await - .into_iter() - .collect::>>()?; + .collect::>(); + if !idents.is_empty() { + self.inc_version(); + (self.invalidator)(&self.cache, &idents).await?; + } + Ok(()) } } @@ -99,27 +218,39 @@ where K: Copy + Hash + Eq + Send + Sync + 'static, V: Clone + Send + Sync + 'static, { - /// Returns a _clone_ of the value corresponding to the key. + /// Returns a value from cache for copyable keys. + /// + /// With [InitStrategy::Unchecked], this method prioritizes latency and may + /// return stale/dirty value. With [InitStrategy::VersionChecked], this method + /// retries initialization on version change and avoids dirty returns. pub async fn get(&self, key: K) -> Result> { metrics::CACHE_CONTAINER_CACHE_GET .with_label_values(&[&self.name]) .inc(); - let moved_init = self.initializer.clone(); - let moved_key = key; - let init = async move { - metrics::CACHE_CONTAINER_CACHE_MISS - .with_label_values(&[&self.name]) - .inc(); - let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE - .with_label_values(&[&self.name]) - .start_timer(); - moved_init(&moved_key) - .await - .transpose() - .context(error::ValueNotExistSnafu)? + + let result = match self.init_strategy { + InitStrategy::Unchecked => { + self.cache + .try_get_with(key, init(self.initializer.clone(), key, &self.name)) + .await + } + InitStrategy::VersionChecked => { + self.cache + .try_get_with( + key, + init_with_retry( + self.initializer.clone(), + key, + latest_get_backoff(), + self.version.clone(), + &self.name, + ), + ) + .await + } }; - match self.cache.try_get_with(key, init).await { + match result { Ok(value) => Ok(Some(value)), Err(err) => match err.as_ref() { Error::ValueNotExist { .. } => Ok(None), @@ -136,14 +267,15 @@ where { /// Invalidates cache by [CacheToken]. pub async fn invalidate(&self, caches: &[CacheToken]) -> Result<()> { - let tasks = caches + let idents = caches .iter() .filter(|token| (self.token_filter)(token)) - .map(|token| (self.invalidator)(&self.cache, token)); - join_all(tasks) - .await - .into_iter() - .collect::>>()?; + .collect::>(); + if !idents.is_empty() { + self.inc_version(); + (self.invalidator)(&self.cache, &idents).await?; + } + Ok(()) } @@ -156,7 +288,11 @@ where self.cache.contains_key(key) } - /// Returns a _clone_ of the value corresponding to the key. + /// Returns a value from cache by key reference. + /// + /// With [InitStrategy::Unchecked], this method prioritizes latency and may + /// return stale/dirty value. With [InitStrategy::VersionChecked], this method + /// retries initialization on version change and avoids dirty returns. pub async fn get_by_ref(&self, key: &Q) -> Result> where K: Borrow, @@ -165,24 +301,32 @@ where metrics::CACHE_CONTAINER_CACHE_GET .with_label_values(&[&self.name]) .inc(); - let moved_init = self.initializer.clone(); - let moved_key = key.to_owned(); - - let init = async move { - metrics::CACHE_CONTAINER_CACHE_MISS - .with_label_values(&[&self.name]) - .inc(); - let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE - .with_label_values(&[&self.name]) - .start_timer(); - - moved_init(&moved_key) - .await - .transpose() - .context(error::ValueNotExistSnafu)? + let result = match self.init_strategy { + InitStrategy::Unchecked => { + self.cache + .try_get_with_by_ref( + key, + init(self.initializer.clone(), key.to_owned(), &self.name), + ) + .await + } + InitStrategy::VersionChecked => { + self.cache + .try_get_with_by_ref( + key, + init_with_retry( + self.initializer.clone(), + key.to_owned(), + latest_get_backoff(), + self.version.clone(), + &self.name, + ), + ) + .await + } }; - match self.cache.try_get_with_by_ref(key, init).await { + match result { Ok(value) => Ok(Some(value)), Err(err) => match err.as_ref() { Error::ValueNotExist { .. } => Ok(None), @@ -296,9 +440,11 @@ mod tests { moved_counter.fetch_add(1, Ordering::Relaxed); Box::pin(async { Ok(Some("hi".to_string())) }) }); - let invalidator: Invalidator = Box::new(|cache, key| { + let invalidator: Invalidator = Box::new(|cache, keys| { Box::pin(async move { - cache.invalidate(key).await; + for key in keys { + cache.invalidate(*key).await; + } Ok(()) }) }); @@ -323,4 +469,46 @@ mod tests { assert_eq!(value, "hi"); assert_eq!(counter.load(Ordering::Relaxed), 2); } + + #[tokio::test(flavor = "multi_thread")] + async fn test_get_by_ref_returns_fresh_value_after_invalidate() { + let cache: Cache = CacheBuilder::new(128).build(); + let counter = Arc::new(AtomicI32::new(0)); + let moved_counter = counter.clone(); + let init: Initializer = Arc::new(move |_| { + let counter = moved_counter.clone(); + Box::pin(async move { + let n = counter.fetch_add(1, Ordering::Relaxed) + 1; + sleep(Duration::from_millis(100)).await; + Ok(Some(format!("v{n}"))) + }) + }); + let invalidator: Invalidator = Box::new(|cache, keys| { + Box::pin(async move { + for key in keys { + cache.invalidate(*key).await; + } + Ok(()) + }) + }); + + let adv_cache = Arc::new(CacheContainer::with_strategy( + "test".to_string(), + cache, + invalidator, + init, + always_true_filter, + InitStrategy::VersionChecked, + )); + + let moved_cache = adv_cache.clone(); + let get_task = tokio::spawn(async move { moved_cache.get_by_ref("foo").await }); + + sleep(Duration::from_millis(50)).await; + adv_cache.invalidate(&["foo".to_string()]).await.unwrap(); + + let value = get_task.await.unwrap().unwrap().unwrap(); + assert_eq!(value, "v2"); + assert_eq!(counter.load(Ordering::Relaxed), 2); + } } diff --git a/src/common/meta/src/cache/flow/table_flownode.rs b/src/common/meta/src/cache/flow/table_flownode.rs index a7777f3361..ebe3664202 100644 --- a/src/common/meta/src/cache/flow/table_flownode.rs +++ b/src/common/meta/src/cache/flow/table_flownode.rs @@ -170,20 +170,22 @@ async fn handle_drop_flow( fn invalidator<'a>( cache: &'a Cache, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - match ident { - CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await, - CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await, - CacheIdent::FlowNodeAddressChange(node_id) => { - info!( - "Invalidate flow node cache for node_id in table_flownode: {}", - node_id - ); - cache.invalidate_all(); + for ident in idents { + match ident { + CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await, + CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await, + CacheIdent::FlowNodeAddressChange(node_id) => { + info!( + "Invalidate flow node cache for node_id in table_flownode: {}", + node_id + ); + cache.invalidate_all(); + } + _ => {} } - _ => {} } Ok(()) }) diff --git a/src/common/meta/src/cache/table/schema.rs b/src/common/meta/src/cache/table/schema.rs index bcf81d4fe6..bd9e8e6dc1 100644 --- a/src/common/meta/src/cache/table/schema.rs +++ b/src/common/meta/src/cache/table/schema.rs @@ -58,11 +58,13 @@ fn init_factory(schema_manager: SchemaManager) -> Initializer( cache: &'a Cache>, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, crate::error::Result<()>> { Box::pin(async move { - if let CacheIdent::SchemaName(schema_name) = ident { - cache.invalidate(schema_name).await + for ident in idents { + if let CacheIdent::SchemaName(schema_name) = ident { + cache.invalidate(schema_name).await + } } Ok(()) }) diff --git a/src/common/meta/src/cache/table/table_info.rs b/src/common/meta/src/cache/table/table_info.rs index b853d908e8..97af5bcdb7 100644 --- a/src/common/meta/src/cache/table/table_info.rs +++ b/src/common/meta/src/cache/table/table_info.rs @@ -61,11 +61,13 @@ fn init_factory(table_info_manager: TableInfoManagerRef) -> Initializer( cache: &'a Cache>, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - if let CacheIdent::TableId(table_id) = ident { - cache.invalidate(table_id).await + for ident in idents { + if let CacheIdent::TableId(table_id) = ident { + cache.invalidate(table_id).await + } } Ok(()) }) diff --git a/src/common/meta/src/cache/table/table_name.rs b/src/common/meta/src/cache/table/table_name.rs index 540da5e5f4..927a5b3480 100644 --- a/src/common/meta/src/cache/table/table_name.rs +++ b/src/common/meta/src/cache/table/table_name.rs @@ -71,11 +71,13 @@ fn init_factory(table_name_manager: TableNameManagerRef) -> Initializer( cache: &'a Cache, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - if let CacheIdent::TableName(table_name) = ident { - cache.invalidate(table_name).await + for ident in idents { + if let CacheIdent::TableName(table_name) = ident { + cache.invalidate(table_name).await + } } Ok(()) }) diff --git a/src/common/meta/src/cache/table/table_route.rs b/src/common/meta/src/cache/table/table_route.rs index 47abdaa728..be820b0c52 100644 --- a/src/common/meta/src/cache/table/table_route.rs +++ b/src/common/meta/src/cache/table/table_route.rs @@ -19,6 +19,7 @@ use moka::future::Cache; use snafu::OptionExt; use store_api::storage::TableId; +use crate::cache::container::InitStrategy; use crate::cache::{CacheContainer, Initializer}; use crate::error; use crate::error::Result; @@ -65,7 +66,14 @@ pub fn new_table_route_cache( let table_info_manager = Arc::new(TableRouteManager::new(kv_backend)); let init = init_factory(table_info_manager); - CacheContainer::new(name, cache, Box::new(invalidator), init, filter) + CacheContainer::with_strategy( + name, + cache, + Box::new(invalidator), + init, + filter, + InitStrategy::VersionChecked, + ) } fn init_factory( @@ -92,11 +100,13 @@ fn init_factory( fn invalidator<'a>( cache: &'a Cache>, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - if let CacheIdent::TableId(table_id) = ident { - cache.invalidate(table_id).await + for ident in idents { + if let CacheIdent::TableId(table_id) = ident { + cache.invalidate(table_id).await + } } Ok(()) }) diff --git a/src/common/meta/src/cache/table/table_schema.rs b/src/common/meta/src/cache/table/table_schema.rs index 99ece65683..33b1773f45 100644 --- a/src/common/meta/src/cache/table/table_schema.rs +++ b/src/common/meta/src/cache/table/table_schema.rs @@ -65,7 +65,7 @@ fn init_factory(table_info_manager: TableInfoManager) -> Initializer( _cache: &'a Cache>, - _ident: &'a CacheIdent, + _idents: &'a [&CacheIdent], ) -> BoxFuture<'a, error::Result<()>> { Box::pin(std::future::ready(Ok(()))) } diff --git a/src/common/meta/src/cache/table/view_info.rs b/src/common/meta/src/cache/table/view_info.rs index 6a85493d42..d0e1058a7e 100644 --- a/src/common/meta/src/cache/table/view_info.rs +++ b/src/common/meta/src/cache/table/view_info.rs @@ -60,11 +60,13 @@ fn init_factory(view_info_manager: ViewInfoManagerRef) -> Initializer( cache: &'a Cache>, - ident: &'a CacheIdent, + idents: &'a [&CacheIdent], ) -> BoxFuture<'a, Result<()>> { Box::pin(async move { - if let CacheIdent::TableId(view_id) = ident { - cache.invalidate(view_id).await + for ident in idents { + if let CacheIdent::TableId(view_id) = ident { + cache.invalidate(view_id).await + } } Ok(()) }) diff --git a/src/meta-srv/src/election.rs b/src/common/meta/src/election.rs similarity index 67% rename from src/meta-srv/src/election.rs rename to src/common/meta/src/election.rs index 2d2826b286..12173beda8 100644 --- a/src/meta-srv/src/election.rs +++ b/src/common/meta/src/election.rs @@ -21,15 +21,85 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use common_telemetry::{error, info, warn}; +use serde::{Deserialize, Serialize}; use tokio::sync::broadcast::error::RecvError; use tokio::sync::broadcast::{self, Receiver, Sender}; use crate::error::Result; -use crate::metasrv::MetasrvNodeInfo; -pub(crate) const CANDIDATE_LEASE_SECS: u64 = 600; +pub const CANDIDATE_LEASE_SECS: u64 = 600; const KEEP_ALIVE_INTERVAL_SECS: u64 = CANDIDATE_LEASE_SECS / 2; +/// The value of the leader. It is used to store the leader's address. +pub struct LeaderValue(pub String); + +impl> From for LeaderValue { + fn from(value: T) -> Self { + let string = String::from_utf8_lossy(value.as_ref()); + Self(string.to_string()) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetasrvNodeInfo { + // The metasrv's address + pub addr: String, + // The node build version + pub version: String, + // The node build git commit hash + pub git_commit: String, + // The node start timestamp in milliseconds + pub start_time_ms: u64, + // The node total cpu millicores + #[serde(default)] + pub total_cpu_millicores: i64, + // The node total memory bytes + #[serde(default)] + pub total_memory_bytes: i64, + /// The node build cpu usage millicores + #[serde(default)] + pub cpu_usage_millicores: i64, + /// The node build memory usage bytes + #[serde(default)] + pub memory_usage_bytes: i64, + // The node hostname + #[serde(default)] + pub hostname: String, +} + +// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto. +#[allow(deprecated)] +impl From for api::v1::meta::MetasrvNodeInfo { + fn from(node_info: MetasrvNodeInfo) -> Self { + Self { + peer: Some(api::v1::meta::Peer { + addr: node_info.addr, + ..Default::default() + }), + // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version. + // New code should use the fields in `info.NodeInfo` instead. + version: node_info.version.clone(), + git_commit: node_info.git_commit.clone(), + start_time_ms: node_info.start_time_ms, + cpus: node_info.total_cpu_millicores as u32, + memory_bytes: node_info.total_memory_bytes as u64, + // The canonical location for node information. + info: Some(api::v1::meta::NodeInfo { + version: node_info.version, + git_commit: node_info.git_commit, + start_time_ms: node_info.start_time_ms, + total_cpu_millicores: node_info.total_cpu_millicores, + total_memory_bytes: node_info.total_memory_bytes, + cpu_usage_millicores: node_info.cpu_usage_millicores, + memory_usage_bytes: node_info.memory_usage_bytes, + cpus: node_info.total_cpu_millicores as u32, + memory_bytes: node_info.total_memory_bytes as u64, + hostname: node_info.hostname, + }), + } + } +} + /// Messages sent when the leader changes. #[derive(Debug, Clone)] pub enum LeaderChangeMessage { @@ -168,3 +238,5 @@ pub trait Election: Send + Sync { fn subscribe_leader_change(&self) -> Receiver; } + +pub type ElectionRef = Arc>; diff --git a/src/meta-srv/src/election/etcd.rs b/src/common/meta/src/election/etcd.rs similarity index 94% rename from src/meta-srv/src/election/etcd.rs rename to src/common/meta/src/election/etcd.rs index 883f723d74..affad31ef4 100644 --- a/src/meta-srv/src/election/etcd.rs +++ b/src/common/meta/src/election/etcd.rs @@ -16,8 +16,6 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; -use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS}; -use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, info, warn}; use etcd_client::{ Client, GetOptions, LeaderKey as EtcdLeaderKey, LeaseKeepAliveStream, LeaseKeeper, PutOptions, @@ -27,13 +25,15 @@ use tokio::sync::broadcast; use tokio::sync::broadcast::Receiver; use tokio::time::{MissedTickBehavior, timeout}; +use crate::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS}; use crate::election::{ - CANDIDATE_LEASE_SECS, Election, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage, LeaderKey, - listen_leader_change, send_leader_change_and_set_flags, + CANDIDATE_LEASE_SECS, Election, ElectionRef, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage, + LeaderKey, LeaderValue, MetasrvNodeInfo, listen_leader_change, + send_leader_change_and_set_flags, }; use crate::error; use crate::error::Result; -use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo}; +use crate::key::{CANDIDATES_ROOT, ELECTION_KEY}; impl LeaderKey for EtcdLeaderKey { fn name(&self) -> &[u8] { @@ -253,7 +253,7 @@ impl Election for EtcdElection { .leader(self.election_key()) .await .context(error::EtcdFailedSnafu)?; - let leader_value = res.kv().context(error::NoLeaderSnafu)?.value(); + let leader_value = res.kv().context(error::ElectionNoLeaderSnafu)?.value(); Ok(leader_value.into()) } } @@ -279,7 +279,7 @@ impl EtcdElection { ensure!( res.ttl() > 0, error::UnexpectedSnafu { - violated: "Failed to refresh the lease", + err_msg: "Failed to refresh the lease".to_string(), } ); diff --git a/src/meta-srv/src/election/rds.rs b/src/common/meta/src/election/rds.rs similarity index 96% rename from src/meta-srv/src/election/rds.rs rename to src/common/meta/src/election/rds.rs index 16e113415a..6ee529ee02 100644 --- a/src/meta-srv/src/election/rds.rs +++ b/src/common/meta/src/election/rds.rs @@ -36,7 +36,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> { .split(LEASE_SEP) .collect_tuple() .with_context(|| UnexpectedSnafu { - violated: format!( + err_msg: format!( "Invalid value {}, expect node info || {} || expire time", value, LEASE_SEP ), @@ -45,7 +45,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> { let expire_time = match Timestamp::from_str(expire_time, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", expire_time), + err_msg: format!("Invalid timestamp: {}", expire_time), } .fail()?, }; diff --git a/src/meta-srv/src/election/rds/mysql.rs b/src/common/meta/src/election/rds/mysql.rs similarity index 97% rename from src/meta-srv/src/election/rds/mysql.rs rename to src/common/meta/src/election/rds/mysql.rs index 20051a2610..80f3d8ca7c 100644 --- a/src/meta-srv/src/election/rds/mysql.rs +++ b/src/common/meta/src/election/rds/mysql.rs @@ -16,7 +16,6 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; -use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, info, warn}; use common_time::Timestamp; use snafu::{OptionExt, ResultExt, ensure}; @@ -29,14 +28,15 @@ use tokio::time::MissedTickBehavior; use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time}; use crate::election::{ - Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags, + Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change, + send_leader_change_and_set_flags, }; use crate::error::{ AcquireMySqlClientSnafu, DecodeSqlValueSnafu, DeserializeFromJsonSnafu, - LeaderLeaseChangedSnafu, LeaderLeaseExpiredSnafu, MySqlExecutionSnafu, NoLeaderSnafu, Result, - SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu, + ElectionLeaderLeaseChangedSnafu, ElectionLeaderLeaseExpiredSnafu, ElectionNoLeaderSnafu, + MySqlExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu, }; -use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo}; +use crate::key::{CANDIDATES_ROOT, ELECTION_KEY}; struct ElectionSqlFactory<'a> { table_name: &'a str, @@ -592,7 +592,7 @@ impl Election for MySqlElection { ensure!( lease.expire_time > lease.current, UnexpectedSnafu { - violated: format!( + err_msg: format!( "Candidate lease expired at {:?} (current time: {:?}), key: {:?}", lease.expire_time, lease.current, @@ -667,10 +667,10 @@ impl Election for MySqlElection { let client = self.client.lock().await; let mut executor = Executor::Default(client); if let Some(lease) = self.get_value_with_lease(&key, &mut executor).await? { - ensure!(lease.expire_time > lease.current, NoLeaderSnafu); + ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu); Ok(lease.leader_value.as_bytes().into()) } else { - NoLeaderSnafu.fail() + ElectionNoLeaderSnafu.fail() } } } @@ -705,7 +705,7 @@ impl MySqlElection { let current_time = match Timestamp::from_str(¤t_time_str, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", current_time_str), + err_msg: format!("Invalid timestamp: {}", current_time_str), } .fail()?, }; @@ -740,7 +740,7 @@ impl MySqlElection { current = match Timestamp::from_str(current_time_str, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", current_time_str), + err_msg: format!("Invalid timestamp: {}", current_time_str), } .fail()?, }; @@ -777,7 +777,7 @@ impl MySqlElection { ensure!( res == 1, UnexpectedSnafu { - violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)), + err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)), } ); @@ -920,9 +920,12 @@ impl MySqlElection { /// will be released. /// - **Case 2**: If all checks pass, the function returns without performing any actions. fn lease_check(&self, lease: &Option) -> Result { - let lease = lease.as_ref().context(NoLeaderSnafu)?; + let lease = lease.as_ref().context(ElectionNoLeaderSnafu)?; // Case 1: Lease expired - ensure!(lease.expire_time > lease.current, LeaderLeaseExpiredSnafu); + ensure!( + lease.expire_time > lease.current, + ElectionLeaderLeaseExpiredSnafu + ); // Case 2: Everything is fine Ok(lease.clone()) } @@ -960,7 +963,7 @@ impl MySqlElection { let remote_lease = self.get_value_with_lease(&key, &mut executor).await?; ensure!( expected_lease.map(|lease| lease.origin) == remote_lease.map(|lease| lease.origin), - LeaderLeaseChangedSnafu + ElectionLeaderLeaseChangedSnafu ); self.delete_value(&key, &mut executor).await?; self.put_value_with_lease( @@ -987,12 +990,11 @@ mod tests { use std::assert_matches::assert_matches; use std::env; - use common_meta::maybe_skip_mysql_integration_test; use common_telemetry::init_default_ut_logging; + use sqlx::MySqlPool; use super::*; - use crate::error; - use crate::utils::mysql::create_mysql_pool; + use crate::{error, maybe_skip_mysql_integration_test}; async fn create_mysql_client( table_name: Option<&str>, @@ -1003,11 +1005,11 @@ mod tests { let endpoint = env::var("GT_MYSQL_ENDPOINTS").unwrap_or_default(); if endpoint.is_empty() { return UnexpectedSnafu { - violated: "MySQL endpoint is empty".to_string(), + err_msg: "MySQL endpoint is empty".to_string(), } .fail(); } - let pool = create_mysql_pool(&[endpoint], None).await.unwrap(); + let pool = MySqlPool::connect(&endpoint).await.unwrap(); let mut client = ElectionMysqlClient::new( pool, execution_timeout, @@ -1302,7 +1304,7 @@ mod tests { let err = elected(&leader_mysql_election, table_name, Some(incorrect_lease)) .await .unwrap_err(); - assert_matches!(err, error::Error::LeaderLeaseChanged { .. }); + assert_matches!(err, error::Error::ElectionLeaderLeaseChanged { .. }); let lease = get_lease(&leader_mysql_election).await; assert!(lease.is_none()); drop_table(&leader_mysql_election.client, table_name).await; diff --git a/src/meta-srv/src/election/rds/postgres.rs b/src/common/meta/src/election/rds/postgres.rs similarity index 97% rename from src/meta-srv/src/election/rds/postgres.rs rename to src/common/meta/src/election/rds/postgres.rs index c21efd780b..01910335a0 100644 --- a/src/meta-srv/src/election/rds/postgres.rs +++ b/src/common/meta/src/election/rds/postgres.rs @@ -16,7 +16,6 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; -use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, info, warn}; use common_time::Timestamp; use deadpool_postgres::{Manager, Pool}; @@ -28,13 +27,15 @@ use tokio_postgres::types::ToSql; use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time}; use crate::election::{ - Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags, + Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change, + send_leader_change_and_set_flags, }; use crate::error::{ - DeserializeFromJsonSnafu, GetPostgresClientSnafu, NoLeaderSnafu, PostgresExecutionSnafu, - Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu, + DeserializeFromJsonSnafu, ElectionNoLeaderSnafu, GetPostgresClientSnafu, + PostgresExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, + UnexpectedSnafu, }; -use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo}; +use crate::key::{CANDIDATES_ROOT, ELECTION_KEY}; struct ElectionSqlFactory<'a> { lock_id: u64, @@ -404,13 +405,13 @@ impl Election for PgElection { .get_value_with_lease(&key) .await? .context(UnexpectedSnafu { - violated: format!("Failed to get lease for key: {:?}", key), + err_msg: format!("Failed to get lease for key: {:?}", key), })?; ensure!( lease.expire_time > lease.current, UnexpectedSnafu { - violated: format!( + err_msg: format!( "Candidate lease expired at {:?} (current time {:?}), key: {:?}", lease.expire_time, lease.current, key ), @@ -464,11 +465,11 @@ impl Election for PgElection { .query(&self.sql_set.campaign, &[]) .await?; let row = res.first().context(UnexpectedSnafu { - violated: "Failed to get the result of acquiring advisory lock", + err_msg: "Failed to get the result of acquiring advisory lock".to_string(), })?; let is_leader = row.try_get(0).map_err(|_| { UnexpectedSnafu { - violated: "Failed to get the result of get lock", + err_msg: "Failed to get the result of get lock".to_string(), } .build() })?; @@ -500,10 +501,10 @@ impl Election for PgElection { } else { let key = self.election_key(); if let Some(lease) = self.get_value_with_lease(&key).await? { - ensure!(lease.expire_time > lease.current, NoLeaderSnafu); + ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu); Ok(lease.leader_value.as_bytes().into()) } else { - NoLeaderSnafu.fail() + ElectionNoLeaderSnafu.fail() } } } @@ -537,7 +538,7 @@ impl PgElection { let current_time = match Timestamp::from_str(current_time_str, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", current_time_str), + err_msg: format!("Invalid timestamp: {}", current_time_str), } .fail()?, }; @@ -576,7 +577,7 @@ impl PgElection { current = match Timestamp::from_str(current_time_str, None) { Ok(ts) => ts, Err(_) => UnexpectedSnafu { - violated: format!("Invalid timestamp: {}", current_time_str), + err_msg: format!("Invalid timestamp: {}", current_time_str), } .fail()?, }; @@ -613,7 +614,7 @@ impl PgElection { ensure!( res == 1, UnexpectedSnafu { - violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)), + err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)), } ); @@ -742,9 +743,9 @@ impl PgElection { let lease = self .get_value_with_lease(&key) .await? - .context(NoLeaderSnafu)?; + .context(ElectionNoLeaderSnafu)?; // Case 2 - ensure!(lease.expire_time > lease.current, NoLeaderSnafu); + ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu); // Case 3 Ok(()) } @@ -831,11 +832,11 @@ mod tests { use std::assert_matches::assert_matches; use std::env; - use common_meta::maybe_skip_postgres_integration_test; + use deadpool_postgres::{Config, Runtime}; + use tokio_postgres::NoTls; use super::*; - use crate::error; - use crate::utils::postgres::create_postgres_pool; + use crate::{error, maybe_skip_postgres_integration_test}; async fn create_postgres_client( table_name: Option<&str>, @@ -846,11 +847,13 @@ mod tests { let endpoint = env::var("GT_POSTGRES_ENDPOINTS").unwrap_or_default(); if endpoint.is_empty() { return UnexpectedSnafu { - violated: "Postgres endpoint is empty".to_string(), + err_msg: "Postgres endpoint is empty".to_string(), } .fail(); } - let pool = create_postgres_pool(&[endpoint], None, None).await.unwrap(); + let mut cfg = Config::new(); + cfg.url = Some(endpoint); + let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls).unwrap(); let mut pg_client = ElectionPgClient::new( pool, execution_timeout, diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs index c6613af828..05b5af393b 100644 --- a/src/common/meta/src/error.rs +++ b/src/common/meta/src/error.rs @@ -338,6 +338,24 @@ pub enum Error { location: Location, }, + #[snafu(display("Metasrv election has no leader at this moment"))] + ElectionNoLeader { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Metasrv election leader lease expired"))] + ElectionLeaderLeaseExpired { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Metasrv election leader lease changed during election"))] + ElectionLeaderLeaseChanged { + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Table already exists, table: {}", table_name))] TableAlreadyExists { table_name: String, @@ -714,6 +732,16 @@ pub enum Error { #[snafu(display("Failed to get cache"))] GetCache { source: Arc }, + #[snafu(display( + "Failed to get latest cache value after {} attempts due to concurrent invalidation", + attempts + ))] + GetLatestCacheRetryExceeded { + attempts: usize, + #[snafu(implicit)] + location: Location, + }, + #[cfg(feature = "pg_kvbackend")] #[snafu(display("Failed to execute via Postgres, sql: {}", sql))] PostgresExecution { @@ -741,6 +769,15 @@ pub enum Error { location: Location, }, + #[cfg(feature = "pg_kvbackend")] + #[snafu(display("Failed to get Postgres client"))] + GetPostgresClient { + #[snafu(source)] + error: deadpool::managed::PoolError, + #[snafu(implicit)] + location: Location, + }, + #[cfg(feature = "pg_kvbackend")] #[snafu(display("Failed to {} Postgres transaction", operation))] PostgresTransaction { @@ -795,6 +832,24 @@ pub enum Error { location: Location, }, + #[cfg(feature = "mysql_kvbackend")] + #[snafu(display("Failed to decode sql value"))] + DecodeSqlValue { + #[snafu(source)] + error: sqlx::error::Error, + #[snafu(implicit)] + location: Location, + }, + + #[cfg(feature = "mysql_kvbackend")] + #[snafu(display("Failed to acquire mysql client from pool"))] + AcquireMySqlClient { + #[snafu(source)] + error: sqlx::Error, + #[snafu(implicit)] + location: Location, + }, + #[cfg(feature = "mysql_kvbackend")] #[snafu(display("Failed to {} MySql transaction", operation))] MySqlTransaction { @@ -812,6 +867,15 @@ pub enum Error { location: Location, }, + #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] + #[snafu(display("Sql execution timeout, sql: {}, duration: {:?}", sql, duration))] + SqlExecutionTimeout { + sql: String, + duration: std::time::Duration, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display( "Datanode table info not found, table id: {}, datanode id: {}", table_id, @@ -1063,8 +1127,12 @@ impl ErrorExt for Error { | ConnectEtcd { .. } | MoveValues { .. } | GetCache { .. } + | GetLatestCacheRetryExceeded { .. } | SerializeToJson { .. } - | DeserializeFromJson { .. } => StatusCode::Internal, + | DeserializeFromJson { .. } + | ElectionNoLeader { .. } + | ElectionLeaderLeaseExpired { .. } + | ElectionLeaderLeaseChanged { .. } => StatusCode::Internal, NoLeader { .. } => StatusCode::TableUnavailable, ValueNotExist { .. } @@ -1187,15 +1255,18 @@ impl ErrorExt for Error { PostgresExecution { .. } | CreatePostgresPool { .. } | GetPostgresConnection { .. } + | GetPostgresClient { .. } | PostgresTransaction { .. } | PostgresTlsConfig { .. } | InvalidTlsConfig { .. } => StatusCode::Internal, #[cfg(feature = "mysql_kvbackend")] - MySqlExecution { .. } | CreateMySqlPool { .. } | MySqlTransaction { .. } => { - StatusCode::Internal - } + MySqlExecution { .. } + | CreateMySqlPool { .. } + | DecodeSqlValue { .. } + | AcquireMySqlClient { .. } + | MySqlTransaction { .. } => StatusCode::Internal, #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] - RdsTransactionRetryFailed { .. } => StatusCode::Internal, + RdsTransactionRetryFailed { .. } | SqlExecutionTimeout { .. } => StatusCode::Internal, DatanodeTableInfoNotFound { .. } => StatusCode::Internal, } } @@ -1243,7 +1314,10 @@ impl Error { /// Determine whether it is a retry later type through [StatusCode] pub fn is_retry_later(&self) -> bool { - matches!(self, Error::RetryLater { .. }) + matches!( + self, + Error::RetryLater { .. } | Error::GetLatestCacheRetryExceeded { .. } + ) } /// Determine whether it needs to clean poisons. diff --git a/src/common/meta/src/lib.rs b/src/common/meta/src/lib.rs index 93cd229b16..36aae1026e 100644 --- a/src/common/meta/src/lib.rs +++ b/src/common/meta/src/lib.rs @@ -22,6 +22,7 @@ pub mod datanode; pub mod ddl; pub mod ddl_manager; pub mod distributed_time_constants; +pub mod election; pub mod error; pub mod flow_name; pub mod heartbeat; diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs index 454afb95b3..46dcef11d4 100644 --- a/src/common/procedure/src/local/runner.rs +++ b/src/common/procedure/src/local/runner.rs @@ -17,6 +17,8 @@ use std::sync::Arc; use std::time::Duration; use backon::{BackoffBuilder, ExponentialBuilder}; +use common_error::ext::PlainError; +use common_error::status_code::StatusCode; use common_event_recorder::EventRecorderRef; use common_telemetry::tracing_context::{FutureExt, TracingContext}; use common_telemetry::{debug, error, info, tracing}; @@ -90,6 +92,45 @@ impl Drop for ProcedureGuard { } } +/// Returns a list of conflicting lock keys between a parent and a child procedure. +/// Evaluates the Read/Write lock compatibility matrix: +/// - Share + Share => Compatible +/// - Exclusive + Any => Conflict +/// - Any + Exclusive => Conflict +fn find_lock_conflicts<'a>( + parent_keys: impl Iterator, + child_keys: impl Iterator, +) -> Vec { + use std::collections::HashMap; + + // Map from key string slice (&str) to a boolean indicating if the parent holds it EXCLUSIVELY. + let mut parent_map = HashMap::new(); + for key in parent_keys { + match key { + StringKey::Exclusive(k) => { + parent_map.insert(k.as_str(), true); + } + StringKey::Share(k) => { + parent_map.entry(k.as_str()).or_insert(false); + } + } + } + + child_keys + .filter_map(|child_key| match child_key { + StringKey::Exclusive(k) | StringKey::Share(k) + if parent_map.get(k.as_str()) == Some(&true) => + { + Some(k.clone()) + } + StringKey::Exclusive(k) if parent_map.get(k.as_str()) == Some(&false) => { + Some(k.clone()) + } + _ => None, + }) + .collect() +} + pub(crate) struct Runner { pub(crate) meta: ProcedureMetaRef, pub(crate) procedure: BoxedProcedure, @@ -512,6 +553,41 @@ impl Runner { async fn on_suspended(&mut self, subprocedures: Vec) { let has_child = !subprocedures.is_empty(); + + // Pre-check: detect potential deadlocks BEFORE submitting any subprocedure. + // If a child shares conflicting lock keys with the parent, submitting it would + // cause a Hold-and-Wait deadlock — the child blocks on lock acquisition while + // the parent holds the lock and waits for the child to finish. + for sub in &subprocedures { + let conflicting = find_lock_conflicts( + self.meta.lock_key.keys_to_lock(), + sub.procedure.lock_key().keys_to_lock(), + ); + if !conflicting.is_empty() { + let err_msg = format!( + "Deadlock prevented: subprocedure {}-{} shares conflicting lock key(s) {:?} \ + with parent {}-{}. Parent holds these locks and would wait for child \ + completion, but child cannot acquire them.", + sub.procedure.type_name(), + sub.id, + conflicting, + self.procedure.type_name(), + self.meta.id, + ); + error!("{}", err_msg); + let err = Arc::new(Error::external(PlainError::new( + err_msg, + StatusCode::Internal, + ))); + if self.procedure.rollback_supported() { + self.meta.set_state(ProcedureState::prepare_rollback(err)); + } else { + self.meta.set_state(ProcedureState::failed(err)); + } + return; + } + } + for subprocedure in subprocedures { info!( "Procedure {}-{} submit subprocedure {}-{}", @@ -1939,4 +2015,169 @@ mod tests { join_all(tasks).await; assert_eq!(shared_atomic_value.load(Ordering::Relaxed), 2); } + #[tokio::test] + async fn test_on_suspend_deadlock_detected_no_rollback() { + // Parent holds Exclusive("catalog.schema.table"), child also requests Exclusive("catalog.schema.table"). + // Since parent does NOT support rollback, state should become Failed. + let child_id = ProcedureId::random(); + let exec_fn = move |_| { + async move { + let child_exec_fn = |_| async { Ok(Status::done()) }.boxed(); + let child = ProcedureAdapter { + data: "child".to_string(), + lock_key: LockKey::single_exclusive("catalog.schema.table"), + poison_keys: PoisonKeys::default(), + exec_fn: child_exec_fn, + rollback_fn: None, + }; + Ok(Status::Suspended { + subprocedures: vec![ProcedureWithId { + id: child_id, + procedure: Box::new(child), + }], + persist: false, + }) + } + .boxed() + }; + let parent = ProcedureAdapter { + data: "parent".to_string(), + lock_key: LockKey::single_exclusive("catalog.schema.table"), + poison_keys: PoisonKeys::default(), + exec_fn, + rollback_fn: None, // No rollback support + }; + + let dir = create_temp_dir("deadlock_no_rollback"); + let meta = parent.new_meta(ROOT_ID); + let ctx = context_without_provider(meta.id); + let object_store = test_util::new_object_store(&dir); + let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone())); + let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store); + runner.manager_ctx.start(); + + runner.execute_once(&ctx).await; + let state = runner.meta.state(); + assert!(state.is_failed(), "Expected Failed, got {state:?}"); + // Verify the error exists + assert!( + state.error().is_some(), + "Failed state should contain an error" + ); + // Child should NOT have been submitted + assert!( + !runner.manager_ctx.contains_procedure(child_id), + "Child procedure should not be submitted when deadlock is detected" + ); + } + + #[tokio::test] + async fn test_on_suspend_deadlock_detected_with_rollback() { + // Parent holds Exclusive("catalog.schema.table"), child also requests Exclusive("catalog.schema.table"). + // Since parent DOES support rollback, state should become PrepareRollback. + let child_id = ProcedureId::random(); + let exec_fn = move |_| { + async move { + let child_exec_fn = |_| async { Ok(Status::done()) }.boxed(); + let child = ProcedureAdapter { + data: "child".to_string(), + lock_key: LockKey::single_exclusive("catalog.schema.table"), + poison_keys: PoisonKeys::default(), + exec_fn: child_exec_fn, + rollback_fn: None, + }; + Ok(Status::Suspended { + subprocedures: vec![ProcedureWithId { + id: child_id, + procedure: Box::new(child), + }], + persist: false, + }) + } + .boxed() + }; + let rollback_fn = move |_| async move { Ok(()) }.boxed(); + let parent = ProcedureAdapter { + data: "parent".to_string(), + lock_key: LockKey::single_exclusive("catalog.schema.table"), + poison_keys: PoisonKeys::default(), + exec_fn, + rollback_fn: Some(Box::new(rollback_fn)), // Supports rollback + }; + + let dir = create_temp_dir("deadlock_with_rollback"); + let meta = parent.new_meta(ROOT_ID); + let ctx = context_without_provider(meta.id); + let object_store = test_util::new_object_store(&dir); + let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone())); + let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store); + runner.manager_ctx.start(); + + runner.execute_once(&ctx).await; + let state = runner.meta.state(); + assert!( + state.is_prepare_rollback(), + "Expected PrepareRollback, got {state:?}" + ); + // Verify the error exists in PrepareRollback variant + match &state { + ProcedureState::PrepareRollback { error } => { + assert!(!error.to_string().is_empty(), "Error should not be empty"); + } + _ => panic!("Expected PrepareRollback, got {state:?}"), + } + // Child should NOT have been submitted + assert!( + !runner.manager_ctx.contains_procedure(child_id), + "Child procedure should not be submitted when deadlock is detected" + ); + } + + #[test] + fn test_find_lock_conflicts() { + use crate::procedure::StringKey; + + // 1. Share + Share = No conflict (Compatible) + let parent = [StringKey::Share("A".to_string())]; + let child = [StringKey::Share("A".to_string())]; + assert!(super::find_lock_conflicts(parent.iter(), child.iter()).is_empty()); + + // 2. Share + Exclusive = Conflict + let parent = [StringKey::Share("A".to_string())]; + let child = [StringKey::Exclusive("A".to_string())]; + assert_eq!( + super::find_lock_conflicts(parent.iter(), child.iter()), + vec!["A".to_string()] + ); + + // 3. Exclusive + Share = Conflict + let parent = [StringKey::Exclusive("A".to_string())]; + let child = [StringKey::Share("A".to_string())]; + assert_eq!( + super::find_lock_conflicts(parent.iter(), child.iter()), + vec!["A".to_string()] + ); + + // 4. Exclusive + Exclusive = Conflict + let parent = [StringKey::Exclusive("A".to_string())]; + let child = [StringKey::Exclusive("A".to_string())]; + assert_eq!( + super::find_lock_conflicts(parent.iter(), child.iter()), + vec!["A".to_string()] + ); + + // 5. Multiple keys, partial overlap + let parent = [ + StringKey::Share("A".to_string()), + StringKey::Exclusive("B".to_string()), + ]; + let child = [ + StringKey::Exclusive("A".to_string()), // Conflict with Share("A") + StringKey::Share("B".to_string()), // Conflict with Exclusive("B") + StringKey::Exclusive("C".to_string()), // No conflict, parent doesn't hold C + ]; + let mut conflicts = super::find_lock_conflicts(parent.iter(), child.iter()); + conflicts.sort(); + assert_eq!(conflicts, vec!["A".to_string(), "B".to_string()]); + } } diff --git a/src/common/query/src/prelude.rs b/src/common/query/src/prelude.rs index c27b94294e..50668bbbb1 100644 --- a/src/common/query/src/prelude.rs +++ b/src/common/query/src/prelude.rs @@ -27,7 +27,16 @@ static GREPTIME_TIMESTAMP_CELL: OnceCell = OnceCell::new(); static GREPTIME_VALUE_CELL: OnceCell = OnceCell::new(); pub fn set_default_prefix(prefix: Option<&str>) -> Result<()> { - match prefix { + // Strip surrounding double quotes as a defensive measure against upstream + // sources (scripts, CI, template engines, incorrect shell escaping) that may + // pass literal `""` as the value instead of an empty string. + let stripped = prefix.map(|s| { + s.strip_prefix('"') + .and_then(|s| s.strip_suffix('"')) + .unwrap_or(s) + }); + + match stripped { None => { // use default greptime prefix GREPTIME_TIMESTAMP_CELL.get_or_init(|| GREPTIME_TIMESTAMP.to_string()); @@ -70,3 +79,45 @@ const GREPTIME_VALUE: &str = "greptime_value"; pub const GREPTIME_COUNT: &str = "greptime_count"; /// Default physical table name pub const GREPTIME_PHYSICAL_TABLE: &str = "greptime_physical_table"; + +#[cfg(test)] +mod tests { + use super::*; + + // Each test runs in a separate process via `cargo nextest`, so OnceCell + // state does not leak between tests. + + #[test] + fn test_set_default_prefix_none() { + set_default_prefix(None).unwrap(); + assert_eq!(greptime_timestamp(), "greptime_timestamp"); + assert_eq!(greptime_value(), "greptime_value"); + } + + #[test] + fn test_set_default_prefix_empty_string() { + set_default_prefix(Some("")).unwrap(); + assert_eq!(greptime_timestamp(), "timestamp"); + assert_eq!(greptime_value(), "value"); + } + + #[test] + fn test_set_default_prefix_quoted_empty() { + // Handles upstream sources that pass literal `""` instead of an empty string + set_default_prefix(Some("\"\"")).unwrap(); + assert_eq!(greptime_timestamp(), "timestamp"); + assert_eq!(greptime_value(), "value"); + } + + #[test] + fn test_set_default_prefix_custom() { + set_default_prefix(Some("mydb")).unwrap(); + assert_eq!(greptime_timestamp(), "mydb_timestamp"); + assert_eq!(greptime_value(), "mydb_value"); + } + + #[test] + fn test_set_default_prefix_invalid() { + assert!(set_default_prefix(Some("invalid prefix!")).is_err()); + } +} diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs index 9070e2babe..50f2dba270 100644 --- a/src/datatypes/src/schema.rs +++ b/src/datatypes/src/schema.rs @@ -16,8 +16,8 @@ mod column_schema; pub mod constraint; use std::collections::HashMap; -use std::fmt; use std::sync::Arc; +use std::{fmt, mem}; use arrow::datatypes::{Field, Schema as ArrowSchema}; use datafusion_common::DFSchemaRef; @@ -177,6 +177,26 @@ impl Schema { &self.arrow_schema.metadata } + /// Returns the estimated memory footprint of this schema. + pub fn estimated_size(&self) -> usize { + mem::size_of_val(self) + + mem::size_of::() * self.column_schemas.capacity() + + self + .column_schemas + .iter() + .map(|column_schema| { + column_schema.estimated_size() - mem::size_of::() + }) + .sum::() + + mem::size_of::<(String, usize)>() * self.name_to_index.capacity() + + self + .name_to_index + .keys() + .map(|name| name.capacity()) + .sum::() + + arrow_schema_size(self.arrow_schema.as_ref()) + } + /// Generate a new projected schema /// /// # Panic @@ -213,6 +233,17 @@ impl Schema { } } +fn arrow_schema_size(schema: &ArrowSchema) -> usize { + mem::size_of_val(schema) + + schema.fields.size() + + mem::size_of::<(String, String)>() * schema.metadata.capacity() + + schema + .metadata + .iter() + .map(|(key, value)| key.capacity() + value.capacity()) + .sum::() +} + #[derive(Default)] pub struct SchemaBuilder { column_schemas: Vec, diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs index 183cf05da8..2479f4fc41 100644 --- a/src/datatypes/src/schema/column_schema.rs +++ b/src/datatypes/src/schema/column_schema.rs @@ -13,8 +13,8 @@ // limitations under the License. use std::collections::HashMap; -use std::fmt; use std::str::FromStr; +use std::{fmt, mem}; use arrow::datatypes::Field; use arrow_schema::extension::{ @@ -178,6 +178,19 @@ impl ColumnSchema { self } + /// Returns the estimated memory footprint of this schema. + pub fn estimated_size(&self) -> usize { + mem::size_of_val(self) - mem::size_of_val(&self.data_type) + + self.data_type.as_arrow_type().size() + + self.name.capacity() + + self + .default_constraint + .as_ref() + .map(column_default_constraint_size) + .unwrap_or_default() + + metadata_size(&self.metadata) + } + /// Set the inverted index for the column. /// Similar to [with_inverted_index] but don't take the ownership. /// @@ -493,6 +506,21 @@ impl ColumnSchema { } } +fn metadata_size(metadata: &Metadata) -> usize { + mem::size_of::<(String, String)>() * metadata.capacity() + + metadata + .iter() + .map(|(key, value)| key.capacity() + value.capacity()) + .sum::() +} + +fn column_default_constraint_size(default_constraint: &ColumnDefaultConstraint) -> usize { + match default_constraint { + ColumnDefaultConstraint::Function(expr) => expr.capacity(), + ColumnDefaultConstraint::Value(value) => value.as_value_ref().data_size(), + } +} + /// Column extended type set in column schema's metadata. #[derive(Debug, Clone, PartialEq, Eq)] pub enum ColumnExtType { diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs index 61586fc460..912bbfca54 100644 --- a/src/datatypes/src/types/json_type.rs +++ b/src/datatypes/src/types/json_type.rs @@ -396,7 +396,7 @@ pub fn jsonb_to_string(val: &[u8]) -> Result { match jsonb::from_slice(val) { Ok(jsonb_value) => { let serialized = jsonb_value.to_string(); - Ok(serialized) + fix_unicode_point(&serialized) } Err(e) => InvalidJsonbSnafu { error: e }.fail(), } @@ -405,18 +405,12 @@ pub fn jsonb_to_string(val: &[u8]) -> Result { /// Converts a json type value to serde_json::Value pub fn jsonb_to_serde_json(val: &[u8]) -> Result { let json_string = jsonb_to_string(val)?; - jsonb_string_to_serde_value(&json_string) + serde_json::Value::from_str(&json_string).context(DeserializeSnafu { json: json_string }) } -/// Attempts to deserialize a JSON text into `serde_json::Value`, with a best-effort -/// fallback for Rust-style Unicode escape sequences. +/// Normalizes a JSON string by converting Rust-style Unicode escape sequences to JSON-compatible format. /// -/// This function is intended to be used on JSON strings produced from the internal -/// JSONB representation (e.g. via [`jsonb_to_string`]). It first calls -/// `serde_json::Value::from_str` directly. If that succeeds, the parsed value is -/// returned as-is. -/// -/// If the initial parse fails, the input is scanned for Rust-style Unicode code +/// The input is scanned for Rust-style Unicode code /// point escapes of the form `\\u{H...}` (a backslash, `u`, an opening brace, /// followed by 1–6 hexadecimal digits, and a closing brace). Each such escape is /// converted into JSON-compatible UTF‑16 escape sequences: @@ -427,59 +421,44 @@ pub fn jsonb_to_serde_json(val: &[u8]) -> Result { /// the code point is encoded as a UTF‑16 surrogate pair and emitted as two consecutive /// `\\uXXXX` sequences (as JSON format required). /// -/// After this normalization, the function retries parsing the resulting string as -/// JSON and returns the deserialized value or a `DeserializeSnafu` error if it -/// still cannot be parsed. -fn jsonb_string_to_serde_value(json: &str) -> Result { - match serde_json::Value::from_str(json) { - Ok(v) => Ok(v), - Err(e) => { - // If above deserialization is failed, the JSON string might contain some Rust chars - // that are somehow incorrectly represented as Unicode code point literal. For example, - // "\u{fe0f}". We have to convert them to JSON compatible format, like "\uFE0F", then - // try to deserialize the JSON string again. - if !e.is_syntax() || !e.to_string().contains("invalid escape") { - return Err(e).context(DeserializeSnafu { json }); - } +/// After this normalization, the function returns the normalized string +fn fix_unicode_point(json: &str) -> Result { + static UNICODE_CODE_POINT_PATTERN: LazyLock = LazyLock::new(|| { + // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits + // inside braces. + Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e)) + }); - static UNICODE_CODE_POINT_PATTERN: LazyLock = LazyLock::new(|| { - // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits - // inside braces. - Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e)) - }); + let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| { + // Extract the hex payload (without braces) and parse to a code point. + let hex = &caps[1]; + let Ok(code) = u32::from_str_radix(hex, 16) else { + // On parse failure, leave the original escape sequence unchanged. + return caps[0].to_string(); + }; - let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| { - // Extract the hex payload (without braces) and parse to a code point. - let hex = &caps[1]; - let Ok(code) = u32::from_str_radix(hex, 16) else { - // On parse failure, leave the original escape sequence unchanged. - return caps[0].to_string(); - }; + if code <= 0xFFFF { + // Basic Multilingual Plane: JSON can represent this directly as \uXXXX. + format!("\\u{:04X}", code) + } else if code > 0x10FFFF { + // Beyond max Unicode code point + caps[0].to_string() + } else { + // Supplementary planes: JSON needs UTF-16 surrogate pairs. + // Convert the code point to a 20-bit value. + let code = code - 0x10000; - if code <= 0xFFFF { - // Basic Multilingual Plane: JSON can represent this directly as \uXXXX. - format!("\\u{:04X}", code) - } else if code > 0x10FFFF { - // Beyond max Unicode code point - caps[0].to_string() - } else { - // Supplementary planes: JSON needs UTF-16 surrogate pairs. - // Convert the code point to a 20-bit value. - let code = code - 0x10000; + // High surrogate: top 10 bits, offset by 0xD800. + let high = 0xD800 + ((code >> 10) & 0x3FF); - // High surrogate: top 10 bits, offset by 0xD800. - let high = 0xD800 + ((code >> 10) & 0x3FF); + // Low surrogate: bottom 10 bits, offset by 0xDC00. + let low = 0xDC00 + (code & 0x3FF); - // Low surrogate: bottom 10 bits, offset by 0xDC00. - let low = 0xDC00 + (code & 0x3FF); - - // Emit two \uXXXX escapes in sequence. - format!("\\u{:04X}\\u{:04X}", high, low) - } - }); - serde_json::Value::from_str(&v).context(DeserializeSnafu { json }) + // Emit two \uXXXX escapes in sequence. + format!("\\u{:04X}\\u{:04X}", high, low) } - } + }); + Ok(v.to_string()) } /// Parses a string to a json type value @@ -495,45 +474,54 @@ mod tests { use crate::json::JsonStructureSettings; #[test] - fn test_jsonb_string_to_serde_value() -> Result<()> { + fn test_fix_unicode_point() -> Result<()> { let valid_cases = vec![ - (r#"{"data": "simple ascii"}"#, r#"{"data":"simple ascii"}"#), + (r#"{"data": "simple ascii"}"#, r#"{"data": "simple ascii"}"#), ( - r#"{"data": "Greek sigma: \u{03a3}"}"#, - r#"{"data":"Greek sigma: Σ"}"#, + r#"{"data":"Greek sigma: \u{03a3}"}"#, + r#"{"data":"Greek sigma: \u03A3"}"#, ), ( - r#"{"data": "Joker card: \u{1f0df}"}"#, - r#"{"data":"Joker card: 🃟"}"#, + r#"{"data":"Joker card: \u{1f0df}"}"#, + r#"{"data":"Joker card: \uD83C\uDCDF"}"#, ), ( - r#"{"data": "BMP boundary: \u{ffff}"}"#, - r#"{"data":"BMP boundary: ￿"}"#, + r#"{"data":"BMP boundary: \u{ffff}"}"#, + r#"{"data":"BMP boundary: \uFFFF"}"#, ), ( - r#"{"data": "Supplementary min: \u{10000}"}"#, - r#"{"data":"Supplementary min: 𐀀"}"#, + r#"{"data":"Supplementary min: \u{10000}"}"#, + r#"{"data":"Supplementary min: \uD800\uDC00"}"#, ), ( - r#"{"data": "Supplementary max: \u{10ffff}"}"#, - r#"{"data":"Supplementary max: 􏿿"}"#, + r#"{"data":"Supplementary max: \u{10ffff}"}"#, + r#"{"data":"Supplementary max: \uDBFF\uDFFF"}"#, ), ]; for (input, expect) in valid_cases { - let v = jsonb_string_to_serde_value(input)?; - assert_eq!(v.to_string(), expect); + let v = fix_unicode_point(input)?; + assert_eq!(v, expect); } - let invalid_cases = vec![ - r#"{"data": "Invalid hex: \u{gggg}"}"#, - r#"{"data": "Beyond max Unicode code point: \u{110000}"}"#, - r#"{"data": "Out of range: \u{1100000}"}"#, // 7 digit - r#"{"data": "Empty braces: \u{}"}"#, + let invalid_escape_cases = vec![ + ( + r#"{"data": "Invalid hex: \u{gggg}"}"#, + r#"{"data": "Invalid hex: \u{gggg}"}"#, + ), + ( + r#"{"data": "Empty braces: \u{}"}"#, + r#"{"data": "Empty braces: \u{}"}"#, + ), + ( + r#"{"data": "Out of range: \u{1100000}"}"#, + r#"{"data": "Out of range: \u{1100000}"}"#, + ), ]; - for input in invalid_cases { - let result = jsonb_string_to_serde_value(input); - assert!(result.is_err()); + for (input, expect) in invalid_escape_cases { + let v = fix_unicode_point(input)?; + assert_eq!(v, expect); } + Ok(()) } diff --git a/src/flow/src/df_optimizer.rs b/src/flow/src/df_optimizer.rs index 1d41d09346..614b79ccf1 100644 --- a/src/flow/src/df_optimizer.rs +++ b/src/flow/src/df_optimizer.rs @@ -16,30 +16,19 @@ #![warn(unused)] -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::sync::Arc; use common_error::ext::BoxedError; use common_telemetry::debug; use datafusion::config::ConfigOptions; use datafusion::error::DataFusionError; -use datafusion::functions_aggregate::count::count_udaf; -use datafusion::functions_aggregate::sum::sum_udaf; use datafusion::optimizer::analyzer::type_coercion::TypeCoercion; use datafusion::optimizer::common_subexpr_eliminate::CommonSubexprEliminate; use datafusion::optimizer::optimize_projections::OptimizeProjections; use datafusion::optimizer::simplify_expressions::SimplifyExpressions; -use datafusion::optimizer::utils::NamePreserver; use datafusion::optimizer::{Analyzer, AnalyzerRule, Optimizer, OptimizerContext}; -use datafusion_common::tree_node::{ - Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor, -}; -use datafusion_common::{Column, DFSchema, ScalarValue}; -use datafusion_expr::utils::merge_schema; -use datafusion_expr::{ - BinaryExpr, ColumnarValue, Expr, Literal, Operator, Projection, ScalarFunctionArgs, - ScalarUDFImpl, Signature, TypeSignature, Volatility, -}; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use query::QueryEngine; use query::optimizer::count_wildcard::CountWildcardToTimeIndexRule; use query::parser::QueryLanguageParser; @@ -52,7 +41,6 @@ use substrait::DFLogicalSubstraitConvertor; use crate::adapter::FlownodeContext; use crate::error::{DatafusionSnafu, Error, ExternalSnafu, UnexpectedSnafu}; -use crate::expr::{TUMBLE_END, TUMBLE_START}; use crate::plan::TypedPlan; // TODO(discord9): use `Analyzer` to manage rules if more `AnalyzerRule` is needed @@ -63,8 +51,6 @@ pub async fn apply_df_optimizer( let cfg = query_ctx.create_config_options(); let analyzer = Analyzer::with_rules(vec![ Arc::new(CountWildcardToTimeIndexRule), - Arc::new(AvgExpandRule), - Arc::new(TumbleExpandRule), Arc::new(CheckGroupByRule::new()), Arc::new(TypeCoercion::new()), ]); @@ -127,390 +113,6 @@ pub async fn sql_to_flow_plan( Ok(flow_plan) } -#[derive(Debug)] -struct AvgExpandRule; - -impl AnalyzerRule for AvgExpandRule { - fn analyze( - &self, - plan: datafusion_expr::LogicalPlan, - _config: &ConfigOptions, - ) -> datafusion_common::Result { - let transformed = plan - .transform_up_with_subqueries(expand_avg_analyzer)? - .data - .transform_down_with_subqueries(put_aggr_to_proj_analyzer)? - .data; - Ok(transformed) - } - - fn name(&self) -> &str { - "avg_expand" - } -} - -/// lift aggr's composite aggr_expr to outer proj, and leave aggr only with simple direct aggr expr -/// i.e. -/// ```ignore -/// proj: avg(x) -/// -- aggr: [sum(x)/count(x) as avg(x)] -/// ``` -/// becomes: -/// ```ignore -/// proj: sum(x)/count(x) as avg(x) -/// -- aggr: [sum(x), count(x)] -/// ``` -fn put_aggr_to_proj_analyzer( - plan: datafusion_expr::LogicalPlan, -) -> Result, DataFusionError> { - if let datafusion_expr::LogicalPlan::Projection(proj) = &plan - && let datafusion_expr::LogicalPlan::Aggregate(aggr) = proj.input.as_ref() - { - let mut replace_old_proj_exprs = HashMap::new(); - let mut expanded_aggr_exprs = vec![]; - for aggr_expr in &aggr.aggr_expr { - let mut is_composite = false; - if let Expr::AggregateFunction(_) = &aggr_expr { - expanded_aggr_exprs.push(aggr_expr.clone()); - } else { - let old_name = aggr_expr.name_for_alias()?; - let new_proj_expr = aggr_expr - .clone() - .transform(|ch| { - if let Expr::AggregateFunction(_) = &ch { - is_composite = true; - expanded_aggr_exprs.push(ch.clone()); - Ok(Transformed::yes(Expr::Column(Column::from_qualified_name( - ch.name_for_alias()?, - )))) - } else { - Ok(Transformed::no(ch)) - } - })? - .data; - replace_old_proj_exprs.insert(old_name, new_proj_expr); - } - } - - if expanded_aggr_exprs.len() > aggr.aggr_expr.len() { - let mut aggr = aggr.clone(); - aggr.aggr_expr = expanded_aggr_exprs; - let mut aggr_plan = datafusion_expr::LogicalPlan::Aggregate(aggr); - // important to recompute schema after changing aggr_expr - aggr_plan = aggr_plan.recompute_schema()?; - - // reconstruct proj with new proj_exprs - let mut new_proj_exprs = proj.expr.clone(); - for proj_expr in new_proj_exprs.iter_mut() { - if let Some(new_proj_expr) = - replace_old_proj_exprs.get(&proj_expr.name_for_alias()?) - { - *proj_expr = new_proj_expr.clone(); - } - *proj_expr = proj_expr - .clone() - .transform(|expr| { - if let Some(new_expr) = replace_old_proj_exprs.get(&expr.name_for_alias()?) - { - Ok(Transformed::yes(new_expr.clone())) - } else { - Ok(Transformed::no(expr)) - } - })? - .data; - } - let proj = datafusion_expr::LogicalPlan::Projection(Projection::try_new( - new_proj_exprs, - Arc::new(aggr_plan), - )?); - return Ok(Transformed::yes(proj)); - } - } - Ok(Transformed::no(plan)) -} - -/// expand `avg()` function into `cast(sum(() AS f64)/count(()` -fn expand_avg_analyzer( - plan: datafusion_expr::LogicalPlan, -) -> Result, DataFusionError> { - let mut schema = merge_schema(&plan.inputs()); - - if let datafusion_expr::LogicalPlan::TableScan(ts) = &plan { - let source_schema = - DFSchema::try_from_qualified_schema(ts.table_name.clone(), &ts.source.schema())?; - schema.merge(&source_schema); - } - - let mut expr_rewrite = ExpandAvgRewriter::new(&schema); - - let name_preserver = NamePreserver::new(&plan); - // apply coercion rewrite all expressions in the plan individually - plan.map_expressions(|expr| { - let original_name = name_preserver.save(&expr); - Ok(expr - .rewrite(&mut expr_rewrite)? - .update_data(|expr| original_name.restore(expr))) - })? - .map_data(|plan| plan.recompute_schema()) -} - -/// rewrite `avg()` function into `CASE WHEN count() !=0 THEN cast(sum(() AS avg_return_type)/count(() ELSE 0` -/// -/// TODO(discord9): support avg return type decimal128 -/// -/// see impl details at https://github.com/apache/datafusion/blob/4ad4f90d86c57226a4e0fb1f79dfaaf0d404c273/datafusion/expr/src/type_coercion/aggregates.rs#L457-L462 -pub(crate) struct ExpandAvgRewriter<'a> { - /// schema of the plan - #[allow(unused)] - pub(crate) schema: &'a DFSchema, -} - -impl<'a> ExpandAvgRewriter<'a> { - fn new(schema: &'a DFSchema) -> Self { - Self { schema } - } -} - -impl TreeNodeRewriter for ExpandAvgRewriter<'_> { - type Node = Expr; - - fn f_up(&mut self, expr: Expr) -> Result, DataFusionError> { - if let Expr::AggregateFunction(aggr_func) = &expr - && aggr_func.func.name() == "avg" - { - let sum_expr = { - let mut tmp = aggr_func.clone(); - tmp.func = sum_udaf(); - Expr::AggregateFunction(tmp) - }; - let sum_cast = { - let mut tmp = sum_expr.clone(); - tmp = Expr::Cast(datafusion_expr::Cast { - expr: Box::new(tmp), - data_type: arrow_schema::DataType::Float64, - }); - tmp - }; - - let count_expr = { - let mut tmp = aggr_func.clone(); - tmp.func = count_udaf(); - - Expr::AggregateFunction(tmp) - }; - let count_expr_ref = - Expr::Column(Column::from_qualified_name(count_expr.name_for_alias()?)); - - let div = BinaryExpr::new(Box::new(sum_cast), Operator::Divide, Box::new(count_expr)); - let div_expr = Box::new(Expr::BinaryExpr(div)); - - let zero = Box::new(0.lit()); - let not_zero = BinaryExpr::new(Box::new(count_expr_ref), Operator::NotEq, zero.clone()); - let not_zero = Box::new(Expr::BinaryExpr(not_zero)); - let null = Box::new(Expr::Literal(ScalarValue::Null, None)); - - let case_when = - datafusion_expr::Case::new(None, vec![(not_zero, div_expr)], Some(null)); - let case_when_expr = Expr::Case(case_when); - - return Ok(Transformed::yes(case_when_expr)); - } - - Ok(Transformed::no(expr)) - } -} - -/// expand tumble in aggr expr to tumble_start and tumble_end with column name like `window_start` -#[derive(Debug)] -struct TumbleExpandRule; - -impl AnalyzerRule for TumbleExpandRule { - fn analyze( - &self, - plan: datafusion_expr::LogicalPlan, - _config: &ConfigOptions, - ) -> datafusion_common::Result { - let transformed = plan - .transform_up_with_subqueries(expand_tumble_analyzer)? - .data; - Ok(transformed) - } - - fn name(&self) -> &str { - "tumble_expand" - } -} - -/// expand `tumble` in aggr expr to `tumble_start` and `tumble_end`, also expand related alias and column ref -/// -/// will add `tumble_start` and `tumble_end` to outer projection if not exist before -fn expand_tumble_analyzer( - plan: datafusion_expr::LogicalPlan, -) -> Result, DataFusionError> { - if let datafusion_expr::LogicalPlan::Projection(proj) = &plan - && let datafusion_expr::LogicalPlan::Aggregate(aggr) = proj.input.as_ref() - { - let mut new_group_expr = vec![]; - let mut alias_to_expand = HashMap::new(); - let mut encountered_tumble = false; - for expr in aggr.group_expr.iter() { - match expr { - datafusion_expr::Expr::ScalarFunction(func) if func.name() == "tumble" => { - encountered_tumble = true; - - let tumble_start = TumbleExpand::new(TUMBLE_START); - let tumble_start = datafusion_expr::expr::ScalarFunction::new_udf( - Arc::new(tumble_start.into()), - func.args.clone(), - ); - let tumble_start = datafusion_expr::Expr::ScalarFunction(tumble_start); - let start_col_name = tumble_start.name_for_alias()?; - new_group_expr.push(tumble_start); - - let tumble_end = TumbleExpand::new(TUMBLE_END); - let tumble_end = datafusion_expr::expr::ScalarFunction::new_udf( - Arc::new(tumble_end.into()), - func.args.clone(), - ); - let tumble_end = datafusion_expr::Expr::ScalarFunction(tumble_end); - let end_col_name = tumble_end.name_for_alias()?; - new_group_expr.push(tumble_end); - - alias_to_expand.insert(expr.name_for_alias()?, (start_col_name, end_col_name)); - } - _ => new_group_expr.push(expr.clone()), - } - } - if !encountered_tumble { - return Ok(Transformed::no(plan)); - } - let mut new_aggr = aggr.clone(); - new_aggr.group_expr = new_group_expr; - let new_aggr = datafusion_expr::LogicalPlan::Aggregate(new_aggr).recompute_schema()?; - // replace alias in projection if needed, and add new column ref if necessary - let mut new_proj_expr = vec![]; - let mut have_expanded = false; - - for proj_expr in proj.expr.iter() { - if let Some((start_col_name, end_col_name)) = - alias_to_expand.get(&proj_expr.name_for_alias()?) - { - let start_col = Column::from_qualified_name(start_col_name); - let end_col = Column::from_qualified_name(end_col_name); - new_proj_expr.push(datafusion_expr::Expr::Column(start_col)); - new_proj_expr.push(datafusion_expr::Expr::Column(end_col)); - have_expanded = true; - } else { - new_proj_expr.push(proj_expr.clone()); - } - } - - // append to end of projection if not exist - if !have_expanded { - for (start_col_name, end_col_name) in alias_to_expand.values() { - let start_col = Column::from_qualified_name(start_col_name); - let end_col = Column::from_qualified_name(end_col_name); - new_proj_expr.push(datafusion_expr::Expr::Column(start_col).alias("window_start")); - new_proj_expr.push(datafusion_expr::Expr::Column(end_col).alias("window_end")); - } - } - - let new_proj = datafusion_expr::LogicalPlan::Projection(Projection::try_new( - new_proj_expr, - Arc::new(new_aggr), - )?); - return Ok(Transformed::yes(new_proj)); - } - - Ok(Transformed::no(plan)) -} - -/// This is a placeholder for tumble_start and tumble_end function, so that datafusion can -/// recognize them as scalar function -#[derive(Debug, PartialEq, Eq, Hash)] -pub struct TumbleExpand { - signature: Signature, - name: String, -} - -impl TumbleExpand { - pub fn new(name: &str) -> Self { - Self { - signature: Signature::new(TypeSignature::UserDefined, Volatility::Immutable), - name: name.to_string(), - } - } -} - -impl ScalarUDFImpl for TumbleExpand { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn name(&self) -> &str { - &self.name - } - - /// elide the signature for now - fn signature(&self) -> &Signature { - &self.signature - } - - fn coerce_types( - &self, - arg_types: &[arrow_schema::DataType], - ) -> datafusion_common::Result> { - match (arg_types.first(), arg_types.get(1), arg_types.get(2)) { - (Some(ts), Some(window), opt) => { - use arrow_schema::DataType::*; - if !matches!(ts, Date32 | Timestamp(_, _)) { - return Err(DataFusionError::Plan( - format!("Expect timestamp column as first arg for tumble_start, found {:?}", ts) - )); - } - if !matches!(window, Utf8 | Interval(_)) { - return Err(DataFusionError::Plan( - format!("Expect second arg for window size's type being interval for tumble_start, found {:?}", window), - )); - } - - if let Some(start_time) = opt - && !matches!(start_time, Utf8 | Date32 | Timestamp(_, _)){ - return Err(DataFusionError::Plan( - format!("Expect start_time to either be date, timestamp or string, found {:?}", start_time) - )); - } - - Ok(arg_types.to_vec()) - } - _ => Err(DataFusionError::Plan( - "Expect tumble function have at least two arg(timestamp column and window size) and a third optional arg for starting time".to_string(), - )), - } - } - - fn return_type( - &self, - arg_types: &[arrow_schema::DataType], - ) -> Result { - arg_types.first().cloned().ok_or_else(|| { - DataFusionError::Plan( - "Expect tumble function have at least two arg(timestamp column and window size)" - .to_string(), - ) - }) - } - - fn invoke_with_args( - &self, - _args: ScalarFunctionArgs, - ) -> datafusion_common::Result { - Err(DataFusionError::Plan( - "This function should not be executed by datafusion".to_string(), - )) - } -} - /// This rule check all group by exprs, and make sure they are also in select clause in a aggr query #[derive(Debug)] struct CheckGroupByRule {} diff --git a/src/flow/src/transform/aggr.rs b/src/flow/src/transform/aggr.rs index 579f0e8ee3..861ca8fe65 100644 --- a/src/flow/src/transform/aggr.rs +++ b/src/flow/src/transform/aggr.rs @@ -382,10 +382,9 @@ impl TypedPlan { #[cfg(test)] mod test { - use std::time::Duration; use bytes::BytesMut; - use common_time::{IntervalMonthDayNano, Timestamp}; + use common_time::IntervalMonthDayNano; use datatypes::data_type::ConcreteDataType as CDT; use datatypes::prelude::ConcreteDataType; use datatypes::value::Value; @@ -397,898 +396,6 @@ mod test { use crate::repr::{ColumnType, RelationType}; use crate::transform::test::{create_test_ctx, create_test_query_engine, sql_to_substrait}; - #[tokio::test] - async fn test_df_func_basic() { - let engine = create_test_query_engine(); - let sql = "SELECT sum(abs(number)) FROM numbers_with_ts GROUP BY tumble(ts, '1 second', '2021-07-01 00:00:00');"; - let plan = sql_to_substrait(engine.clone(), sql).await; - - let mut ctx = create_test_ctx(); - let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan) - .await - .unwrap(); - - let aggr_expr = AggregateExpr { - func: AggregateFunc::SumUInt64, - expr: ScalarExpr::Column(0), - distinct: false, - }; - let expected = - TypedPlan { - schema: RelationType::new(vec![ - ColumnType::new(CDT::uint64_datatype(), true), // sum(number) - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ]) - .with_key(vec![2]) - .with_time_index(Some(1)) - .into_named(vec![ - Some("sum(abs(numbers_with_ts.number))".to_string()), - Some("window_start".to_string()), - Some("window_end".to_string()), - ]), - plan: Plan::Mfp { - input: Box::new( - Plan::Reduce { - input: Box::new( - Plan::Get { - id: crate::expr::Id::Global(GlobalId::User(1)), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(ConcreteDataType::uint32_datatype(), false), - ColumnType::new( - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - ]) - .into_named(vec![ - Some("number".to_string()), - Some("ts".to_string()), - ]), - ) - .mfp(MapFilterProject::new(2).into_safe()) - .unwrap(), - ), - key_val_plan: KeyValPlan { - key_plan: MapFilterProject::new(2) - .map(vec![ - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowFloor { - window_size: Duration::from_nanos(1_000_000_000), - start_time: Some(Timestamp::new_millisecond( - 1625097600000, - )), - }, - ), - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowCeiling { - window_size: Duration::from_nanos(1_000_000_000), - start_time: Some(Timestamp::new_millisecond( - 1625097600000, - )), - }, - ), - ]) - .unwrap() - .project(vec![2, 3]) - .unwrap() - .into_safe(), - val_plan: MapFilterProject::new(2) - .map(vec![ScalarExpr::CallDf { - df_scalar_fn: DfScalarFunction::try_from_raw_fn( - RawDfScalarFn { - f: BytesMut::from( - b"\x08\x02\"\x08\x1a\x06\x12\x04\n\x02\x12\0" - .as_ref(), - ), - input_schema: RelationType::new(vec![ColumnType::new( - ConcreteDataType::uint32_datatype(), - false, - )]) - .into_unnamed(), - extensions: FunctionExtensions::from_iter( - [ - (0, "tumble_start".to_string()), - (1, "tumble_end".to_string()), - (2, "abs".to_string()), - (3, "sum".to_string()), - ] - .into_iter(), - ), - }, - ) - .await - .unwrap(), - exprs: vec![ScalarExpr::Column(0)], - } - .cast(CDT::uint64_datatype())]) - .unwrap() - .project(vec![2]) - .unwrap() - .into_safe(), - }, - reduce_plan: ReducePlan::Accumulable(AccumulablePlan { - full_aggrs: vec![aggr_expr.clone()], - simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)], - distinct_aggrs: vec![], - }), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ColumnType::new(CDT::uint64_datatype(), true), //sum(number) - ]) - .with_key(vec![1]) - .with_time_index(Some(0)) - .into_unnamed(), - ), - ), - mfp: MapFilterProject::new(3) - .map(vec![ - ScalarExpr::Column(2), - ScalarExpr::Column(0), - ScalarExpr::Column(1), - ]) - .unwrap() - .project(vec![3, 4, 5]) - .unwrap(), - }, - }; - assert_eq!(flow_plan, expected); - } - - #[tokio::test] - async fn test_df_func_expr_tree() { - let engine = create_test_query_engine(); - let sql = "SELECT abs(sum(number)) FROM numbers_with_ts GROUP BY tumble(ts, '1 second', '2021-07-01 00:00:00');"; - let plan = sql_to_substrait(engine.clone(), sql).await; - - let mut ctx = create_test_ctx(); - let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan) - .await - .unwrap(); - - let aggr_expr = AggregateExpr { - func: AggregateFunc::SumUInt64, - expr: ScalarExpr::Column(0), - distinct: false, - }; - let expected = TypedPlan { - schema: RelationType::new(vec![ - ColumnType::new(CDT::uint64_datatype(), true), // sum(number) - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ]) - .with_key(vec![2]) - .with_time_index(Some(1)) - .into_named(vec![ - Some("abs(sum(numbers_with_ts.number))".to_string()), - Some("window_start".to_string()), - Some("window_end".to_string()), - ]), - plan: Plan::Mfp { - input: Box::new( - Plan::Reduce { - input: Box::new( - Plan::Get { - id: crate::expr::Id::Global(GlobalId::User(1)), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(ConcreteDataType::uint32_datatype(), false), - ColumnType::new( - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - ]) - .into_named(vec![ - Some("number".to_string()), - Some("ts".to_string()), - ]), - ) - .mfp(MapFilterProject::new(2).into_safe()) - .unwrap(), - ), - key_val_plan: KeyValPlan { - key_plan: MapFilterProject::new(2) - .map(vec![ - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowFloor { - window_size: Duration::from_nanos(1_000_000_000), - start_time: Some(Timestamp::new_millisecond( - 1625097600000, - )), - }, - ), - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowCeiling { - window_size: Duration::from_nanos(1_000_000_000), - start_time: Some(Timestamp::new_millisecond( - 1625097600000, - )), - }, - ), - ]) - .unwrap() - .project(vec![2, 3]) - .unwrap() - .into_safe(), - val_plan: MapFilterProject::new(2) - .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())]) - .unwrap() - .project(vec![2]) - .unwrap() - .into_safe(), - }, - reduce_plan: ReducePlan::Accumulable(AccumulablePlan { - full_aggrs: vec![aggr_expr.clone()], - simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)], - distinct_aggrs: vec![], - }), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ColumnType::new(CDT::uint64_datatype(), true), //sum(number) - ]) - .with_key(vec![1]) - .with_time_index(Some(0)) - .into_named(vec![None, None, None]), - ), - ), - mfp: MapFilterProject::new(3) - .map(vec![ - ScalarExpr::CallDf { - df_scalar_fn: DfScalarFunction::try_from_raw_fn(RawDfScalarFn { - f: BytesMut::from(b"\"\x08\x1a\x06\x12\x04\n\x02\x12\0".as_ref()), - input_schema: RelationType::new(vec![ColumnType::new( - ConcreteDataType::uint64_datatype(), - true, - )]) - .into_unnamed(), - extensions: FunctionExtensions::from_iter( - [ - (0, "abs".to_string()), - (1, "tumble_start".to_string()), - (2, "tumble_end".to_string()), - (3, "sum".to_string()), - ] - .into_iter(), - ), - }) - .await - .unwrap(), - exprs: vec![ScalarExpr::Column(2)], - }, - ScalarExpr::Column(0), - ScalarExpr::Column(1), - ]) - .unwrap() - .project(vec![3, 4, 5]) - .unwrap(), - }, - }; - assert_eq!(flow_plan, expected); - } - - /// TODO(discord9): add more illegal sql tests - #[tokio::test] - async fn test_tumble_composite() { - let engine = create_test_query_engine(); - let sql = - "SELECT number, avg(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour'), number"; - let plan = sql_to_substrait(engine.clone(), sql).await; - - let mut ctx = create_test_ctx(); - let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan) - .await - .unwrap(); - - let aggr_exprs = vec![ - AggregateExpr { - func: AggregateFunc::SumUInt64, - expr: ScalarExpr::Column(0), - distinct: false, - }, - AggregateExpr { - func: AggregateFunc::Count, - expr: ScalarExpr::Column(1), - distinct: false, - }, - ]; - let avg_expr = ScalarExpr::If { - cond: Box::new(ScalarExpr::Column(4).call_binary( - ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()), - BinaryFunc::NotEq, - )), - then: Box::new( - ScalarExpr::Column(3) - .cast(CDT::float64_datatype()) - .call_binary( - ScalarExpr::Column(4).cast(CDT::float64_datatype()), - BinaryFunc::DivFloat64, - ), - ), - els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())), - }; - let expected = TypedPlan { - plan: Plan::Mfp { - input: Box::new( - Plan::Reduce { - input: Box::new( - Plan::Get { - id: crate::expr::Id::Global(GlobalId::User(1)), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(ConcreteDataType::uint32_datatype(), false), - ColumnType::new( - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - ]) - .into_named(vec![ - Some("number".to_string()), - Some("ts".to_string()), - ]), - ) - .mfp(MapFilterProject::new(2).into_safe()) - .unwrap(), - ), - key_val_plan: KeyValPlan { - key_plan: MapFilterProject::new(2) - .map(vec![ - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowFloor { - window_size: Duration::from_nanos(3_600_000_000_000), - start_time: None, - }, - ), - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowCeiling { - window_size: Duration::from_nanos(3_600_000_000_000), - start_time: None, - }, - ), - ScalarExpr::Column(0), - ]) - .unwrap() - .project(vec![2, 3, 4]) - .unwrap() - .into_safe(), - val_plan: MapFilterProject::new(2) - .map(vec![ - ScalarExpr::Column(0).cast(CDT::uint64_datatype()), - ScalarExpr::Column(0), - ]) - .unwrap() - .project(vec![2, 3]) - .unwrap() - .into_safe(), - }, - reduce_plan: ReducePlan::Accumulable(AccumulablePlan { - full_aggrs: aggr_exprs.clone(), - simple_aggrs: vec![ - AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0), - AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1), - ], - distinct_aggrs: vec![], - }), - } - .with_types( - RelationType::new(vec![ - // keys - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start(time index) - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end(pk) - ColumnType::new(CDT::uint32_datatype(), false), // number(pk) - // values - ColumnType::new(CDT::uint64_datatype(), true), // avg.sum(number) - ColumnType::new(CDT::int64_datatype(), true), // avg.count(number) - ]) - .with_key(vec![1, 2]) - .with_time_index(Some(0)) - .into_named(vec![ - None, - None, - Some("number".to_string()), - None, - None, - ]), - ), - ), - mfp: MapFilterProject::new(5) - .map(vec![ - ScalarExpr::Column(2), // number(pk) - avg_expr, - ScalarExpr::Column(0), // window start - ScalarExpr::Column(1), // window end - ]) - .unwrap() - .project(vec![5, 6, 7, 8]) - .unwrap(), - }, - schema: RelationType::new(vec![ - ColumnType::new(CDT::uint32_datatype(), false), // number - ColumnType::new(CDT::float64_datatype(), true), // avg(number) - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ]) - .with_key(vec![0, 3]) - .with_time_index(Some(2)) - .into_named(vec![ - Some("number".to_string()), - Some("avg(numbers_with_ts.number)".to_string()), - Some("window_start".to_string()), - Some("window_end".to_string()), - ]), - }; - assert_eq!(flow_plan, expected); - } - - #[tokio::test] - async fn test_tumble_parse_optional() { - let engine = create_test_query_engine(); - let sql = "SELECT sum(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour')"; - let plan = sql_to_substrait(engine.clone(), sql).await; - - let mut ctx = create_test_ctx(); - let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan) - .await - .unwrap(); - - let aggr_expr = AggregateExpr { - func: AggregateFunc::SumUInt64, - expr: ScalarExpr::Column(0), - distinct: false, - }; - let expected = TypedPlan { - schema: RelationType::new(vec![ - ColumnType::new(CDT::uint64_datatype(), true), // sum(number) - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ]) - .with_key(vec![2]) - .with_time_index(Some(1)) - .into_named(vec![ - Some("sum(numbers_with_ts.number)".to_string()), - Some("window_start".to_string()), - Some("window_end".to_string()), - ]), - plan: Plan::Mfp { - input: Box::new( - Plan::Reduce { - input: Box::new( - Plan::Get { - id: crate::expr::Id::Global(GlobalId::User(1)), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(ConcreteDataType::uint32_datatype(), false), - ColumnType::new( - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - ]) - .into_named(vec![ - Some("number".to_string()), - Some("ts".to_string()), - ]), - ) - .mfp(MapFilterProject::new(2).into_safe()) - .unwrap(), - ), - key_val_plan: KeyValPlan { - key_plan: MapFilterProject::new(2) - .map(vec![ - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowFloor { - window_size: Duration::from_nanos(3_600_000_000_000), - start_time: None, - }, - ), - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowCeiling { - window_size: Duration::from_nanos(3_600_000_000_000), - start_time: None, - }, - ), - ]) - .unwrap() - .project(vec![2, 3]) - .unwrap() - .into_safe(), - val_plan: MapFilterProject::new(2) - .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())]) - .unwrap() - .project(vec![2]) - .unwrap() - .into_safe(), - }, - reduce_plan: ReducePlan::Accumulable(AccumulablePlan { - full_aggrs: vec![aggr_expr.clone()], - simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)], - distinct_aggrs: vec![], - }), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ColumnType::new(CDT::uint64_datatype(), true), //sum(number) - ]) - .with_key(vec![1]) - .with_time_index(Some(0)) - .into_named(vec![None, None, None]), - ), - ), - mfp: MapFilterProject::new(3) - .map(vec![ - ScalarExpr::Column(2), - ScalarExpr::Column(0), - ScalarExpr::Column(1), - ]) - .unwrap() - .project(vec![3, 4, 5]) - .unwrap(), - }, - }; - assert_eq!(flow_plan, expected); - } - - #[tokio::test] - async fn test_tumble_parse() { - let engine = create_test_query_engine(); - let sql = "SELECT sum(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour', '2021-07-01 00:00:00')"; - let plan = sql_to_substrait(engine.clone(), sql).await; - - let mut ctx = create_test_ctx(); - let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan) - .await - .unwrap(); - - let aggr_expr = AggregateExpr { - func: AggregateFunc::SumUInt64, - expr: ScalarExpr::Column(0), - distinct: false, - }; - let expected = TypedPlan { - schema: RelationType::new(vec![ - ColumnType::new(CDT::uint64_datatype(), true), // sum(number) - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ]) - .with_key(vec![2]) - .with_time_index(Some(1)) - .into_named(vec![ - Some("sum(numbers_with_ts.number)".to_string()), - Some("window_start".to_string()), - Some("window_end".to_string()), - ]), - plan: Plan::Mfp { - input: Box::new( - Plan::Reduce { - input: Box::new( - Plan::Get { - id: crate::expr::Id::Global(GlobalId::User(1)), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(ConcreteDataType::uint32_datatype(), false), - ColumnType::new( - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - ]) - .into_named(vec![ - Some("number".to_string()), - Some("ts".to_string()), - ]), - ) - .mfp(MapFilterProject::new(2).into_safe()) - .unwrap(), - ), - key_val_plan: KeyValPlan { - key_plan: MapFilterProject::new(2) - .map(vec![ - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowFloor { - window_size: Duration::from_nanos(3_600_000_000_000), - start_time: Some(Timestamp::new_millisecond( - 1625097600000, - )), - }, - ), - ScalarExpr::Column(1).call_unary( - UnaryFunc::TumbleWindowCeiling { - window_size: Duration::from_nanos(3_600_000_000_000), - start_time: Some(Timestamp::new_millisecond( - 1625097600000, - )), - }, - ), - ]) - .unwrap() - .project(vec![2, 3]) - .unwrap() - .into_safe(), - val_plan: MapFilterProject::new(2) - .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())]) - .unwrap() - .project(vec![2]) - .unwrap() - .into_safe(), - }, - reduce_plan: ReducePlan::Accumulable(AccumulablePlan { - full_aggrs: vec![aggr_expr.clone()], - simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)], - distinct_aggrs: vec![], - }), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start - ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end - ColumnType::new(CDT::uint64_datatype(), true), //sum(number) - ]) - .with_key(vec![1]) - .with_time_index(Some(0)) - .into_unnamed(), - ), - ), - mfp: MapFilterProject::new(3) - .map(vec![ - ScalarExpr::Column(2), - ScalarExpr::Column(0), - ScalarExpr::Column(1), - ]) - .unwrap() - .project(vec![3, 4, 5]) - .unwrap(), - }, - }; - assert_eq!(flow_plan, expected); - } - - #[tokio::test] - async fn test_avg_group_by() { - let engine = create_test_query_engine(); - let sql = "SELECT avg(number), number FROM numbers GROUP BY number"; - let plan = sql_to_substrait(engine.clone(), sql).await; - - let mut ctx = create_test_ctx(); - let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan).await; - - let aggr_exprs = vec![ - AggregateExpr { - func: AggregateFunc::SumUInt64, - expr: ScalarExpr::Column(0), - distinct: false, - }, - AggregateExpr { - func: AggregateFunc::Count, - expr: ScalarExpr::Column(1), - distinct: false, - }, - ]; - let avg_expr = ScalarExpr::If { - cond: Box::new(ScalarExpr::Column(2).call_binary( - ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()), - BinaryFunc::NotEq, - )), - then: Box::new( - ScalarExpr::Column(1) - .cast(CDT::float64_datatype()) - .call_binary( - ScalarExpr::Column(2).cast(CDT::float64_datatype()), - BinaryFunc::DivFloat64, - ), - ), - els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())), - }; - let expected = TypedPlan { - schema: RelationType::new(vec![ - ColumnType::new(CDT::float64_datatype(), true), // avg(number: u32) -> f64 - ColumnType::new(CDT::uint32_datatype(), false), // number - ]) - .with_key(vec![1]) - .into_named(vec![ - Some("avg(numbers.number)".to_string()), - Some("number".to_string()), - ]), - plan: Plan::Mfp { - input: Box::new( - Plan::Reduce { - input: Box::new( - Plan::Get { - id: crate::expr::Id::Global(GlobalId::User(0)), - } - .with_types( - RelationType::new(vec![ColumnType::new( - ConcreteDataType::uint32_datatype(), - false, - )]) - .into_named(vec![Some("number".to_string())]), - ) - .mfp( - MapFilterProject::new(1) - .project(vec![0]) - .unwrap() - .into_safe(), - ) - .unwrap(), - ), - key_val_plan: KeyValPlan { - key_plan: MapFilterProject::new(1) - .map(vec![ScalarExpr::Column(0)]) - .unwrap() - .project(vec![1]) - .unwrap() - .into_safe(), - val_plan: MapFilterProject::new(1) - .map(vec![ - ScalarExpr::Column(0).cast(CDT::uint64_datatype()), - ScalarExpr::Column(0), - ]) - .unwrap() - .project(vec![1, 2]) - .unwrap() - .into_safe(), - }, - reduce_plan: ReducePlan::Accumulable(AccumulablePlan { - full_aggrs: aggr_exprs.clone(), - simple_aggrs: vec![ - AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0), - AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1), - ], - distinct_aggrs: vec![], - }), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(ConcreteDataType::uint32_datatype(), false), // key: number - ColumnType::new(ConcreteDataType::uint64_datatype(), true), // sum - ColumnType::new(ConcreteDataType::int64_datatype(), true), // count - ]) - .with_key(vec![0]) - .into_named(vec![ - Some("number".to_string()), - None, - None, - ]), - ), - ), - mfp: MapFilterProject::new(3) - .map(vec![ - avg_expr, // col 3 - ScalarExpr::Column(0), - // TODO(discord9): optimize mfp so to remove indirect ref - ]) - .unwrap() - .project(vec![3, 4]) - .unwrap(), - }, - }; - assert_eq!(flow_plan.unwrap(), expected); - } - - #[tokio::test] - async fn test_avg() { - let engine = create_test_query_engine(); - let sql = "SELECT avg(number) FROM numbers"; - let plan = sql_to_substrait(engine.clone(), sql).await; - - let mut ctx = create_test_ctx(); - - let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan) - .await - .unwrap(); - - let aggr_exprs = vec![ - AggregateExpr { - func: AggregateFunc::SumUInt64, - expr: ScalarExpr::Column(0), - distinct: false, - }, - AggregateExpr { - func: AggregateFunc::Count, - expr: ScalarExpr::Column(1), - distinct: false, - }, - ]; - let avg_expr = ScalarExpr::If { - cond: Box::new(ScalarExpr::Column(1).call_binary( - ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()), - BinaryFunc::NotEq, - )), - then: Box::new( - ScalarExpr::Column(0) - .cast(CDT::float64_datatype()) - .call_binary( - ScalarExpr::Column(1).cast(CDT::float64_datatype()), - BinaryFunc::DivFloat64, - ), - ), - els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())), - }; - let input = Box::new( - Plan::Get { - id: crate::expr::Id::Global(GlobalId::User(0)), - } - .with_types( - RelationType::new(vec![ColumnType::new( - ConcreteDataType::uint32_datatype(), - false, - )]) - .into_named(vec![Some("number".to_string())]), - ), - ); - let expected = TypedPlan { - schema: RelationType::new(vec![ColumnType::new(CDT::float64_datatype(), true)]) - .into_named(vec![Some("avg(numbers.number)".to_string())]), - plan: Plan::Mfp { - input: Box::new( - Plan::Reduce { - input: Box::new( - Plan::Mfp { - input: input.clone(), - mfp: MapFilterProject::new(1).project(vec![0]).unwrap(), - } - .with_types( - RelationType::new(vec![ColumnType::new( - CDT::uint32_datatype(), - false, - )]) - .into_named(vec![Some("number".to_string())]), - ), - ), - key_val_plan: KeyValPlan { - key_plan: MapFilterProject::new(1) - .project(vec![]) - .unwrap() - .into_safe(), - val_plan: MapFilterProject::new(1) - .map(vec![ - ScalarExpr::Column(0).cast(CDT::uint64_datatype()), - ScalarExpr::Column(0), - ]) - .unwrap() - .project(vec![1, 2]) - .unwrap() - .into_safe(), - }, - reduce_plan: ReducePlan::Accumulable(AccumulablePlan { - full_aggrs: aggr_exprs.clone(), - simple_aggrs: vec![ - AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0), - AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1), - ], - distinct_aggrs: vec![], - }), - } - .with_types( - RelationType::new(vec![ - ColumnType::new(ConcreteDataType::uint64_datatype(), true), // sum - ColumnType::new(ConcreteDataType::int64_datatype(), true), // count - ]) - .into_named(vec![None, None]), - ), - ), - mfp: MapFilterProject::new(2) - .map(vec![ - avg_expr, - // TODO(discord9): optimize mfp so to remove indirect ref - ]) - .unwrap() - .project(vec![2]) - .unwrap(), - }, - }; - assert_eq!(flow_plan, expected); - } - #[tokio::test] async fn test_sum() { let engine = create_test_query_engine(); diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index fa8a74cad2..ce589bb677 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod builder; +mod dashboard; mod grpc; mod influxdb; mod jaeger; diff --git a/src/frontend/src/instance/dashboard.rs b/src/frontend/src/instance/dashboard.rs new file mode 100644 index 0000000000..373961dbfa --- /dev/null +++ b/src/frontend/src/instance/dashboard.rs @@ -0,0 +1,405 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use api::v1::value::ValueData; +use api::v1::{ + ColumnDataType, ColumnDef, ColumnSchema as PbColumnSchema, Row, RowInsertRequest, + RowInsertRequests, Rows, SemanticType, +}; +use async_trait::async_trait; +use common_catalog::consts::{DEFAULT_PRIVATE_SCHEMA_NAME, default_engine}; +use common_error::ext::BoxedError; +use common_query::OutputData; +use common_recordbatch::util as record_util; +use common_telemetry::info; +use common_time::FOREVER; +use datafusion::datasource::DefaultTableSource; +use datafusion::logical_expr::col; +use datafusion::sql::TableReference; +use datafusion_expr::{DmlStatement, LogicalPlan, lit}; +use datatypes::arrow::array::{Array, AsArray}; +use servers::error::{ + CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, ExecuteQuerySnafu, NotSupportedSnafu, + TableNotFoundSnafu, +}; +use servers::query_handler::DashboardDefinition; +use session::context::{QueryContextBuilder, QueryContextRef}; +use snafu::{OptionExt, ResultExt}; +use table::TableRef; +use table::metadata::TableInfo; +use table::requests::TTL_KEY; +use table::table::adapter::DfTableProviderAdapter; + +use crate::instance::Instance; + +pub const DASHBOARD_TABLE_NAME: &str = "dashboard"; +pub const DASHBOARD_TABLE_NAME_COLUMN_NAME: &str = "name"; +pub const DASHBOARD_TABLE_DEFINITION_COLUMN_NAME: &str = "definition"; +pub const DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME: &str = "created_at"; + +impl Instance { + /// Build a schema for dashboard table. + /// Returns the (time index, primary keys, column) definitions. + fn build_dashboard_schema() -> (String, Vec, Vec) { + ( + DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(), + vec![DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string()], + vec![ + ColumnDef { + name: DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string(), + data_type: ColumnDataType::String as i32, + is_nullable: false, + default_constraint: vec![], + semantic_type: SemanticType::Tag as i32, + comment: String::new(), + datatype_extension: None, + options: None, + }, + ColumnDef { + name: DASHBOARD_TABLE_DEFINITION_COLUMN_NAME.to_string(), + data_type: ColumnDataType::String as i32, + is_nullable: false, + default_constraint: vec![], + semantic_type: SemanticType::Field as i32, + comment: String::new(), + datatype_extension: None, + options: None, + }, + ColumnDef { + name: DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(), + data_type: ColumnDataType::TimestampNanosecond as i32, + is_nullable: false, + default_constraint: vec![], + semantic_type: SemanticType::Timestamp as i32, + comment: String::new(), + datatype_extension: None, + options: None, + }, + ], + ) + } + + /// Build a column schemas for inserting a row into the dashboard table. + fn build_dashboard_insert_column_schemas() -> Vec { + vec![ + PbColumnSchema { + column_name: DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string(), + datatype: ColumnDataType::String.into(), + semantic_type: SemanticType::Tag.into(), + ..Default::default() + }, + PbColumnSchema { + column_name: DASHBOARD_TABLE_DEFINITION_COLUMN_NAME.to_string(), + datatype: ColumnDataType::String.into(), + semantic_type: SemanticType::Field.into(), + ..Default::default() + }, + PbColumnSchema { + column_name: DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(), + datatype: ColumnDataType::TimestampNanosecond.into(), + semantic_type: SemanticType::Timestamp.into(), + ..Default::default() + }, + ] + } + + fn dashboard_query_ctx(table_info: &TableInfo) -> QueryContextRef { + QueryContextBuilder::default() + .current_catalog(table_info.catalog_name.clone()) + .current_schema(table_info.schema_name.clone()) + .build() + .into() + } + + async fn create_dashboard_table_if_not_exists( + &self, + ctx: QueryContextRef, + ) -> servers::error::Result { + let catalog = ctx.current_catalog(); + + if let Some(table) = self + .catalog_manager + .table( + catalog, + DEFAULT_PRIVATE_SCHEMA_NAME, + DASHBOARD_TABLE_NAME, + Some(&ctx), + ) + .await + .context(CatalogSnafu)? + { + return Ok(table); + } + + let (time_index, primary_keys, column_defs) = Self::build_dashboard_schema(); + + let mut table_options = HashMap::new(); + table_options.insert(TTL_KEY.to_string(), FOREVER.to_string()); + + let mut create_table_expr = api::v1::CreateTableExpr { + catalog_name: catalog.to_string(), + schema_name: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(), + table_name: DASHBOARD_TABLE_NAME.to_string(), + desc: "GreptimeDB dashboard table".to_string(), + column_defs, + time_index, + primary_keys, + create_if_not_exists: true, + table_options, + table_id: None, + engine: default_engine().to_string(), + }; + + self.statement_executor + .create_table_inner(&mut create_table_expr, None, ctx.clone()) + .await + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + let table = self + .catalog_manager + .table( + catalog, + DEFAULT_PRIVATE_SCHEMA_NAME, + DASHBOARD_TABLE_NAME, + Some(&ctx), + ) + .await + .context(CatalogSnafu)? + .context(TableNotFoundSnafu { + catalog: catalog.to_string(), + schema: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(), + table: DASHBOARD_TABLE_NAME.to_string(), + })?; + + Ok(table) + } + + /// Insert a dashboard into the dashboard table. + async fn insert_dashboard( + &self, + name: &str, + definition: &str, + query_ctx: QueryContextRef, + ) -> servers::error::Result<()> { + let table = self + .create_dashboard_table_if_not_exists(query_ctx.clone()) + .await?; + let table_info = table.table_info(); + + let insert = RowInsertRequest { + table_name: DASHBOARD_TABLE_NAME.to_string(), + rows: Some(Rows { + schema: Self::build_dashboard_insert_column_schemas(), + rows: vec![Row { + values: vec![ + ValueData::StringValue(name.to_string()).into(), + ValueData::StringValue(definition.to_string()).into(), + ValueData::TimestampNanosecondValue(0).into(), + ], + }], + }), + }; + + let requests = RowInsertRequests { + inserts: vec![insert], + }; + + let output = self + .inserter + .handle_row_inserts( + requests, + Self::dashboard_query_ctx(&table_info), + &self.statement_executor, + false, + false, + ) + .await + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + info!( + "Insert dashboard success, name: {}, table: {}, output: {:?}", + name, + table_info.full_table_name(), + output + ); + + Ok(()) + } + + /// List all dashboards. + async fn list_dashboards( + &self, + query_ctx: QueryContextRef, + ) -> servers::error::Result> { + let table = if let Some(table) = self + .catalog_manager + .table( + query_ctx.current_catalog(), + DEFAULT_PRIVATE_SCHEMA_NAME, + DASHBOARD_TABLE_NAME, + Some(&query_ctx), + ) + .await + .context(CatalogSnafu)? + { + table + } else { + return Ok(vec![]); + }; + + let table_info = table.table_info(); + + let dataframe = self + .query_engine + .read_table(table.clone()) + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + let dataframe = dataframe + .select_columns(&[ + DASHBOARD_TABLE_NAME_COLUMN_NAME, + DASHBOARD_TABLE_DEFINITION_COLUMN_NAME, + ]) + .context(DataFusionSnafu)?; + + let plan = dataframe.into_parts().1; + + let output = self + .query_engine + .execute(plan, Self::dashboard_query_ctx(&table_info)) + .await + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + let stream = match output.data { + OutputData::Stream(stream) => stream, + OutputData::RecordBatches(record_batches) => record_batches.as_stream(), + _ => unreachable!(), + }; + + let records = record_util::collect(stream) + .await + .context(CollectRecordbatchSnafu)?; + + let mut dashboards = Vec::new(); + + for r in &records { + let name_column = r.column(0); + let definition_column = r.column(1); + + let name = name_column + .as_string_opt::() + .context(NotSupportedSnafu { + feat: "Invalid data type for greptime_private.dashboard.name", + })?; + + let definition = + definition_column + .as_string_opt::() + .context(NotSupportedSnafu { + feat: "Invalid data type for greptime_private.dashboard.definition", + })?; + + for i in 0..name.len() { + dashboards.push(DashboardDefinition { + name: name.value(i).to_string(), + definition: definition.value(i).to_string(), + }); + } + } + + Ok(dashboards) + } + + /// Delete a dashboard by name. + async fn delete_dashboard( + &self, + name: &str, + query_ctx: QueryContextRef, + ) -> servers::error::Result<()> { + let table = self + .create_dashboard_table_if_not_exists(query_ctx.clone()) + .await?; + let table_info = table.table_info(); + + let dataframe = self + .query_engine + .read_table(table.clone()) + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + let name_condition = col(DASHBOARD_TABLE_NAME_COLUMN_NAME).eq(lit(name)); + + let dataframe = dataframe.filter(name_condition).context(DataFusionSnafu)?; + + let table_name = TableReference::full( + table_info.catalog_name.clone(), + table_info.schema_name.clone(), + table_info.name.clone(), + ); + + let table_provider = Arc::new(DfTableProviderAdapter::new(table.clone())); + let table_source = Arc::new(DefaultTableSource::new(table_provider)); + + let stmt = DmlStatement::new( + table_name, + table_source, + datafusion_expr::WriteOp::Delete, + Arc::new(dataframe.into_parts().1), + ); + + let plan = LogicalPlan::Dml(stmt); + + let output = self + .query_engine + .execute(plan, Self::dashboard_query_ctx(&table_info)) + .await + .map_err(BoxedError::new) + .context(ExecuteQuerySnafu)?; + + info!( + "Delete dashboard success, name: {}, table: {}, output: {:?}", + name, + table_info.full_table_name(), + output + ); + + Ok(()) + } +} + +#[async_trait] +impl servers::query_handler::DashboardHandler for Instance { + async fn save( + &self, + name: &str, + definition: &str, + ctx: QueryContextRef, + ) -> servers::error::Result<()> { + self.insert_dashboard(name, definition, ctx).await + } + + async fn list(&self, ctx: QueryContextRef) -> servers::error::Result> { + self.list_dashboards(ctx).await + } + + async fn delete(&self, name: &str, ctx: QueryContextRef) -> servers::error::Result<()> { + self.delete_dashboard(name, ctx).await + } +} diff --git a/src/frontend/src/instance/grpc.rs b/src/frontend/src/instance/grpc.rs index c4191145f8..70ff50fadc 100644 --- a/src/frontend/src/instance/grpc.rs +++ b/src/frontend/src/instance/grpc.rs @@ -27,7 +27,6 @@ use api::v1::{ use async_stream::try_stream; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; -use common_base::AffectedRows; use common_error::ext::BoxedError; use common_grpc::flight::do_put::DoPutResponse; use common_query::Output; @@ -260,62 +259,6 @@ impl GrpcQueryHandler for Instance { .context(server_error::ExecuteGrpcQuerySnafu) } - async fn put_record_batch( - &self, - request: servers::grpc::flight::PutRecordBatchRequest, - table_ref: &mut Option, - ctx: QueryContextRef, - ) -> server_error::Result { - let result: Result = async { - let table = if let Some(table) = table_ref { - table.clone() - } else { - let table = self - .catalog_manager() - .table( - &request.table_name.catalog_name, - &request.table_name.schema_name, - &request.table_name.table_name, - None, - ) - .await - .context(CatalogSnafu)? - .with_context(|| TableNotFoundSnafu { - table_name: request.table_name.to_string(), - })?; - *table_ref = Some(table.clone()); - table - }; - - let interceptor_ref = self.plugins.get::>(); - let interceptor = interceptor_ref.as_ref(); - interceptor.pre_bulk_insert(table.clone(), ctx.clone())?; - - self.plugins - .get::() - .as_ref() - .check_permission(ctx.current_user(), PermissionReq::BulkInsert) - .context(PermissionSnafu)?; - - // do we check limit for bulk insert? - - self.inserter - .handle_bulk_insert( - table, - request.flight_data, - request.record_batch, - request.schema_bytes, - ) - .await - .context(TableOperationSnafu) - } - .await; - - result - .map_err(BoxedError::new) - .context(server_error::ExecuteGrpcRequestSnafu) - } - fn handle_put_record_batch_stream( &self, stream: servers::grpc::flight::PutRecordBatchRequestStream, diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs index 45c3ec3649..4b51efbd33 100644 --- a/src/frontend/src/server.rs +++ b/src/frontend/src/server.rs @@ -143,6 +143,8 @@ where builder = builder.with_jaeger_handler(self.instance.clone()); } + builder = builder.with_dashboard_handler(self.instance.clone()); + if let Some(configurator) = self.plugins.get::() { info!("Adding extra router from plugins"); builder = builder.with_extra_router(configurator.router()); diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs index 2cfe7d2f7d..eadb7cdc75 100644 --- a/src/meta-srv/src/bootstrap.rs +++ b/src/meta-srv/src/bootstrap.rs @@ -24,6 +24,8 @@ use common_base::Plugins; use common_config::Configurable; #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] use common_meta::distributed_time_constants::META_LEASE_SECS; +use common_meta::election::CANDIDATE_LEASE_SECS; +use common_meta::election::etcd::EtcdElection; use common_meta::kv_backend::chroot::ChrootKvBackend; use common_meta::kv_backend::etcd::EtcdStore; use common_meta::kv_backend::memory::MemoryKvBackend; @@ -42,9 +44,6 @@ use tonic::codec::CompressionEncoding; use tonic::transport::server::{Router, TcpIncoming}; use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef}; -#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))] -use crate::election::CANDIDATE_LEASE_SECS; -use crate::election::etcd::EtcdElection; use crate::error::OtherSnafu; use crate::metasrv::builder::MetasrvBuilder; use crate::metasrv::{ @@ -281,7 +280,8 @@ pub async fn metasrv_builder( etcd_client, opts.store_key_prefix.clone(), ) - .await?; + .await + .context(error::KvBackendSnafu)?; (kv_backend, Some(election)) } @@ -290,10 +290,10 @@ pub async fn metasrv_builder( use std::time::Duration; use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS; + use common_meta::election::rds::postgres::{ElectionPgClient, PgElection}; use common_meta::kv_backend::rds::PgStore; use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod}; - use crate::election::rds::postgres::{ElectionPgClient, PgElection}; use crate::utils::postgres::create_postgres_pool; let candidate_lease_ttl = Duration::from_secs(CANDIDATE_LEASE_SECS); @@ -321,7 +321,8 @@ pub async fn metasrv_builder( execution_timeout, idle_session_timeout, statement_timeout, - )?; + ) + .context(error::KvBackendSnafu)?; let election = PgElection::with_pg_client( opts.grpc.server_addr.clone(), election_client, @@ -332,7 +333,8 @@ pub async fn metasrv_builder( &opts.meta_table_name, opts.meta_election_lock_id, ) - .await?; + .await + .context(error::KvBackendSnafu)?; let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone()) .await?; @@ -352,9 +354,9 @@ pub async fn metasrv_builder( (None, BackendImpl::MysqlStore) => { use std::time::Duration; + use common_meta::election::rds::mysql::{ElectionMysqlClient, MySqlElection}; use common_meta::kv_backend::rds::MySqlStore; - use crate::election::rds::mysql::{ElectionMysqlClient, MySqlElection}; use crate::utils::mysql::create_mysql_pool; let pool = create_mysql_pool(&opts.store_addrs, opts.backend_tls.as_ref()).await?; @@ -389,7 +391,8 @@ pub async fn metasrv_builder( meta_lease_ttl, &election_table_name, ) - .await?; + .await + .context(error::KvBackendSnafu)?; (kv_backend, Some(election)) } }; diff --git a/src/meta-srv/src/cluster.rs b/src/meta-srv/src/cluster.rs index 35b15b3b29..ef3ba07702 100644 --- a/src/meta-srv/src/cluster.rs +++ b/src/meta-srv/src/cluster.rs @@ -247,7 +247,7 @@ impl MetaPeerClient { // Safety: when self.is_leader() == false, election must not empty. let election = self.election.as_ref().unwrap(); - let leader_addr = election.leader().await?.0; + let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0; let channel = self .channel_manager @@ -279,7 +279,7 @@ impl MetaPeerClient { // Safety: when self.is_leader() == false, election must not empty. let election = self.election.as_ref().unwrap(); - let leader_addr = election.leader().await?.0; + let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0; let channel = self .channel_manager diff --git a/src/meta-srv/src/lib.rs b/src/meta-srv/src/lib.rs index c67bc32b40..0e87d4421a 100644 --- a/src/meta-srv/src/lib.rs +++ b/src/meta-srv/src/lib.rs @@ -21,7 +21,6 @@ pub mod bootstrap; pub mod cache_invalidator; pub mod cluster; pub mod discovery; -pub mod election; pub mod error; pub mod events; mod failure_detector; diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs index 165efd0555..a1515d897e 100644 --- a/src/meta-srv/src/metasrv.rs +++ b/src/meta-srv/src/metasrv.rs @@ -32,6 +32,8 @@ use common_meta::ddl_manager::DdlManagerRef; use common_meta::distributed_time_constants::{ self, BASE_HEARTBEAT_INTERVAL, default_distributed_time_constants, frontend_heartbeat_interval, }; +use common_meta::election::LeaderChangeMessage; +pub use common_meta::election::{ElectionRef, MetasrvNodeInfo}; use common_meta::key::TableMetadataManagerRef; use common_meta::key::runtime_switch::RuntimeSwitchManagerRef; use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef}; @@ -64,7 +66,6 @@ use tokio::sync::broadcast::error::RecvError; use crate::cluster::MetaPeerClientRef; use crate::discovery; -use crate::election::{Election, LeaderChangeMessage}; use crate::error::{ self, InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu, StartTelemetryTaskSnafu, StopProcedureManagerSnafu, @@ -459,76 +460,6 @@ impl Context { } } -/// The value of the leader. It is used to store the leader's address. -pub struct LeaderValue(pub String); - -impl> From for LeaderValue { - fn from(value: T) -> Self { - let string = String::from_utf8_lossy(value.as_ref()); - Self(string.to_string()) - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetasrvNodeInfo { - // The metasrv's address - pub addr: String, - // The node build version - pub version: String, - // The node build git commit hash - pub git_commit: String, - // The node start timestamp in milliseconds - pub start_time_ms: u64, - // The node total cpu millicores - #[serde(default)] - pub total_cpu_millicores: i64, - // The node total memory bytes - #[serde(default)] - pub total_memory_bytes: i64, - /// The node build cpu usage millicores - #[serde(default)] - pub cpu_usage_millicores: i64, - /// The node build memory usage bytes - #[serde(default)] - pub memory_usage_bytes: i64, - // The node hostname - #[serde(default)] - pub hostname: String, -} - -// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto. -#[allow(deprecated)] -impl From for api::v1::meta::MetasrvNodeInfo { - fn from(node_info: MetasrvNodeInfo) -> Self { - Self { - peer: Some(api::v1::meta::Peer { - addr: node_info.addr, - ..Default::default() - }), - // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version. - // New code should use the fields in `info.NodeInfo` instead. - version: node_info.version.clone(), - git_commit: node_info.git_commit.clone(), - start_time_ms: node_info.start_time_ms, - cpus: node_info.total_cpu_millicores as u32, - memory_bytes: node_info.total_memory_bytes as u64, - // The canonical location for node information. - info: Some(api::v1::meta::NodeInfo { - version: node_info.version, - git_commit: node_info.git_commit, - start_time_ms: node_info.start_time_ms, - total_cpu_millicores: node_info.total_cpu_millicores, - total_memory_bytes: node_info.total_memory_bytes, - cpu_usage_millicores: node_info.cpu_usage_millicores, - memory_usage_bytes: node_info.memory_usage_bytes, - cpus: node_info.total_cpu_millicores as u32, - memory_bytes: node_info.total_memory_bytes as u64, - hostname: node_info.hostname, - }), - } - } -} - #[derive(Clone, Copy)] pub enum SelectTarget { Datanode, @@ -552,7 +483,6 @@ pub struct SelectorContext { pub type SelectorRef = Arc>>; pub type RegionStatAwareSelectorRef = Arc>>; -pub type ElectionRef = Arc>; pub struct MetaStateHandler { subscribe_manager: Option, diff --git a/src/meta-srv/src/service/admin/leader.rs b/src/meta-srv/src/service/admin/leader.rs index 1fadb4a3ef..17329e7b47 100644 --- a/src/meta-srv/src/service/admin/leader.rs +++ b/src/meta-srv/src/service/admin/leader.rs @@ -32,7 +32,7 @@ pub struct LeaderHandler { impl LeaderHandler { async fn get_leader(&self) -> Result> { if let Some(election) = &self.election { - let leader_addr = election.leader().await?.0; + let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0; return Ok(Some(leader_addr)); } Ok(None) diff --git a/src/meta-srv/src/service/cluster.rs b/src/meta-srv/src/service/cluster.rs index 5c0ae4c71f..366a8aa5fb 100644 --- a/src/meta-srv/src/service/cluster.rs +++ b/src/meta-srv/src/service/cluster.rs @@ -63,7 +63,10 @@ impl cluster_server::Cluster for Metasrv { let leader_addr = &self.options().grpc.server_addr; let (leader, followers) = match self.election() { Some(election) => { - let nodes = election.all_candidates().await?; + let nodes = election + .all_candidates() + .await + .context(error::KvBackendSnafu)?; let followers = nodes .into_iter() .filter(|node_info| &node_info.addr != leader_addr) diff --git a/src/meta-srv/src/service/heartbeat.rs b/src/meta-srv/src/service/heartbeat.rs index e09073546a..238ed99df2 100644 --- a/src/meta-srv/src/service/heartbeat.rs +++ b/src/meta-srv/src/service/heartbeat.rs @@ -23,7 +23,7 @@ use api::v1::meta::{ use common_telemetry::{debug, error, info, warn}; use futures::StreamExt; use once_cell::sync::OnceCell; -use snafu::OptionExt; +use snafu::{OptionExt, ResultExt}; use tokio::sync::mpsc; use tokio::sync::mpsc::Sender; use tokio_stream::wrappers::ReceiverStream; @@ -148,7 +148,7 @@ async fn handle_ask_leader(_req: AskLeaderRequest, ctx: Context) -> Result ctx.server_addr, diff --git a/src/metric-engine/Cargo.toml b/src/metric-engine/Cargo.toml index 567210b952..5b561997ab 100644 --- a/src/metric-engine/Cargo.toml +++ b/src/metric-engine/Cargo.toml @@ -17,6 +17,7 @@ bytes.workspace = true fxhash = "0.2" common-base.workspace = true common-error.workspace = true +common-grpc.workspace = true common-macro.workspace = true common-query.workspace = true common-recordbatch.workspace = true diff --git a/src/metric-engine/src/batch_modifier.rs b/src/metric-engine/src/batch_modifier.rs new file mode 100644 index 0000000000..8a5774889b --- /dev/null +++ b/src/metric-engine/src/batch_modifier.rs @@ -0,0 +1,426 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::hash::Hasher; +use std::sync::Arc; + +use datatypes::arrow::array::{Array, BinaryBuilder, StringArray, UInt64Array}; +use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use datatypes::arrow::record_batch::RecordBatch; +use datatypes::value::ValueRef; +use fxhash::FxHasher; +use mito_codec::row_converter::SparsePrimaryKeyCodec; +use snafu::ResultExt; +use store_api::storage::ColumnId; +use store_api::storage::consts::{PRIMARY_KEY_COLUMN_NAME, ReservedColumnId}; + +use crate::error::{EncodePrimaryKeySnafu, Result, UnexpectedRequestSnafu}; + +/// Info about a tag column for TSID computation and sparse primary key encoding. +#[allow(dead_code)] +pub(crate) struct TagColumnInfo { + /// Column name (used for label-name hash). + pub name: String, + /// Column index in the RecordBatch. + pub index: usize, + /// Column ID in the physical region. + pub column_id: ColumnId, +} + +/// Computes `__tsid` values for each row. +#[allow(dead_code)] +pub(crate) fn compute_tsid_array( + batch: &RecordBatch, + sorted_tag_columns: &[TagColumnInfo], + tag_arrays: &[&StringArray], +) -> UInt64Array { + let num_rows = batch.num_rows(); + + let label_name_hash = { + let mut hasher = FxHasher::default(); + for tag_col in sorted_tag_columns { + hasher.write(tag_col.name.as_bytes()); + hasher.write_u8(0xff); + } + hasher.finish() + }; + + let mut tsid_values = Vec::with_capacity(num_rows); + for row in 0..num_rows { + let has_null = tag_arrays.iter().any(|arr| arr.is_null(row)); + + let tsid = if !has_null { + let mut hasher = FxHasher::default(); + hasher.write_u64(label_name_hash); + for arr in tag_arrays { + hasher.write(arr.value(row).as_bytes()); + hasher.write_u8(0xff); + } + hasher.finish() + } else { + let mut name_hasher = FxHasher::default(); + for (tc, arr) in sorted_tag_columns.iter().zip(tag_arrays.iter()) { + if !arr.is_null(row) { + name_hasher.write(tc.name.as_bytes()); + name_hasher.write_u8(0xff); + } + } + let row_label_hash = name_hasher.finish(); + + let mut val_hasher = FxHasher::default(); + val_hasher.write_u64(row_label_hash); + for arr in tag_arrays { + if !arr.is_null(row) { + val_hasher.write(arr.value(row).as_bytes()); + val_hasher.write_u8(0xff); + } + } + val_hasher.finish() + }; + + tsid_values.push(tsid); + } + + UInt64Array::from(tsid_values) +} + +fn build_tag_arrays<'a>( + batch: &'a RecordBatch, + sorted_tag_columns: &[TagColumnInfo], +) -> Vec<&'a StringArray> { + sorted_tag_columns + .iter() + .map(|tc| { + batch + .column(tc.index) + .as_any() + .downcast_ref::() + .expect("tag column must be utf8") + }) + .collect() +} + +/// Modifies a RecordBatch for sparse primary key encoding. +#[allow(dead_code)] +pub(crate) fn modify_batch_sparse( + batch: RecordBatch, + table_id: u32, + sorted_tag_columns: &[TagColumnInfo], + non_tag_column_indices: &[usize], +) -> Result { + let num_rows = batch.num_rows(); + let codec = SparsePrimaryKeyCodec::schemaless(); + let tag_arrays: Vec<&StringArray> = build_tag_arrays(&batch, sorted_tag_columns); + let tsid_array = compute_tsid_array(&batch, sorted_tag_columns, &tag_arrays); + + let mut pk_builder = BinaryBuilder::with_capacity(num_rows, 0); + let mut buffer = Vec::new(); + for row in 0..num_rows { + buffer.clear(); + let internal = [ + (ReservedColumnId::table_id(), ValueRef::UInt32(table_id)), + ( + ReservedColumnId::tsid(), + ValueRef::UInt64(tsid_array.value(row)), + ), + ]; + codec + .encode_to_vec(internal.into_iter(), &mut buffer) + .context(EncodePrimaryKeySnafu)?; + + let tags = sorted_tag_columns + .iter() + .zip(tag_arrays.iter()) + .filter(|(_, arr)| !arr.is_null(row)) + .map(|(tc, arr)| (tc.column_id, ValueRef::String(arr.value(row)))); + codec + .encode_to_vec(tags, &mut buffer) + .context(EncodePrimaryKeySnafu)?; + + pk_builder.append_value(&buffer); + } + + let pk_array = pk_builder.finish(); + + let mut fields = vec![Arc::new(Field::new( + PRIMARY_KEY_COLUMN_NAME, + DataType::Binary, + false, + ))]; + let mut columns: Vec> = vec![Arc::new(pk_array)]; + + for &idx in non_tag_column_indices { + fields.push(batch.schema().fields()[idx].clone()); + columns.push(batch.column(idx).clone()); + } + + let new_schema = Arc::new(ArrowSchema::new(fields)); + RecordBatch::try_new(new_schema, columns).map_err(|e| { + UnexpectedRequestSnafu { + reason: format!("Failed to build modified sparse RecordBatch: {e}"), + } + .build() + }) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use api::v1::value::ValueData; + use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value}; + use datatypes::arrow::array::{BinaryArray, Int64Array, StringArray}; + use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use datatypes::arrow::record_batch::RecordBatch; + use store_api::codec::PrimaryKeyEncoding; + use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME; + + use super::*; + use crate::row_modifier::{RowModifier, RowsIter, TableIdInput}; + + fn build_sparse_test_batch() -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("greptime_timestamp", DataType::Int64, false), + Field::new("greptime_value", DataType::Float64, true), + Field::new("namespace", DataType::Utf8, true), + Field::new("host", DataType::Utf8, true), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![1000])), + Arc::new(datatypes::arrow::array::Float64Array::from(vec![42.0])), + Arc::new(StringArray::from(vec!["greptimedb"])), + Arc::new(StringArray::from(vec!["127.0.0.1"])), + ], + ) + .unwrap() + } + + fn sparse_tag_columns() -> Vec { + vec![ + TagColumnInfo { + name: "host".to_string(), + index: 3, + column_id: 3, + }, + TagColumnInfo { + name: "namespace".to_string(), + index: 2, + column_id: 2, + }, + ] + } + + #[test] + fn test_compute_tsid_basic() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("namespace", DataType::Utf8, true), + Field::new("host", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(vec!["greptimedb"])), + Arc::new(StringArray::from(vec!["127.0.0.1"])), + ], + ) + .unwrap(); + + let tag_columns: Vec = vec![ + TagColumnInfo { + name: "host".to_string(), + index: 1, + column_id: 2, + }, + TagColumnInfo { + name: "namespace".to_string(), + index: 0, + column_id: 1, + }, + ]; + let tag_arrays = build_tag_arrays(&batch, &tag_columns); + let tsid_array = compute_tsid_array(&batch, &tag_columns, &tag_arrays); + + assert_eq!(tsid_array.value(0), 2721566936019240841); + } + + #[test] + fn test_compute_tsid_with_nulls() { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + ])); + let batch_no_null = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["A"])), + Arc::new(StringArray::from(vec!["B"])), + ], + ) + .unwrap(); + let tag_cols_2: Vec = vec![ + TagColumnInfo { + name: "a".to_string(), + index: 0, + column_id: 1, + }, + TagColumnInfo { + name: "b".to_string(), + index: 1, + column_id: 2, + }, + ]; + let tag_arrays_2 = build_tag_arrays(&batch_no_null, &tag_cols_2); + let tsid_no_null = compute_tsid_array(&batch_no_null, &tag_cols_2, &tag_arrays_2); + + let schema3 = Arc::new(ArrowSchema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Utf8, true), + ])); + let batch_with_null = RecordBatch::try_new( + schema3, + vec![ + Arc::new(StringArray::from(vec!["A"])), + Arc::new(StringArray::from(vec!["B"])), + Arc::new(StringArray::from(vec![None as Option<&str>])), + ], + ) + .unwrap(); + let tag_cols_3: Vec = vec![ + TagColumnInfo { + name: "a".to_string(), + index: 0, + column_id: 1, + }, + TagColumnInfo { + name: "b".to_string(), + index: 1, + column_id: 2, + }, + TagColumnInfo { + name: "c".to_string(), + index: 2, + column_id: 3, + }, + ]; + let tag_arrays_3 = build_tag_arrays(&batch_with_null, &tag_cols_3); + let tsid_with_null = compute_tsid_array(&batch_with_null, &tag_cols_3, &tag_arrays_3); + + assert_eq!(tsid_no_null.value(0), tsid_with_null.value(0)); + } + + #[test] + fn test_modify_batch_sparse() { + let batch = build_sparse_test_batch(); + let tag_columns = sparse_tag_columns(); + let non_tag_indices = vec![0, 1]; + let table_id: u32 = 1025; + + let modified = + modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap(); + + assert_eq!(modified.num_columns(), 3); + assert_eq!(modified.schema().field(0).name(), PRIMARY_KEY_COLUMN_NAME); + assert_eq!(modified.schema().field(1).name(), "greptime_timestamp"); + assert_eq!(modified.schema().field(2).name(), "greptime_value"); + } + + #[test] + fn test_modify_batch_sparse_matches_row_modifier() { + let batch = build_sparse_test_batch(); + let tag_columns = sparse_tag_columns(); + let non_tag_indices = vec![0, 1]; + let table_id: u32 = 1025; + let modified = + modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap(); + + let name_to_column_id: HashMap = [ + ("greptime_timestamp".to_string(), 0), + ("greptime_value".to_string(), 1), + ("namespace".to_string(), 2), + ("host".to_string(), 3), + ] + .into_iter() + .collect(); + + let rows = Rows { + schema: vec![ + ColumnSchema { + column_name: "greptime_timestamp".to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "greptime_value".to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "namespace".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ColumnSchema { + column_name: "host".to_string(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + ..Default::default() + }, + ], + rows: vec![Row { + values: vec![ + Value { + value_data: Some(ValueData::TimestampMillisecondValue(1000)), + }, + Value { + value_data: Some(ValueData::F64Value(42.0)), + }, + Value { + value_data: Some(ValueData::StringValue("greptimedb".to_string())), + }, + Value { + value_data: Some(ValueData::StringValue("127.0.0.1".to_string())), + }, + ], + }], + }; + + let row_iter = RowsIter::new(rows, &name_to_column_id); + let rows = RowModifier::default() + .modify_rows( + row_iter, + TableIdInput::Single(table_id), + PrimaryKeyEncoding::Sparse, + ) + .unwrap(); + let ValueData::BinaryValue(expected_pk) = + rows.rows[0].values[0].value_data.clone().unwrap() + else { + panic!("expected binary primary key"); + }; + + let actual_array = modified + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(actual_array.value(0), expected_pk.as_slice()); + } +} diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs index 7a1efedac4..ba90ca960d 100644 --- a/src/metric-engine/src/engine.rs +++ b/src/metric-engine/src/engine.rs @@ -13,6 +13,7 @@ // limitations under the License. mod alter; +mod bulk_insert; mod catchup; mod close; mod create; @@ -288,9 +289,8 @@ impl RegionEngine for MetricEngine { debug_assert_eq!(region_id, resp_region_id); return response; } - RegionRequest::BulkInserts(_) => { - // todo(hl): find a way to support bulk inserts in metric engine. - UnsupportedRegionRequestSnafu { request }.fail() + RegionRequest::BulkInserts(bulk) => { + self.inner.bulk_insert_region(region_id, bulk).await } }; diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs new file mode 100644 index 0000000000..2a3c26c80c --- /dev/null +++ b/src/metric-engine/src/engine/bulk_insert.rs @@ -0,0 +1,783 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; + +use api::v1::{ArrowIpc, ColumnDataType, SemanticType}; +use bytes::Bytes; +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_grpc::flight::{FlightEncoder, FlightMessage}; +use common_query::prelude::{greptime_timestamp, greptime_value}; +use datatypes::arrow::array::{Array, Float64Array, StringArray, TimestampMillisecondArray}; +use datatypes::arrow::record_batch::RecordBatch; +use snafu::{OptionExt, ensure}; +use store_api::codec::PrimaryKeyEncoding; +use store_api::metadata::RegionMetadataRef; +use store_api::region_request::{ + AffectedRows, RegionBulkInsertsRequest, RegionPutRequest, RegionRequest, +}; +use store_api::storage::RegionId; + +use crate::batch_modifier::{TagColumnInfo, modify_batch_sparse}; +use crate::engine::MetricEngineInner; +use crate::error; +use crate::error::Result; + +impl MetricEngineInner { + /// Bulk-inserts logical rows into a metric region. + /// + /// This method accepts a `RegionBulkInsertsRequest` whose payload is a logical + /// `RecordBatch` (timestamp, value and tag columns) for the given logical `region_id`. + /// + /// The transformed batch is encoded to Arrow IPC and forwarded as a `BulkInserts` + /// request to the data region, along with the original `partition_expr_version`. + /// If the data region reports `StatusCode::Unsupported` for bulk inserts, the request + /// is transparently retried as a `Put` by converting the original logical batch into + /// `api::v1::Rows`, so callers observe the same semantics as `put_region`. + /// + /// Returns the number of affected rows, or `0` if the input batch is empty. + pub async fn bulk_insert_region( + &self, + region_id: RegionId, + request: RegionBulkInsertsRequest, + ) -> Result { + ensure!( + !self.is_physical_region(region_id), + error::UnsupportedRegionRequestSnafu { + request: RegionRequest::BulkInserts(request), + } + ); + + let (physical_region_id, data_region_id, primary_key_encoding) = + self.find_data_region_meta(region_id)?; + + if primary_key_encoding != PrimaryKeyEncoding::Sparse { + return error::UnsupportedRegionRequestSnafu { + request: RegionRequest::BulkInserts(request), + } + .fail(); + } + + let batch = request.payload; + if batch.num_rows() == 0 { + return Ok(0); + } + + let logical_metadata = self + .logical_region_metadata(physical_region_id, region_id) + .await?; + let (tag_columns, non_tag_indices) = self.resolve_tag_columns_from_metadata( + region_id, + data_region_id, + &batch, + &logical_metadata, + )?; + let modified_batch = modify_batch_sparse( + batch.clone(), + region_id.table_id(), + &tag_columns, + &non_tag_indices, + )?; + let (schema, data_header, payload) = record_batch_to_ipc(&modified_batch)?; + + let partition_expr_version = request.partition_expr_version; + let request = RegionBulkInsertsRequest { + region_id: data_region_id, + payload: modified_batch, + raw_data: ArrowIpc { + schema, + data_header, + payload, + }, + partition_expr_version, + }; + match self + .data_region + .write_data(data_region_id, RegionRequest::BulkInserts(request)) + .await + { + Ok(affected_rows) => Ok(affected_rows), + Err(err) if err.status_code() == StatusCode::Unsupported => { + // todo(hl): fallback path for PartitionTreeMemtable, remove this once we remove it + let rows = record_batch_to_rows(&batch, region_id)?; + self.put_region( + region_id, + RegionPutRequest { + rows, + hint: None, + partition_expr_version, + }, + ) + .await + } + Err(err) => Err(err), + } + } + + fn resolve_tag_columns_from_metadata( + &self, + logical_region_id: RegionId, + data_region_id: RegionId, + batch: &RecordBatch, + logical_metadata: &RegionMetadataRef, + ) -> Result<(Vec, Vec)> { + let tag_names: HashSet<&str> = logical_metadata + .column_metadatas + .iter() + .filter_map(|column| { + if column.semantic_type == SemanticType::Tag { + Some(column.column_schema.name.as_str()) + } else { + None + } + }) + .collect(); + + let mut tag_columns = Vec::new(); + let mut non_tag_indices = Vec::new(); + { + let state = self.state.read().unwrap(); + let physical_columns = state + .physical_region_states() + .get(&data_region_id) + .context(error::PhysicalRegionNotFoundSnafu { + region_id: data_region_id, + })? + .physical_columns(); + + for (index, field) in batch.schema().fields().iter().enumerate() { + let name = field.name(); + let column_id = + *physical_columns + .get(name) + .with_context(|| error::ColumnNotFoundSnafu { + name: name.clone(), + region_id: logical_region_id, + })?; + if tag_names.contains(name.as_str()) { + tag_columns.push(TagColumnInfo { + name: name.clone(), + index, + column_id, + }); + } else { + non_tag_indices.push(index); + } + } + } + + tag_columns.sort_by(|a, b| a.name.cmp(&b.name)); + Ok((tag_columns, non_tag_indices)) + } +} + +fn record_batch_to_rows(batch: &RecordBatch, logical_region_id: RegionId) -> Result { + let schema_ref = batch.schema(); + let fields = schema_ref.fields(); + + let mut ts_idx = None; + let mut val_idx = None; + let mut tag_indices = Vec::new(); + + for (idx, field) in fields.iter().enumerate() { + if field.name() == greptime_timestamp() { + ts_idx = Some(idx); + if !matches!( + field.data_type(), + datatypes::arrow::datatypes::DataType::Timestamp( + datatypes::arrow::datatypes::TimeUnit::Millisecond, + _ + ) + ) { + return error::UnexpectedRequestSnafu { + reason: format!( + "Timestamp column '{}' in region {:?} has incompatible type: {:?}", + field.name(), + logical_region_id, + field.data_type() + ), + } + .fail(); + } + } else if field.name() == greptime_value() { + val_idx = Some(idx); + if !matches!( + field.data_type(), + datatypes::arrow::datatypes::DataType::Float64 + ) { + return error::UnexpectedRequestSnafu { + reason: format!( + "Value column '{}' in region {:?} has incompatible type: {:?}", + field.name(), + logical_region_id, + field.data_type() + ), + } + .fail(); + } + } else { + if !matches!( + field.data_type(), + datatypes::arrow::datatypes::DataType::Utf8 + ) { + return error::UnexpectedRequestSnafu { + reason: format!( + "Tag column '{}' in region {:?} must be Utf8, found: {:?}", + field.name(), + logical_region_id, + field.data_type() + ), + } + .fail(); + } + tag_indices.push(idx); + } + } + + let ts_idx = ts_idx.with_context(|| error::UnexpectedRequestSnafu { + reason: format!( + "Timestamp column '{}' not found in RecordBatch for region {:?}", + greptime_timestamp(), + logical_region_id + ), + })?; + let val_idx = val_idx.with_context(|| error::UnexpectedRequestSnafu { + reason: format!( + "Value column '{}' not found in RecordBatch for region {:?}", + greptime_value(), + logical_region_id + ), + })?; + + let mut schema = Vec::with_capacity(2 + tag_indices.len()); + schema.push(api::v1::ColumnSchema { + column_name: greptime_timestamp().to_string(), + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Timestamp as i32, + datatype_extension: None, + options: None, + }); + schema.push(api::v1::ColumnSchema { + column_name: greptime_value().to_string(), + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }); + for &idx in &tag_indices { + let field = &fields[idx]; + schema.push(api::v1::ColumnSchema { + column_name: field.name().clone(), + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Tag as i32, + datatype_extension: None, + options: None, + }); + } + + let ts_array = batch + .column(ts_idx) + .as_any() + .downcast_ref::() + .expect("validated as TimestampMillisecond"); + let val_array = batch + .column(val_idx) + .as_any() + .downcast_ref::() + .expect("validated as Float64"); + let tag_arrays: Vec<&StringArray> = tag_indices + .iter() + .map(|&idx| { + batch + .column(idx) + .as_any() + .downcast_ref::() + .expect("validated as Utf8") + }) + .collect(); + + let num_rows = batch.num_rows(); + let mut rows = Vec::with_capacity(num_rows); + for row_idx in 0..num_rows { + let mut values = Vec::with_capacity(2 + tag_arrays.len()); + + if ts_array.is_null(row_idx) { + values.push(api::v1::Value { value_data: None }); + } else { + values.push(api::v1::Value { + value_data: Some(api::v1::value::ValueData::TimestampMillisecondValue( + ts_array.value(row_idx), + )), + }); + } + + if val_array.is_null(row_idx) { + values.push(api::v1::Value { value_data: None }); + } else { + values.push(api::v1::Value { + value_data: Some(api::v1::value::ValueData::F64Value( + val_array.value(row_idx), + )), + }); + } + + for arr in &tag_arrays { + if arr.is_null(row_idx) { + values.push(api::v1::Value { value_data: None }); + } else { + values.push(api::v1::Value { + value_data: Some(api::v1::value::ValueData::StringValue( + arr.value(row_idx).to_string(), + )), + }); + } + } + + rows.push(api::v1::Row { values }); + } + + Ok(api::v1::Rows { schema, rows }) +} + +fn record_batch_to_ipc(record_batch: &RecordBatch) -> Result<(Bytes, Bytes, Bytes)> { + let mut encoder = FlightEncoder::default(); + let schema = encoder.encode_schema(record_batch.schema().as_ref()); + let mut iter = encoder + .encode(FlightMessage::RecordBatch(record_batch.clone())) + .into_iter(); + + let Some(flight_data) = iter.next() else { + return error::UnexpectedRequestSnafu { + reason: "Failed to encode empty flight data", + } + .fail(); + }; + ensure!( + iter.next().is_none(), + error::UnexpectedRequestSnafu { + reason: "Bulk insert RecordBatch with dictionary arrays is unsupported".to_string(), + } + ); + + Ok(( + schema.data_header, + flight_data.data_header, + flight_data.data_body, + )) +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + use std::sync::Arc; + + use api::v1::ArrowIpc; + use common_error::ext::ErrorExt; + use common_query::prelude::{greptime_timestamp, greptime_value}; + use common_recordbatch::RecordBatches; + use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray}; + use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use datatypes::arrow::record_batch::RecordBatch; + use store_api::metric_engine_consts::MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING; + use store_api::path_utils::table_dir; + use store_api::region_engine::RegionEngine; + use store_api::region_request::{RegionBulkInsertsRequest, RegionPutRequest, RegionRequest}; + use store_api::storage::{RegionId, ScanRequest}; + + use super::record_batch_to_ipc; + use crate::error::Error; + use crate::test_util::{self, TestEnv}; + + fn build_logical_batch(start: usize, rows: usize) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + greptime_timestamp(), + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new(greptime_value(), DataType::Float64, true), + Field::new("job", DataType::Utf8, true), + ])); + + let mut ts = Vec::with_capacity(rows); + let mut values = Vec::with_capacity(rows); + let mut tags = Vec::with_capacity(rows); + for i in start..start + rows { + ts.push(i as i64); + values.push(i as f64); + tags.push("tag_0".to_string()); + } + + RecordBatch::try_new( + schema, + vec![ + Arc::new(TimestampMillisecondArray::from(ts)), + Arc::new(Float64Array::from(values)), + Arc::new(StringArray::from(tags)), + ], + ) + .unwrap() + } + + fn build_bulk_request(logical_region_id: RegionId, batch: RecordBatch) -> RegionRequest { + let (schema, data_header, payload) = record_batch_to_ipc(&batch).unwrap(); + RegionRequest::BulkInserts(RegionBulkInsertsRequest { + region_id: logical_region_id, + payload: batch, + raw_data: ArrowIpc { + schema, + data_header, + payload, + }, + partition_expr_version: None, + }) + } + + async fn init_dense_metric_region(env: &TestEnv) -> RegionId { + let physical_region_id = env.default_physical_region_id(); + env.create_physical_region( + physical_region_id, + &TestEnv::default_table_dir(), + vec![( + MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING.to_string(), + "dense".to_string(), + )], + ) + .await; + + let logical_region_id = env.default_logical_region_id(); + let request = test_util::create_logical_region_request( + &["job"], + physical_region_id, + &table_dir("test", logical_region_id.table_id()), + ); + env.metric() + .handle_request(logical_region_id, RegionRequest::Create(request)) + .await + .unwrap(); + logical_region_id + } + + #[tokio::test] + async fn test_bulk_insert_empty_batch_returns_zero() { + let env = TestEnv::new().await; + env.init_metric_region().await; + let logical_region_id = env.default_logical_region_id(); + + let batch = build_logical_batch(0, 0); + let request = RegionRequest::BulkInserts(RegionBulkInsertsRequest { + region_id: logical_region_id, + payload: batch, + raw_data: ArrowIpc::default(), + partition_expr_version: None, + }); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 0); + } + + #[tokio::test] + async fn test_bulk_insert_physical_region_rejected() { + let env = TestEnv::new().await; + env.init_metric_region().await; + + let physical_region_id = env.default_physical_region_id(); + let batch = build_logical_batch(0, 2); + let request = build_bulk_request(physical_region_id, batch); + + let err = env + .metric() + .handle_request(physical_region_id, request) + .await + .unwrap_err(); + let Some(err) = err.as_any().downcast_ref::() else { + panic!("unexpected error type"); + }; + assert_matches!(err, Error::UnsupportedRegionRequest { .. }); + } + + #[tokio::test] + async fn test_bulk_insert_unknown_column_errors() { + let env = TestEnv::new().await; + env.init_metric_region().await; + let logical_region_id = env.default_logical_region_id(); + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + greptime_timestamp(), + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new(greptime_value(), DataType::Float64, true), + Field::new("nonexistent_column", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(TimestampMillisecondArray::from(vec![0i64])), + Arc::new(Float64Array::from(vec![1.0])), + Arc::new(StringArray::from(vec!["val"])), + ], + ) + .unwrap(); + + let request = build_bulk_request(logical_region_id, batch); + let err = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap_err(); + let Some(err) = err.as_any().downcast_ref::() else { + panic!("unexpected error type"); + }; + assert_matches!(err, Error::ColumnNotFound { .. }); + } + + #[tokio::test] + async fn test_bulk_insert_multiple_tag_columns() { + let env = TestEnv::new().await; + let physical_region_id = env.default_physical_region_id(); + env.create_physical_region(physical_region_id, &TestEnv::default_table_dir(), vec![]) + .await; + let logical_region_id = env.default_logical_region_id(); + let request = test_util::create_logical_region_request( + &["host", "region"], + physical_region_id, + &table_dir("test", logical_region_id.table_id()), + ); + env.metric() + .handle_request(logical_region_id, RegionRequest::Create(request)) + .await + .unwrap(); + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + greptime_timestamp(), + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + Field::new(greptime_value(), DataType::Float64, true), + Field::new("host", DataType::Utf8, true), + Field::new("region", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(TimestampMillisecondArray::from(vec![0i64, 1, 2])), + Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0])), + Arc::new(StringArray::from(vec!["h1", "h2", "h1"])), + Arc::new(StringArray::from(vec!["us-east", "us-west", "eu-west"])), + ], + ) + .unwrap(); + + let request = build_bulk_request(logical_region_id, batch); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 3); + + let stream = env + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 3); + } + + #[tokio::test] + async fn test_bulk_insert_accumulates_rows() { + let env = TestEnv::new().await; + env.init_metric_region().await; + let logical_region_id = env.default_logical_region_id(); + + let request = build_bulk_request(logical_region_id, build_logical_batch(0, 3)); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 3); + + let request = build_bulk_request(logical_region_id, build_logical_batch(3, 5)); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 5); + + let stream = env + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 8); + } + + #[tokio::test] + async fn test_bulk_insert_sparse_encoding() { + let env = TestEnv::new().await; + env.init_metric_region().await; + let logical_region_id = env.default_logical_region_id(); + + let request = build_bulk_request(logical_region_id, build_logical_batch(0, 4)); + let response = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + assert_eq!(response.affected_rows, 4); + + let stream = env + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = RecordBatches::try_collect(stream).await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 4); + } + + #[tokio::test] + async fn test_bulk_insert_dense_encoding_rejected() { + let env = TestEnv::new().await; + let logical_region_id = init_dense_metric_region(&env).await; + + let request = build_bulk_request(logical_region_id, build_logical_batch(0, 2)); + let err = env + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap_err(); + let Some(err) = err.as_any().downcast_ref::() else { + panic!("unexpected error type"); + }; + assert_matches!(err, Error::UnsupportedRegionRequest { .. }); + } + + #[tokio::test] + async fn test_bulk_insert_matches_put() { + let env_put = TestEnv::new().await; + env_put.init_metric_region().await; + let logical_region_id = env_put.default_logical_region_id(); + let schema = test_util::row_schema_with_tags(&["job"]); + let rows = test_util::build_rows(1, 5); + env_put + .metric() + .handle_request( + logical_region_id, + RegionRequest::Put(RegionPutRequest { + rows: api::v1::Rows { schema, rows }, + hint: None, + partition_expr_version: None, + }), + ) + .await + .unwrap(); + let put_stream = env_put + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let put_batches = RecordBatches::try_collect(put_stream).await.unwrap(); + let put_output = put_batches.pretty_print().unwrap(); + + let env_bulk = TestEnv::new().await; + env_bulk.init_metric_region().await; + let request = build_bulk_request(logical_region_id, build_logical_batch(0, 5)); + env_bulk + .metric() + .handle_request(logical_region_id, request) + .await + .unwrap(); + let bulk_stream = env_bulk + .metric() + .scan_to_stream(logical_region_id, ScanRequest::default()) + .await + .unwrap(); + let bulk_batches = RecordBatches::try_collect(bulk_stream).await.unwrap(); + let bulk_output = bulk_batches.pretty_print().unwrap(); + + assert_eq!(put_output, bulk_output); + } + + #[test] + fn test_record_batch_to_rows_with_null_values() { + use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray}; + use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use datatypes::arrow::record_batch::RecordBatch; + use store_api::storage::RegionId; + + use crate::engine::bulk_insert::record_batch_to_rows; + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + greptime_timestamp(), + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new(greptime_value(), DataType::Float64, true), + Field::new("job", DataType::Utf8, true), + Field::new("host", DataType::Utf8, true), + ])); + + let ts_array = TimestampMillisecondArray::from(vec![Some(1000), None, Some(3000)]); + let val_array = Float64Array::from(vec![Some(1.0), Some(2.0), None]); + let job_array = StringArray::from(vec![Some("job1"), None, Some("job3")]); + let host_array = StringArray::from(vec![None, Some("host2"), Some("host3")]); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(ts_array), + Arc::new(val_array), + Arc::new(job_array), + Arc::new(host_array), + ], + ) + .unwrap(); + + let region_id = RegionId::new(1, 1); + let rows = record_batch_to_rows(&batch, region_id).unwrap(); + + assert_eq!(rows.rows.len(), 3); + assert_eq!(rows.schema.len(), 4); + + // Row 0: all non-null except host + assert!(rows.rows[0].values[0].value_data.is_some()); + assert!(rows.rows[0].values[1].value_data.is_some()); + assert!(rows.rows[0].values[2].value_data.is_some()); + assert!(rows.rows[0].values[3].value_data.is_none()); + + // Row 1: null timestamp, null job + assert!(rows.rows[1].values[0].value_data.is_none()); + assert!(rows.rows[1].values[1].value_data.is_some()); + assert!(rows.rows[1].values[2].value_data.is_none()); + assert!(rows.rows[1].values[3].value_data.is_some()); + + // Row 2: null value + assert!(rows.rows[2].values[0].value_data.is_some()); + assert!(rows.rows[2].values[1].value_data.is_none()); + assert!(rows.rows[2].values[2].value_data.is_some()); + assert!(rows.rows[2].values[3].value_data.is_some()); + } +} diff --git a/src/metric-engine/src/engine/put.rs b/src/metric-engine/src/engine/put.rs index 9251605aea..edae0d2bb4 100644 --- a/src/metric-engine/src/engine/put.rs +++ b/src/metric-engine/src/engine/put.rs @@ -460,7 +460,7 @@ impl MetricEngineInner { .await } - fn find_data_region_meta( + pub(crate) fn find_data_region_meta( &self, logical_region_id: RegionId, ) -> Result<(RegionId, RegionId, PrimaryKeyEncoding)> { diff --git a/src/metric-engine/src/lib.rs b/src/metric-engine/src/lib.rs index 30daa80b91..b93029f2f4 100644 --- a/src/metric-engine/src/lib.rs +++ b/src/metric-engine/src/lib.rs @@ -52,6 +52,7 @@ #![feature(assert_matches)] +mod batch_modifier; pub mod config; mod data_region; pub mod engine; diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 1d7cf7b6d7..a78bf079b0 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -108,6 +108,11 @@ name = "memtable_bench" harness = false required-features = ["test"] +[[bench]] +name = "bench_cache_stream" +harness = false +required-features = ["test"] + [[bench]] name = "bench_filter_time_partition" harness = false diff --git a/src/mito2/benches/bench_cache_stream.rs b/src/mito2/benches/bench_cache_stream.rs new file mode 100644 index 0000000000..f2314f2ccb --- /dev/null +++ b/src/mito2/benches/bench_cache_stream.rs @@ -0,0 +1,126 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Benchmarks for `cache_flat_range_stream` overhead. +//! +//! Compares consuming batches from a plain stream vs through the caching wrapper +//! that clones batches for the range cache. +//! +//! Run with: +//! ```sh +//! cargo bench -p mito2 --features test --bench bench_cache_stream +//! ``` + +use std::collections::VecDeque; +use std::sync::Arc; + +use criterion::{Criterion, criterion_group, criterion_main}; +use futures::TryStreamExt; +use mito_codec::row_converter::DensePrimaryKeyCodec; +use mito2::memtable::bulk::context::BulkIterContext; +use mito2::memtable::bulk::part::{BulkPartConverter, BulkPartEncoder}; +use mito2::memtable::bulk::part_reader::EncodedBulkPartIter; +use mito2::read::range_cache::bench_cache_flat_range_stream; +use mito2::sst::parquet::DEFAULT_ROW_GROUP_SIZE; +use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; +use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata}; + +fn cache_flat_range_stream_bench(c: &mut Criterion) { + let metadata = Arc::new(cpu_metadata()); + let region_id = metadata.region_id; + let start_sec = 1710043200; + // 2000 hosts × 51 steps = 102,000 rows ≈ DEFAULT_ROW_GROUP_SIZE + let num_hosts = 2000; + let end_sec = start_sec + 510; + let generator = CpuDataGenerator::new(metadata.clone(), num_hosts, start_sec, end_sec); + + // Build a BulkPart from all the generated data + let schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default()); + let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata)); + + let mut converter = BulkPartConverter::new( + &metadata, + schema, + DEFAULT_ROW_GROUP_SIZE, + codec, + true, // store_pk_columns + ); + for kvs in generator.iter() { + converter.append_key_values(&kvs).unwrap(); + } + let bulk_part = converter.convert().unwrap(); + + // Encode to parquet + let encoder = BulkPartEncoder::new(metadata.clone(), DEFAULT_ROW_GROUP_SIZE).unwrap(); + let encoded_part = encoder.encode_part(&bulk_part).unwrap().unwrap(); + + // Decode all record batches + let num_row_groups = encoded_part.metadata().parquet_metadata.num_row_groups(); + let context = Arc::new( + BulkIterContext::new( + metadata.clone(), + None, // No projection + None, // No predicate + false, + ) + .unwrap(), + ); + let row_groups: VecDeque = (0..num_row_groups).collect(); + + let rt = tokio::runtime::Runtime::new().unwrap(); + + let mut group = c.benchmark_group("cache_flat_range_stream"); + group.sample_size(10); + + group.bench_function("baseline_iter_stream", |b| { + b.iter(|| { + rt.block_on(async { + let iter = EncodedBulkPartIter::try_new( + &encoded_part, + context.clone(), + row_groups.clone(), + None, + None, + ) + .unwrap(); + let stream: mito2::read::BoxedRecordBatchStream = + Box::pin(futures::stream::iter(iter)); + let mut stream = stream; + while let Some(_batch) = stream.try_next().await.unwrap() {} + }); + }); + }); + + group.bench_function("cache_flat_range_stream", |b| { + b.iter(|| { + rt.block_on(async { + let iter = EncodedBulkPartIter::try_new( + &encoded_part, + context.clone(), + row_groups.clone(), + None, + None, + ) + .unwrap(); + let stream: mito2::read::BoxedRecordBatchStream = + Box::pin(futures::stream::iter(iter)); + let mut stream = bench_cache_flat_range_stream(stream, 64 * 1024 * 1024, region_id); + while let Some(_batch) = stream.try_next().await.unwrap() {} + }); + }); + }); +} + +criterion_group!(benches, cache_flat_range_stream_bench); +criterion_main!(benches); diff --git a/src/mito2/benches/memtable_bench.rs b/src/mito2/benches/memtable_bench.rs index ebe994f861..8336625e3c 100644 --- a/src/mito2/benches/memtable_bench.rs +++ b/src/mito2/benches/memtable_bench.rs @@ -12,15 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +//! Benchmarks for memtable operations: writes, full scans, filtered scans, +//! bulk part conversion, record batch iteration with filters, and flat merge. +//! +//! Run with: +//! ```sh +//! cargo bench -p mito2 --features test --bench memtable_bench +//! ``` + use std::sync::Arc; -use api::v1::value::ValueData; -use api::v1::{Row, Rows, SemanticType}; use criterion::{Criterion, criterion_group, criterion_main}; -use datafusion_common::Column; -use datafusion_expr::{Expr, lit}; -use datatypes::data_type::ConcreteDataType; -use datatypes::schema::ColumnSchema; use mito_codec::row_converter::DensePrimaryKeyCodec; use mito2::memtable::bulk::context::BulkIterContext; use mito2::memtable::bulk::part::BulkPartConverter; @@ -28,20 +30,13 @@ use mito2::memtable::bulk::part_reader::BulkPartBatchIter; use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig}; use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable}; use mito2::memtable::time_series::TimeSeriesMemtable; -use mito2::memtable::{KeyValues, Memtable, RangesOptions}; +use mito2::memtable::{IterBuilder, Memtable, RangesOptions}; use mito2::read::flat_merge::FlatMergeIterator; use mito2::read::scan_region::PredicateGroup; use mito2::region::options::MergeMode; use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; -use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema}; -use rand::Rng; -use rand::rngs::ThreadRng; -use rand::seq::IndexedRandom; -use store_api::metadata::{ - ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, -}; -use store_api::storage::RegionId; -use table::predicate::Predicate; +use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata}; +use mito2::test_util::memtable_util; /// Writes rows. fn write_rows(c: &mut Criterion) { @@ -105,7 +100,11 @@ fn full_scan(c: &mut Criterion) { } b.iter(|| { - let iter = memtable.iter(None, None, None).unwrap(); + let iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); for batch in iter { let _batch = batch.unwrap(); } @@ -145,7 +144,17 @@ fn filter_1_host(c: &mut Criterion) { let predicate = generator.random_host_filter(); b.iter(|| { - let iter = memtable.iter(None, Some(predicate.clone()), None).unwrap(); + let iter = memtable + .ranges( + None, + RangesOptions { + predicate: PredicateGroup::new(&metadata, predicate.exprs()).unwrap(), + ..Default::default() + }, + ) + .unwrap() + .build(None) + .unwrap(); for batch in iter { let _batch = batch.unwrap(); } @@ -202,224 +211,6 @@ fn filter_1_host(c: &mut Criterion) { }); } -struct Host { - hostname: String, - region: String, - datacenter: String, - rack: String, - os: String, - arch: String, - team: String, - service: String, - service_version: String, - service_environment: String, -} - -impl Host { - fn random_with_id(id: usize) -> Host { - let mut rng = rand::rng(); - let region = format!("ap-southeast-{}", rng.random_range(0..10)); - let datacenter = format!( - "{}{}", - region, - ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap() - ); - Host { - hostname: format!("host_{id}"), - region, - datacenter, - rack: rng.random_range(0..100).to_string(), - os: "Ubuntu16.04LTS".to_string(), - arch: "x86".to_string(), - team: "CHI".to_string(), - service: rng.random_range(0..100).to_string(), - service_version: rng.random_range(0..10).to_string(), - service_environment: "test".to_string(), - } - } - - fn fill_values(&self, values: &mut Vec) { - let tags = [ - api::v1::Value { - value_data: Some(ValueData::StringValue(self.hostname.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.region.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.datacenter.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.rack.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.os.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.arch.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.team.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.service.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.service_version.clone())), - }, - api::v1::Value { - value_data: Some(ValueData::StringValue(self.service_environment.clone())), - }, - ]; - for tag in tags { - values.push(tag); - } - } -} - -struct CpuDataGenerator { - metadata: RegionMetadataRef, - column_schemas: Vec, - hosts: Vec, - start_sec: i64, - end_sec: i64, -} - -impl CpuDataGenerator { - fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self { - let column_schemas = region_metadata_to_row_schema(&metadata); - Self { - metadata, - column_schemas, - hosts: Self::generate_hosts(num_hosts), - start_sec, - end_sec, - } - } - - fn iter(&self) -> impl Iterator + '_ { - // point per 10s. - (self.start_sec..self.end_sec) - .step_by(10) - .enumerate() - .map(|(seq, ts)| self.build_key_values(seq, ts)) - } - - fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues { - let rows = self - .hosts - .iter() - .map(|host| { - let mut rng = rand::rng(); - let mut values = Vec::with_capacity(21); - values.push(api::v1::Value { - value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)), - }); - host.fill_values(&mut values); - for _ in 0..10 { - values.push(api::v1::Value { - value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))), - }); - } - Row { values } - }) - .collect(); - let mutation = api::v1::Mutation { - op_type: api::v1::OpType::Put as i32, - sequence: seq as u64, - rows: Some(Rows { - schema: self.column_schemas.clone(), - rows, - }), - write_hint: None, - }; - - KeyValues::new(&self.metadata, mutation).unwrap() - } - - fn random_host_filter(&self) -> Predicate { - let host = self.random_hostname(); - let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host)); - Predicate::new(vec![expr]) - } - - fn random_host_filter_exprs(&self) -> Vec { - let host = self.random_hostname(); - vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))] - } - - fn random_hostname(&self) -> String { - let mut rng = rand::rng(); - self.hosts.choose(&mut rng).unwrap().hostname.clone() - } - - fn random_f64(rng: &mut ThreadRng) -> f64 { - let base: u32 = rng.random_range(30..95); - base as f64 - } - - fn generate_hosts(num_hosts: usize) -> Vec { - (0..num_hosts).map(Host::random_with_id).collect() - } -} - -/// Creates a metadata for TSBS cpu-like table. -fn cpu_metadata() -> RegionMetadata { - let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1)); - builder.push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ), - semantic_type: SemanticType::Timestamp, - column_id: 0, - }); - let mut column_id = 1; - let tags = [ - "hostname", - "region", - "datacenter", - "rack", - "os", - "arch", - "team", - "service", - "service_version", - "service_environment", - ]; - for tag in tags { - builder.push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true), - semantic_type: SemanticType::Tag, - column_id, - }); - column_id += 1; - } - let fields = [ - "usage_user", - "usage_system", - "usage_idle", - "usage_nice", - "usage_iowait", - "usage_irq", - "usage_softirq", - "usage_steal", - "usage_guest", - "usage_guest_nice", - ]; - for field in fields { - builder.push_column_metadata(ColumnMetadata { - column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true), - semantic_type: SemanticType::Field, - column_id, - }); - column_id += 1; - } - builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - builder.build().unwrap() -} - fn bulk_part_converter(c: &mut Criterion) { let metadata = Arc::new(cpu_metadata()); let start_sec = 1710043200; diff --git a/src/mito2/benches/simple_bulk_memtable.rs b/src/mito2/benches/simple_bulk_memtable.rs index 0277397768..05035734de 100644 --- a/src/mito2/benches/simple_bulk_memtable.rs +++ b/src/mito2/benches/simple_bulk_memtable.rs @@ -21,7 +21,7 @@ use criterion::{Criterion, criterion_group, criterion_main}; use datatypes::data_type::ConcreteDataType; use datatypes::schema::ColumnSchema; use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable; -use mito2::memtable::{KeyValues, Memtable, MemtableRanges, RangesOptions}; +use mito2::memtable::{IterBuilder, KeyValues, Memtable, MemtableRanges, RangesOptions}; use mito2::read; use mito2::read::Source; use mito2::read::dedup::DedupReader; @@ -156,7 +156,11 @@ async fn flush(mem: &SimpleBulkMemtable) { } async fn flush_original(mem: &SimpleBulkMemtable) { - let iter = mem.iter(None, None, None).unwrap(); + let iter = mem + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); for b in iter { black_box(b.unwrap()); } diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs index 92c8a3bc36..33180ebf46 100644 --- a/src/mito2/src/access_layer.rs +++ b/src/mito2/src/access_layer.rs @@ -17,7 +17,6 @@ use std::time::{Duration, Instant}; use async_stream::try_stream; use common_time::Timestamp; -use either::Either; use futures::{Stream, TryStreamExt}; use object_store::services::Fs; use object_store::util::{join_dir, with_instrument_layers}; @@ -37,7 +36,7 @@ use crate::error::{ CleanDirSnafu, DeleteIndexSnafu, DeleteIndexesSnafu, DeleteSstsSnafu, OpenDalSnafu, Result, }; use crate::metrics::{COMPACTION_STAGE_ELAPSED, FLUSH_ELAPSED}; -use crate::read::{FlatSource, Source}; +use crate::read::FlatSource; use crate::region::options::IndexOptions; use crate::sst::file::{FileHandle, RegionFileId, RegionIndexId}; use crate::sst::index::IndexerBuilderImpl; @@ -47,7 +46,7 @@ use crate::sst::location::{self, region_dir_from_table_dir}; use crate::sst::parquet::reader::ParquetReaderBuilder; use crate::sst::parquet::writer::ParquetWriter; use crate::sst::parquet::{SstInfo, WriteOptions}; -use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY}; +use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FormatType}; pub type AccessLayerRef = Arc; /// SST write results. @@ -339,6 +338,7 @@ impl AccessLayer { metrics: &mut Metrics, ) -> Result { let region_id = request.metadata.region_id; + let region_metadata = request.metadata.clone(); let cache_manager = request.cache_manager.clone(); let sst_info = if let Some(write_cache) = cache_manager.write_cache() { @@ -391,15 +391,19 @@ impl AccessLayer { ) .await .with_file_cleaner(cleaner); - match request.source { - Either::Left(source) => { + match request.sst_write_format { + FormatType::PrimaryKey => { writer - .write_all(source, request.max_sequence, write_opts) + .write_all_flat_as_primary_key( + request.source, + request.max_sequence, + write_opts, + ) .await? } - Either::Right(flat_source) => { + FormatType::Flat => { writer - .write_all_flat(flat_source, request.max_sequence, write_opts) + .write_all_flat(request.source, request.max_sequence, write_opts) .await? } } @@ -412,6 +416,7 @@ impl AccessLayer { cache_manager.put_parquet_meta_data( RegionFileId::new(region_id, sst.file_id), parquet_metadata.clone(), + Some(region_metadata.clone()), ) } } @@ -520,11 +525,12 @@ pub enum OperationType { pub struct SstWriteRequest { pub op_type: OperationType, pub metadata: RegionMetadataRef, - pub source: Either, + pub source: FlatSource, pub cache_manager: CacheManagerRef, #[allow(dead_code)] pub storage: Option, pub max_sequence: Option, + pub sst_write_format: FormatType, /// Configs for index pub index_options: IndexOptions, diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs index 3ad71d2a61..35db74eee6 100644 --- a/src/mito2/src/cache.rs +++ b/src/mito2/src/cache.rs @@ -28,6 +28,7 @@ use std::ops::Range; use std::sync::Arc; use bytes::Bytes; +use common_telemetry::warn; use datatypes::arrow::record_batch::RecordBatch; use datatypes::value::Value; use datatypes::vectors::VectorRef; @@ -36,8 +37,10 @@ use index::result_cache::IndexResultCache; use moka::notification::RemovalCause; use moka::sync::Cache; use object_store::ObjectStore; -use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData}; +use parquet::file::metadata::{FileMetaData, PageIndexPolicy, ParquetMetaData}; use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef}; +use snafu::{OptionExt, ResultExt}; +use store_api::metadata::RegionMetadataRef; use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector}; use crate::cache::cache_size::parquet_meta_size; @@ -46,10 +49,13 @@ use crate::cache::index::inverted_index::{InvertedIndexCache, InvertedIndexCache #[cfg(feature = "vector_index")] use crate::cache::index::vector_index::{VectorIndexCache, VectorIndexCacheRef}; use crate::cache::write_cache::WriteCacheRef; +use crate::error::{InvalidMetadataSnafu, InvalidParquetSnafu, Result}; use crate::memtable::record_batch_estimated_size; use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS}; use crate::read::Batch; +use crate::read::range_cache::{RangeScanCacheKey, RangeScanCacheValue}; use crate::sst::file::{RegionFileId, RegionIndexId}; +use crate::sst::parquet::PARQUET_METADATA_KEY; use crate::sst::parquet::reader::MetadataCacheMetrics; /// Metrics type key for sst meta. @@ -64,6 +70,108 @@ const FILE_TYPE: &str = "file"; const INDEX_TYPE: &str = "index"; /// Metrics type key for selector result cache. const SELECTOR_RESULT_TYPE: &str = "selector_result"; +/// Metrics type key for range scan result cache. +const RANGE_RESULT_TYPE: &str = "range_result"; + +/// Cached SST metadata combines the parquet footer with the decoded region metadata. +/// +/// The cached parquet footer strips the `greptime:metadata` JSON payload and stores the decoded +/// [RegionMetadata] separately so readers can skip repeated deserialization work. +#[derive(Debug)] +pub(crate) struct CachedSstMeta { + parquet_metadata: Arc, + region_metadata: RegionMetadataRef, + region_metadata_weight: usize, +} + +impl CachedSstMeta { + pub(crate) fn try_new(file_path: &str, parquet_metadata: ParquetMetaData) -> Result { + Self::try_new_with_region_metadata(file_path, parquet_metadata, None) + } + + pub(crate) fn try_new_with_region_metadata( + file_path: &str, + parquet_metadata: ParquetMetaData, + region_metadata: Option, + ) -> Result { + let file_metadata = parquet_metadata.file_metadata(); + let key_values = file_metadata + .key_value_metadata() + .context(InvalidParquetSnafu { + file: file_path, + reason: "missing key value meta", + })?; + let meta_value = key_values + .iter() + .find(|kv| kv.key == PARQUET_METADATA_KEY) + .with_context(|| InvalidParquetSnafu { + file: file_path, + reason: format!("key {} not found", PARQUET_METADATA_KEY), + })?; + let json = meta_value + .value + .as_ref() + .with_context(|| InvalidParquetSnafu { + file: file_path, + reason: format!("No value for key {}", PARQUET_METADATA_KEY), + })?; + let region_metadata = match region_metadata { + Some(region_metadata) => region_metadata, + None => Arc::new( + store_api::metadata::RegionMetadata::from_json(json) + .context(InvalidMetadataSnafu)?, + ), + }; + // Keep the previous JSON-byte floor and charge the decoded structures as well. + let region_metadata_weight = region_metadata.estimated_size().max(json.len()); + let parquet_metadata = Arc::new(strip_region_metadata_from_parquet(parquet_metadata)); + + Ok(Self { + parquet_metadata, + region_metadata, + region_metadata_weight, + }) + } + + pub(crate) fn parquet_metadata(&self) -> Arc { + self.parquet_metadata.clone() + } + + pub(crate) fn region_metadata(&self) -> RegionMetadataRef { + self.region_metadata.clone() + } +} + +fn strip_region_metadata_from_parquet(parquet_metadata: ParquetMetaData) -> ParquetMetaData { + let file_metadata = parquet_metadata.file_metadata(); + let filtered_key_values = file_metadata.key_value_metadata().and_then(|key_values| { + let filtered = key_values + .iter() + .filter(|kv| kv.key != PARQUET_METADATA_KEY) + .cloned() + .collect::>(); + (!filtered.is_empty()).then_some(filtered) + }); + let stripped_file_metadata = FileMetaData::new( + file_metadata.version(), + file_metadata.num_rows(), + file_metadata.created_by().map(ToString::to_string), + filtered_key_values, + file_metadata.schema_descr_ptr(), + file_metadata.column_orders().cloned(), + ); + + let mut builder = parquet_metadata.into_builder(); + let row_groups = builder.take_row_groups(); + let column_index = builder.take_column_index(); + let offset_index = builder.take_offset_index(); + + parquet::file::metadata::ParquetMetaDataBuilder::new(stripped_file_metadata) + .set_row_groups(row_groups) + .set_column_index(column_index) + .set_offset_index(offset_index) + .build() +} /// Cache strategies that may only enable a subset of caches. #[derive(Clone)] @@ -81,18 +189,17 @@ pub enum CacheStrategy { } impl CacheStrategy { - /// Gets parquet metadata with cache metrics tracking. - /// Returns the metadata and updates the provided metrics. - pub(crate) async fn get_parquet_meta_data( + /// Gets fused SST metadata with cache metrics tracking. + pub(crate) async fn get_sst_meta_data( &self, file_id: RegionFileId, metrics: &mut MetadataCacheMetrics, page_index_policy: PageIndexPolicy, - ) -> Option> { + ) -> Option> { match self { CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => { cache_manager - .get_parquet_meta_data(file_id, metrics, page_index_policy) + .get_sst_meta_data(file_id, metrics, page_index_policy) .await } CacheStrategy::Disabled => { @@ -102,30 +209,48 @@ impl CacheStrategy { } } - /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()]. - pub fn get_parquet_meta_data_from_mem_cache( + /// Calls [CacheManager::get_sst_meta_data_from_mem_cache()]. + pub(crate) fn get_sst_meta_data_from_mem_cache( &self, file_id: RegionFileId, - ) -> Option> { + ) -> Option> { match self { - CacheStrategy::EnableAll(cache_manager) => { - cache_manager.get_parquet_meta_data_from_mem_cache(file_id) - } - CacheStrategy::Compaction(cache_manager) => { - cache_manager.get_parquet_meta_data_from_mem_cache(file_id) + CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => { + cache_manager.get_sst_meta_data_from_mem_cache(file_id) } CacheStrategy::Disabled => None, } } - /// Calls [CacheManager::put_parquet_meta_data()]. - pub fn put_parquet_meta_data(&self, file_id: RegionFileId, metadata: Arc) { + /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()]. + pub fn get_parquet_meta_data_from_mem_cache( + &self, + file_id: RegionFileId, + ) -> Option> { + self.get_sst_meta_data_from_mem_cache(file_id) + .map(|metadata| metadata.parquet_metadata()) + } + + /// Calls [CacheManager::put_sst_meta_data()]. + pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc) { match self { - CacheStrategy::EnableAll(cache_manager) => { - cache_manager.put_parquet_meta_data(file_id, metadata); + CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => { + cache_manager.put_sst_meta_data(file_id, metadata); } - CacheStrategy::Compaction(cache_manager) => { - cache_manager.put_parquet_meta_data(file_id, metadata); + CacheStrategy::Disabled => {} + } + } + + /// Calls [CacheManager::put_parquet_meta_data()]. + pub fn put_parquet_meta_data( + &self, + file_id: RegionFileId, + metadata: Arc, + region_metadata: Option, + ) { + match self { + CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => { + cache_manager.put_parquet_meta_data(file_id, metadata, region_metadata); } CacheStrategy::Disabled => {} } @@ -223,6 +348,31 @@ impl CacheStrategy { } } + /// Calls [CacheManager::get_range_result()]. + /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled]. + #[allow(dead_code)] + pub(crate) fn get_range_result( + &self, + key: &RangeScanCacheKey, + ) -> Option> { + match self { + CacheStrategy::EnableAll(cache_manager) => cache_manager.get_range_result(key), + CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None, + } + } + + /// Calls [CacheManager::put_range_result()]. + /// It does nothing if the strategy isn't [CacheStrategy::EnableAll]. + pub(crate) fn put_range_result( + &self, + key: RangeScanCacheKey, + result: Arc, + ) { + if let CacheStrategy::EnableAll(cache_manager) = self { + cache_manager.put_range_result(key, result); + } + } + /// Calls [CacheManager::write_cache()]. /// It returns None if the strategy is [CacheStrategy::Disabled]. pub fn write_cache(&self) -> Option<&WriteCacheRef> { @@ -324,6 +474,8 @@ pub struct CacheManager { puffin_metadata_cache: Option, /// Cache for time series selectors. selector_result_cache: Option, + /// Cache for range scan outputs in flat format. + range_result_cache: Option, /// Cache for index result. index_result_cache: Option, } @@ -336,6 +488,35 @@ impl CacheManager { CacheManagerBuilder::default() } + /// Gets fused SST metadata with metrics tracking. + /// Tries in-memory cache first, then file cache, updating metrics accordingly. + pub(crate) async fn get_sst_meta_data( + &self, + file_id: RegionFileId, + metrics: &mut MetadataCacheMetrics, + page_index_policy: PageIndexPolicy, + ) -> Option> { + if let Some(metadata) = self.get_sst_meta_data_from_mem_cache(file_id) { + metrics.mem_cache_hit += 1; + return Some(metadata); + } + + let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet); + if let Some(write_cache) = &self.write_cache + && let Some(metadata) = write_cache + .file_cache() + .get_sst_meta_data(key, metrics, page_index_policy) + .await + { + metrics.file_cache_hit += 1; + self.put_sst_meta_data(file_id, metadata.clone()); + return Some(metadata); + } + + metrics.cache_miss += 1; + None + } + /// Gets cached [ParquetMetaData] with metrics tracking. /// Tries in-memory cache first, then file cache, updating metrics accordingly. pub(crate) async fn get_parquet_meta_data( @@ -344,29 +525,21 @@ impl CacheManager { metrics: &mut MetadataCacheMetrics, page_index_policy: PageIndexPolicy, ) -> Option> { - // Try to get metadata from sst meta cache - if let Some(metadata) = self.get_parquet_meta_data_from_mem_cache(file_id) { - metrics.mem_cache_hit += 1; - return Some(metadata); - } + self.get_sst_meta_data(file_id, metrics, page_index_policy) + .await + .map(|metadata| metadata.parquet_metadata()) + } - // Try to get metadata from write cache - let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet); - if let Some(write_cache) = &self.write_cache - && let Some(metadata) = write_cache - .file_cache() - .get_parquet_meta_data(key, metrics, page_index_policy) - .await - { - metrics.file_cache_hit += 1; - let metadata = Arc::new(metadata); - // Put metadata into sst meta cache - self.put_parquet_meta_data(file_id, metadata.clone()); - return Some(metadata); - }; - metrics.cache_miss += 1; - - None + /// Gets cached fused SST metadata from in-memory cache. + /// This method does not perform I/O. + pub(crate) fn get_sst_meta_data_from_mem_cache( + &self, + file_id: RegionFileId, + ) -> Option> { + self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| { + let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id())); + update_hit_miss(value, SST_META_TYPE) + }) } /// Gets cached [ParquetMetaData] from in-memory cache. @@ -375,15 +548,12 @@ impl CacheManager { &self, file_id: RegionFileId, ) -> Option> { - // Try to get metadata from sst meta cache - self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| { - let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id())); - update_hit_miss(value, SST_META_TYPE) - }) + self.get_sst_meta_data_from_mem_cache(file_id) + .map(|metadata| metadata.parquet_metadata()) } - /// Puts [ParquetMetaData] into the cache. - pub fn put_parquet_meta_data(&self, file_id: RegionFileId, metadata: Arc) { + /// Puts fused SST metadata into the cache. + pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc) { if let Some(cache) = &self.sst_meta_cache { let key = SstMetaKey(file_id.region_id(), file_id.file_id()); CACHE_BYTES @@ -393,6 +563,34 @@ impl CacheManager { } } + /// Puts [ParquetMetaData] into the cache. + pub fn put_parquet_meta_data( + &self, + file_id: RegionFileId, + metadata: Arc, + region_metadata: Option, + ) { + if self.sst_meta_cache.is_some() { + let file_path = format!( + "region_id={}, file_id={}", + file_id.region_id(), + file_id.file_id() + ); + match CachedSstMeta::try_new_with_region_metadata( + &file_path, + Arc::unwrap_or_clone(metadata), + region_metadata, + ) { + Ok(metadata) => self.put_sst_meta_data(file_id, Arc::new(metadata)), + Err(err) => warn!( + err; "Failed to decode region metadata while caching parquet metadata, region_id: {}, file_id: {}", + file_id.region_id(), + file_id.file_id() + ), + } + } + } + /// Removes [ParquetMetaData] from the cache. pub fn remove_parquet_meta_data(&self, file_id: RegionFileId) { if let Some(cache) = &self.sst_meta_cache { @@ -512,6 +710,31 @@ impl CacheManager { } } + /// Gets cached result for range scan. + #[allow(dead_code)] + pub(crate) fn get_range_result( + &self, + key: &RangeScanCacheKey, + ) -> Option> { + self.range_result_cache + .as_ref() + .and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE)) + } + + /// Puts range scan result into cache. + pub(crate) fn put_range_result( + &self, + key: RangeScanCacheKey, + result: Arc, + ) { + if let Some(cache) = &self.range_result_cache { + CACHE_BYTES + .with_label_values(&[RANGE_RESULT_TYPE]) + .add(range_result_cache_weight(&key, &result).into()); + cache.insert(key, result); + } + } + /// Gets the write cache. pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> { self.write_cache.as_ref() @@ -562,6 +785,7 @@ pub struct CacheManagerBuilder { puffin_metadata_size: u64, write_cache: Option, selector_result_cache_size: u64, + range_result_cache_size: u64, } impl CacheManagerBuilder { @@ -625,6 +849,12 @@ impl CacheManagerBuilder { self } + /// Sets range result cache size. + pub fn range_result_cache_size(mut self, bytes: u64) -> Self { + self.range_result_cache_size = bytes; + self + } + /// Builds the [CacheManager]. pub fn build(self) -> CacheManager { fn to_str(cause: RemovalCause) -> &'static str { @@ -712,6 +942,21 @@ impl CacheManagerBuilder { }) .build() }); + let range_result_cache = (self.range_result_cache_size != 0).then(|| { + Cache::builder() + .max_capacity(self.range_result_cache_size) + .weigher(range_result_cache_weight) + .eviction_listener(move |k, v, cause| { + let size = range_result_cache_weight(&k, &v); + CACHE_BYTES + .with_label_values(&[RANGE_RESULT_TYPE]) + .sub(size.into()); + CACHE_EVICTION + .with_label_values(&[RANGE_RESULT_TYPE, to_str(cause)]) + .inc(); + }) + .build() + }); CacheManager { sst_meta_cache, vector_cache, @@ -723,14 +968,15 @@ impl CacheManagerBuilder { vector_index_cache, puffin_metadata_cache: Some(Arc::new(puffin_metadata_cache)), selector_result_cache, + range_result_cache, index_result_cache, } } } -fn meta_cache_weight(k: &SstMetaKey, v: &Arc) -> u32 { +fn meta_cache_weight(k: &SstMetaKey, v: &Arc) -> u32 { // We ignore the size of `Arc`. - (k.estimated_size() + parquet_meta_size(v)) as u32 + (k.estimated_size() + parquet_meta_size(&v.parquet_metadata) + v.region_metadata_weight) as u32 } fn vector_cache_weight(_k: &(ConcreteDataType, Value), v: &VectorRef) -> u32 { @@ -746,6 +992,10 @@ fn selector_result_cache_weight(k: &SelectorResultKey, v: &Arc) -> u32 { + (k.estimated_size() + v.estimated_size()) as u32 +} + /// Updates cache hit/miss metrics. fn update_hit_miss(value: Option, cache_type: &str) -> Option { if value.is_some() { @@ -892,8 +1142,8 @@ impl SelectorResultValue { } } -/// Maps (region id, file id) to [ParquetMetaData]. -type SstMetaCache = Cache>; +/// Maps (region id, file id) to fused SST metadata. +type SstMetaCache = Cache>; /// Maps [Value] to a vector that holds this value repeatedly. /// /// e.g. `"hello" => ["hello", "hello", "hello"]` @@ -902,20 +1152,30 @@ type VectorCache = Cache<(ConcreteDataType, Value), VectorRef>; type PageCache = Cache>; /// Maps (file id, row group id, time series row selector) to [SelectorResultValue]. type SelectorResultCache = Cache>; +/// Maps partition-range scan key to cached flat batches. +type RangeResultCache = Cache>; #[cfg(test)] mod tests { use std::sync::Arc; + use api::v1::SemanticType; use api::v1::index::{BloomFilterMeta, InvertedIndexMetas}; + use datatypes::schema::ColumnSchema; use datatypes::vectors::Int64Vector; use puffin::file_metadata::FileMetadata; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; use store_api::storage::ColumnId; use super::*; use crate::cache::index::bloom_filter_index::Tag; use crate::cache::index::result_cache::PredicateKey; - use crate::cache::test_util::parquet_meta; + use crate::cache::test_util::{ + parquet_meta, sst_parquet_meta, sst_parquet_meta_with_region_metadata, + }; + use crate::read::range_cache::{ + RangeScanCacheKey, RangeScanCacheValue, ScanRequestFingerprintBuilder, + }; use crate::sst::parquet::row_selection::RowGroupSelection; #[tokio::test] @@ -929,7 +1189,7 @@ mod tests { let file_id = RegionFileId::new(region_id, FileId::random()); let metadata = parquet_meta(); let mut metrics = MetadataCacheMetrics::default(); - cache.put_parquet_meta_data(file_id, metadata); + cache.put_parquet_meta_data(file_id, metadata, None); assert!( cache .get_parquet_meta_data(file_id, &mut metrics, Default::default()) @@ -966,13 +1226,23 @@ mod tests { .await .is_none() ); - let metadata = parquet_meta(); - cache.put_parquet_meta_data(file_id, metadata); + let (metadata, region_metadata) = sst_parquet_meta(); + cache.put_parquet_meta_data(file_id, metadata, None); + let cached = cache + .get_sst_meta_data(file_id, &mut metrics, Default::default()) + .await + .unwrap(); + assert_eq!(region_metadata, cached.region_metadata()); assert!( - cache - .get_parquet_meta_data(file_id, &mut metrics, Default::default()) - .await - .is_some() + cached + .parquet_metadata() + .file_metadata() + .key_value_metadata() + .is_none_or(|key_values| { + key_values + .iter() + .all(|key_value| key_value.key != PARQUET_METADATA_KEY) + }) ); cache.remove_parquet_meta_data(file_id); assert!( @@ -983,6 +1253,42 @@ mod tests { ); } + #[tokio::test] + async fn test_parquet_meta_cache_with_provided_region_metadata() { + let cache = CacheManager::builder().sst_meta_cache_size(2000).build(); + let mut metrics = MetadataCacheMetrics::default(); + let region_id = RegionId::new(1, 1); + let file_id = RegionFileId::new(region_id, FileId::random()); + let (metadata, region_metadata) = sst_parquet_meta(); + + cache.put_parquet_meta_data(file_id, metadata, Some(region_metadata.clone())); + + let cached = cache + .get_sst_meta_data(file_id, &mut metrics, Default::default()) + .await + .unwrap(); + assert!(Arc::ptr_eq(®ion_metadata, &cached.region_metadata())); + } + + #[test] + fn test_meta_cache_weight_accounts_for_decoded_region_metadata() { + let region_metadata = Arc::new(wide_region_metadata(128)); + let json_len = region_metadata.to_json().unwrap().len(); + let metadata = sst_parquet_meta_with_region_metadata(region_metadata.clone()); + let cached = Arc::new( + CachedSstMeta::try_new("test.parquet", Arc::unwrap_or_clone(metadata)).unwrap(), + ); + let key = SstMetaKey(region_metadata.region_id, FileId::random()); + + assert!(cached.region_metadata_weight > json_len); + assert_eq!( + meta_cache_weight(&key, &cached) as usize, + key.estimated_size() + + parquet_meta_size(&cached.parquet_metadata) + + cached.region_metadata_weight + ); + } + #[test] fn test_repeated_vector_cache() { let cache = CacheManager::builder().vector_cache_size(4096).build(); @@ -1028,6 +1334,50 @@ mod tests { assert!(cache.get_selector_result(&key).is_some()); } + #[test] + fn test_range_result_cache() { + let cache = Arc::new( + CacheManager::builder() + .range_result_cache_size(1024 * 1024) + .build(), + ); + + let key = RangeScanCacheKey { + region_id: RegionId::new(1, 1), + row_groups: vec![(FileId::random(), 0)], + scan: ScanRequestFingerprintBuilder { + read_column_ids: vec![], + read_column_types: vec![], + filters: vec!["tag_0 = 1".to_string()], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: true, + merge_mode: crate::region::options::MergeMode::LastRow, + partition_expr_version: 0, + } + .build(), + }; + let value = Arc::new(RangeScanCacheValue::new(Vec::new(), 0)); + + assert!(cache.get_range_result(&key).is_none()); + cache.put_range_result(key.clone(), value.clone()); + assert!(cache.get_range_result(&key).is_some()); + + let enable_all = CacheStrategy::EnableAll(cache.clone()); + assert!(enable_all.get_range_result(&key).is_some()); + + let compaction = CacheStrategy::Compaction(cache.clone()); + assert!(compaction.get_range_result(&key).is_none()); + compaction.put_range_result(key.clone(), value.clone()); + assert!(cache.get_range_result(&key).is_some()); + + let disabled = CacheStrategy::Disabled; + assert!(disabled.get_range_result(&key).is_none()); + disabled.put_range_result(key.clone(), value); + assert!(cache.get_range_result(&key).is_some()); + } + #[tokio::test] async fn test_evict_puffin_cache_clears_all_entries() { use std::collections::{BTreeMap, HashMap}; @@ -1122,4 +1472,45 @@ mod tests { assert!(result_cache.get(&predicate, index_id.file_id()).is_none()); assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none()); } + + fn wide_region_metadata(column_count: u32) -> RegionMetadata { + let region_id = RegionId::new(1024, 7); + let mut builder = RegionMetadataBuilder::new(region_id); + let mut primary_key = Vec::new(); + + for column_id in 0..column_count { + let semantic_type = if column_id < 32 { + primary_key.push(column_id); + SemanticType::Tag + } else { + SemanticType::Field + }; + let mut column_schema = ColumnSchema::new( + format!("wide_column_{column_id}"), + ConcreteDataType::string_datatype(), + true, + ); + column_schema + .mut_metadata() + .insert(format!("cache_key_{column_id}"), "cache_value".repeat(4)); + builder.push_column_metadata(ColumnMetadata { + column_schema, + semantic_type, + column_id, + }); + } + + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: column_count, + }); + builder.primary_key(primary_key); + + builder.build().unwrap() + } } diff --git a/src/mito2/src/cache/file_cache.rs b/src/mito2/src/cache/file_cache.rs index 32a276d0e4..278838b369 100644 --- a/src/mito2/src/cache/file_cache.rs +++ b/src/mito2/src/cache/file_cache.rs @@ -34,7 +34,7 @@ use store_api::storage::{FileId, RegionId}; use tokio::sync::mpsc::{Sender, UnboundedReceiver}; use crate::access_layer::TempFileCleaner; -use crate::cache::{FILE_TYPE, INDEX_TYPE}; +use crate::cache::{CachedSstMeta, FILE_TYPE, INDEX_TYPE}; use crate::error::{self, OpenDalSnafu, Result}; use crate::metrics::{ CACHE_BYTES, CACHE_HIT, CACHE_MISS, WRITE_CACHE_DOWNLOAD_BYTES_TOTAL, @@ -612,6 +612,34 @@ impl FileCache { } } + /// Get fused SST metadata from the file cache. + /// If the file is not in the cache, or metadata loading/decoding fails, return None. + pub(crate) async fn get_sst_meta_data( + &self, + key: IndexKey, + cache_metrics: &mut MetadataCacheMetrics, + page_index_policy: PageIndexPolicy, + ) -> Option> { + let file_path = self.inner.cache_file_path(key); + self.get_parquet_meta_data(key, cache_metrics, page_index_policy) + .await + .and_then( + |metadata| match CachedSstMeta::try_new(&file_path, metadata) { + Ok(metadata) => Some(Arc::new(metadata)), + Err(err) => { + CACHE_MISS + .with_label_values(&[key.file_type.metric_label()]) + .inc(); + warn!( + err; "Failed to decode cached parquet metadata for key {:?}", + key + ); + None + } + }, + ) + } + async fn get_reader(&self, file_path: &str) -> object_store::Result> { if self.inner.local_store.exists(file_path).await? { Ok(Some(self.inner.local_store.reader(file_path).await?)) diff --git a/src/mito2/src/cache/test_util.rs b/src/mito2/src/cache/test_util.rs index 65ad9d87eb..ef3d8e9315 100644 --- a/src/mito2/src/cache/test_util.rs +++ b/src/mito2/src/cache/test_util.rs @@ -23,8 +23,13 @@ use object_store::ObjectStore; use object_store::services::Fs; use parquet::arrow::ArrowWriter; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; -use parquet::file::metadata::ParquetMetaData; +use parquet::file::metadata::{KeyValue, ParquetMetaData}; +use parquet::file::properties::WriterProperties; use parquet::file::statistics::Statistics; +use store_api::metadata::RegionMetadataRef; + +use crate::sst::parquet::PARQUET_METADATA_KEY; +use crate::test_util::sst_util::sst_region_metadata; /// Returns a parquet meta data. pub(crate) fn parquet_meta() -> Arc { @@ -33,13 +38,43 @@ pub(crate) fn parquet_meta() -> Arc { builder.metadata().clone() } +/// Returns parquet metadata for an SST parquet file and its decoded region metadata. +pub(crate) fn sst_parquet_meta() -> (Arc, RegionMetadataRef) { + let region_metadata = Arc::new(sst_region_metadata()); + let file_data = parquet_file_data_with_region_metadata(®ion_metadata); + let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(file_data)).unwrap(); + (builder.metadata().clone(), region_metadata) +} + +/// Returns parquet metadata for an SST parquet file with custom region metadata. +pub(crate) fn sst_parquet_meta_with_region_metadata( + region_metadata: RegionMetadataRef, +) -> Arc { + let file_data = parquet_file_data_with_region_metadata(®ion_metadata); + let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(file_data)).unwrap(); + builder.metadata().clone() +} + /// Write a test parquet file to a buffer fn parquet_file_data() -> Vec { + parquet_file_data_inner(None) +} + +fn parquet_file_data_with_region_metadata(region_metadata: &RegionMetadataRef) -> Vec { + let json = region_metadata.to_json().unwrap(); + let key_value = KeyValue::new(PARQUET_METADATA_KEY.to_string(), json); + parquet_file_data_inner(Some(vec![key_value])) +} + +fn parquet_file_data_inner(key_value_metadata: Option>) -> Vec { let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap(); let mut buffer = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), None).unwrap(); + let props = WriterProperties::builder() + .set_key_value_metadata(key_value_metadata) + .build(); + let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), Some(props)).unwrap(); writer.write(&to_write).unwrap(); writer.close().unwrap(); diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs index a28df3f54c..e2483ed4e4 100644 --- a/src/mito2/src/cache/write_cache.rs +++ b/src/mito2/src/cache/write_cache.rs @@ -244,15 +244,19 @@ impl WriteCache { .await .with_file_cleaner(cleaner); - let sst_info = match write_request.source { - either::Left(source) => { + let sst_info = match write_request.sst_write_format { + crate::sst::FormatType::PrimaryKey => { writer - .write_all(source, write_request.max_sequence, write_opts) + .write_all_flat_as_primary_key( + write_request.source, + write_request.max_sequence, + write_opts, + ) .await? } - either::Right(flat_source) => { + crate::sst::FormatType::Flat => { writer - .write_all_flat(flat_source, write_request.max_sequence, write_opts) + .write_all_flat(write_request.source, write_request.max_sequence, write_opts) .await? } }; @@ -509,12 +513,13 @@ mod tests { use crate::cache::test_util::{assert_parquet_metadata_equal, new_fs_store}; use crate::cache::{CacheManager, CacheStrategy}; use crate::error::InvalidBatchSnafu; - use crate::read::Source; + use crate::read::FlatSource; use crate::region::options::IndexOptions; use crate::sst::parquet::reader::ParquetReaderBuilder; use crate::test_util::TestEnv; use crate::test_util::sst_util::{ - new_batch_by_range, new_source, sst_file_handle_with_file_id, sst_region_metadata, + new_flat_source_from_record_batches, new_record_batch_by_range, + sst_file_handle_with_file_id, sst_region_metadata, }; #[tokio::test] @@ -532,21 +537,22 @@ mod tests { .create_write_cache(local_store.clone(), ReadableSize::mb(10)) .await; - // Create Source + // Create source. let metadata = Arc::new(sst_region_metadata()); let region_id = metadata.region_id; - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); let write_request = SstWriteRequest { op_type: OperationType::Flush, metadata, - source: either::Left(source), + source, storage: None, max_sequence: None, + sst_write_format: Default::default(), cache_manager: Default::default(), index_options: IndexOptions::default(), index_config: Default::default(), @@ -636,19 +642,20 @@ mod tests { // Create source let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Write to local cache and upload sst to mock remote store let write_request = SstWriteRequest { op_type: OperationType::Flush, metadata, - source: either::Left(source), + source, storage: None, max_sequence: None, + sst_write_format: Default::default(), cache_manager: cache_manager.clone(), index_options: IndexOptions::default(), index_config: Default::default(), @@ -686,9 +693,15 @@ mod tests { .cache(CacheStrategy::EnableAll(cache_manager.clone())) .page_index_policy(PageIndexPolicy::Optional); let reader = builder.build().await.unwrap().unwrap(); + let cached_write_parquet_metadata = crate::cache::CachedSstMeta::try_new( + "test.sst", + Arc::unwrap_or_clone(write_parquet_metadata), + ) + .unwrap() + .parquet_metadata(); // Check parquet metadata - assert_parquet_metadata_equal(write_parquet_metadata, reader.parquet_metadata()); + assert_parquet_metadata_equal(cached_write_parquet_metadata, reader.parquet_metadata()); } #[tokio::test] @@ -715,9 +728,9 @@ mod tests { let metadata = Arc::new(sst_region_metadata()); // Creates a source that can return an error to abort the writer. - let source = Source::Iter(Box::new( + let source = FlatSource::Iter(Box::new( [ - Ok(new_batch_by_range(&["a", "d"], 0, 60)), + Ok(new_record_batch_by_range(&["a", "d"], 0, 60)), InvalidBatchSnafu { reason: "Abort the writer", } @@ -730,9 +743,10 @@ mod tests { let write_request = SstWriteRequest { op_type: OperationType::Flush, metadata, - source: either::Left(source), + source, storage: None, max_sequence: None, + sst_write_format: Default::default(), cache_manager: cache_manager.clone(), index_options: IndexOptions::default(), index_config: Default::default(), diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index 6d51d1dd59..ba6957fdae 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -58,10 +58,10 @@ use crate::error::{ TimeRangePredicateOverflowSnafu, TimeoutSnafu, }; use crate::metrics::{COMPACTION_STAGE_ELAPSED, INFLIGHT_COMPACTION_COUNT}; +use crate::read::BoxedRecordBatchStream; use crate::read::projection::ProjectionMapper; use crate::read::scan_region::{PredicateGroup, ScanInput}; use crate::read::seq_scan::SeqScan; -use crate::read::{BoxedBatchReader, BoxedRecordBatchStream}; use crate::region::options::{MergeMode, RegionOptions}; use crate::region::version::VersionControlRef; use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState}; @@ -828,7 +828,7 @@ pub struct SerializedCompactionOutput { output_time_range: Option, } -/// Builders to create [BoxedBatchReader] for compaction. +/// Builders to create [BoxedRecordBatchStream] for compaction. struct CompactionSstReaderBuilder<'a> { metadata: RegionMetadataRef, sst_layer: AccessLayerRef, @@ -841,24 +841,17 @@ struct CompactionSstReaderBuilder<'a> { } impl CompactionSstReaderBuilder<'_> { - /// Builds [BoxedBatchReader] that reads all SST files and yields batches in primary key order. - async fn build_sst_reader(self) -> Result { - let scan_input = self.build_scan_input(false)?.with_compaction(true); - - SeqScan::new(scan_input).build_reader_for_compaction().await - } - /// Builds [BoxedRecordBatchStream] that reads all SST files and yields batches in flat format for compaction. async fn build_flat_sst_reader(self) -> Result { - let scan_input = self.build_scan_input(true)?.with_compaction(true); + let scan_input = self.build_scan_input()?.with_compaction(true); SeqScan::new(scan_input) .build_flat_reader_for_compaction() .await } - fn build_scan_input(self, flat_format: bool) -> Result { - let mapper = ProjectionMapper::all(&self.metadata, flat_format)?; + fn build_scan_input(self) -> Result { + let mapper = ProjectionMapper::all(&self.metadata, true)?; let mut scan_input = ScanInput::new(self.sst_layer, mapper) .with_files(self.inputs.to_vec()) .with_append_mode(self.append_mode) @@ -868,7 +861,7 @@ impl CompactionSstReaderBuilder<'_> { // We ignore file not found error during compaction. .with_ignore_file_not_found(true) .with_merge_mode(self.merge_mode) - .with_flat_format(flat_format); + .with_flat_format(true); // This serves as a workaround of https://github.com/GreptimeTeam/greptimedb/issues/3944 // by converting time ranges into predicate. diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs index 1876972b0d..b03e6415e8 100644 --- a/src/mito2/src/compaction/compactor.rs +++ b/src/mito2/src/compaction/compactor.rs @@ -43,7 +43,7 @@ use crate::error::{ use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions}; use crate::metrics; -use crate::read::{FlatSource, Source}; +use crate::read::FlatSource; use crate::region::options::RegionOptions; use crate::region::version::VersionRef; use crate::region::{ManifestContext, RegionLeaderState, RegionRoleState}; @@ -356,13 +356,8 @@ impl DefaultCompactor { time_range: output.output_time_range, merge_mode, }; - let source = if flat_format { - let reader = builder.build_flat_sst_reader().await?; - Either::Right(FlatSource::Stream(reader)) - } else { - let reader = builder.build_sst_reader().await?; - Either::Left(Source::Reader(reader)) - }; + let reader = builder.build_flat_sst_reader().await?; + let source = FlatSource::Stream(reader); let mut metrics = Metrics::new(WriteType::Compaction); let region_metadata = compaction_region.region_metadata.clone(); let sst_infos = compaction_region @@ -375,6 +370,11 @@ impl DefaultCompactor { cache_manager: compaction_region.cache_manager.clone(), storage, max_sequence: max_sequence.map(NonZero::get), + sst_write_format: if flat_format { + FormatType::Flat + } else { + FormatType::PrimaryKey + }, index_options, index_config, inverted_index_config, diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index 602f5508ba..0eee067ab6 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -116,6 +116,8 @@ pub struct MitoConfig { pub page_cache_size: ReadableSize, /// Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache. pub selector_result_cache_size: ReadableSize, + /// Cache size for flat range scan results. Setting it to 0 to disable the cache. + pub range_result_cache_size: ReadableSize, /// Whether to enable the write cache. pub enable_write_cache: bool, /// File system path for write cache dir's root, defaults to `{data_home}`. @@ -200,6 +202,7 @@ impl Default for MitoConfig { vector_cache_size: ReadableSize::mb(512), page_cache_size: ReadableSize::mb(512), selector_result_cache_size: ReadableSize::mb(512), + range_result_cache_size: ReadableSize::mb(512), enable_write_cache: false, write_cache_path: String::new(), write_cache_size: ReadableSize::gb(5), @@ -336,6 +339,7 @@ impl MitoConfig { self.vector_cache_size = mem_cache_size; self.page_cache_size = page_cache_size; self.selector_result_cache_size = mem_cache_size; + self.range_result_cache_size = mem_cache_size; self.index.adjust_buffer_and_cache_size(sys_memory); } diff --git a/src/mito2/src/engine/row_selector_test.rs b/src/mito2/src/engine/row_selector_test.rs index 317ede5a97..d79152e57f 100644 --- a/src/mito2/src/engine/row_selector_test.rs +++ b/src/mito2/src/engine/row_selector_test.rs @@ -24,7 +24,7 @@ use crate::test_util::{ CreateRequestBuilder, TestEnv, build_rows_for_key, flush_region, put_rows, rows_schema, }; -async fn test_last_row(append_mode: bool) { +async fn test_last_row(append_mode: bool, flat_format: bool) { let mut env = TestEnv::new().await; let engine = env.create_engine(MitoConfig::default()).await; let region_id = RegionId::new(1, 1); @@ -39,9 +39,12 @@ async fn test_last_row(append_mode: bool) { env.get_kv_backend(), ) .await; - let request = CreateRequestBuilder::new() - .insert_option("append_mode", &append_mode.to_string()) - .build(); + let mut request_builder = + CreateRequestBuilder::new().insert_option("append_mode", &append_mode.to_string()); + if flat_format { + request_builder = request_builder.insert_option("sst_format", "flat"); + } + let request = request_builder.build(); let column_schemas = rows_schema(&request); engine .handle_request(region_id, RegionRequest::Create(request)) @@ -106,10 +109,20 @@ async fn test_last_row(append_mode: bool) { #[tokio::test] async fn test_last_row_append_mode_disabled() { - test_last_row(false).await; + test_last_row(false, false).await; } #[tokio::test] async fn test_last_row_append_mode_enabled() { - test_last_row(true).await; + test_last_row(true, false).await; +} + +#[tokio::test] +async fn test_last_row_flat_format_append_mode_disabled() { + test_last_row(false, true).await; +} + +#[tokio::test] +async fn test_last_row_flat_format_append_mode_enabled() { + test_last_row(true, true).await; } diff --git a/src/mito2/src/engine/skip_wal_test.rs b/src/mito2/src/engine/skip_wal_test.rs index d1b38c47fb..c59be6ba2c 100644 --- a/src/mito2/src/engine/skip_wal_test.rs +++ b/src/mito2/src/engine/skip_wal_test.rs @@ -15,7 +15,9 @@ use api::v1::Rows; use common_wal::options::{WAL_OPTIONS_KEY, WalOptions}; use store_api::region_engine::{RegionEngine, RegionRole}; -use store_api::region_request::{RegionCloseRequest, RegionRequest}; +use store_api::region_request::{ + RegionCloseRequest, RegionOpenRequest, RegionRequest, RegionTruncateRequest, +}; use store_api::storage::{RegionId, ScanRequest}; use crate::config::MitoConfig; @@ -168,3 +170,76 @@ async fn test_close_follower_region_skip_wal() { let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); assert_eq!(0, total_rows); } + +#[tokio::test] +async fn test_close_region_after_truncate_skip_wal() { + common_telemetry::init_default_ut_logging(); + let mut env = TestEnv::with_prefix("close-truncate-skip-wal").await; + let engine = env.create_engine(MitoConfig::default()).await; + + let region_id = RegionId::new(1, 1); + let mut request = CreateRequestBuilder::new().build(); + let wal_options = WalOptions::Noop; + request.options.insert( + WAL_OPTIONS_KEY.to_string(), + serde_json::to_string(&wal_options).unwrap(), + ); + + engine + .handle_request(region_id, RegionRequest::Create(request.clone())) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::Truncate(RegionTruncateRequest::All), + ) + .await + .unwrap(); + + let region = engine.get_region(region_id).unwrap(); + let version_data = region.version_control.current(); + assert_eq!( + version_data.version.truncated_entry_id, + Some(version_data.last_entry_id) + ); + + let rows = Rows { + schema: rows_schema(&request), + rows: build_rows(0, 3), + }; + put_rows(&engine, region_id, rows).await; + + let region = engine.get_region(region_id).unwrap(); + assert!(!region.version().memtables.is_empty()); + + engine + .handle_request(region_id, RegionRequest::Close(RegionCloseRequest {})) + .await + .unwrap(); + + engine + .handle_request( + region_id, + RegionRequest::Open(RegionOpenRequest { + engine: String::new(), + table_dir: request.table_dir, + path_type: store_api::region_request::PathType::Bare, + options: request.options, + skip_wal_replay: false, + checkpoint: None, + }), + ) + .await + .unwrap(); + let stream = engine + .scan_to_stream(region_id, ScanRequest::default()) + .await + .unwrap(); + let batches = common_recordbatch::RecordBatches::try_collect(stream) + .await + .unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(3, total_rows); +} diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index 923d8a2713..c6b69fe607 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -616,15 +616,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to read arrow record batch from parquet file {}", path))] - ArrowReader { - path: String, - #[snafu(source)] - error: ArrowError, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Column not found, column: {column}"))] ColumnNotFound { column: String, @@ -1349,7 +1340,6 @@ impl ErrorExt for Error { RegionState { .. } | UpdateManifest { .. } => StatusCode::RegionNotReady, JsonOptions { .. } => StatusCode::InvalidArguments, EmptyRegionDir { .. } | EmptyManifestDir { .. } => StatusCode::RegionNotFound, - ArrowReader { .. } => StatusCode::StorageUnavailable, ConvertValue { source, .. } => source.status_code(), ApplyBloomFilterIndex { source, .. } => source.status_code(), InvalidPartitionExpr { source, .. } => source.status_code(), diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index 0c16544b6e..fedac95d27 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -22,7 +22,6 @@ use std::time::Instant; use common_telemetry::{debug, error, info}; use datatypes::arrow::datatypes::SchemaRef; -use either::Either; use partition::expr::PartitionExpr; use smallvec::{SmallVec, smallvec}; use snafu::ResultExt; @@ -41,18 +40,14 @@ use crate::error::{ }; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::memtable::bulk::ENCODE_ROW_THRESHOLD; -use crate::memtable::{ - BoxedRecordBatchIterator, EncodedRange, IterBuilder, MemtableRanges, RangesOptions, -}; +use crate::memtable::{BoxedRecordBatchIterator, EncodedRange, MemtableRanges, RangesOptions}; use crate::metrics::{ FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_FAILURE_TOTAL, FLUSH_FILE_TOTAL, FLUSH_REQUESTS_TOTAL, INFLIGHT_FLUSH_COUNT, }; -use crate::read::dedup::{DedupReader, LastNonNull, LastRow}; +use crate::read::FlatSource; use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeIterator; -use crate::read::merge::MergeReaderBuilder; -use crate::read::{FlatSource, Source}; use crate::region::options::{IndexOptions, MergeMode, RegionOptions}; use crate::region::version::{VersionControlData, VersionControlRef, VersionRef}; use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState, parse_partition_expr}; @@ -62,8 +57,10 @@ use crate::request::{ }; use crate::schedule::scheduler::{Job, SchedulerRef}; use crate::sst::file::FileMeta; -use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE, SstInfo, WriteOptions}; -use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; +use crate::sst::parquet::{ + DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE, SstInfo, WriteOptions, flat_format, +}; +use crate::sst::{FlatSchemaOptions, FormatType, to_flat_sst_arrow_schema}; use crate::worker::WorkerListener; /// Global write buffer (memtable) manager. @@ -480,78 +477,29 @@ impl RegionFlushTask { // the counter may have more series than the actual series count. series_count += memtable_series_count; - if mem_ranges.is_record_batch() { - let flush_start = Instant::now(); - let FlushFlatMemResult { - num_encoded, - num_sources, - results, - } = self - .flush_flat_mem_ranges(version, &write_opts, mem_ranges) - .await?; - encoded_part_count += num_encoded; - for (source_idx, result) in results.into_iter().enumerate() { - let (max_sequence, ssts_written, metrics) = result?; - if ssts_written.is_empty() { - // No data written. - continue; - } - - common_telemetry::debug!( - "Region {} flush one memtable {} {}/{}, metrics: {:?}", - self.region_id, - memtable_id, - source_idx, - num_sources, - metrics - ); - - flush_metrics = flush_metrics.merge(metrics); - - file_metas.extend(ssts_written.into_iter().map(|sst_info| { - flushed_bytes += sst_info.file_size; - Self::new_file_meta( - self.region_id, - max_sequence, - sst_info, - partition_expr.clone(), - ) - })); - } - - common_telemetry::debug!( - "Region {} flush {} memtables for {}, num_mem_ranges: {}, num_encoded: {}, num_rows: {}, flush_cost: {:?}, compact_cost: {:?}", - self.region_id, - num_sources, - memtable_id, - num_mem_ranges, - num_encoded, - num_mem_rows, - flush_start.elapsed(), - compact_cost, - ); - } else { - let max_sequence = mem_ranges.max_sequence(); - let source = memtable_source(mem_ranges, &version.options).await?; - - // Flush to level 0. - let source = Either::Left(source); - let write_request = self.new_write_request(version, max_sequence, source); - - let mut metrics = Metrics::new(WriteType::Flush); - let ssts_written = self - .access_layer - .write_sst(write_request, &write_opts, &mut metrics) - .await?; - FLUSH_FILE_TOTAL.inc_by(ssts_written.len() as u64); + let flush_start = Instant::now(); + let FlushFlatMemResult { + num_encoded, + num_sources, + results, + } = self + .flush_flat_mem_ranges(version, &write_opts, mem_ranges) + .await?; + encoded_part_count += num_encoded; + for (source_idx, result) in results.into_iter().enumerate() { + let (max_sequence, ssts_written, metrics) = result?; if ssts_written.is_empty() { // No data written. continue; } - debug!( - "Region {} flush one memtable, num_mem_ranges: {}, num_rows: {}, metrics: {:?}", - self.region_id, num_mem_ranges, num_mem_rows, metrics + common_telemetry::debug!( + "Region {} flush one memtable {} {}/{}, metrics: {:?}", + self.region_id, + memtable_id, + source_idx, + num_sources, + metrics ); flush_metrics = flush_metrics.merge(metrics); @@ -565,7 +513,19 @@ impl RegionFlushTask { partition_expr.clone(), ) })); - }; + } + + common_telemetry::debug!( + "Region {} flush {} memtables for {}, num_mem_ranges: {}, num_encoded: {}, num_rows: {}, flush_cost: {:?}, compact_cost: {:?}", + self.region_id, + num_sources, + memtable_id, + num_mem_ranges, + num_encoded, + num_mem_rows, + flush_start.elapsed(), + compact_cost, + ); } Ok(DoFlushMemtablesResult { @@ -587,16 +547,17 @@ impl RegionFlushTask { &version.metadata, &FlatSchemaOptions::from_encoding(version.metadata.primary_key_encoding), ); + let field_column_start = + flat_format::field_column_start(&version.metadata, batch_schema.fields().len()); let flat_sources = memtable_flat_sources( batch_schema, mem_ranges, &version.options, - version.metadata.primary_key.len(), + field_column_start, )?; let mut tasks = Vec::with_capacity(flat_sources.encoded.len() + flat_sources.sources.len()); let num_encoded = flat_sources.encoded.len(); for (source, max_sequence) in flat_sources.sources { - let source = Either::Right(source); let write_request = self.new_write_request(version, max_sequence, source); let access_layer = self.access_layer.clone(); let write_opts = write_opts.clone(); @@ -667,8 +628,13 @@ impl RegionFlushTask { &self, version: &VersionRef, max_sequence: u64, - source: Either, + source: FlatSource, ) -> SstWriteRequest { + let flat_format = version + .options + .sst_format + .map(|f| f == FormatType::Flat) + .unwrap_or(self.engine_config.default_experimental_flat_format); SstWriteRequest { op_type: OperationType::Flush, metadata: version.metadata.clone(), @@ -676,6 +642,11 @@ impl RegionFlushTask { cache_manager: self.cache_manager.clone(), storage: version.options.storage.clone(), max_sequence: Some(max_sequence), + sst_write_format: if flat_format { + FormatType::Flat + } else { + FormatType::PrimaryKey + }, index_options: self.index_options.clone(), index_config: self.engine_config.index.clone(), inverted_index_config: self.engine_config.inverted_index.clone(), @@ -722,41 +693,6 @@ struct DoFlushMemtablesResult { flush_metrics: Metrics, } -/// Returns a [Source] for the given memtable. -async fn memtable_source(mem_ranges: MemtableRanges, options: &RegionOptions) -> Result { - let source = if mem_ranges.ranges.len() == 1 { - let only_range = mem_ranges.ranges.into_values().next().unwrap(); - let iter = only_range.build_iter()?; - Source::Iter(iter) - } else { - // todo(hl): a workaround since sync version of MergeReader is wip. - let sources = mem_ranges - .ranges - .into_values() - .map(|r| r.build_iter().map(Source::Iter)) - .collect::>>()?; - let merge_reader = MergeReaderBuilder::from_sources(sources).build().await?; - let maybe_dedup = if options.append_mode { - // no dedup in append mode - Box::new(merge_reader) as _ - } else { - // dedup according to merge mode - match options.merge_mode.unwrap_or(MergeMode::LastRow) { - MergeMode::LastRow => { - Box::new(DedupReader::new(merge_reader, LastRow::new(false), None)) as _ - } - MergeMode::LastNonNull => Box::new(DedupReader::new( - merge_reader, - LastNonNull::new(false), - None, - )) as _, - } - }; - Source::Reader(maybe_dedup) - }; - Ok(source) -} - struct FlatSources { sources: SmallVec<[(FlatSource, SequenceNumber); 4]>, encoded: SmallVec<[(EncodedRange, SequenceNumber); 4]>, diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs index c39bbfa346..3ebfdd3628 100644 --- a/src/mito2/src/memtable.rs +++ b/src/mito2/src/memtable.rs @@ -28,6 +28,7 @@ use mito_codec::key_values::KeyValue; pub use mito_codec::key_values::KeyValues; use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec}; use serde::{Deserialize, Serialize}; +use snafu::ensure; use store_api::metadata::RegionMetadataRef; use store_api::storage::{ColumnId, SequenceNumber, SequenceRange}; @@ -231,10 +232,17 @@ impl MemtableRanges { impl IterBuilder for MemtableRanges { fn build(&self, _metrics: Option) -> Result { - UnsupportedOperationSnafu { - err_msg: "MemtableRanges does not support build iterator", - } - .fail() + ensure!( + self.ranges.len() == 1, + UnsupportedOperationSnafu { + err_msg: format!( + "Building an iterator from MemtableRanges expects 1 range, but got {}", + self.ranges.len() + ), + } + ); + + self.ranges.values().next().unwrap().build_iter() } fn is_record_batch(&self) -> bool { @@ -256,20 +264,6 @@ pub trait Memtable: Send + Sync + fmt::Debug { /// Writes an encoded batch of into memtable. fn write_bulk(&self, part: crate::memtable::bulk::part::BulkPart) -> Result<()>; - /// Scans the memtable. - /// `projection` selects columns to read, `None` means reading all columns. - /// `filters` are the predicates to be pushed down to memtable. - /// - /// # Note - /// This method should only be used for tests. - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - projection: Option<&[ColumnId]>, - predicate: Option, - sequence: Option, - ) -> Result; - /// Returns the ranges in the memtable. /// /// The returned map contains the range id and the range after applying the predicate. @@ -543,11 +537,15 @@ pub trait IterBuilder: Send + Sync { } /// Returns the record batch iterator to read the range. + /// ## Note + /// Implementations should ensure the iterator yields data within given time range. fn build_record_batch( &self, + time_range: Option<(Timestamp, Timestamp)>, metrics: Option, ) -> Result { let _metrics = metrics; + let _ = time_range; UnsupportedOperationSnafu { err_msg: "Record batch iterator is not supported by this memtable", } @@ -706,7 +704,7 @@ impl MemtableRange { metrics: Option, ) -> Result { if self.context.builder.is_record_batch() { - return self.context.builder.build_record_batch(metrics); + return self.context.builder.build_record_batch(time_range, metrics); } if let Some(context) = self.context.batch_to_record_batch.as_ref() { diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs index cf2ced06fe..502b61759d 100644 --- a/src/mito2/src/memtable/bulk.rs +++ b/src/mito2/src/memtable/bulk.rs @@ -14,6 +14,7 @@ //! Memtable implementation for bulk load +pub(crate) mod chunk_reader; #[allow(unused)] pub mod context; #[allow(unused)] @@ -34,6 +35,7 @@ fn env_usize(name: &str, default: usize) -> usize { .unwrap_or(default) } +use common_time::Timestamp; use datatypes::arrow::datatypes::SchemaRef; use mito_codec::key_values::KeyValue; use rayon::prelude::*; @@ -57,7 +59,7 @@ use crate::memtable::{ use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeIterator; use crate::region::options::MergeMode; -use crate::sst::parquet::format::FIXED_POS_COLUMN_NUM; +use crate::sst::parquet::flat_format::field_column_start; use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE}; use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; @@ -462,16 +464,6 @@ impl Memtable for BulkMemtable { Ok(()) } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - _projection: Option<&[ColumnId]>, - _predicate: Option, - _sequence: Option, - ) -> Result { - todo!() - } - fn ranges( &self, projection: Option<&[ColumnId]>, @@ -802,6 +794,7 @@ impl IterBuilder for BulkRangeIterBuilder { fn build_record_batch( &self, + _time_range: Option<(Timestamp, Timestamp)>, metrics: Option, ) -> Result { let series_count = self.part.estimated_series_count(); @@ -835,6 +828,7 @@ impl IterBuilder for MultiBulkRangeIterBuilder { fn build_record_batch( &self, + _time_range: Option<(Timestamp, Timestamp)>, metrics: Option, ) -> Result { self.part @@ -874,6 +868,7 @@ impl IterBuilder for EncodedBulkRangeIterBuilder { fn build_record_batch( &self, + _time_range: Option<(Timestamp, Timestamp)>, metrics: Option, ) -> Result { if let Some(iter) = self @@ -1186,13 +1181,8 @@ impl MemtableCompactor { Box::new(dedup_iter) } MergeMode::LastNonNull => { - // Calculates field column start: total columns - fixed columns - field columns - // Field column count = total metadata columns - time index column - primary key columns - let field_column_count = - metadata.column_metadatas.len() - 1 - metadata.primary_key.len(); - let total_columns = arrow_schema.fields().len(); let field_column_start = - total_columns - FIXED_POS_COLUMN_NUM - field_column_count; + field_column_start(metadata, arrow_schema.fields().len()); let dedup_iter = FlatDedupIterator::new( merged_iter, diff --git a/src/mito2/src/memtable/bulk/chunk_reader.rs b/src/mito2/src/memtable/bulk/chunk_reader.rs new file mode 100644 index 0000000000..e632cd1b37 --- /dev/null +++ b/src/mito2/src/memtable/bulk/chunk_reader.rs @@ -0,0 +1,65 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! ChunkReader implementation for in-memory parquet bytes. + +use std::io::Cursor; + +use bytes::Bytes; +use parquet::errors::{ParquetError, Result}; +use parquet::file::reader::{ChunkReader, Length}; + +/// A [ChunkReader] implementation for in-memory parquet bytes. +/// +/// This provides byte access to parquet data stored in memory (Bytes), +/// used for reading parquet data from bulk memtable. +#[derive(Clone)] +pub struct MemtableChunkReader { + /// The in-memory parquet data. + data: Bytes, +} + +impl MemtableChunkReader { + /// Creates a new [MemtableChunkReader] from the given bytes. + pub fn new(data: Bytes) -> Self { + Self { data } + } +} + +impl Length for MemtableChunkReader { + fn len(&self) -> u64 { + self.data.len() as u64 + } +} + +impl ChunkReader for MemtableChunkReader { + type T = Cursor; + + fn get_read(&self, start: u64) -> Result { + let start = start as usize; + if start > self.data.len() { + return Err(ParquetError::IndexOutOfBound(start, self.data.len())); + } + Ok(Cursor::new(self.data.slice(start..))) + } + + fn get_bytes(&self, start: u64, length: usize) -> Result { + let start = start as usize; + let end = start + length; + if end > self.data.len() { + return Err(ParquetError::IndexOutOfBound(end, self.data.len())); + } + Ok(self.data.slice(start..end)) + } +} diff --git a/src/mito2/src/memtable/bulk/part.rs b/src/mito2/src/memtable/bulk/part.rs index 71e49776c0..bf345c038e 100644 --- a/src/mito2/src/memtable/bulk/part.rs +++ b/src/mito2/src/memtable/bulk/part.rs @@ -967,7 +967,7 @@ impl EncodedBulkPart { Self { data, metadata } } - pub(crate) fn metadata(&self) -> &BulkPartMeta { + pub fn metadata(&self) -> &BulkPartMeta { &self.metadata } @@ -977,7 +977,7 @@ impl EncodedBulkPart { } /// Returns the encoded data. - pub(crate) fn data(&self) -> &Bytes { + pub fn data(&self) -> &Bytes { &self.data } @@ -1121,10 +1121,7 @@ pub struct BulkPartEncoder { } impl BulkPartEncoder { - pub(crate) fn new( - metadata: RegionMetadataRef, - row_group_size: usize, - ) -> Result { + pub fn new(metadata: RegionMetadataRef, row_group_size: usize) -> Result { // TODO(yingwen): Skip arrow schema if needed. let json = metadata.to_json().context(InvalidMetadataSnafu)?; let key_value_meta = @@ -1216,7 +1213,7 @@ impl BulkPartEncoder { } /// Encodes bulk part to a [EncodedBulkPart], returns the encoded data. - fn encode_part(&self, part: &BulkPart) -> Result> { + pub fn encode_part(&self, part: &BulkPart) -> Result> { if part.batch.num_rows() == 0 { return Ok(None); } diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs index 1e9d955321..edb9ff52d9 100644 --- a/src/mito2/src/memtable/bulk/part_reader.rs +++ b/src/mito2/src/memtable/bulk/part_reader.rs @@ -30,7 +30,6 @@ use crate::memtable::{MemScanMetrics, MemScanMetricsData}; use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED}; use crate::sst::parquet::file_range::{PreFilterMode, TagDecodeState}; use crate::sst::parquet::flat_format::sequence_column_index; -use crate::sst::parquet::reader::RowGroupReaderContext; /// Iterator for reading data inside a bulk part. pub struct EncodedBulkPartIter { @@ -50,7 +49,7 @@ pub struct EncodedBulkPartIter { impl EncodedBulkPartIter { /// Creates a new [BulkPartIter]. - pub(crate) fn try_new( + pub fn try_new( encoded_part: &EncodedBulkPart, context: BulkIterContextRef, mut row_groups_to_read: VecDeque, diff --git a/src/mito2/src/memtable/bulk/row_group_reader.rs b/src/mito2/src/memtable/bulk/row_group_reader.rs index fccd22db10..40a5b2f85d 100644 --- a/src/mito2/src/memtable/bulk/row_group_reader.rs +++ b/src/mito2/src/memtable/bulk/row_group_reader.rs @@ -12,124 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::ops::Range; use std::sync::Arc; use bytes::Bytes; -use datatypes::arrow::array::RecordBatch; -use datatypes::arrow::error::ArrowError; -use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowGroups, RowSelection}; -use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels}; -use parquet::column::page::{PageIterator, PageReader}; -use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::{ + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, + ParquetRecordBatchReaderBuilder, RowSelection, +}; +use parquet::file::metadata::ParquetMetaData; use snafu::ResultExt; use crate::error; use crate::error::ReadDataPartSnafu; +use crate::memtable::bulk::chunk_reader::MemtableChunkReader; use crate::memtable::bulk::context::BulkIterContextRef; use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; -use crate::sst::parquet::format::ReadFormat; -use crate::sst::parquet::reader::RowGroupReaderContext; -use crate::sst::parquet::row_group::{ColumnChunkIterator, RowGroupBase}; - -/// Helper for reading specific row group inside Memtable Parquet parts. -// This is similar to [mito2::sst::parquet::row_group::InMemoryRowGroup] since -// it's a workaround for lacking of keyword generics. -pub struct MemtableRowGroupPageFetcher<'a> { - /// Shared structs for reading row group. - base: RowGroupBase<'a>, - bytes: Bytes, -} - -impl<'a> MemtableRowGroupPageFetcher<'a> { - pub(crate) fn create( - row_group_idx: usize, - parquet_meta: &'a ParquetMetaData, - bytes: Bytes, - ) -> Self { - Self { - // the cached `column_uncompressed_pages` would never be used in Memtable readers. - base: RowGroupBase::new(parquet_meta, row_group_idx), - bytes, - } - } - - /// Fetches column pages from memory file. - pub(crate) fn fetch(&mut self, projection: &ProjectionMask, selection: Option<&RowSelection>) { - if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) { - // Selection provided. - let (fetch_ranges, page_start_offsets) = - self.base - .calc_sparse_read_ranges(projection, offset_index, selection); - if fetch_ranges.is_empty() { - return; - } - let chunk_data = self.fetch_bytes(&fetch_ranges); - - self.base - .assign_sparse_chunk(projection, chunk_data, page_start_offsets); - } else { - let fetch_ranges = self.base.calc_dense_read_ranges(projection); - if fetch_ranges.is_empty() { - // Nothing to fetch. - return; - } - let chunk_data = self.fetch_bytes(&fetch_ranges); - self.base.assign_dense_chunk(projection, chunk_data); - } - } - - fn fetch_bytes(&self, ranges: &[Range]) -> Vec { - ranges - .iter() - .map(|range| self.bytes.slice(range.start as usize..range.end as usize)) - .collect() - } - - /// Creates a page reader to read column at `i`. - fn column_page_reader(&self, i: usize) -> parquet::errors::Result> { - let reader = self.base.column_reader(i)?; - Ok(Box::new(reader)) - } -} - -impl RowGroups for MemtableRowGroupPageFetcher<'_> { - fn num_rows(&self) -> usize { - self.base.row_count - } - - fn column_chunks(&self, i: usize) -> parquet::errors::Result> { - Ok(Box::new(ColumnChunkIterator { - reader: Some(self.column_page_reader(i)), - })) - } - - fn row_groups(&self) -> Box + '_> { - Box::new(std::iter::once(self.base.row_group_metadata())) - } - - fn metadata(&self) -> &ParquetMetaData { - self.base.parquet_metadata() - } -} - -impl RowGroupReaderContext for BulkIterContextRef { - fn map_result( - &self, - result: Result, ArrowError>, - ) -> error::Result> { - result.context(error::DecodeArrowRowGroupSnafu) - } - - fn read_format(&self) -> &ReadFormat { - self.as_ref().read_format() - } -} pub(crate) struct MemtableRowGroupReaderBuilder { projection: ProjectionMask, parquet_metadata: Arc, - field_levels: FieldLevels, + arrow_metadata: ArrowReaderMetadata, data: Bytes, } @@ -140,15 +43,16 @@ impl MemtableRowGroupReaderBuilder { parquet_metadata: Arc, data: Bytes, ) -> error::Result { - let parquet_schema_desc = parquet_metadata.file_metadata().schema_descr(); - let hint = Some(context.read_format().arrow_schema().fields()); - let field_levels = - parquet_to_arrow_field_levels(parquet_schema_desc, projection.clone(), hint) + // Create ArrowReaderMetadata for building the reader. + let arrow_reader_options = + ArrowReaderOptions::new().with_schema(context.read_format().arrow_schema().clone()); + let arrow_metadata = + ArrowReaderMetadata::try_new(parquet_metadata.clone(), arrow_reader_options) .context(ReadDataPartSnafu)?; Ok(Self { projection, parquet_metadata, - field_levels, + arrow_metadata, data, }) } @@ -159,23 +63,21 @@ impl MemtableRowGroupReaderBuilder { row_group_idx: usize, row_selection: Option, ) -> error::Result { - let mut row_group = MemtableRowGroupPageFetcher::create( - row_group_idx, - &self.parquet_metadata, - self.data.clone(), - ); - // Fetches data from memory part. Currently, row selection is not supported. - row_group.fetch(&self.projection, row_selection.as_ref()); + let chunk_reader = MemtableChunkReader::new(self.data.clone()); - // Builds the parquet reader. - // Now the row selection is None. - ParquetRecordBatchReader::try_new_with_row_groups( - &self.field_levels, - &row_group, - DEFAULT_READ_BATCH_SIZE, - row_selection, + let mut builder = ParquetRecordBatchReaderBuilder::new_with_metadata( + chunk_reader, + self.arrow_metadata.clone(), ) - .context(ReadDataPartSnafu) + .with_row_groups(vec![row_group_idx]) + .with_projection(self.projection.clone()) + .with_batch_size(DEFAULT_READ_BATCH_SIZE); + + if let Some(selection) = row_selection { + builder = builder.with_row_selection(selection); + } + + builder.build().context(ReadDataPartSnafu) } /// Computes whether to skip field filters for a specific row group based on PreFilterMode. diff --git a/src/mito2/src/memtable/partition_tree.rs b/src/mito2/src/memtable/partition_tree.rs index febae46784..662bfd99f6 100644 --- a/src/mito2/src/memtable/partition_tree.rs +++ b/src/mito2/src/memtable/partition_tree.rs @@ -177,16 +177,6 @@ impl Memtable for PartitionTreeMemtable { .fail() } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - projection: Option<&[ColumnId]>, - predicate: Option, - sequence: Option, - ) -> Result { - self.tree.read(projection, predicate, sequence, None) - } - fn ranges( &self, projection: Option<&[ColumnId]>, @@ -396,8 +386,6 @@ mod tests { use api::v1::{Mutation, OpType, Rows, SemanticType}; use common_query::prelude::{greptime_timestamp, greptime_value}; use common_time::Timestamp; - use datafusion_common::Column; - use datafusion_expr::{BinaryExpr, Expr, Literal, Operator}; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::Vector; use datatypes::scalars::ScalarVector; @@ -548,7 +536,10 @@ mod tests { let expect = (0..100).collect::>(); let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1); memtable.write(&kvs).unwrap(); - let iter = memtable.iter(Some(&[3]), None, None).unwrap(); + let ranges = memtable + .ranges(Some(&[3]), RangesOptions::default()) + .unwrap(); + let iter = ranges.build(None).unwrap(); let mut v0_all = vec![]; for res in iter { @@ -625,41 +616,6 @@ mod tests { assert_eq!(expect, read); } - #[test] - fn test_memtable_filter() { - let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![0, 1], false)); - // Try to build a memtable via the builder. - let memtable = PartitionTreeMemtableBuilder::new( - PartitionTreeConfig { - index_max_keys_per_shard: 40, - ..Default::default() - }, - None, - ) - .build(1, &metadata); - - for i in 0..100 { - let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect(); - let kvs = - memtable_util::build_key_values(&metadata, "hello".to_string(), i, ×tamps, 1); - memtable.write(&kvs).unwrap(); - } - - for i in 0..100 { - let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect(); - let expr = Expr::BinaryExpr(BinaryExpr { - left: Box::new(Expr::Column(Column::from_name("k1"))), - op: Operator::Eq, - right: Box::new((i as u32).lit()), - }); - let iter = memtable - .iter(None, Some(Predicate::new(vec![expr])), None) - .unwrap(); - let read = collect_iter_timestamps(iter); - assert_eq!(timestamps, read); - } - } - #[test] fn test_deserialize_config() { let config = PartitionTreeConfig { @@ -811,7 +767,11 @@ mod tests { )) .unwrap(); - let mut reader = new_memtable.iter(None, None, None).unwrap(); + let mut reader = new_memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = reader.next().unwrap().unwrap(); let pk = codec.decode(batch.primary_key()).unwrap().into_dense(); if let Value::String(s) = &pk[2] { @@ -916,7 +876,14 @@ mod tests { .unwrap(); memtable.freeze().unwrap(); assert_eq!( - collect_kvs(memtable.iter(None, None, None).unwrap(), &metadata), + collect_kvs( + memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(), + &metadata + ), ('a'..'h').map(|c| (c.to_string(), c.to_string())).collect() ); let forked = memtable.fork(2, &metadata); @@ -925,7 +892,14 @@ mod tests { forked.write(&key_values(&metadata, keys.iter())).unwrap(); forked.freeze().unwrap(); assert_eq!( - collect_kvs(forked.iter(None, None, None).unwrap(), &metadata), + collect_kvs( + forked + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(), + &metadata + ), keys.iter() .map(|c| (c.to_string(), c.to_string())) .collect() @@ -936,7 +910,14 @@ mod tests { let keys = ["g", "e", "a", "f", "b", "c", "h"]; forked2.write(&key_values(&metadata, keys.iter())).unwrap(); - let kvs = collect_kvs(forked2.iter(None, None, None).unwrap(), &metadata); + let kvs = collect_kvs( + forked2 + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(), + &metadata, + ); let expected = keys .iter() .map(|c| (c.to_string(), c.to_string())) diff --git a/src/mito2/src/memtable/simple_bulk_memtable.rs b/src/mito2/src/memtable/simple_bulk_memtable.rs index 4dcaa2bac0..6d91f00361 100644 --- a/src/mito2/src/memtable/simple_bulk_memtable.rs +++ b/src/mito2/src/memtable/simple_bulk_memtable.rs @@ -213,22 +213,6 @@ impl Memtable for SimpleBulkMemtable { Ok(()) } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - projection: Option<&[ColumnId]>, - _predicate: Option, - sequence: Option, - ) -> error::Result { - let iter = self.create_iter(projection, sequence)?.build(None)?; - if self.merge_mode == MergeMode::LastNonNull { - let iter = LastNonNullIter::new(iter); - Ok(Box::new(iter)) - } else { - Ok(Box::new(iter)) - } - } - fn ranges( &self, projection: Option<&[ColumnId]>, @@ -526,7 +510,11 @@ mod tests { )) .unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(2, batch.num_rows()); assert_eq!(2, batch.fields().len()); @@ -551,7 +539,11 @@ mod tests { )) .unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); assert_eq!(2, batch.fields().len()); @@ -565,7 +557,11 @@ mod tests { // Only project column 2 (f1) let projection = vec![2]; - let mut iter = memtable.iter(Some(&projection), None, None).unwrap(); + let mut iter = memtable + .ranges(Some(&projection), RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); @@ -592,7 +588,11 @@ mod tests { OpType::Put, )) .unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); // deduped to 1 row @@ -611,7 +611,11 @@ mod tests { let kv = kvs.iter().next().unwrap(); memtable.write_one(kv).unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); } @@ -745,7 +749,11 @@ mod tests { }; memtable.write_bulk(part).unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(2, batch.num_rows()); @@ -764,7 +772,11 @@ mod tests { OpType::Put, ); memtable.write(&kvs).unwrap(); - let mut iter = memtable.iter(None, None, None).unwrap(); + let mut iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(3, batch.num_rows()); assert_eq!( @@ -854,7 +866,15 @@ mod tests { // Filter with sequence 0 should only return first write let mut iter = memtable - .iter(None, None, Some(SequenceRange::LtEq { max: 0 })) + .ranges( + None, + RangesOptions { + sequence: Some(SequenceRange::LtEq { max: 0 }), + ..Default::default() + }, + ) + .unwrap() + .build(None) .unwrap(); let batch = iter.next().unwrap().unwrap(); assert_eq!(1, batch.num_rows()); diff --git a/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs b/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs index b71a86c554..08edebdbb2 100644 --- a/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs +++ b/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs @@ -12,98 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; -use std::time::Instant; - use store_api::metadata::RegionMetadataRef; -use store_api::storage::{ColumnId, SequenceRange}; -use crate::error; -use crate::memtable::simple_bulk_memtable::{Iter, SimpleBulkMemtable}; -use crate::memtable::time_series::Values; -use crate::memtable::{BoxedBatchIterator, IterBuilder, MemScanMetrics}; -use crate::read::dedup::LastNonNullIter; -use crate::region::options::MergeMode; +use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable; impl SimpleBulkMemtable { pub fn region_metadata(&self) -> RegionMetadataRef { self.region_metadata.clone() } - - pub(crate) fn create_iter( - &self, - projection: Option<&[ColumnId]>, - sequence: Option, - ) -> error::Result { - let mut series = self.series.write().unwrap(); - - let values = if series.is_empty() { - None - } else { - Some(series.compact(&self.region_metadata)?.clone()) - }; - let projection = self.build_projection(projection); - Ok(BatchIterBuilderDeprecated { - region_metadata: self.region_metadata.clone(), - values, - projection, - dedup: self.dedup, - sequence, - merge_mode: self.merge_mode, - }) - } -} - -#[derive(Clone)] -pub(crate) struct BatchIterBuilderDeprecated { - region_metadata: RegionMetadataRef, - values: Option, - projection: HashSet, - sequence: Option, - dedup: bool, - merge_mode: MergeMode, -} - -impl IterBuilder for BatchIterBuilderDeprecated { - fn build(&self, metrics: Option) -> error::Result { - let start_time = Instant::now(); - let Some(values) = self.values.clone() else { - return Ok(Box::new(Iter { batch: None })); - }; - - let maybe_batch = values - .to_batch( - &[], - &self.region_metadata, - &self.projection, - self.sequence, - self.dedup, - self.merge_mode, - ) - .map(Some) - .transpose(); - - // Collect metrics from the batch - if let Some(metrics) = metrics { - let (num_rows, num_batches) = match &maybe_batch { - Some(Ok(batch)) => (batch.num_rows(), 1), - _ => (0, 0), - }; - let inner = crate::memtable::MemScanMetricsData { - total_series: 1, - num_rows, - num_batches, - scan_cost: start_time.elapsed(), - }; - metrics.merge_inner(&inner); - } - - let iter = Iter { batch: maybe_batch }; - - if self.merge_mode == MergeMode::LastNonNull { - Ok(Box::new(LastNonNullIter::new(iter))) - } else { - Ok(Box::new(iter)) - } - } } diff --git a/src/mito2/src/memtable/time_partition.rs b/src/mito2/src/memtable/time_partition.rs index 6f11c813cb..ee695aceb8 100644 --- a/src/mito2/src/memtable/time_partition.rs +++ b/src/mito2/src/memtable/time_partition.rs @@ -827,6 +827,7 @@ mod tests { use super::*; use crate::memtable::partition_tree::PartitionTreeMemtableBuilder; use crate::memtable::time_series::TimeSeriesMemtableBuilder; + use crate::memtable::{IterBuilder, RangesOptions}; use crate::test_util::memtable_util::{self, collect_iter_timestamps}; #[test] @@ -852,7 +853,11 @@ mod tests { partitions.list_memtables(&mut memtables); assert_eq!(0, memtables[0].id()); - let iter = memtables[0].iter(None, None, None).unwrap(); + let iter = memtables[0] + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[1000, 3000, 5000, 6000, 7000], ×tamps[..]); } @@ -890,7 +895,11 @@ mod tests { let mut memtables = Vec::new(); partitions.list_memtables(&mut memtables); - let iter = memtables[0].iter(None, None, None).unwrap(); + let iter = memtables[0] + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[0, 2000, 3000, 4000, 5000, 7000], ×tamps[..]); let parts = partitions.list_partitions(); @@ -943,7 +952,12 @@ mod tests { let partitions = new_multi_partitions(&metadata); let parts = partitions.list_partitions(); - let iter = parts[0].memtable.iter(None, None, None).unwrap(); + let iter = parts[0] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(0, parts[0].memtable.id()); assert_eq!( @@ -955,7 +969,12 @@ mod tests { parts[0].time_range.max_timestamp ); assert_eq!(&[0, 2000, 3000, 4000], ×tamps[..]); - let iter = parts[1].memtable.iter(None, None, None).unwrap(); + let iter = parts[1] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); assert_eq!(1, parts[1].memtable.id()); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[5000, 7000], ×tamps[..]); @@ -1273,7 +1292,12 @@ mod tests { let parts = partitions.list_partitions(); assert_eq!(1, parts.len()); - let iter = parts[0].memtable.iter(None, None, None).unwrap(); + let iter = parts[0] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[1000, 2000, 3000], ×tamps[..]); @@ -1284,11 +1308,21 @@ mod tests { let parts = partitions.list_partitions(); assert_eq!(2, parts.len()); // Check first partition [0, 5000) - let iter = parts[0].memtable.iter(None, None, None).unwrap(); + let iter = parts[0] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[1000, 2000, 3000, 4000], ×tamps[..]); // Check second partition [5000, 10000) - let iter = parts[1].memtable.iter(None, None, None).unwrap(); + let iter = parts[1] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[5000, 6000], ×tamps[..]); @@ -1301,7 +1335,12 @@ mod tests { assert_eq!(3, parts.len()); // Check new partition [10000, 15000) - let iter = parts[2].memtable.iter(None, None, None).unwrap(); + let iter = parts[2] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[11000, 12000], ×tamps[..]); @@ -1314,7 +1353,12 @@ mod tests { let parts = partitions.list_partitions(); assert_eq!(1, parts.len()); - let iter = parts[0].memtable.iter(None, None, None).unwrap(); + let iter = parts[0] + .memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let timestamps = collect_iter_timestamps(iter); assert_eq!(&[1000, 5000, 9000], ×tamps[..]); } diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs index 271a9343eb..d3d00d0703 100644 --- a/src/mito2/src/memtable/time_series.rs +++ b/src/mito2/src/memtable/time_series.rs @@ -51,15 +51,18 @@ use crate::memtable::bulk::part::BulkPart; use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable; use crate::memtable::stats::WriteMetrics; use crate::memtable::{ - AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues, - MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext, - MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection, + AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, BoxedRecordBatchIterator, + IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, + MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions, + read_column_ids_from_projection, }; use crate::metrics::{ MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT, MEMTABLE_ACTIVE_SERIES_COUNT, READ_ROWS_TOTAL, READ_STAGE_ELAPSED, }; use crate::read::dedup::LastNonNullIter; +use crate::read::prune::PruneTimeIterator; +use crate::read::scan_region::PredicateGroup; use crate::read::{Batch, BatchBuilder, BatchColumn}; use crate::region::options::MergeMode; @@ -267,39 +270,6 @@ impl Memtable for TimeSeriesMemtable { Ok(()) } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - projection: Option<&[ColumnId]>, - filters: Option, - sequence: Option, - ) -> Result { - let projection = if let Some(projection) = projection { - projection.iter().copied().collect() - } else { - self.region_metadata - .field_columns() - .map(|c| c.column_id) - .collect() - }; - - let iter = self.series_set.iter_series( - projection, - filters, - self.dedup, - self.merge_mode, - sequence, - None, - )?; - - if self.merge_mode == MergeMode::LastNonNull { - let iter = LastNonNullIter::new(iter); - Ok(Box::new(iter)) - } else { - Ok(Box::new(iter)) - } - } - fn ranges( &self, projection: Option<&[ColumnId]>, @@ -316,25 +286,20 @@ impl Memtable for TimeSeriesMemtable { .map(|c| c.column_id) .collect() }; - let builder = Box::new(TimeSeriesIterBuilder { - series_set: self.series_set.clone(), - projection, - predicate: predicate.predicate().cloned(), - dedup: self.dedup, - merge_mode: self.merge_mode, - sequence, - }); - let adapter_context = Arc::new(BatchToRecordBatchContext::new( + let batch_to_record_batch = Arc::new(BatchToRecordBatchContext::new( self.region_metadata.clone(), read_column_ids, )); - let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch( - self.id, - builder, - predicate, - Some(adapter_context), - )); - + let builder = Box::new(TimeSeriesIterBuilder { + series_set: self.series_set.clone(), + projection, + predicate: predicate.clone(), + dedup: self.dedup, + merge_mode: self.merge_mode, + sequence, + batch_to_record_batch, + }); + let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate)); let range_stats = self.stats(); let range = MemtableRange::new(context, range_stats); Ok(MemtableRanges { @@ -476,7 +441,7 @@ impl SeriesSet { fn iter_series( &self, projection: HashSet, - predicate: Option, + predicate: PredicateGroup, dedup: bool, merge_mode: MergeMode, sequence: Option, @@ -493,7 +458,7 @@ impl SeriesSet { self.region_metadata.clone(), self.series.clone(), projection, - predicate, + predicate.predicate().cloned(), primary_key_schema, primary_key_datatypes, self.codec.clone(), @@ -1278,10 +1243,11 @@ impl From for Values { struct TimeSeriesIterBuilder { series_set: SeriesSet, projection: HashSet, - predicate: Option, + predicate: PredicateGroup, dedup: bool, sequence: Option, merge_mode: MergeMode, + batch_to_record_batch: Arc, } impl IterBuilder for TimeSeriesIterBuilder { @@ -1301,6 +1267,25 @@ impl IterBuilder for TimeSeriesIterBuilder { Ok(Box::new(iter)) } } + + fn is_record_batch(&self) -> bool { + true + } + + fn build_record_batch( + &self, + time_range: Option<(Timestamp, Timestamp)>, + metrics: Option, + ) -> Result { + let iter = self.build(metrics)?; + let iter: BoxedBatchIterator = if let Some(time_range) = time_range { + let time_filters = self.predicate.time_filters(); + Box::new(PruneTimeIterator::new(iter, time_range, time_filters)) + } else { + iter + }; + Ok(self.batch_to_record_batch.adapt_iter(iter)) + } } #[cfg(test)] @@ -1798,7 +1783,9 @@ mod tests { *expected_ts.entry(ts).or_default() += if dedup { 1 } else { 2 }; } - let iter = memtable.iter(None, None, None).unwrap(); + let ranges = memtable.ranges(None, RangesOptions::default()).unwrap(); + let range = ranges.ranges.into_values().next().unwrap(); + let iter = range.build_iter().unwrap(); let mut read = HashMap::new(); for ts in iter @@ -1838,7 +1825,11 @@ mod tests { let memtable = TimeSeriesMemtable::new(schema, 42, None, true, MergeMode::LastRow); memtable.write(&kvs).unwrap(); - let iter = memtable.iter(Some(&[3]), None, None).unwrap(); + let iter = memtable + .ranges(Some(&[3]), RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let mut v0_all = vec![]; @@ -1917,7 +1908,11 @@ mod tests { barrier.wait(); for _ in 0..10 { - let iter = memtable.iter(None, None, None).unwrap(); + let iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); for batch_result in iter { let _ = batch_result.unwrap(); } @@ -1936,7 +1931,11 @@ mod tests { handle.join().unwrap(); } - let iter = memtable.iter(None, None, None).unwrap(); + let iter = memtable + .ranges(None, RangesOptions::default()) + .unwrap() + .build(None) + .unwrap(); let mut series_count = 0; let mut row_count = 0; @@ -2033,4 +2032,265 @@ mod tests { all_timestamps.sort(); assert_eq!(vec![3, 4, 5, 6, 7], all_timestamps); } + + /// Helper to create a TimeSeriesIterBuilder from a memtable and schema. + fn build_iter_builder( + schema: &RegionMetadataRef, + memtable: &TimeSeriesMemtable, + projection: Option<&[ColumnId]>, + dedup: bool, + merge_mode: MergeMode, + sequence: Option, + ) -> TimeSeriesIterBuilder { + let read_column_ids = read_column_ids_from_projection(schema, projection); + let field_projection = if let Some(projection) = projection { + projection.iter().copied().collect() + } else { + schema.field_columns().map(|c| c.column_id).collect() + }; + let adapter_context = Arc::new(BatchToRecordBatchContext::new( + schema.clone(), + read_column_ids, + )); + TimeSeriesIterBuilder { + series_set: memtable.series_set.clone(), + projection: field_projection, + predicate: PredicateGroup::default(), + dedup, + merge_mode, + sequence, + batch_to_record_batch: adapter_context, + } + } + + #[test] + fn test_iter_builder_build_record_batch_basic() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + let kvs = build_key_values(&schema, "hello".to_string(), 42, 10); + memtable.write(&kvs).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None); + + let mut iter = builder.build_record_batch(None, None).unwrap(); + let rb = iter.next().transpose().unwrap().unwrap(); + assert_eq!(10, rb.num_rows()); + + let rb_schema = rb.schema(); + let col_names: Vec<_> = rb_schema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + assert_eq!( + col_names, + vec![ + "k0", + "k1", + "v0", + "v1", + "ts", + "__primary_key", + "__sequence", + "__op_type", + ] + ); + + assert!(iter.next().is_none()); + } + + #[test] + fn test_iter_builder_build_record_batch_with_projection() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + let kvs = build_key_values(&schema, "test".to_string(), 1, 5); + memtable.write(&kvs).unwrap(); + + // Project only field v0 (column_id=3) and ts (column_id=2). + let projection = vec![2, 3]; + let builder = build_iter_builder( + &schema, + &memtable, + Some(&projection), + true, + MergeMode::LastRow, + None, + ); + + let mut iter = builder.build_record_batch(None, None).unwrap(); + let rb = iter.next().transpose().unwrap().unwrap(); + assert_eq!(5, rb.num_rows()); + + let rb_schema = rb.schema(); + let col_names: Vec<_> = rb_schema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + // Only projected columns + internal columns. + assert_eq!( + col_names, + vec!["v0", "ts", "__primary_key", "__sequence", "__op_type",] + ); + + assert!(iter.next().is_none()); + } + + #[test] + fn test_iter_builder_build_record_batch_multiple_series() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + let kvs_a = build_key_values(&schema, "aaa".to_string(), 1, 3); + let kvs_b = build_key_values(&schema, "bbb".to_string(), 2, 4); + memtable.write(&kvs_a).unwrap(); + memtable.write(&kvs_b).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None); + + let iter = builder.build_record_batch(None, None).unwrap(); + let mut total_rows = 0; + for rb in iter { + let rb = rb.unwrap(); + total_rows += rb.num_rows(); + assert_eq!(8, rb.num_columns()); + } + assert_eq!(7, total_rows); + } + + #[test] + fn test_iter_builder_build_record_batch_dedup() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + // Write same data twice — dedup should keep only one copy per timestamp. + let kvs = build_key_values(&schema, "dup".to_string(), 10, 5); + memtable.write(&kvs).unwrap(); + memtable.write(&kvs).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None); + + let iter = builder.build_record_batch(None, None).unwrap(); + let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum(); + assert_eq!(5, total_rows); + } + + #[test] + fn test_iter_builder_build_record_batch_no_dedup() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, false, MergeMode::LastRow); + + let kvs = build_key_values(&schema, "dup".to_string(), 10, 5); + memtable.write(&kvs).unwrap(); + memtable.write(&kvs).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, false, MergeMode::LastRow, None); + + let iter = builder.build_record_batch(None, None).unwrap(); + let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum(); + assert_eq!(10, total_rows); + } + + #[test] + fn test_iter_builder_build_record_batch_with_sequence_filter() { + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + // build_key_values creates a mutation with base sequence=0. + // Each row gets sequence = base + row_index, so 5 rows get sequences 0,1,2,3,4. + let kvs = build_key_values(&schema, "seq".to_string(), 1, 5); + memtable.write(&kvs).unwrap(); + + // Filter to sequence > 4 — should yield no rows. + let builder = build_iter_builder( + &schema, + &memtable, + None, + true, + MergeMode::LastRow, + Some(SequenceRange::Gt { min: 4 }), + ); + + let iter = builder.build_record_batch(None, None).unwrap(); + let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum(); + assert_eq!(0, total_rows); + + // Filter to sequence <= 2 — should yield 3 rows (sequences 0, 1, 2). + let builder = build_iter_builder( + &schema, + &memtable, + None, + true, + MergeMode::LastRow, + Some(SequenceRange::LtEq { max: 2 }), + ); + + let iter = builder.build_record_batch(None, None).unwrap(); + let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum(); + assert_eq!(3, total_rows); + } + + #[test] + fn test_iter_builder_build_record_batch_data_correctness() { + use datatypes::arrow::array::{ + Float64Array, Int64Array, TimestampMillisecondArray, UInt8Array, + }; + + let schema = schema_for_test(); + let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow); + + let kvs = build_key_values(&schema, "check".to_string(), 7, 3); + memtable.write(&kvs).unwrap(); + + let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None); + + let mut iter = builder.build_record_batch(None, None).unwrap(); + let rb = iter.next().transpose().unwrap().unwrap(); + assert_eq!(3, rb.num_rows()); + + // Verify timestamp values. + let ts_col = rb + .column_by_name("ts") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let timestamps: Vec<_> = (0..ts_col.len()).map(|i| ts_col.value(i)).collect(); + assert_eq!(vec![0, 1, 2], timestamps); + + // Verify field v0 values. + let v0_col = rb + .column_by_name("v0") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let v0_values: Vec<_> = (0..v0_col.len()).map(|i| v0_col.value(i)).collect(); + assert_eq!(vec![0, 1, 2], v0_values); + + // Verify field v1 values. + let v1_col = rb + .column_by_name("v1") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let v1_values: Vec<_> = (0..v1_col.len()).map(|i| v1_col.value(i)).collect(); + assert_eq!(vec![0.0, 1.0, 2.0], v1_values); + + // Verify op_type is all Put (1). + let op_col = rb + .column_by_name("__op_type") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..op_col.len() { + assert_eq!(OpType::Put as u8, op_col.value(i)); + } + + assert!(iter.next().is_none()); + } } diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs index 5fbd63ce8b..84931b9f37 100644 --- a/src/mito2/src/read.rs +++ b/src/mito2/src/read.rs @@ -27,6 +27,10 @@ pub mod projection; pub(crate) mod prune; pub(crate) mod pruner; pub mod range; +#[cfg(feature = "test")] +pub mod range_cache; +#[cfg(not(feature = "test"))] +pub(crate) mod range_cache; pub mod scan_region; pub mod scan_util; pub(crate) mod seq_scan; diff --git a/src/mito2/src/read/batch_adapter.rs b/src/mito2/src/read/batch_adapter.rs index 461dbeba69..4698229c5b 100644 --- a/src/mito2/src/read/batch_adapter.rs +++ b/src/mito2/src/read/batch_adapter.rs @@ -59,7 +59,7 @@ impl BatchToRecordBatchAdapter { /// - `metadata`: region metadata describing the schema. /// - `codec`: codec for decoding the encoded primary key bytes. /// - `read_column_ids`: projected column ids to read. - pub(crate) fn new( + pub fn new( iter: BoxedBatchIterator, metadata: RegionMetadataRef, codec: Arc, diff --git a/src/mito2/src/read/flat_projection.rs b/src/mito2/src/read/flat_projection.rs index 3e0f1169df..02b4c6b3c1 100644 --- a/src/mito2/src/read/flat_projection.rs +++ b/src/mito2/src/read/flat_projection.rs @@ -18,18 +18,21 @@ use std::sync::Arc; use api::v1::SemanticType; use common_error::ext::BoxedError; -use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu}; +use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu, NewDfRecordBatchSnafu}; use common_recordbatch::{DfRecordBatch, RecordBatch}; -use datatypes::arrow::datatypes::Field; +use datatypes::arrow::array::Array; +use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field}; use datatypes::prelude::{ConcreteDataType, DataType}; use datatypes::schema::{Schema, SchemaRef}; +use datatypes::value::Value; use datatypes::vectors::Helper; use snafu::{OptionExt, ResultExt}; use store_api::metadata::{RegionMetadata, RegionMetadataRef}; use store_api::storage::ColumnId; +use crate::cache::CacheStrategy; use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result}; -use crate::read::projection::read_column_ids_from_projection; +use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache}; use crate::sst::parquet::flat_format::sst_column_id_indices; use crate::sst::parquet::format::FormatProjection; use crate::sst::{ @@ -248,12 +251,55 @@ impl FlatProjectionMapper { pub(crate) fn convert( &self, batch: &datatypes::arrow::record_batch::RecordBatch, + cache_strategy: &CacheStrategy, ) -> common_recordbatch::error::Result { if self.is_empty_projection { return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows()); } - let columns = self.project_vectors(batch)?; - RecordBatch::new(self.output_schema.clone(), columns) + // Construct output record batch directly from Arrow arrays to avoid + // Arrow -> Vector -> Arrow roundtrips in the hot path. + let mut arrays = Vec::with_capacity(self.output_schema.num_columns()); + for (output_idx, index) in self.batch_indices.iter().enumerate() { + let mut array = batch.column(*index).clone(); + // Cast dictionary values to the target type. + if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() { + // When a string dictionary column contains only a single value, reuse a cached + // repeated vector to avoid repeatedly expanding the dictionary. + if let Some(dict_array) = single_value_string_dictionary( + &array, + &self.output_schema.column_schemas()[output_idx].data_type, + value_type.as_ref(), + ) { + let dict_values = dict_array.values(); + let value = if dict_values.is_null(0) { + Value::Null + } else { + Value::from(datatypes::arrow_array::string_array_value(dict_values, 0)) + }; + + let repeated = repeated_vector_with_cache( + &self.output_schema.column_schemas()[output_idx].data_type, + &value, + batch.num_rows(), + cache_strategy, + )?; + array = repeated.to_arrow_array(); + } else { + let casted = datatypes::arrow::compute::cast(&array, value_type) + .context(ArrowComputeSnafu)?; + array = casted; + } + } + arrays.push(array); + } + + let df_record_batch = + DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays) + .context(NewDfRecordBatchSnafu)?; + Ok(RecordBatch::from_df_record_batch( + self.output_schema.clone(), + df_record_batch, + )) } /// Projects columns from the input batch and converts them into vectors. @@ -281,6 +327,28 @@ impl FlatProjectionMapper { } } +fn single_value_string_dictionary<'a>( + array: &'a Arc, + output_type: &ConcreteDataType, + value_type: &ArrowDataType, +) -> Option<&'a datatypes::arrow::array::DictionaryArray> { + if !matches!( + value_type, + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View + ) || !output_type.is_string() + { + return None; + } + + let dict_array = array + .as_any() + .downcast_ref::>()?; + + (dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array) +} + /// Returns ids and datatypes of columns of the output batch after applying the `projection`. /// /// It adds the time index column if it doesn't present in the projection. diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs index c2336f218d..1dc4102311 100644 --- a/src/mito2/src/read/last_row.rs +++ b/src/mito2/src/read/last_row.rs @@ -21,6 +21,7 @@ use datatypes::arrow::array::{Array, BinaryArray}; use datatypes::arrow::compute::concat_batches; use datatypes::arrow::record_batch::RecordBatch; use datatypes::vectors::UInt32Vector; +use futures::{Stream, TryStreamExt}; use snafu::ResultExt; use store_api::storage::{FileId, TimeSeriesRowSelector}; @@ -30,7 +31,7 @@ use crate::cache::{ }; use crate::error::{ComputeArrowSnafu, Result}; use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice; -use crate::read::{Batch, BatchReader, BoxedBatchReader}; +use crate::read::{Batch, BatchReader, BoxedBatchReader, BoxedRecordBatchStream}; use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; use crate::sst::parquet::flat_format::{primary_key_column_index, time_index_column_index}; use crate::sst::parquet::format::{PrimaryKeyArray, primary_key_offsets}; @@ -332,10 +333,10 @@ impl FlatRowGroupLastRowCachedReader { } /// Returns the next RecordBatch. - pub(crate) fn next_batch(&mut self) -> Result> { + pub(crate) async fn next_batch(&mut self) -> Result> { match self { FlatRowGroupLastRowCachedReader::Hit(r) => r.next_batch(), - FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch(), + FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch().await, } } @@ -465,12 +466,12 @@ impl FlatRowGroupLastRowReader { Ok(Some(merged)) } - fn next_batch(&mut self) -> Result> { + async fn next_batch(&mut self) -> Result> { if self.pending.is_full() { return self.flush_pending(); } - while let Some(batch) = self.reader.next_batch()? { + while let Some(batch) = self.reader.next_batch().await? { self.selector.on_next(batch, &mut self.pending)?; if self.pending.is_full() { return self.flush_pending(); @@ -610,6 +611,41 @@ impl FlatLastTimestampSelector { } } +/// Reader that keeps only the last row of each time series from a flat RecordBatch stream. +/// Assumes input is sorted, deduped, and contains no delete operations. +pub(crate) struct FlatLastRowReader { + stream: BoxedRecordBatchStream, + selector: FlatLastTimestampSelector, + pending: BatchBuffer, +} + +impl FlatLastRowReader { + /// Creates a new `FlatLastRowReader`. + pub(crate) fn new(stream: BoxedRecordBatchStream) -> Self { + Self { + stream, + selector: FlatLastTimestampSelector::default(), + pending: BatchBuffer::new(), + } + } + + /// Converts the reader into a stream of RecordBatches. + pub(crate) fn into_stream(mut self) -> impl Stream> { + async_stream::try_stream! { + while let Some(batch) = self.stream.try_next().await? { + self.selector.on_next(batch, &mut self.pending)?; + if self.pending.is_full() { + yield self.pending.concat()?; + } + } + self.selector.finish(&mut self.pending)?; + if !self.pending.is_empty() { + yield self.pending.concat()?; + } + } + } +} + /// Gets the primary key bytes at `index` from the primary key dictionary column. fn primary_key_bytes_at(batch: &RecordBatch, pk_col_idx: usize, index: usize) -> &[u8] { let pk_dict = batch diff --git a/src/mito2/src/read/projection.rs b/src/mito2/src/read/projection.rs index 2c000e7bdc..b5b6904521 100644 --- a/src/mito2/src/read/projection.rs +++ b/src/mito2/src/read/projection.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use api::v1::SemanticType; use common_error::ext::BoxedError; use common_recordbatch::RecordBatch; -use common_recordbatch::error::ExternalSnafu; +use common_recordbatch::error::{DataTypesSnafu, ExternalSnafu}; use datatypes::prelude::{ConcreteDataType, DataType}; use datatypes::schema::{Schema, SchemaRef}; use datatypes::value::Value; @@ -37,7 +37,7 @@ use crate::read::Batch; use crate::read::flat_projection::FlatProjectionMapper; /// Only cache vector when its length `<=` this value. -const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384; +pub(crate) const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384; /// Wrapper enum for different projection mapper implementations. pub enum ProjectionMapper { @@ -423,7 +423,7 @@ enum BatchIndex { } /// Gets a vector with repeated values from specific cache or creates a new one. -fn repeated_vector_with_cache( +pub(crate) fn repeated_vector_with_cache( data_type: &ConcreteDataType, value: &Value, num_rows: usize, @@ -450,7 +450,7 @@ fn repeated_vector_with_cache( } /// Returns a vector with repeated values. -fn new_repeated_vector( +pub(crate) fn new_repeated_vector( data_type: &ConcreteDataType, value: &Value, num_rows: usize, @@ -458,8 +458,7 @@ fn new_repeated_vector( let mut mutable_vector = data_type.create_mutable_vector(1); mutable_vector .try_push_value_ref(&value.as_value_ref()) - .map_err(BoxedError::new) - .context(ExternalSnafu)?; + .context(DataTypesSnafu)?; // This requires an additional allocation. let base_vector = mutable_vector.to_vector(); Ok(base_vector.replicate(&[num_rows])) @@ -809,6 +808,7 @@ mod tests { .num_fields(2) .build(), ); + let cache = CacheStrategy::Disabled; let mapper = ProjectionMapper::all(&metadata, true).unwrap(); assert_eq!([0, 1, 2, 3, 4], mapper.column_ids()); assert_eq!( @@ -823,7 +823,7 @@ mod tests { ); let batch = new_flat_batch(Some(0), &[(1, 1), (2, 2)], &[(3, 3), (4, 4)], 3); - let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap(); + let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap(); let expect = "\ +---------------------+----+----+----+----+ | ts | k0 | k1 | v0 | v1 | @@ -843,6 +843,7 @@ mod tests { .num_fields(2) .build(), ); + let cache = CacheStrategy::Disabled; // Columns v1, k0 let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), true).unwrap(); assert_eq!([4, 1], mapper.column_ids()); @@ -856,7 +857,7 @@ mod tests { ); let batch = new_flat_batch(None, &[(1, 1)], &[(4, 4)], 3); - let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap(); + let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap(); let expect = "\ +----+----+ | v1 | k0 | @@ -876,6 +877,7 @@ mod tests { .num_fields(2) .build(), ); + let cache = CacheStrategy::Disabled; // Output columns v1, k0. Read also includes v0. let mapper = ProjectionMapper::new_with_read_columns( &metadata, @@ -887,7 +889,7 @@ mod tests { assert_eq!([4, 1, 3], mapper.column_ids()); let batch = new_flat_batch(None, &[(1, 1)], &[(3, 3), (4, 4)], 3); - let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap(); + let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap(); let expect = "\ +----+----+ | v1 | k0 | @@ -907,6 +909,7 @@ mod tests { .num_fields(2) .build(), ); + let cache = CacheStrategy::Disabled; // Empty projection let mapper = ProjectionMapper::new(&metadata, [].into_iter(), true).unwrap(); assert_eq!([0], mapper.column_ids()); // Should still read the time index column @@ -918,7 +921,7 @@ mod tests { ); let batch = new_flat_batch(Some(0), &[], &[], 3); - let record_batch = flat_mapper.convert(&batch).unwrap(); + let record_batch = flat_mapper.convert(&batch, &cache).unwrap(); assert_eq!(3, record_batch.num_rows()); assert_eq!(0, record_batch.num_columns()); assert!(record_batch.schema.is_empty()); diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs index 29ded3d49a..6766bf3f38 100644 --- a/src/mito2/src/read/prune.rs +++ b/src/mito2/src/read/prune.rs @@ -80,11 +80,6 @@ impl PruneReader { } } - pub(crate) fn reset_source(&mut self, source: Source, skip_fields: bool) { - self.source = source; - self.skip_fields = skip_fields; - } - /// Merge metrics with the inner reader and return the merged metrics. pub(crate) fn metrics(&self) -> ReaderMetrics { let mut metrics = self.metrics.clone(); @@ -252,10 +247,10 @@ pub enum FlatSource { } impl FlatSource { - fn next_batch(&mut self) -> Result> { + async fn next_batch(&mut self) -> Result> { match self { - FlatSource::RowGroup(r) => r.next_batch(), - FlatSource::LastRow(r) => r.next_batch(), + FlatSource::RowGroup(r) => r.next_batch().await, + FlatSource::LastRow(r) => r.next_batch().await, } } } @@ -302,13 +297,16 @@ impl FlatPruneReader { self.metrics.clone() } - pub(crate) fn next_batch(&mut self) -> Result> { - while let Some(record_batch) = { + pub(crate) async fn next_batch(&mut self) -> Result> { + loop { let start = std::time::Instant::now(); - let batch = self.source.next_batch()?; + let batch = self.source.next_batch().await?; self.metrics.scan_cost += start.elapsed(); - batch - } { + + let Some(record_batch) = batch else { + return Ok(None); + }; + // Update metrics for the received batch self.metrics.num_rows += record_batch.num_rows(); self.metrics.num_batches += 1; @@ -322,8 +320,6 @@ impl FlatPruneReader { } } } - - Ok(None) } /// Prunes batches by the pushed down predicate and returns RecordBatch. diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs new file mode 100644 index 0000000000..5fc8931691 --- /dev/null +++ b/src/mito2/src/read/range_cache.rs @@ -0,0 +1,856 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Utilities for the partition range scan result cache. + +use std::mem; +use std::sync::Arc; + +use async_stream::try_stream; +use common_time::range::TimestampRange; +use datatypes::arrow::array::{Array, AsArray, DictionaryArray}; +use datatypes::arrow::datatypes::UInt32Type; +use datatypes::arrow::record_batch::RecordBatch; +use datatypes::prelude::ConcreteDataType; +use futures::TryStreamExt; +use store_api::region_engine::PartitionRange; +use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector}; + +use crate::cache::CacheStrategy; +use crate::read::BoxedRecordBatchStream; +use crate::read::scan_region::StreamContext; +use crate::read::scan_util::PartitionMetrics; +use crate::region::options::MergeMode; +use crate::sst::file::FileTimeRange; +use crate::sst::parquet::flat_format::primary_key_column_index; + +/// Fingerprint of the scan request fields that affect partition range cache reuse. +/// +/// It records a normalized view of the projected columns and filters, plus +/// scan options that can change the returned rows. Schema-dependent metadata +/// and the partition expression version are included so cached results are not +/// reused across incompatible schema or partitioning changes. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct ScanRequestFingerprint { + /// Projection and filters without the time index and partition exprs. + inner: Arc, + /// Filters with the time index column. + time_filters: Option>>, + series_row_selector: Option, + append_mode: bool, + filter_deleted: bool, + merge_mode: MergeMode, + /// We keep the partition expr version to ensure we won't reuse the fingerprint after we change the partition expr. + /// We store the version instead of the whole partition expr or partition expr filters. + partition_expr_version: u64, +} + +#[derive(Debug)] +pub(crate) struct ScanRequestFingerprintBuilder { + pub(crate) read_column_ids: Vec, + pub(crate) read_column_types: Vec>, + pub(crate) filters: Vec, + pub(crate) time_filters: Vec, + pub(crate) series_row_selector: Option, + pub(crate) append_mode: bool, + pub(crate) filter_deleted: bool, + pub(crate) merge_mode: MergeMode, + pub(crate) partition_expr_version: u64, +} + +impl ScanRequestFingerprintBuilder { + pub(crate) fn build(self) -> ScanRequestFingerprint { + let Self { + read_column_ids, + read_column_types, + filters, + time_filters, + series_row_selector, + append_mode, + filter_deleted, + merge_mode, + partition_expr_version, + } = self; + + ScanRequestFingerprint { + inner: Arc::new(SharedScanRequestFingerprint { + read_column_ids, + read_column_types, + filters, + }), + time_filters: (!time_filters.is_empty()).then(|| Arc::new(time_filters)), + series_row_selector, + append_mode, + filter_deleted, + merge_mode, + partition_expr_version, + } + } +} + +/// Non-copiable struct of the fingerprint. +#[derive(Debug, PartialEq, Eq, Hash)] +struct SharedScanRequestFingerprint { + /// Column ids of the projection. + read_column_ids: Vec, + /// Column types of the projection. + /// We keep this to ensure we won't reuse the fingerprint after a schema change. + read_column_types: Vec>, + /// Filters without the time index column and region partition exprs. + filters: Vec, +} + +impl ScanRequestFingerprint { + #[cfg(test)] + pub(crate) fn read_column_ids(&self) -> &[ColumnId] { + &self.inner.read_column_ids + } + + #[cfg(test)] + pub(crate) fn read_column_types(&self) -> &[Option] { + &self.inner.read_column_types + } + + #[cfg(test)] + pub(crate) fn filters(&self) -> &[String] { + &self.inner.filters + } + + #[cfg(test)] + pub(crate) fn time_filters(&self) -> &[String] { + self.time_filters + .as_deref() + .map(Vec::as_slice) + .unwrap_or(&[]) + } + + pub(crate) fn without_time_filters(&self) -> Self { + Self { + inner: Arc::clone(&self.inner), + time_filters: None, + series_row_selector: self.series_row_selector, + append_mode: self.append_mode, + filter_deleted: self.filter_deleted, + merge_mode: self.merge_mode, + partition_expr_version: self.partition_expr_version, + } + } + + pub(crate) fn estimated_size(&self) -> usize { + mem::size_of::() + + self.inner.read_column_ids.capacity() * mem::size_of::() + + self.inner.read_column_types.capacity() * mem::size_of::>() + + self.inner.filters.capacity() * mem::size_of::() + + self + .inner + .filters + .iter() + .map(|filter| filter.capacity()) + .sum::() + + self.time_filters.as_ref().map_or(0, |filters| { + mem::size_of::>() + + filters.capacity() * mem::size_of::() + + filters + .iter() + .map(|filter| filter.capacity()) + .sum::() + }) + } +} + +/// Cache key for range scan outputs. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct RangeScanCacheKey { + pub(crate) region_id: RegionId, + /// Sorted (file_id, row_group_index) pairs that uniquely identify the data this range covers. + pub(crate) row_groups: Vec<(FileId, i64)>, + pub(crate) scan: ScanRequestFingerprint, +} + +impl RangeScanCacheKey { + pub(crate) fn estimated_size(&self) -> usize { + mem::size_of::() + + self.row_groups.capacity() * mem::size_of::<(FileId, i64)>() + + self.scan.estimated_size() + } +} + +/// Cached result for one range scan. +pub(crate) struct RangeScanCacheValue { + pub(crate) batches: Vec, + /// Precomputed size of all batches, accounting for shared dictionary values. + estimated_batches_size: usize, +} + +impl RangeScanCacheValue { + pub(crate) fn new(batches: Vec, estimated_batches_size: usize) -> Self { + Self { + batches, + estimated_batches_size, + } + } + + pub(crate) fn estimated_size(&self) -> usize { + mem::size_of::() + + self.batches.capacity() * mem::size_of::() + + self.estimated_batches_size + } +} + +/// Row groups and whether all sources are file-only for a partition range. +#[allow(dead_code)] +pub(crate) struct PartitionRangeRowGroups { + /// Sorted (file_id, row_group_index) pairs. + pub(crate) row_groups: Vec<(FileId, i64)>, + pub(crate) only_file_sources: bool, +} + +/// Collects (file_id, row_group_index) pairs from a partition range's row group indices. +#[allow(dead_code)] +pub(crate) fn collect_partition_range_row_groups( + stream_ctx: &StreamContext, + part_range: &PartitionRange, +) -> PartitionRangeRowGroups { + let range_meta = &stream_ctx.ranges[part_range.identifier]; + let mut row_groups = Vec::new(); + let mut only_file_sources = true; + + for index in &range_meta.row_group_indices { + if stream_ctx.is_file_range_index(*index) { + let file_id = stream_ctx.input.file_from_index(*index).file_id().file_id(); + row_groups.push((file_id, index.row_group_index)); + } else { + only_file_sources = false; + } + } + + row_groups.sort_unstable_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()).then(a.1.cmp(&b.1))); + + PartitionRangeRowGroups { + row_groups, + only_file_sources, + } +} + +/// Builds a cache key for the given partition range if it is eligible for caching. +#[allow(dead_code)] +pub(crate) fn build_range_cache_key( + stream_ctx: &StreamContext, + part_range: &PartitionRange, +) -> Option { + let fingerprint = stream_ctx.scan_fingerprint.as_ref()?; + + // Dyn filters can change at runtime, so we can't cache when they're present. + let has_dyn_filters = stream_ctx + .input + .predicate_group() + .predicate_without_region() + .is_some_and(|p| !p.dyn_filters().is_empty()); + if has_dyn_filters { + return None; + } + + let rg = collect_partition_range_row_groups(stream_ctx, part_range); + if !rg.only_file_sources || rg.row_groups.is_empty() { + return None; + } + + let range_meta = &stream_ctx.ranges[part_range.identifier]; + let scan = if query_time_range_covers_partition_range( + stream_ctx.input.time_range.as_ref(), + range_meta.time_range, + ) { + fingerprint.without_time_filters() + } else { + fingerprint.clone() + }; + + Some(RangeScanCacheKey { + region_id: stream_ctx.input.region_metadata().region_id, + row_groups: rg.row_groups, + scan, + }) +} + +#[allow(dead_code)] +fn query_time_range_covers_partition_range( + query_time_range: Option<&TimestampRange>, + partition_time_range: FileTimeRange, +) -> bool { + let Some(query_time_range) = query_time_range else { + return true; + }; + + let (part_start, part_end) = partition_time_range; + query_time_range.contains(&part_start) && query_time_range.contains(&part_end) +} + +/// Returns a stream that replays cached record batches. +#[allow(dead_code)] +pub(crate) fn cached_flat_range_stream(value: Arc) -> BoxedRecordBatchStream { + Box::pin(futures::stream::iter( + value.batches.clone().into_iter().map(Ok), + )) +} + +/// Returns true if two primary key dictionary arrays share the same underlying +/// values buffers by pointer comparison. +/// +/// The primary key column is always `DictionaryArray` with `Binary` values. +fn pk_values_ptr_eq(a: &DictionaryArray, b: &DictionaryArray) -> bool { + let a = a.values().as_binary::(); + let b = b.values().as_binary::(); + let values_eq = a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets()); + match (a.nulls(), b.nulls()) { + (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()), + (None, None) => values_eq, + _ => false, + } +} + +/// Buffers record batches for caching, tracking memory size while deduplicating +/// shared dictionary values across batches. +/// +/// Uses the primary key column as a proxy to detect dictionary sharing: if the PK +/// column's dictionary values are pointer-equal across batches, we assume all +/// dictionary columns share their values and deduct the total dictionary values size. +struct CacheBatchBuffer { + batches: Vec, + /// Running total of batch memory. + total_size: usize, + /// The first batch's PK dictionary array, for pointer comparison. + /// `None` if no dictionary PK column exists or no batch has been added yet. + first_pk_dict: Option>, + /// Sum of `get_array_memory_size()` of all dictionary value arrays from the first batch. + total_dict_values_size: usize, + /// Whether the PK dictionary is still shared across all batches seen so far. + shared: bool, +} + +impl CacheBatchBuffer { + fn new() -> Self { + Self { + batches: Vec::new(), + total_size: 0, + first_pk_dict: None, + total_dict_values_size: 0, + shared: true, + } + } + + fn push(&mut self, batch: RecordBatch) { + if self.batches.is_empty() { + self.init_first_batch(&batch); + } else { + self.add_subsequent_batch(&batch); + } + self.batches.push(batch); + } + + fn init_first_batch(&mut self, batch: &RecordBatch) { + self.total_size += batch.get_array_memory_size(); + + let pk_col_idx = primary_key_column_index(batch.num_columns()); + let mut total_dict_values_size = 0; + for col_idx in 0..batch.num_columns() { + let col = batch.column(col_idx); + if let Some(dict) = col.as_any().downcast_ref::>() { + total_dict_values_size += dict.values().get_array_memory_size(); + if col_idx == pk_col_idx { + self.first_pk_dict = Some(dict.clone()); + } + } + } + self.total_dict_values_size = total_dict_values_size; + } + + fn add_subsequent_batch(&mut self, batch: &RecordBatch) { + let batch_size = batch.get_array_memory_size(); + + if self.shared + && let Some(first_pk_dict) = &self.first_pk_dict + { + let pk_col_idx = primary_key_column_index(batch.num_columns()); + let col = batch.column(pk_col_idx); + if let Some(dict) = col.as_any().downcast_ref::>() + && pk_values_ptr_eq(first_pk_dict, dict) + { + // PK dict is shared, deduct all dict values sizes. + self.total_size += batch_size - self.total_dict_values_size; + return; + } + // Dictionary diverged. + self.shared = false; + } + + self.total_size += batch_size; + } + + fn estimated_batches_size(&self) -> usize { + self.total_size + } + + fn into_batches(self) -> Vec { + self.batches + } +} + +/// Wraps a stream to cache its output for future range cache hits. +#[allow(dead_code)] +pub(crate) fn cache_flat_range_stream( + mut stream: BoxedRecordBatchStream, + cache_strategy: CacheStrategy, + key: RangeScanCacheKey, + part_metrics: PartitionMetrics, +) -> BoxedRecordBatchStream { + Box::pin(try_stream! { + let mut buffer = CacheBatchBuffer::new(); + while let Some(batch) = stream.try_next().await? { + buffer.push(batch.clone()); + yield batch; + } + + let estimated_size = buffer.estimated_batches_size(); + let batches = buffer.into_batches(); + let value = Arc::new(RangeScanCacheValue::new(batches, estimated_size)); + part_metrics.inc_range_cache_size(key.estimated_size() + value.estimated_size()); + cache_strategy.put_range_result(key, value); + }) +} + +/// Creates a `cache_flat_range_stream` with dummy internals for benchmarking. +/// +/// This avoids exposing `RangeScanCacheKey`, `ScanRequestFingerprint`, and +/// `PartitionMetrics` publicly. +#[cfg(feature = "test")] +pub fn bench_cache_flat_range_stream( + stream: BoxedRecordBatchStream, + cache_size_bytes: u64, + region_id: RegionId, +) -> BoxedRecordBatchStream { + use std::time::Instant; + + use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; + + use crate::region::options::MergeMode; + + let cache_manager = Arc::new( + crate::cache::CacheManager::builder() + .range_result_cache_size(cache_size_bytes) + .build(), + ); + let cache_strategy = CacheStrategy::EnableAll(cache_manager); + + let fingerprint = ScanRequestFingerprintBuilder { + read_column_ids: vec![], + read_column_types: vec![], + filters: vec![], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: false, + merge_mode: MergeMode::LastRow, + partition_expr_version: 0, + } + .build(); + + let key = RangeScanCacheKey { + region_id, + row_groups: vec![], + scan: fingerprint, + }; + + let metrics_set = ExecutionPlanMetricsSet::new(); + let part_metrics = + PartitionMetrics::new(region_id, 0, "bench", Instant::now(), false, &metrics_set); + + cache_flat_range_stream(stream, cache_strategy, key, part_metrics) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::time::Instant; + + use common_time::Timestamp; + use common_time::range::TimestampRange; + use common_time::timestamp::TimeUnit; + use datafusion_common::ScalarValue; + use datafusion_expr::{Expr, col, lit}; + use smallvec::smallvec; + use store_api::storage::FileId; + + use super::*; + use crate::cache::CacheManager; + use crate::read::projection::ProjectionMapper; + use crate::read::range::{RangeMeta, RowGroupIndex, SourceIndex}; + use crate::read::scan_region::{PredicateGroup, ScanInput}; + use crate::test_util::memtable_util::metadata_with_primary_key; + use crate::test_util::scheduler_util::SchedulerEnv; + use crate::test_util::sst_util::sst_file_handle_with_file_id; + + fn test_cache_strategy() -> CacheStrategy { + CacheStrategy::EnableAll(Arc::new( + CacheManager::builder() + .range_result_cache_size(1024) + .build(), + )) + } + + async fn new_stream_context( + filters: Vec, + query_time_range: Option, + partition_time_range: FileTimeRange, + ) -> (StreamContext, PartitionRange) { + let env = SchedulerEnv::new().await; + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(); + let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap(); + let file_id = FileId::random(); + let file = sst_file_handle_with_file_id( + file_id, + partition_time_range.0.value(), + partition_time_range.1.value(), + ); + let input = ScanInput::new(env.access_layer.clone(), mapper) + .with_predicate(predicate) + .with_time_range(query_time_range) + .with_files(vec![file]) + .with_cache(test_cache_strategy()) + .with_flat_format(true); + let range_meta = RangeMeta { + time_range: partition_time_range, + indices: smallvec![SourceIndex { + index: 0, + num_row_groups: 1, + }], + row_group_indices: smallvec![RowGroupIndex { + index: 0, + row_group_index: 0, + }], + num_rows: 10, + }; + let partition_range = range_meta.new_partition_range(0); + let scan_fingerprint = crate::read::scan_region::build_scan_fingerprint(&input); + let stream_ctx = StreamContext { + input, + ranges: vec![range_meta], + scan_fingerprint, + query_start: Instant::now(), + }; + + (stream_ctx, partition_range) + } + + /// Helper to create a timestamp millisecond literal. + fn ts_lit(val: i64) -> Expr { + lit(ScalarValue::TimestampMillisecond(Some(val), None)) + } + + #[tokio::test] + async fn strips_time_only_filters_when_query_covers_partition_range() { + let (stream_ctx, part_range) = new_stream_context( + vec![ + col("ts").gt_eq(ts_lit(1000)), + col("ts").lt(ts_lit(2001)), + col("ts").is_not_null(), + col("k0").eq(lit("foo")), + ], + TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond), + ( + Timestamp::new_millisecond(1000), + Timestamp::new_millisecond(2000), + ), + ) + .await; + + let key = build_range_cache_key(&stream_ctx, &part_range).unwrap(); + + // Range-reducible time filters should be cleared when query covers partition range. + assert!(key.scan.time_filters().is_empty()); + // Non-range time predicates stay in filters. + let mut expected_filters = [ + col("k0").eq(lit("foo")).to_string(), + col("ts").is_not_null().to_string(), + ]; + expected_filters.sort_unstable(); + assert_eq!(key.scan.filters(), expected_filters.as_slice()); + } + + #[tokio::test] + async fn preserves_time_filters_when_query_does_not_cover_partition_range() { + let (stream_ctx, part_range) = new_stream_context( + vec![col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo"))], + TimestampRange::with_unit(1000, 1500, TimeUnit::Millisecond), + ( + Timestamp::new_millisecond(1000), + Timestamp::new_millisecond(2000), + ), + ) + .await; + + let key = build_range_cache_key(&stream_ctx, &part_range).unwrap(); + + // Time filters should be preserved when query does not cover partition range. + assert_eq!( + key.scan.time_filters(), + [col("ts").gt_eq(ts_lit(1000)).to_string()].as_slice() + ); + assert_eq!( + key.scan.filters(), + [col("k0").eq(lit("foo")).to_string()].as_slice() + ); + } + + #[tokio::test] + async fn strips_time_only_filters_when_query_has_no_time_range_limit() { + let (stream_ctx, part_range) = new_stream_context( + vec![ + col("ts").gt_eq(ts_lit(1000)), + col("ts").is_not_null(), + col("k0").eq(lit("foo")), + ], + None, + ( + Timestamp::new_millisecond(1000), + Timestamp::new_millisecond(2000), + ), + ) + .await; + + let key = build_range_cache_key(&stream_ctx, &part_range).unwrap(); + + // Range-reducible time filters should be cleared when query has no time range limit. + assert!(key.scan.time_filters().is_empty()); + // Non-range time predicates stay in filters. + let mut expected_filters = [ + col("k0").eq(lit("foo")).to_string(), + col("ts").is_not_null().to_string(), + ]; + expected_filters.sort_unstable(); + assert_eq!(key.scan.filters(), expected_filters.as_slice()); + } + + #[test] + fn normalizes_and_clears_time_filters() { + let normalized = ScanRequestFingerprintBuilder { + read_column_ids: vec![1, 2], + read_column_types: vec![None, None], + filters: vec!["k0 = 'foo'".to_string()], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: true, + merge_mode: MergeMode::LastRow, + partition_expr_version: 0, + } + .build(); + + assert!(normalized.time_filters().is_empty()); + + let fingerprint = ScanRequestFingerprintBuilder { + read_column_ids: vec![1, 2], + read_column_types: vec![None, None], + filters: vec!["k0 = 'foo'".to_string()], + time_filters: vec!["ts >= 1000".to_string()], + series_row_selector: Some(TimeSeriesRowSelector::LastRow), + append_mode: false, + filter_deleted: true, + merge_mode: MergeMode::LastRow, + partition_expr_version: 7, + } + .build(); + + let reset = fingerprint.without_time_filters(); + + assert_eq!(reset.read_column_ids(), fingerprint.read_column_ids()); + assert_eq!(reset.read_column_types(), fingerprint.read_column_types()); + assert_eq!(reset.filters(), fingerprint.filters()); + assert!(reset.time_filters().is_empty()); + assert_eq!(reset.series_row_selector, fingerprint.series_row_selector); + assert_eq!(reset.append_mode, fingerprint.append_mode); + assert_eq!(reset.filter_deleted, fingerprint.filter_deleted); + assert_eq!(reset.merge_mode, fingerprint.merge_mode); + assert_eq!( + reset.partition_expr_version, + fingerprint.partition_expr_version + ); + } + + /// Creates a test schema with 5 columns where the primary key dictionary column + /// is at index 2 (`num_columns - 3`), matching the flat format layout. + /// + /// Layout: `[field0: Int64, field1: Int64, pk: Dictionary, ts: Int64, seq: Int64]` + fn dict_test_schema() -> Arc { + use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; + Arc::new(Schema::new(vec![ + Field::new("field0", ArrowDataType::Int64, false), + Field::new("field1", ArrowDataType::Int64, false), + Field::new( + "pk", + ArrowDataType::Dictionary( + Box::new(ArrowDataType::UInt32), + Box::new(ArrowDataType::Binary), + ), + false, + ), + Field::new("ts", ArrowDataType::Int64, false), + Field::new("seq", ArrowDataType::Int64, false), + ])) + } + + /// Helper to create a record batch with a dictionary column at the primary key position. + fn make_dict_batch( + schema: Arc, + dict_values: &datatypes::arrow::array::BinaryArray, + keys: &[u32], + int_values: &[i64], + ) -> RecordBatch { + use datatypes::arrow::array::{Int64Array, UInt32Array}; + + let key_array = UInt32Array::from(keys.to_vec()); + let dict_array: DictionaryArray = + DictionaryArray::new(key_array, Arc::new(dict_values.clone())); + let int_array = Int64Array::from(int_values.to_vec()); + let zeros = Int64Array::from(vec![0i64; int_values.len()]); + RecordBatch::try_new( + schema, + vec![ + Arc::new(zeros.clone()), + Arc::new(int_array), + Arc::new(dict_array), + Arc::new(zeros.clone()), + Arc::new(zeros), + ], + ) + .unwrap() + } + + /// Computes the total `get_array_memory_size()` of all dictionary value arrays in a batch. + fn compute_total_dict_values_size(batch: &RecordBatch) -> usize { + batch + .columns() + .iter() + .filter_map(|col| { + col.as_any() + .downcast_ref::>() + .map(|dict| dict.values().get_array_memory_size()) + }) + .sum() + } + + #[test] + fn cache_batch_buffer_empty() { + let buffer = CacheBatchBuffer::new(); + assert_eq!(buffer.estimated_batches_size(), 0); + assert!(buffer.into_batches().is_empty()); + } + + #[test] + fn cache_batch_buffer_single_batch() { + use datatypes::arrow::array::BinaryArray; + + let schema = dict_test_schema(); + let dict_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]); + let batch = make_dict_batch(schema, &dict_values, &[0, 1, 2], &[10, 20, 30]); + + let full_size = batch.get_array_memory_size(); + + let mut buffer = CacheBatchBuffer::new(); + buffer.push(batch); + assert_eq!(buffer.estimated_batches_size(), full_size); + assert_eq!(buffer.into_batches().len(), 1); + } + + #[test] + fn cache_batch_buffer_shared_dictionary() { + use datatypes::arrow::array::BinaryArray; + + let schema = dict_test_schema(); + let dict_values = BinaryArray::from_vec(vec![b"alpha", b"beta", b"gamma"]); + + // Two batches sharing the same dictionary values array. + let batch1 = make_dict_batch(schema.clone(), &dict_values, &[0, 1], &[10, 20]); + let batch2 = make_dict_batch(schema, &dict_values, &[1, 2], &[30, 40]); + + let batch1_full = batch1.get_array_memory_size(); + let batch2_full = batch2.get_array_memory_size(); + + // The total dictionary values size that should be deduplicated for the second batch. + let dict_values_size = compute_total_dict_values_size(&batch2); + + let mut buffer = CacheBatchBuffer::new(); + buffer.push(batch1); + buffer.push(batch2); + + // Second batch's dict values should not be counted again. + assert_eq!( + buffer.estimated_batches_size(), + batch1_full + batch2_full - dict_values_size + ); + assert_eq!(buffer.into_batches().len(), 2); + } + + #[test] + fn cache_batch_buffer_non_shared_dictionary() { + use datatypes::arrow::array::BinaryArray; + + let schema = dict_test_schema(); + let dict_values1 = BinaryArray::from_vec(vec![b"a", b"b"]); + let dict_values2 = BinaryArray::from_vec(vec![b"x", b"y"]); + + let batch1 = make_dict_batch(schema.clone(), &dict_values1, &[0, 1], &[10, 20]); + let batch2 = make_dict_batch(schema, &dict_values2, &[0, 1], &[30, 40]); + + let batch1_full = batch1.get_array_memory_size(); + let batch2_full = batch2.get_array_memory_size(); + + let mut buffer = CacheBatchBuffer::new(); + buffer.push(batch1); + buffer.push(batch2); + + // Different dictionaries: full size for both. + assert_eq!(buffer.estimated_batches_size(), batch1_full + batch2_full); + } + + #[test] + fn cache_batch_buffer_shared_then_diverged() { + use datatypes::arrow::array::BinaryArray; + + let schema = dict_test_schema(); + let shared_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]); + let different_values = BinaryArray::from_vec(vec![b"x", b"y"]); + + let batch1 = make_dict_batch(schema.clone(), &shared_values, &[0], &[1]); + let batch2 = make_dict_batch(schema.clone(), &shared_values, &[1], &[2]); + let batch3 = make_dict_batch(schema, &different_values, &[0], &[3]); + + let size1 = batch1.get_array_memory_size(); + let size2 = batch2.get_array_memory_size(); + let size3 = batch3.get_array_memory_size(); + + let dict_values_size = compute_total_dict_values_size(&batch2); + + let mut buffer = CacheBatchBuffer::new(); + buffer.push(batch1); + buffer.push(batch2); + buffer.push(batch3); + + // batch2 shares dict with batch1 (dedup), batch3 does not (full size). + assert_eq!( + buffer.estimated_batches_size(), + size1 + (size2 - dict_values_size) + size3 + ); + } +} diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 5d934afd2d..e7cae7e7b8 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -40,7 +40,7 @@ use store_api::region_engine::{PartitionRange, RegionScannerRef}; use store_api::storage::{ ColumnId, RegionId, ScanRequest, SequenceRange, TimeSeriesDistribution, TimeSeriesRowSelector, }; -use table::predicate::{Predicate, build_time_range_predicate}; +use table::predicate::{Predicate, build_time_range_predicate, extract_time_range_from_expr}; use tokio::sync::{Semaphore, mpsc}; use tokio_stream::wrappers::ReceiverStream; @@ -55,6 +55,7 @@ use crate::metrics::READ_SST_COUNT; use crate::read::compat::{self, CompatBatch, FlatCompatBatch, PrimaryKeyCompatBatch}; use crate::read::projection::ProjectionMapper; use crate::read::range::{FileRangeBuilder, MemRangeBuilder, RangeMeta, RowGroupIndex}; +use crate::read::range_cache::ScanRequestFingerprint; use crate::read::seq_scan::SeqScan; use crate::read::series_scan::SeriesScan; use crate::read::stream::ScanBatchStream; @@ -815,7 +816,7 @@ pub struct ScanInput { /// But this read columns might also include non-projected columns needed for filtering. pub(crate) read_column_ids: Vec, /// Time range filter for time index. - time_range: Option, + pub(crate) time_range: Option, /// Predicate to push down. pub(crate) predicate: PredicateGroup, /// Region partition expr applied at read time. @@ -1417,6 +1418,105 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode { } } +/// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible +/// for partition range caching. +pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option { + let eligible = input.flat_format + && !input.compaction + && !input.files.is_empty() + && matches!(input.cache_strategy, CacheStrategy::EnableAll(_)); + + if !eligible { + return None; + } + + let metadata = input.region_metadata(); + let tag_names: HashSet<&str> = metadata + .column_metadatas + .iter() + .filter(|col| col.semantic_type == SemanticType::Tag) + .map(|col| col.column_schema.name.as_str()) + .collect(); + + let time_index = metadata.time_index_column(); + let time_index_name = time_index.column_schema.name.clone(); + let ts_col_unit = time_index + .column_schema + .data_type + .as_timestamp() + .expect("Time index must have timestamp-compatible type") + .unit(); + + let exprs = input + .predicate_group() + .predicate_without_region() + .map(|predicate| predicate.exprs()) + .unwrap_or_default(); + + let mut filters = Vec::new(); + let mut time_filters = Vec::new(); + let mut has_tag_filter = false; + let mut columns = HashSet::new(); + + for expr in exprs { + columns.clear(); + let is_time_only = match expr_to_columns(expr, &mut columns) { + Ok(()) if !columns.is_empty() => { + has_tag_filter |= columns + .iter() + .any(|col| tag_names.contains(col.name.as_str())); + columns.iter().all(|col| col.name == time_index_name) + } + _ => false, + }; + + if is_time_only + && extract_time_range_from_expr(&time_index_name, ts_col_unit, expr).is_some() + { + // Range-reducible time predicates can be safely dropped from the + // cache key when the query time range covers the partition range. + time_filters.push(expr.to_string()); + } else { + // Non-time filters and non-range time predicates (those that + // extract_time_range_from_expr cannot convert to a TimestampRange) + // always stay in the cache key. + filters.push(expr.to_string()); + } + } + + if !has_tag_filter { + // We only cache requests that have tag filters to avoid caching all series. + return None; + } + + // Ensure the filters are sorted for consistent fingerprinting. + filters.sort_unstable(); + time_filters.sort_unstable(); + + Some( + crate::read::range_cache::ScanRequestFingerprintBuilder { + read_column_ids: input.read_column_ids.clone(), + read_column_types: input + .read_column_ids + .iter() + .map(|id| { + metadata + .column_by_id(*id) + .map(|col| col.column_schema.data_type.clone()) + }) + .collect(), + filters, + time_filters, + series_row_selector: input.series_row_selector, + append_mode: input.append_mode, + filter_deleted: input.filter_deleted, + merge_mode: input.merge_mode, + partition_expr_version: metadata.partition_expr_version, + } + .build(), + ) +} + /// Context shared by different streams from a scanner. /// It contains the input and ranges to scan. pub struct StreamContext { @@ -1424,6 +1524,10 @@ pub struct StreamContext { pub input: ScanInput, /// Metadata for partition ranges. pub(crate) ranges: Vec, + /// Precomputed scan fingerprint for partition range caching. + /// `None` when the scan is not eligible for caching. + #[allow(dead_code)] + pub(crate) scan_fingerprint: Option, // Metrics: /// The start time of the query. @@ -1436,10 +1540,12 @@ impl StreamContext { let query_start = input.query_start.unwrap_or_else(Instant::now); let ranges = RangeMeta::seq_scan_ranges(&input); READ_SST_COUNT.observe(input.num_files() as f64); + let scan_fingerprint = build_scan_fingerprint(&input); Self { input, ranges, + scan_fingerprint, query_start, } } @@ -1449,10 +1555,12 @@ impl StreamContext { let query_start = input.query_start.unwrap_or_else(Instant::now); let ranges = RangeMeta::unordered_scan_ranges(&input); READ_SST_COUNT.observe(input.num_files() as f64); + let scan_fingerprint = build_scan_fingerprint(&input); Self { input, ranges, + scan_fingerprint, query_start, } } @@ -1762,11 +1870,17 @@ mod tests { use std::sync::Arc; use datafusion::physical_plan::expressions::lit as physical_lit; + use datafusion_common::ScalarValue; use datafusion_expr::{col, lit}; - use store_api::storage::ScanRequest; + use datatypes::value::Value; + use partition::expr::col as partition_col; + use store_api::metadata::RegionMetadataBuilder; + use store_api::storage::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector}; use super::*; + use crate::cache::CacheManager; use crate::memtable::time_partition::TimePartitions; + use crate::read::range_cache::ScanRequestFingerprintBuilder; use crate::region::options::RegionOptions; use crate::region::version::VersionBuilder; use crate::sst::FormatType; @@ -1804,6 +1918,26 @@ mod tests { ) } + async fn new_scan_input(metadata: RegionMetadataRef, filters: Vec) -> ScanInput { + let env = SchedulerEnv::new().await; + let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(); + let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap(); + let file = FileHandle::new( + crate::sst::file::FileMeta::default(), + Arc::new(crate::sst::file_purger::NoopFilePurger), + ); + + ScanInput::new(env.access_layer.clone(), mapper) + .with_predicate(predicate) + .with_cache(CacheStrategy::EnableAll(Arc::new( + CacheManager::builder() + .range_result_cache_size(1024) + .build(), + ))) + .with_flat_format(true) + .with_files(vec![file]) + } + #[tokio::test] async fn test_build_read_column_ids_includes_filters() { let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); @@ -1923,6 +2057,138 @@ mod tests { assert!(scan_region.use_flat_format()); } + /// Helper to create a timestamp millisecond literal. + fn ts_lit(val: i64) -> datafusion_expr::Expr { + lit(ScalarValue::TimestampMillisecond(Some(val), None)) + } + + #[tokio::test] + async fn test_build_scan_fingerprint_for_eligible_scan() { + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let input = new_scan_input( + metadata.clone(), + vec![ + col("ts").gt_eq(ts_lit(1000)), + col("k0").eq(lit("foo")), + col("v0").gt(lit(1)), + ], + ) + .await + .with_distribution(Some(TimeSeriesDistribution::PerSeries)) + .with_series_row_selector(Some(TimeSeriesRowSelector::LastRow)) + .with_merge_mode(MergeMode::LastNonNull) + .with_filter_deleted(false); + + let fingerprint = build_scan_fingerprint(&input).unwrap(); + + let expected = ScanRequestFingerprintBuilder { + read_column_ids: input.read_column_ids.clone(), + read_column_types: vec![ + metadata + .column_by_id(0) + .map(|col| col.column_schema.data_type.clone()), + metadata + .column_by_id(2) + .map(|col| col.column_schema.data_type.clone()), + metadata + .column_by_id(3) + .map(|col| col.column_schema.data_type.clone()), + ], + filters: vec![ + col("k0").eq(lit("foo")).to_string(), + col("v0").gt(lit(1)).to_string(), + ], + time_filters: vec![col("ts").gt_eq(ts_lit(1000)).to_string()], + series_row_selector: Some(TimeSeriesRowSelector::LastRow), + append_mode: false, + filter_deleted: false, + merge_mode: MergeMode::LastNonNull, + partition_expr_version: 0, + } + .build(); + assert_eq!(expected, fingerprint); + } + + #[tokio::test] + async fn test_build_scan_fingerprint_requires_tag_filter() { + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let input = new_scan_input( + metadata, + vec![col("ts").gt_eq(lit(1000)), col("v0").gt(lit(1))], + ) + .await; + + assert!(build_scan_fingerprint(&input).is_none()); + } + + #[tokio::test] + async fn test_build_scan_fingerprint_respects_scan_eligibility() { + let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); + let filters = vec![col("k0").eq(lit("foo"))]; + + let disabled = ScanInput::new( + SchedulerEnv::new().await.access_layer.clone(), + ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(), + ) + .with_predicate(PredicateGroup::new(metadata.as_ref(), &filters).unwrap()) + .with_flat_format(true); + assert!(build_scan_fingerprint(&disabled).is_none()); + + let non_flat = new_scan_input(metadata.clone(), filters.clone()) + .await + .with_flat_format(false); + assert!(build_scan_fingerprint(&non_flat).is_none()); + + let compaction = new_scan_input(metadata.clone(), filters.clone()) + .await + .with_compaction(true); + assert!(build_scan_fingerprint(&compaction).is_none()); + + // No files to read. + let no_files = new_scan_input(metadata, filters).await.with_files(vec![]); + assert!(build_scan_fingerprint(&no_files).is_none()); + } + + #[tokio::test] + async fn test_build_scan_fingerprint_tracks_schema_and_partition_expr_changes() { + let base = metadata_with_primary_key(vec![0, 1], false); + let mut builder = RegionMetadataBuilder::from_existing(base); + let partition_expr = partition_col("k0") + .gt_eq(Value::String("foo".into())) + .as_json_str() + .unwrap(); + builder.partition_expr_json(Some(partition_expr)); + let metadata = Arc::new(builder.build_without_validation().unwrap()); + + let input = new_scan_input(metadata.clone(), vec![col("k0").eq(lit("foo"))]).await; + let fingerprint = build_scan_fingerprint(&input).unwrap(); + + let expected = ScanRequestFingerprintBuilder { + read_column_ids: input.read_column_ids.clone(), + read_column_types: vec![ + metadata + .column_by_id(0) + .map(|col| col.column_schema.data_type.clone()), + metadata + .column_by_id(2) + .map(|col| col.column_schema.data_type.clone()), + metadata + .column_by_id(3) + .map(|col| col.column_schema.data_type.clone()), + ], + filters: vec![col("k0").eq(lit("foo")).to_string()], + time_filters: vec![], + series_row_selector: None, + append_mode: false, + filter_deleted: true, + merge_mode: MergeMode::LastRow, + partition_expr_version: metadata.partition_expr_version, + } + .build(); + assert_eq!(expected, fingerprint); + assert_ne!(0, metadata.partition_expr_version); + } + #[test] fn test_update_dyn_filters_with_empty_base_predicates() { let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false)); diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs index 0ee6a4437d..9bf1c17276 100644 --- a/src/mito2/src/read/scan_util.rs +++ b/src/mito2/src/read/scan_util.rs @@ -247,6 +247,12 @@ pub(crate) struct ScanMetricsSet { num_range_builders: isize, /// Peak number of file range builders. num_peak_range_builders: isize, + /// Total bytes added to the range cache during this scan. + range_cache_size: usize, + /// Number of range cache hits during this scan. + range_cache_hit: usize, + /// Number of range cache misses during this scan. + range_cache_miss: usize, } /// Wrapper for file metrics that compares by total cost in reverse order. @@ -345,6 +351,9 @@ impl fmt::Debug for ScanMetricsSet { build_ranges_peak_mem_size, num_range_builders: _, num_peak_range_builders, + range_cache_size, + range_cache_hit, + range_cache_miss, } = self; // Write core metrics @@ -590,6 +599,16 @@ impl fmt::Debug for ScanMetricsSet { write!(f, "}}")?; } + if *range_cache_size > 0 { + write!(f, ", \"range_cache_size\":{range_cache_size}")?; + } + if *range_cache_hit > 0 { + write!(f, ", \"range_cache_hit\":{range_cache_hit}")?; + } + if *range_cache_miss > 0 { + write!(f, ", \"range_cache_miss\":{range_cache_miss}")?; + } + write!( f, ", \"build_ranges_peak_mem_size\":{build_ranges_peak_mem_size}, \ @@ -1097,6 +1116,27 @@ impl PartitionMetrics { pub(crate) fn dedup_metrics_reporter(&self) -> Arc { self.0.clone() } + + /// Increments the total bytes added to the range cache. + #[allow(dead_code)] + pub(crate) fn inc_range_cache_size(&self, size: usize) { + let mut metrics = self.0.metrics.lock().unwrap(); + metrics.range_cache_size += size; + } + + /// Increments the range cache hit counter. + #[allow(dead_code)] + pub(crate) fn inc_range_cache_hit(&self) { + let mut metrics = self.0.metrics.lock().unwrap(); + metrics.range_cache_hit += 1; + } + + /// Increments the range cache miss counter. + #[allow(dead_code)] + pub(crate) fn inc_range_cache_miss(&self) { + let mut metrics = self.0.metrics.lock().unwrap(); + metrics.range_cache_miss += 1; + } } impl fmt::Debug for PartitionMetrics { @@ -1493,7 +1533,7 @@ pub fn build_flat_file_range_scan_stream( .transpose()?; let mapper = range.compaction_projection_mapper(); - while let Some(record_batch) = reader.next_batch()? { + while let Some(record_batch) = reader.next_batch().await? { let record_batch = if let Some(mapper) = mapper { let batch = mapper.project(record_batch)?; batch diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index c13b40d111..a1b3b8f350 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -39,7 +39,7 @@ use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu, Un use crate::read::dedup::{DedupReader, LastNonNull, LastRow}; use crate::read::flat_dedup::{FlatDedupReader, FlatLastNonNull, FlatLastRow}; use crate::read::flat_merge::FlatMergeReader; -use crate::read::last_row::LastRowReader; +use crate::read::last_row::{FlatLastRowReader, LastRowReader}; use crate::read::merge::MergeReaderBuilder; use crate::read::pruner::{PartitionPruner, Pruner}; use crate::read::range::RangeMeta; @@ -128,28 +128,6 @@ impl SeqScan { Ok(Box::pin(futures::stream::iter(streams).flatten())) } - /// Builds a [BoxedBatchReader] from sequential scan for compaction. - /// - /// # Panics - /// Panics if the compaction flag is not set. - pub async fn build_reader_for_compaction(&self) -> Result { - assert!(self.stream_ctx.input.compaction); - - let metrics_set = ExecutionPlanMetricsSet::new(); - let part_metrics = self.new_partition_metrics(false, &metrics_set, 0); - debug_assert_eq!(1, self.properties.partitions.len()); - let partition_ranges = &self.properties.partitions[0]; - - let reader = Self::merge_all_ranges_for_compaction( - &self.stream_ctx, - partition_ranges, - &part_metrics, - self.pruner.clone(), - ) - .await?; - Ok(Box::new(reader)) - } - /// Builds a [BoxedRecordBatchStream] from sequential scan for flat format compaction. /// /// # Panics @@ -172,40 +150,6 @@ impl SeqScan { Ok(reader) } - /// Builds a merge reader that reads all ranges. - /// Callers MUST not split ranges before calling this method. - async fn merge_all_ranges_for_compaction( - stream_ctx: &Arc, - partition_ranges: &[PartitionRange], - part_metrics: &PartitionMetrics, - pruner: Arc, - ) -> Result { - pruner.add_partition_ranges(partition_ranges); - let partition_pruner = Arc::new(PartitionPruner::new(pruner, partition_ranges)); - - let mut sources = Vec::new(); - for part_range in partition_ranges { - build_sources( - stream_ctx, - part_range, - true, - part_metrics, - partition_pruner.clone(), - &mut sources, - None, - ) - .await?; - } - - common_telemetry::debug!( - "Build reader to read all parts, region_id: {}, num_part_ranges: {}, num_sources: {}", - stream_ctx.input.mapper.metadata().region_id, - partition_ranges.len(), - sources.len() - ); - Self::build_reader_from_sources(stream_ctx, sources, None, None).await - } - /// Builds a merge reader that reads all flat ranges. /// Callers MUST not split ranges before calling this method. async fn merge_all_flat_ranges_for_compaction( @@ -345,6 +289,13 @@ impl SeqScan { Box::pin(reader.into_stream()) as _ }; + let reader = match &stream_ctx.input.series_row_selector { + Some(TimeSeriesRowSelector::LastRow) => { + Box::pin(FlatLastRowReader::new(reader).into_stream()) as _ + } + None => reader, + }; + Ok(reader) } diff --git a/src/mito2/src/read/stream.rs b/src/mito2/src/read/stream.rs index dd85616241..80002147ea 100644 --- a/src/mito2/src/read/stream.rs +++ b/src/mito2/src/read/stream.rs @@ -99,7 +99,8 @@ impl ConvertBatchStream { let mapper = self.projection_mapper.as_flat().unwrap(); for batch in flat_batch.batches { - self.pending.push_back(mapper.convert(&batch)?); + self.pending + .push_back(mapper.convert(&batch, &self.cache_strategy)?); } } } @@ -114,7 +115,7 @@ impl ConvertBatchStream { // Safety: Only flat format returns this batch. let mapper = self.projection_mapper.as_flat().unwrap(); - mapper.convert(&df_record_batch) + mapper.convert(&df_record_batch, &self.cache_strategy) } } } diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index de8927c4de..3020c9ecf4 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -973,8 +973,23 @@ impl ManifestContext { // This is an edit from flush. if let Some(flushed_entry_id) = edit.flushed_entry_id { + // A flush edit is valid after truncate in two cases: + // 1. `flushed_entry_id` moves past `truncated_entry_id`, meaning it definitely + // flushed data newer than the truncate point. + // 2. `flushed_entry_id` equals `truncated_entry_id`, but `flushed_sequence` + // increases. This happens in skip-WAL tables where entry id can stay at 0, + // while sequence still advances for post-truncate writes. + // + // We still reject stale flushes from before truncate: + // if entry id is equal and sequence does not advance, the flush is outdated. + let is_newer_entry = truncated_entry_id < flushed_entry_id; + let is_same_entry_with_newer_sequence = truncated_entry_id == flushed_entry_id + && edit.flushed_sequence.is_some_and(|flushed_sequence| { + manifest.flushed_sequence < flushed_sequence + }); + ensure!( - truncated_entry_id < flushed_entry_id, + is_newer_entry || is_same_entry_with_newer_sequence, RegionTruncatedSnafu { region_id: manifest.metadata.region_id, } diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index 014c50820f..d089493f81 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -1043,7 +1043,7 @@ async fn preload_parquet_meta_cache_for_files( let loader = MetadataLoader::new(object_store.clone(), &file_path, file_size); match loader.load(&mut cache_metrics).await { Ok(metadata) => { - cache_manager.put_parquet_meta_data(file_id, Arc::new(metadata)); + cache_manager.put_parquet_meta_data(file_id, Arc::new(metadata), None); loaded += 1; } Err(err) => { @@ -1153,6 +1153,8 @@ mod tests { use object_store::ObjectStore; use object_store::services::{Fs, Memory}; use parquet::arrow::ArrowWriter; + use parquet::file::metadata::KeyValue; + use parquet::file::properties::WriterProperties; use store_api::region_request::PathType; use store_api::storage::{FileId, RegionId}; @@ -1161,7 +1163,27 @@ mod tests { use crate::cache::file_cache::{FileType, IndexKey}; use crate::sst::file::{FileHandle, FileMeta}; use crate::sst::file_purger::NoopFilePurger; + use crate::sst::parquet::PARQUET_METADATA_KEY; use crate::test_util::TestEnv; + use crate::test_util::sst_util::sst_region_metadata; + + fn sst_parquet_bytes(batch: &RecordBatch) -> Vec { + let key_value_meta = KeyValue::new( + PARQUET_METADATA_KEY.to_string(), + sst_region_metadata().to_json().unwrap(), + ); + let props = WriterProperties::builder() + .set_key_value_metadata(Some(vec![key_value_meta])) + .build(); + + let mut parquet_bytes = Vec::new(); + let mut writer = + ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), Some(props)).unwrap(); + writer.write(batch).unwrap(); + writer.close().unwrap(); + + parquet_bytes + } #[tokio::test] async fn test_preload_parquet_meta_cache_uses_file_cache() { @@ -1183,10 +1205,7 @@ mod tests { let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; let batch = RecordBatch::try_from_iter([("col", col)]).unwrap(); - let mut parquet_bytes = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), None).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); + let parquet_bytes = sst_parquet_bytes(&batch); let file_size = parquet_bytes.len() as u64; let file_meta = FileMeta { @@ -1334,10 +1353,7 @@ mod tests { let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; let batch = RecordBatch::try_from_iter([("col", col)]).unwrap(); - let mut parquet_bytes = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), None).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); + let parquet_bytes = sst_parquet_bytes(&batch); // file_size is 0 when it's missing/defaulted in manifests; MetadataLoader::load will stat // the local filesystem to retrieve it. diff --git a/src/mito2/src/region/options.rs b/src/mito2/src/region/options.rs index 0fe0a8f12a..fcf68a9216 100644 --- a/src/mito2/src/region/options.rs +++ b/src/mito2/src/region/options.rs @@ -50,7 +50,7 @@ pub(crate) fn parse_wal_options( } /// Mode to handle duplicate rows while merging. -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, EnumString)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, EnumString)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum MergeMode { diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs index 78e4c563b1..94bc1feea8 100644 --- a/src/mito2/src/sst.rs +++ b/src/mito2/src/sst.rs @@ -31,7 +31,6 @@ use store_api::storage::consts::{ OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME, }; -use crate::read::Batch; use crate::sst::parquet::flat_format::time_index_column_index; pub mod file; @@ -260,33 +259,6 @@ pub(crate) struct SeriesEstimator { } impl SeriesEstimator { - /// Updates the estimator with a new Batch. - /// - /// Since each Batch contains only one series, this increments the series count - /// and updates the last timestamp. - pub(crate) fn update(&mut self, batch: &Batch) { - let Some(last_ts) = batch.last_timestamp() else { - return; - }; - - // Checks if there's a boundary between the last batch and this batch - if let Some(prev_last_ts) = self.last_timestamp { - // If the first timestamp of this batch is less than the last timestamp - // we've seen, it indicates a new series - if let Some(first_ts) = batch.first_timestamp() - && first_ts.value() <= prev_last_ts - { - self.series_count += 1; - } - } else { - // First batch, counts as first series - self.series_count = 1; - } - - // Updates the last timestamp - self.last_timestamp = Some(last_ts.value()); - } - /// Updates the estimator with a new record batch in flat format. /// /// This method examines the time index column to detect series boundaries. @@ -340,43 +312,14 @@ impl SeriesEstimator { mod tests { use std::sync::Arc; - use api::v1::OpType; use datatypes::arrow::array::{ - BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt8Builder, - UInt32Array, UInt64Array, + BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array, + UInt64Array, }; use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; use datatypes::arrow::record_batch::RecordBatch; use super::*; - use crate::read::{Batch, BatchBuilder}; - - fn new_batch( - primary_key: &[u8], - timestamps: &[i64], - sequences: &[u64], - op_types: &[OpType], - ) -> Batch { - let timestamps = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec())); - let sequences = Arc::new(UInt64Array::from(sequences.to_vec())); - let mut op_type_builder = UInt8Builder::with_capacity(op_types.len()); - for op_type in op_types { - op_type_builder.append_value(*op_type as u8); - } - let op_types = Arc::new(UInt8Array::from( - op_types.iter().map(|op| *op as u8).collect::>(), - )); - - let mut builder = BatchBuilder::new(primary_key.to_vec()); - builder - .timestamps_array(timestamps) - .unwrap() - .sequences_array(sequences) - .unwrap() - .op_types_array(op_types) - .unwrap(); - builder.build().unwrap() - } fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch { // Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type] @@ -411,128 +354,6 @@ mod tests { RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap() } - #[test] - fn test_series_estimator_empty_batch() { - let mut estimator = SeriesEstimator::default(); - let batch = new_batch(b"test", &[], &[], &[]); - estimator.update(&batch); - assert_eq!(0, estimator.finish()); - } - - #[test] - fn test_series_estimator_single_batch() { - let mut estimator = SeriesEstimator::default(); - let batch = new_batch( - b"test", - &[1, 2, 3], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch); - assert_eq!(1, estimator.finish()); - } - - #[test] - fn test_series_estimator_multiple_batches_same_series() { - let mut estimator = SeriesEstimator::default(); - - // First batch with timestamps 1, 2, 3 - let batch1 = new_batch( - b"test", - &[1, 2, 3], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch1); - - // Second batch with timestamps 4, 5, 6 (continuation) - let batch2 = new_batch( - b"test", - &[4, 5, 6], - &[4, 5, 6], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch2); - - assert_eq!(1, estimator.finish()); - } - - #[test] - fn test_series_estimator_new_series_detected() { - let mut estimator = SeriesEstimator::default(); - - // First batch with timestamps 1, 2, 3 - let batch1 = new_batch( - b"pk0", - &[1, 2, 3], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch1); - - // Second batch with timestamps 2, 3, 4 (timestamp goes back, new series) - let batch2 = new_batch( - b"pk1", - &[2, 3, 4], - &[4, 5, 6], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch2); - - assert_eq!(2, estimator.finish()); - } - - #[test] - fn test_series_estimator_equal_timestamp_boundary() { - let mut estimator = SeriesEstimator::default(); - - // First batch ending at timestamp 5 - let batch1 = new_batch( - b"test", - &[1, 2, 5], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch1); - - // Second batch starting at timestamp 5 (equal, indicates new series) - let batch2 = new_batch( - b"test", - &[5, 6, 7], - &[4, 5, 6], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch2); - - assert_eq!(2, estimator.finish()); - } - - #[test] - fn test_series_estimator_finish_resets_state() { - let mut estimator = SeriesEstimator::default(); - - let batch1 = new_batch( - b"test", - &[1, 2, 3], - &[1, 2, 3], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch1); - - assert_eq!(1, estimator.finish()); - - // After finish, state should be reset - let batch2 = new_batch( - b"test", - &[4, 5, 6], - &[4, 5, 6], - &[OpType::Put, OpType::Put, OpType::Put], - ); - estimator.update(&batch2); - - assert_eq!(1, estimator.finish()); - } - #[test] fn test_series_estimator_flat_empty_batch() { let mut estimator = SeriesEstimator::default(); diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index 0df3229e9c..88aebfc001 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -58,7 +58,7 @@ use crate::error::{ }; use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList}; use crate::metrics::INDEX_CREATE_MEMORY_USAGE; -use crate::read::{Batch, BatchReader}; +use crate::read::Batch; use crate::region::options::IndexOptions; use crate::region::version::VersionControlRef; use crate::region::{ManifestContextRef, RegionLeaderState}; @@ -802,9 +802,9 @@ impl IndexBuildTask { if let Some(mut parquet_reader) = parquet_reader { // TODO(SNC123): optimize index batch loop { - match parquet_reader.next_batch().await { - Ok(Some(mut batch)) => { - indexer.update(&mut batch).await; + match parquet_reader.next_record_batch().await { + Ok(Some(batch)) => { + indexer.update_flat(&batch).await; } Ok(None) => break, Err(e) => { @@ -1227,7 +1227,9 @@ mod tests { use crate::sst::parquet::WriteOptions; use crate::test_util::memtable_util::EmptyMemtableBuilder; use crate::test_util::scheduler_util::SchedulerEnv; - use crate::test_util::sst_util::{new_batch_by_range, new_source, sst_region_metadata}; + use crate::test_util::sst_util::{ + new_flat_source_from_record_batches, new_record_batch_by_range, sst_region_metadata, + }; struct MetaConfig { with_inverted: bool, @@ -1358,19 +1360,20 @@ mod tests { env: &SchedulerEnv, build_mode: IndexBuildMode, ) -> SstInfo { - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); let mut index_config = MitoConfig::default().index; index_config.build_mode = build_mode; let write_request = SstWriteRequest { op_type: OperationType::Flush, metadata: metadata.clone(), - source: either::Left(source), + source, storage: None, max_sequence: None, + sst_write_format: Default::default(), cache_manager: Default::default(), index_options: IndexOptions::default(), index_config, diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index aa98b69176..79a08a209d 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -24,11 +24,13 @@ use crate::sst::DEFAULT_WRITE_BUFFER_SIZE; use crate::sst::file::FileTimeRange; use crate::sst::index::IndexOutput; +pub(crate) mod async_reader; pub mod file_range; pub mod flat_format; pub mod format; pub(crate) mod helper; pub(crate) mod metadata; +pub mod prefilter; pub mod reader; pub mod row_group; pub mod row_selection; @@ -110,6 +112,7 @@ mod tests { TimestampMillisecondArray, UInt8Array, UInt64Array, }; use datatypes::arrow::datatypes::{DataType, Field, Schema, UInt32Type}; + use datatypes::arrow::util::pretty::pretty_format_batches; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{FulltextAnalyzer, FulltextBackend, FulltextOptions}; use object_store::ObjectStore; @@ -129,7 +132,7 @@ mod tests { use crate::cache::test_util::assert_parquet_metadata_equal; use crate::cache::{CacheManager, CacheStrategy, PageKey}; use crate::config::IndexConfig; - use crate::read::{BatchBuilder, BatchReader, FlatSource}; + use crate::read::FlatSource; use crate::region::options::{IndexOptions, InvertedIndexOptions}; use crate::sst::file::{FileHandle, FileMeta, RegionFileId, RegionIndexId}; use crate::sst::file_purger::NoopFilePurger; @@ -137,19 +140,19 @@ mod tests { use crate::sst::index::fulltext_index::applier::builder::FulltextIndexApplierBuilder; use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder; use crate::sst::index::{IndexBuildType, Indexer, IndexerBuilder, IndexerBuilderImpl}; - use crate::sst::parquet::format::PrimaryKeyWriteFormat; + use crate::sst::parquet::flat_format::FlatWriteFormat; use crate::sst::parquet::reader::{ParquetReader, ParquetReaderBuilder, ReaderMetrics}; use crate::sst::parquet::writer::ParquetWriter; use crate::sst::{ DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, location, to_flat_sst_arrow_schema, }; + use crate::test_util::TestEnv; use crate::test_util::sst_util::{ - build_test_binary_test_region_metadata, new_batch_by_range, new_batch_with_binary, - new_batch_with_custom_sequence, new_primary_key, new_source, new_sparse_primary_key, - sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata, + build_test_binary_test_region_metadata, new_flat_source_from_record_batches, + new_primary_key, new_record_batch_by_range, new_record_batch_with_custom_sequence, + new_sparse_primary_key, sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata, sst_region_metadata_with_encoding, }; - use crate::test_util::{TestEnv, check_reader_result}; const FILE_DIR: &str = "/"; const REGION_ID: RegionId = RegionId::new(0, 0); @@ -191,10 +194,10 @@ mod tests { region_file_id: handle.file_id(), }; let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -214,7 +217,7 @@ mod tests { .await; let info = writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -235,14 +238,14 @@ mod tests { object_store, ); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_by_range(&["a", "d"], 0, 50), - new_batch_by_range(&["a", "d"], 50, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 150), - new_batch_by_range(&["b", "h"], 150, 200), + new_record_batch_by_range(&["a", "d"], 0, 50), + new_record_batch_by_range(&["a", "d"], 50, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 150), + new_record_batch_by_range(&["b", "h"], 150, 200), ], ) .await; @@ -254,10 +257,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -279,7 +282,7 @@ mod tests { .await; let sst_info = writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -299,14 +302,14 @@ mod tests { .cache(cache.clone()); for _ in 0..3 { let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_by_range(&["a", "d"], 0, 50), - new_batch_by_range(&["a", "d"], 50, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 150), - new_batch_by_range(&["b", "h"], 150, 200), + new_record_batch_by_range(&["a", "d"], 0, 50), + new_record_batch_by_range(&["a", "d"], 50, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 150), + new_record_batch_by_range(&["b", "h"], 150, 200), ], ) .await; @@ -340,10 +343,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); let write_opts = WriteOptions { row_group_size: 50, @@ -366,7 +369,7 @@ mod tests { .await; let sst_info = writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -382,8 +385,12 @@ mod tests { .page_index_policy(PageIndexPolicy::Optional); let reader = builder.build().await.unwrap().unwrap(); let reader_metadata = reader.parquet_metadata(); + let cached_writer_metadata = + crate::cache::CachedSstMeta::try_new("test.sst", Arc::unwrap_or_clone(writer_metadata)) + .unwrap() + .parquet_metadata(); - assert_parquet_metadata_equal(writer_metadata, reader_metadata); + assert_parquet_metadata_equal(cached_writer_metadata, reader_metadata); } #[tokio::test] @@ -392,10 +399,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -416,7 +423,7 @@ mod tests { ) .await; writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -436,11 +443,11 @@ mod tests { ) .predicate(predicate); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_by_range(&["a", "d"], 0, 50), - new_batch_by_range(&["a", "d"], 50, 60), + new_record_batch_by_range(&["a", "d"], 0, 50), + new_record_batch_by_range(&["a", "d"], 50, 60), ], ) .await; @@ -452,10 +459,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "z"], 0, 0), - new_batch_by_range(&["a", "z"], 100, 100), - new_batch_by_range(&["a", "z"], 200, 230), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "z"], 0, 0), + new_record_batch_by_range(&["a", "z"], 100, 100), + new_record_batch_by_range(&["a", "z"], 200, 230), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -476,7 +483,7 @@ mod tests { ) .await; writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -488,7 +495,11 @@ mod tests { object_store, ); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result(&mut reader, &[new_batch_by_range(&["a", "z"], 200, 230)]).await; + check_record_batch_reader_result( + &mut reader, + &[new_record_batch_by_range(&["a", "z"], 200, 230)], + ) + .await; } #[tokio::test] @@ -497,10 +508,10 @@ mod tests { let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); let metadata = Arc::new(sst_region_metadata()); - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 60), - new_batch_by_range(&["b", "f"], 0, 40), - new_batch_by_range(&["b", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 60), + new_record_batch_by_range(&["b", "f"], 0, 40), + new_record_batch_by_range(&["b", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -522,7 +533,7 @@ mod tests { .await; writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -542,7 +553,11 @@ mod tests { ) .predicate(predicate); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result(&mut reader, &[new_batch_by_range(&["b", "h"], 150, 200)]).await; + check_record_batch_reader_result( + &mut reader, + &[new_record_batch_by_range(&["b", "h"], 150, 200)], + ) + .await; } #[tokio::test] @@ -569,7 +584,7 @@ mod tests { let writer_props = props_builder.build(); - let write_format = PrimaryKeyWriteFormat::new(metadata); + let write_format = FlatWriteFormat::new(metadata, &FlatSchemaOptions::default()); let fields: Vec<_> = write_format .arrow_schema() .fields() @@ -603,9 +618,8 @@ mod tests { ) .unwrap(); - let batch = new_batch_with_binary(&["a"], 0, 60); - let arrow_batch = write_format.convert_batch(&batch).unwrap(); - let arrays: Vec<_> = arrow_batch + let batch = new_record_batch_with_binary(&["a"], 0, 60); + let arrays: Vec<_> = batch .columns() .iter() .map(|array| { @@ -629,11 +643,11 @@ mod tests { object_store, ); let mut reader = builder.build().await.unwrap().unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_with_binary(&["a"], 0, 50), - new_batch_with_binary(&["a"], 50, 60), + new_record_batch_with_binary(&["a"], 0, 50), + new_record_batch_with_binary(&["a"], 50, 60), ], ) .await; @@ -646,17 +660,17 @@ mod tests { let mut env = TestEnv::new().await; let object_store = env.init_object_store_manager(); let metadata = Arc::new(sst_region_metadata()); - let batches = &[ - new_batch_by_range(&["a", "d"], 0, 1000), - new_batch_by_range(&["b", "f"], 0, 1000), - new_batch_by_range(&["c", "g"], 0, 1000), - new_batch_by_range(&["b", "h"], 100, 200), - new_batch_by_range(&["b", "h"], 200, 300), - new_batch_by_range(&["b", "h"], 300, 1000), + let batches = vec![ + new_record_batch_by_range(&["a", "d"], 0, 1000), + new_record_batch_by_range(&["b", "f"], 0, 1000), + new_record_batch_by_range(&["c", "g"], 0, 1000), + new_record_batch_by_range(&["b", "h"], 100, 200), + new_record_batch_by_range(&["b", "h"], 200, 300), + new_record_batch_by_range(&["b", "h"], 300, 1000), ]; let total_rows: usize = batches.iter().map(|batch| batch.num_rows()).sum(); - let source = new_source(batches); + let source = new_flat_source_from_record_batches(batches); let write_opts = WriteOptions { row_group_size: 50, max_file_size: Some(1024 * 16), @@ -678,7 +692,10 @@ mod tests { ) .await; - let files = writer.write_all(source, None, &write_opts).await.unwrap(); + let files = writer + .write_all_flat_as_primary_key(source, None, &write_opts) + .await + .unwrap(); assert_eq!(2, files.len()); let mut rows_read = 0; @@ -695,7 +712,7 @@ mod tests { object_store.clone(), ); let mut reader = builder.build().await.unwrap().unwrap(); - while let Some(batch) = reader.next_batch().await.unwrap() { + while let Some(batch) = reader.next_record_batch().await.unwrap() { rows_read += batch.num_rows(); } } @@ -710,12 +727,12 @@ mod tests { let metadata = Arc::new(sst_region_metadata()); let row_group_size = 50; - let source = new_source(&[ - new_batch_by_range(&["a", "d"], 0, 20), - new_batch_by_range(&["b", "d"], 0, 20), - new_batch_by_range(&["c", "d"], 0, 20), - new_batch_by_range(&["c", "f"], 0, 40), - new_batch_by_range(&["c", "h"], 100, 200), + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_by_range(&["a", "d"], 0, 20), + new_record_batch_by_range(&["b", "d"], 0, 20), + new_record_batch_by_range(&["c", "d"], 0, 20), + new_record_batch_by_range(&["c", "f"], 0, 40), + new_record_batch_by_range(&["c", "h"], 100, 200), ]); // Use a small row group size for test. let write_opts = WriteOptions { @@ -760,7 +777,7 @@ mod tests { .await; let info = writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -877,6 +894,7 @@ mod tests { handle.clone(), object_store.clone(), ) + .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -891,7 +909,11 @@ mod tests { let mut reader = ParquetReader::new(Arc::new(context), selection) .await .unwrap(); - check_reader_result(&mut reader, &[new_batch_by_range(&["b", "d"], 0, 20)]).await; + check_record_batch_reader_result( + &mut reader, + &[new_record_batch_by_range(&["b", "d"], 0, 20)], + ) + .await; assert_eq!(metrics.filter_metrics.rg_total, 4); assert_eq!(metrics.filter_metrics.rg_minmax_filtered, 3); @@ -937,6 +959,7 @@ mod tests { handle.clone(), object_store.clone(), ) + .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -991,6 +1014,7 @@ mod tests { handle.clone(), object_store.clone(), ) + .flat_format(true) .predicate(Some(Predicate::new(preds))) .inverted_index_appliers([inverted_index_applier.clone(), None]) .bloom_filter_index_appliers([bloom_filter_applier.clone(), None]) @@ -1005,13 +1029,13 @@ mod tests { let mut reader = ParquetReader::new(Arc::new(context), selection) .await .unwrap(); - check_reader_result( + check_record_batch_reader_result( &mut reader, &[ - new_batch_by_range(&["a", "d"], 0, 20), - new_batch_by_range(&["b", "d"], 0, 20), - new_batch_by_range(&["c", "d"], 0, 10), - new_batch_by_range(&["c", "d"], 10, 20), + new_record_batch_by_range(&["a", "d"], 0, 20), + new_record_batch_by_range(&["b", "d"], 0, 20), + new_record_batch_by_range(&["c", "d"], 0, 10), + new_record_batch_by_range(&["c", "d"], 10, 20), ], ) .await; @@ -1032,37 +1056,32 @@ mod tests { assert!(cached.contains_row_group(3)); } - /// Creates a flat format RecordBatch for testing. - /// Similar to `new_batch_by_range` but returns a RecordBatch in flat format. - fn new_record_batch_by_range(tags: &[&str], start: usize, end: usize) -> RecordBatch { + fn new_record_batch_with_binary(tags: &[&str], start: usize, end: usize) -> RecordBatch { assert!(end >= start); - let metadata = Arc::new(sst_region_metadata()); + let metadata = build_test_binary_test_region_metadata(); let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default()); let num_rows = end - start; let mut columns = Vec::new(); - // Add primary key columns (tag_0, tag_1) as dictionary arrays let mut tag_0_builder = StringDictionaryBuilder::::new(); - let mut tag_1_builder = StringDictionaryBuilder::::new(); - for _ in 0..num_rows { tag_0_builder.append_value(tags[0]); - tag_1_builder.append_value(tags[1]); } - columns.push(Arc::new(tag_0_builder.finish()) as ArrayRef); - columns.push(Arc::new(tag_1_builder.finish()) as ArrayRef); - // Add field column (field_0) - let field_values: Vec = (start..end).map(|v| v as u64).collect(); - columns.push(Arc::new(UInt64Array::from(field_values))); + let values = (0..num_rows) + .map(|_| "some data".as_bytes()) + .collect::>(); + columns.push( + Arc::new(datatypes::arrow::array::BinaryArray::from_iter_values( + values, + )) as ArrayRef, + ); - // Add time index column (ts) let timestamps: Vec = (start..end).map(|v| v as i64).collect(); columns.push(Arc::new(TimestampMillisecondArray::from(timestamps))); - // Add encoded primary key column let pk = new_primary_key(tags); let mut pk_builder = BinaryDictionaryBuilder::::new(); for _ in 0..num_rows { @@ -1070,10 +1089,7 @@ mod tests { } columns.push(Arc::new(pk_builder.finish())); - // Add sequence column columns.push(Arc::new(UInt64Array::from_value(1000, num_rows))); - - // Add op_type column columns.push(Arc::new(UInt8Array::from_value( OpType::Put as u8, num_rows, @@ -1082,9 +1098,19 @@ mod tests { RecordBatch::try_new(flat_schema, columns).unwrap() } - /// Creates a FlatSource from flat format RecordBatches. - fn new_flat_source_from_record_batches(batches: Vec) -> FlatSource { - FlatSource::Iter(Box::new(batches.into_iter().map(Ok))) + async fn check_record_batch_reader_result( + reader: &mut ParquetReader, + expected: &[RecordBatch], + ) { + let mut actual = Vec::new(); + while let Some(batch) = reader.next_record_batch().await.unwrap() { + actual.push(batch); + } + assert_eq!( + pretty_format_batches(expected).unwrap().to_string(), + pretty_format_batches(&actual).unwrap().to_string() + ); + assert!(reader.next_record_batch().await.unwrap().is_none()); } /// Creates a flat format RecordBatch for testing with sparse primary key encoding. @@ -1333,10 +1359,11 @@ mod tests { }; let metadata = Arc::new(sst_region_metadata()); - // Create batches with sequence 0 to trigger override functionality - let batch1 = new_batch_with_custom_sequence(&["a", "d"], 0, 60, 0); - let batch2 = new_batch_with_custom_sequence(&["b", "f"], 0, 40, 0); - let source = new_source(&[batch1, batch2]); + // Create batches with sequence 0 to trigger override functionality. + let source = new_flat_source_from_record_batches(vec![ + new_record_batch_with_custom_sequence(&["a", "d"], 0, 60, 0), + new_record_batch_with_custom_sequence(&["b", "f"], 0, 40, 0), + ]); let write_opts = WriteOptions { row_group_size: 50, @@ -1355,7 +1382,7 @@ mod tests { .await; writer - .write_all(source, None, &write_opts) + .write_all_flat_as_primary_key(source, None, &write_opts) .await .unwrap() .remove(0); @@ -1369,7 +1396,7 @@ mod tests { ); let mut reader = builder.build().await.unwrap().unwrap(); let mut normal_batches = Vec::new(); - while let Some(batch) = reader.next_batch().await.unwrap() { + while let Some(batch) = reader.next_record_batch().await.unwrap() { normal_batches.push(batch); } @@ -1391,22 +1418,19 @@ mod tests { ); let mut reader = builder.build().await.unwrap().unwrap(); let mut override_batches = Vec::new(); - while let Some(batch) = reader.next_batch().await.unwrap() { + while let Some(batch) = reader.next_record_batch().await.unwrap() { override_batches.push(batch); } // Compare the results assert_eq!(normal_batches.len(), override_batches.len()); for (normal, override_batch) in normal_batches.into_iter().zip(override_batches.iter()) { - // Create expected batch with override sequence let expected_batch = { - let num_rows = normal.num_rows(); - let mut builder = BatchBuilder::from(normal); - builder - .sequences_array(Arc::new(UInt64Array::from_value(custom_sequence, num_rows))) - .unwrap(); - - builder.build().unwrap() + let mut columns = normal.columns().to_vec(); + let num_cols = columns.len(); + columns[num_cols - 2] = + Arc::new(UInt64Array::from_value(custom_sequence, normal.num_rows())); + RecordBatch::try_new(normal.schema(), columns).unwrap() }; // Override batch should match expected batch diff --git a/src/mito2/src/sst/parquet/async_reader.rs b/src/mito2/src/sst/parquet/async_reader.rs new file mode 100644 index 0000000000..a060fd367d --- /dev/null +++ b/src/mito2/src/sst/parquet/async_reader.rs @@ -0,0 +1,221 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Async file reader implementation for SST parquet files. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::Bytes; +use futures::FutureExt; +use futures::future::BoxFuture; +use object_store::ObjectStore; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::errors::{ParquetError, Result as ParquetResult}; +use parquet::file::metadata::ParquetMetaData; + +use crate::cache::file_cache::{FileType, IndexKey}; +use crate::cache::{CacheStrategy, PageKey, PageValue}; +use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES}; +use crate::sst::file::RegionFileId; +use crate::sst::parquet::helper::fetch_byte_ranges; +use crate::sst::parquet::row_group::{ParquetFetchMetrics, compute_total_range_size}; + +/// An [AsyncFileReader] implementation for SST parquet files. +/// +/// This reader provides async byte access to parquet data in object storage, +/// with caching support (page cache and write cache). +pub struct SstAsyncFileReader { + /// Region file ID for cache key. + region_file_id: RegionFileId, + /// Path to the parquet file in object storage. + file_path: String, + /// Object store for reading data. + object_store: ObjectStore, + /// Cache strategy for reading pages. + cache_strategy: CacheStrategy, + /// Cached parquet metadata. + metadata: Arc, + /// Row group index for cache key. + row_group_idx: usize, + /// Optional metrics for tracking fetch operations. + fetch_metrics: Option, +} + +impl SstAsyncFileReader { + /// Creates a new [SstAsyncFileReader]. + pub fn new( + region_file_id: RegionFileId, + file_path: String, + object_store: ObjectStore, + cache_strategy: CacheStrategy, + metadata: Arc, + row_group_idx: usize, + ) -> Self { + Self { + region_file_id, + file_path, + object_store, + cache_strategy, + metadata, + row_group_idx, + fetch_metrics: None, + } + } + + /// Sets the fetch metrics. + pub fn with_fetch_metrics(mut self, metrics: Option) -> Self { + self.fetch_metrics = metrics; + self + } + + /// Fetches byte ranges from page cache, write cache, or object store. + async fn fetch_bytes_with_cache(&self, ranges: Vec>) -> ParquetResult> { + let fetch_start = self + .fetch_metrics + .as_ref() + .map(|_| std::time::Instant::now()); + let _timer = READ_STAGE_FETCH_PAGES.start_timer(); + + let page_key = PageKey::new( + self.region_file_id.file_id(), + self.row_group_idx, + ranges.clone(), + ); + + // Check page cache first. + if let Some(pages) = self.cache_strategy.get_pages(&page_key) { + if let Some(metrics) = &self.fetch_metrics { + let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + let mut metrics_data = metrics.data.lock().unwrap(); + metrics_data.page_cache_hit += 1; + metrics_data.pages_to_fetch_mem += ranges.len(); + metrics_data.page_size_to_fetch_mem += total_size; + metrics_data.page_size_needed += total_size; + if let Some(start) = fetch_start { + metrics_data.total_fetch_elapsed += start.elapsed(); + } + } + return Ok(pages.compressed.clone()); + } + + // Calculate total range size for metrics. + let (total_range_size, unaligned_size) = compute_total_range_size(&ranges); + + // Check write cache. + let key = IndexKey::new( + self.region_file_id.region_id(), + self.region_file_id.file_id(), + FileType::Parquet, + ); + let fetch_write_cache_start = self + .fetch_metrics + .as_ref() + .map(|_| std::time::Instant::now()); + let write_cache_result = self.fetch_ranges_from_write_cache(key, &ranges).await; + + let pages = match write_cache_result { + Some(data) => { + if let Some(metrics) = &self.fetch_metrics { + let elapsed = fetch_write_cache_start + .map(|start| start.elapsed()) + .unwrap_or_default(); + let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + let mut metrics_data = metrics.data.lock().unwrap(); + metrics_data.write_cache_fetch_elapsed += elapsed; + metrics_data.write_cache_hit += 1; + metrics_data.pages_to_fetch_write_cache += ranges.len(); + metrics_data.page_size_to_fetch_write_cache += unaligned_size; + metrics_data.page_size_needed += range_size_needed; + } + data + } + None => { + // Fetch data from object store. + let _timer = READ_STAGE_ELAPSED + .with_label_values(&["cache_miss_read"]) + .start_timer(); + + let start = self + .fetch_metrics + .as_ref() + .map(|_| std::time::Instant::now()); + let data = fetch_byte_ranges(&self.file_path, self.object_store.clone(), &ranges) + .await + .map_err(|e| ParquetError::External(Box::new(e)))?; + + if let Some(metrics) = &self.fetch_metrics { + let elapsed = start.map(|start| start.elapsed()).unwrap_or_default(); + let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + let mut metrics_data = metrics.data.lock().unwrap(); + metrics_data.store_fetch_elapsed += elapsed; + metrics_data.cache_miss += 1; + metrics_data.pages_to_fetch_store += ranges.len(); + metrics_data.page_size_to_fetch_store += unaligned_size; + metrics_data.page_size_needed += range_size_needed; + } + data + } + }; + + // Put pages back to the cache. + let page_value = PageValue::new(pages.clone(), total_range_size); + self.cache_strategy + .put_pages(page_key, Arc::new(page_value)); + + if let (Some(metrics), Some(start)) = (&self.fetch_metrics, fetch_start) { + metrics.data.lock().unwrap().total_fetch_elapsed += start.elapsed(); + } + + Ok(pages) + } + + /// Fetches data from write cache. + /// Returns `None` if the data is not in the cache. + async fn fetch_ranges_from_write_cache( + &self, + key: IndexKey, + ranges: &[Range], + ) -> Option> { + if let Some(cache) = self.cache_strategy.write_cache() { + return cache.file_cache().read_ranges(key, ranges).await; + } + None + } +} + +impl AsyncFileReader for SstAsyncFileReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, ParquetResult> { + async move { + let mut result = self.fetch_bytes_with_cache(vec![range]).await?; + Ok(result.pop().unwrap_or_default()) + } + .boxed() + } + + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, ParquetResult>> { + async move { self.fetch_bytes_with_cache(ranges).await }.boxed() + } + + fn get_metadata( + &mut self, + _options: Option<&parquet::arrow::arrow_reader::ArrowReaderOptions>, + ) -> BoxFuture<'_, ParquetResult>> { + // Metadata is already cached, return it immediately. + std::future::ready(Ok(self.metadata.clone())).boxed() + } +} diff --git a/src/mito2/src/sst/parquet/flat_format.rs b/src/mito2/src/sst/parquet/flat_format.rs index d6b061e468..8a59e9a97d 100644 --- a/src/mito2/src/sst/parquet/flat_format.rs +++ b/src/mito2/src/sst/parquet/flat_format.rs @@ -52,8 +52,8 @@ use crate::error::{ NewRecordBatchSnafu, Result, }; use crate::sst::parquet::format::{ - FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray, PrimaryKeyReadFormat, ReadFormat, - StatValues, + FIXED_POS_COLUMN_NUM, FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray, + PrimaryKeyReadFormat, ReadFormat, StatValues, }; use crate::sst::{ FlatSchemaOptions, flat_sst_arrow_schema_column_num, tag_maybe_to_dictionary_field, @@ -127,6 +127,21 @@ pub(crate) fn op_type_column_index(num_columns: usize) -> usize { num_columns - 1 } +/// Returns the start index of field columns in a flat batch. +/// +/// `num_columns` is the total number of columns in the flat batch schema, +/// including tag columns (if present), field columns, and fixed position columns +/// (time index, primary key, sequence, op type). +/// +/// For Dense encoding (raw PK columns included): field_column_start = primary_key.len() +/// For Sparse encoding (no raw PK columns): field_column_start = 0 +pub(crate) fn field_column_start(metadata: &RegionMetadata, num_columns: usize) -> usize { + // Calculates field column start: total columns - fixed columns - field columns + // Field column count = total metadata columns - time index column - primary key columns + let field_column_count = metadata.column_metadatas.len() - 1 - metadata.primary_key.len(); + num_columns - FIXED_POS_COLUMN_NUM - field_column_count +} + // TODO(yingwen): Add an option to skip reading internal columns if the region is // append only and doesn't use sparse encoding (We need to check the table id under // sparse encoding). @@ -765,3 +780,89 @@ impl FlatReadFormat { .unwrap() } } + +#[cfg(test)] +mod tests { + use api::v1::SemanticType; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::ColumnSchema; + use store_api::codec::PrimaryKeyEncoding; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; + use store_api::storage::RegionId; + + use super::field_column_start; + use crate::sst::{FlatSchemaOptions, flat_sst_arrow_schema_column_num}; + + /// Builds a `RegionMetadata` with the given number of tags and fields. + fn build_metadata( + num_tags: usize, + num_fields: usize, + encoding: PrimaryKeyEncoding, + ) -> RegionMetadata { + let mut builder = RegionMetadataBuilder::new(RegionId::new(0, 0)); + let mut col_id = 0u32; + + for i in 0..num_tags { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + format!("tag_{i}"), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: col_id, + }); + col_id += 1; + } + + for i in 0..num_fields { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + format!("field_{i}"), + ConcreteDataType::uint64_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: col_id, + }); + col_id += 1; + } + + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts".to_string(), + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: col_id, + }); + + let primary_key: Vec = (0..num_tags as u32).collect(); + builder.primary_key(primary_key); + builder.primary_key_encoding(encoding); + builder.build().unwrap() + } + + #[test] + fn test_field_column_start() { + // (num_tags, num_fields, encoding, expected) + let cases = [ + (1, 1, PrimaryKeyEncoding::Dense, 1), + (2, 2, PrimaryKeyEncoding::Dense, 2), + (0, 2, PrimaryKeyEncoding::Dense, 0), + (2, 2, PrimaryKeyEncoding::Sparse, 0), + ]; + + for (num_tags, num_fields, encoding, expected) in cases { + let metadata = build_metadata(num_tags, num_fields, encoding); + let options = FlatSchemaOptions::from_encoding(encoding); + let num_columns = flat_sst_arrow_schema_column_num(&metadata, &options); + let result = field_column_start(&metadata, num_columns); + assert_eq!( + result, expected, + "num_tags={num_tags}, num_fields={num_fields}, encoding={encoding:?}" + ); + } + } +} diff --git a/src/mito2/src/sst/parquet/format.rs b/src/mito2/src/sst/parquet/format.rs index 70d026e6db..ba64eac78b 100644 --- a/src/mito2/src/sst/parquet/format.rs +++ b/src/mito2/src/sst/parquet/format.rs @@ -34,12 +34,12 @@ use api::v1::SemanticType; use common_time::Timestamp; use datafusion_common::ScalarValue; use datatypes::arrow::array::{ - ArrayRef, BinaryArray, BinaryDictionaryBuilder, DictionaryArray, UInt32Array, UInt64Array, + ArrayRef, BinaryArray, BinaryDictionaryBuilder, DictionaryArray, UInt64Array, }; use datatypes::arrow::datatypes::{SchemaRef, UInt32Type}; use datatypes::arrow::record_batch::RecordBatch; use datatypes::prelude::DataType; -use datatypes::vectors::{Helper, Vector}; +use datatypes::vectors::Helper; use mito_codec::row_converter::{ CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec, build_primary_key_codec_with_fields, @@ -51,8 +51,7 @@ use store_api::metadata::{ColumnMetadata, RegionMetadataRef}; use store_api::storage::{ColumnId, SequenceNumber}; use crate::error::{ - ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu, - NewRecordBatchSnafu, Result, + ConvertVectorSnafu, DecodeSnafu, InvalidRecordBatchSnafu, NewRecordBatchSnafu, Result, }; use crate::read::{Batch, BatchBuilder, BatchColumn}; use crate::sst::file::{FileMeta, FileTimeRange}; @@ -73,7 +72,6 @@ pub(crate) const INTERNAL_COLUMN_NUM: usize = 3; /// Helper for writing the SST format with primary key. pub(crate) struct PrimaryKeyWriteFormat { - metadata: RegionMetadataRef, /// SST file schema. arrow_schema: SchemaRef, override_sequence: Option, @@ -84,7 +82,6 @@ impl PrimaryKeyWriteFormat { pub(crate) fn new(metadata: RegionMetadataRef) -> PrimaryKeyWriteFormat { let arrow_schema = to_sst_arrow_schema(&metadata); PrimaryKeyWriteFormat { - metadata, arrow_schema, override_sequence: None, } @@ -104,40 +101,25 @@ impl PrimaryKeyWriteFormat { &self.arrow_schema } - /// Convert `batch` to a arrow record batch to store in parquet. - pub(crate) fn convert_batch(&self, batch: &Batch) -> Result { - debug_assert_eq!( - batch.fields().len() + FIXED_POS_COLUMN_NUM, - self.arrow_schema.fields().len() - ); - let mut columns = Vec::with_capacity(batch.fields().len() + FIXED_POS_COLUMN_NUM); - // Store all fields first. - for (column, column_metadata) in batch.fields().iter().zip(self.metadata.field_columns()) { - ensure!( - column.column_id == column_metadata.column_id, - InvalidBatchSnafu { - reason: format!( - "Batch has column {} but metadata has column {}", - column.column_id, column_metadata.column_id - ), - } - ); - - columns.push(column.data.to_arrow_array()); - } - // Add time index column. - columns.push(batch.timestamps().to_arrow_array()); - // Add internal columns: primary key, sequences, op types. - columns.push(new_primary_key_array(batch.primary_key(), batch.num_rows())); + /// Convert a flat `RecordBatch` to primary-key format, retaining only + /// field columns, time index, and internal columns. + /// + /// `num_fields` is the number of field columns. The method strips + /// leading tag columns: `num_tag_columns = batch.num_columns() - num_fields - FIXED_POS_COLUMN_NUM`. + pub(crate) fn convert_flat_batch( + &self, + batch: &RecordBatch, + num_fields: usize, + ) -> Result { + let num_tag_columns = batch.num_columns() - num_fields - FIXED_POS_COLUMN_NUM; + let mut columns: Vec = batch.columns()[num_tag_columns..].to_vec(); if let Some(override_sequence) = self.override_sequence { - let sequence_array = + let num_cols = columns.len(); + // sequence is at num_cols - 2 (before op_type) + columns[num_cols - 2] = Arc::new(UInt64Array::from(vec![override_sequence; batch.num_rows()])); - columns.push(sequence_array); - } else { - columns.push(batch.sequences().to_arrow_array()); } - columns.push(batch.op_types().to_arrow_array()); RecordBatch::try_new(self.arrow_schema.clone(), columns).context(NewRecordBatchSnafu) } @@ -926,15 +908,6 @@ pub(crate) fn primary_key_offsets(pk_dict_array: &PrimaryKeyArray) -> Result ArrayRef { - let values = Arc::new(BinaryArray::from_iter_values([primary_key])); - let keys = UInt32Array::from_value(0, num_rows); - - // Safety: The key index is valid. - Arc::new(DictionaryArray::new(keys, values)) -} - /// Gets the min/max time index of the row group from the parquet meta. /// It assumes the parquet is created by the mito engine. pub(crate) fn parquet_row_group_time_range( @@ -1017,7 +990,7 @@ mod tests { use api::v1::OpType; use datatypes::arrow::array::{ - Int64Array, StringArray, TimestampMillisecondArray, UInt8Array, UInt64Array, + Int64Array, StringArray, TimestampMillisecondArray, UInt8Array, UInt32Array, UInt64Array, }; use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}; use datatypes::prelude::ConcreteDataType; @@ -1145,13 +1118,6 @@ mod tests { assert_eq!(&build_test_arrow_schema(), write_format.arrow_schema()); } - #[test] - fn test_new_primary_key_array() { - let array = new_primary_key_array(b"test", 3); - let expect = build_test_pk_array(&[(b"test".to_vec(), 3)]) as ArrayRef; - assert_eq!(&expect, &array); - } - fn build_test_pk_array(pk_row_nums: &[(Vec, usize)]) -> Arc { let values = Arc::new(BinaryArray::from_iter_values( pk_row_nums.iter().map(|v| &v.0), @@ -1164,49 +1130,6 @@ mod tests { Arc::new(DictionaryArray::new(keys, values)) } - #[test] - fn test_convert_batch() { - let metadata = build_test_region_metadata(); - let write_format = PrimaryKeyWriteFormat::new(metadata); - - let num_rows = 4; - let batch = new_batch(b"test", 1, 2, num_rows); - let columns: Vec = vec![ - Arc::new(Int64Array::from(vec![2; num_rows])), // field1 - Arc::new(Int64Array::from(vec![3; num_rows])), // field0 - Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts - build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key - Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // sequence - Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type - ]; - let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap(); - - let actual = write_format.convert_batch(&batch).unwrap(); - assert_eq!(expect_record, actual); - } - - #[test] - fn test_convert_batch_with_override_sequence() { - let metadata = build_test_region_metadata(); - let write_format = - PrimaryKeyWriteFormat::new(metadata).with_override_sequence(Some(415411)); - - let num_rows = 4; - let batch = new_batch(b"test", 1, 2, num_rows); - let columns: Vec = vec![ - Arc::new(Int64Array::from(vec![2; num_rows])), // field1 - Arc::new(Int64Array::from(vec![3; num_rows])), // field0 - Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts - build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key - Arc::new(UInt64Array::from(vec![415411; num_rows])), // sequence - Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type - ]; - let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap(); - - let actual = write_format.convert_batch(&batch).unwrap(); - assert_eq!(expect_record, actual); - } - #[test] fn test_projection_indices() { let metadata = build_test_region_metadata(); @@ -1867,4 +1790,100 @@ mod tests { let result = format.convert_batch(record_batch.clone(), None).unwrap(); assert_eq!(record_batch, result); } + + #[test] + fn test_convert_flat_batch() { + let metadata = build_test_region_metadata(); + let write_format = PrimaryKeyWriteFormat::new(metadata); + + let num_rows = 4; + // Build a flat record batch: tag0, tag1, field1, field0, ts, __primary_key, __sequence, __op_type + let flat_columns: Vec = input_columns_for_flat_batch(num_rows); + let flat_batch = RecordBatch::try_new(build_test_flat_sst_schema(), flat_columns).unwrap(); + + // num_fields = 2 (field1, field0) + let result = write_format.convert_flat_batch(&flat_batch, 2).unwrap(); + + // Expected: tag columns stripped, only field1, field0, ts, __primary_key, __sequence, __op_type + let expected_columns: Vec = vec![ + Arc::new(Int64Array::from(vec![2; num_rows])), // field1 + Arc::new(Int64Array::from(vec![3; num_rows])), // field0 + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts + build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // __primary_key + Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // __sequence + Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type + ]; + let expected = RecordBatch::try_new(build_test_arrow_schema(), expected_columns).unwrap(); + + assert_eq!(expected, result); + } + + #[test] + fn test_convert_flat_batch_with_override_sequence() { + let metadata = build_test_region_metadata(); + let write_format = PrimaryKeyWriteFormat::new(metadata).with_override_sequence(Some(999)); + + let num_rows = 4; + let flat_columns: Vec = input_columns_for_flat_batch(num_rows); + let flat_batch = RecordBatch::try_new(build_test_flat_sst_schema(), flat_columns).unwrap(); + + let result = write_format.convert_flat_batch(&flat_batch, 2).unwrap(); + + let expected_columns: Vec = vec![ + Arc::new(Int64Array::from(vec![2; num_rows])), // field1 + Arc::new(Int64Array::from(vec![3; num_rows])), // field0 + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts + build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // __primary_key + Arc::new(UInt64Array::from(vec![999; num_rows])), // overridden __sequence + Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type + ]; + let expected = RecordBatch::try_new(build_test_arrow_schema(), expected_columns).unwrap(); + + assert_eq!(expected, result); + } + + #[test] + fn test_convert_flat_batch_no_tags() { + // Test with a region that has no primary key columns (no tags to strip). + let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1)); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "field0", + ConcreteDataType::int64_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 2, + }); + let metadata = Arc::new(builder.build().unwrap()); + let write_format = PrimaryKeyWriteFormat::new(metadata); + + let num_rows = 3; + // No tag columns, so flat batch is: field0, ts, __primary_key, __sequence, __op_type + let sst_schema = write_format.arrow_schema().clone(); + let columns: Vec = vec![ + Arc::new(Int64Array::from(vec![10; num_rows])), // field0 + Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), // ts + build_test_pk_array(&[(b"".to_vec(), num_rows)]), // __primary_key + Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // __sequence + Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type + ]; + let flat_batch = RecordBatch::try_new(sst_schema.clone(), columns.clone()).unwrap(); + + // num_fields = 1, num_tag_columns = 5 - 1 - 4 = 0, so nothing is stripped + let result = write_format.convert_flat_batch(&flat_batch, 1).unwrap(); + let expected = RecordBatch::try_new(sst_schema, columns).unwrap(); + + assert_eq!(expected, result); + } } diff --git a/src/mito2/src/sst/parquet/prefilter.rs b/src/mito2/src/sst/parquet/prefilter.rs new file mode 100644 index 0000000000..5de2e3512f --- /dev/null +++ b/src/mito2/src/sst/parquet/prefilter.rs @@ -0,0 +1,528 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Helpers for parquet prefiltering. + +use std::ops::Range; + +use api::v1::SemanticType; +use common_recordbatch::filter::SimpleFilterEvaluator; +use datatypes::arrow::array::{BinaryArray, BooleanArray}; +use datatypes::arrow::record_batch::RecordBatch; +use mito_codec::primary_key_filter::is_partition_column; +use mito_codec::row_converter::PrimaryKeyFilter; +use snafu::{OptionExt, ResultExt}; +use store_api::metadata::{RegionMetadata, RegionMetadataRef}; + +use crate::error::{ComputeArrowSnafu, Result, UnexpectedSnafu}; +use crate::sst::parquet::flat_format::primary_key_column_index; +use crate::sst::parquet::format::PrimaryKeyArray; + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn matching_row_ranges_by_primary_key( + input: &RecordBatch, + pk_filter: &mut dyn PrimaryKeyFilter, +) -> Result>> { + let primary_key_index = primary_key_column_index(input.num_columns()); + let pk_dict_array = input + .column(primary_key_index) + .as_any() + .downcast_ref::() + .context(UnexpectedSnafu { + reason: "Primary key column is not a dictionary array", + })?; + let pk_values = pk_dict_array + .values() + .as_any() + .downcast_ref::() + .context(UnexpectedSnafu { + reason: "Primary key values are not binary array", + })?; + let keys = pk_dict_array.keys(); + let key_values = keys.values(); + + if key_values.is_empty() { + return Ok(std::iter::once(0..input.num_rows()).collect()); + } + + let mut matched_row_ranges: Vec> = Vec::new(); + let mut start = 0; + while start < key_values.len() { + let key = key_values[start]; + let mut end = start + 1; + while end < key_values.len() && key_values[end] == key { + end += 1; + } + + if pk_filter.matches(pk_values.value(key as usize)) { + if let Some(last) = matched_row_ranges.last_mut() + && last.end == start + { + last.end = end; + } else { + matched_row_ranges.push(start..end); + } + } + + start = end; + } + + Ok(matched_row_ranges) +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn prefilter_flat_batch_by_primary_key( + input: RecordBatch, + pk_filter: &mut dyn PrimaryKeyFilter, +) -> Result> { + if input.num_rows() == 0 { + return Ok(Some(input)); + } + + let matched_row_ranges = matching_row_ranges_by_primary_key(&input, pk_filter)?; + if matched_row_ranges.is_empty() { + return Ok(None); + } + + if matched_row_ranges.len() == 1 + && matched_row_ranges[0].start == 0 + && matched_row_ranges[0].end == input.num_rows() + { + return Ok(Some(input)); + } + + if matched_row_ranges.len() == 1 { + let span = &matched_row_ranges[0]; + return Ok(Some(input.slice(span.start, span.end - span.start))); + } + + let mut mask = vec![false; input.num_rows()]; + for span in matched_row_ranges { + mask[span].fill(true); + } + + let filtered = + datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask)) + .context(ComputeArrowSnafu)?; + if filtered.num_rows() == 0 { + Ok(None) + } else { + Ok(Some(filtered)) + } +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn retain_usable_primary_key_filters( + sst_metadata: &RegionMetadataRef, + expected_metadata: Option<&RegionMetadata>, + filters: &mut Vec, +) { + filters.retain(|filter| is_usable_primary_key_filter(sst_metadata, expected_metadata, filter)); +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn is_usable_primary_key_filter( + sst_metadata: &RegionMetadataRef, + expected_metadata: Option<&RegionMetadata>, + filter: &SimpleFilterEvaluator, +) -> bool { + // TODO(yingwen): The primary key filter always skips the partition column. Consider using a flag + // to control this behavior. We can remove this behavior after we remove the PartitionTreeMemtable. + if is_partition_column(filter.column_name()) { + return false; + } + + let sst_column = match expected_metadata { + Some(expected_metadata) => { + let Some(expected_column) = expected_metadata.column_by_name(filter.column_name()) + else { + return false; + }; + let Some(sst_column) = sst_metadata.column_by_id(expected_column.column_id) else { + return false; + }; + + if sst_column.column_schema.name != expected_column.column_schema.name + || sst_column.semantic_type != expected_column.semantic_type + || sst_column.column_schema.data_type != expected_column.column_schema.data_type + { + return false; + } + + sst_column + } + None => { + let Some(sst_column) = sst_metadata.column_by_name(filter.column_name()) else { + return false; + }; + sst_column + } + }; + + sst_column.semantic_type == SemanticType::Tag + && sst_metadata + .primary_key_index(sst_column.column_id) + .is_some() +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) struct CachedPrimaryKeyFilter { + inner: Box, + last_primary_key: Vec, + last_match: Option, +} + +impl CachedPrimaryKeyFilter { + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn new(inner: Box) -> Self { + Self { + inner, + last_primary_key: Vec::new(), + last_match: None, + } + } +} + +impl PrimaryKeyFilter for CachedPrimaryKeyFilter { + fn matches(&mut self, pk: &[u8]) -> bool { + if let Some(last_match) = self.last_match + && self.last_primary_key == pk + { + return last_match; + } + + let matched = self.inner.matches(pk); + self.last_primary_key.clear(); + self.last_primary_key.extend_from_slice(pk); + self.last_match = Some(matched); + matched + } +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn batch_single_primary_key(batch: &RecordBatch) -> Result> { + let primary_key_index = primary_key_column_index(batch.num_columns()); + let pk_dict_array = batch + .column(primary_key_index) + .as_any() + .downcast_ref::() + .context(UnexpectedSnafu { + reason: "Primary key column is not a dictionary array", + })?; + let pk_values = pk_dict_array + .values() + .as_any() + .downcast_ref::() + .context(UnexpectedSnafu { + reason: "Primary key values are not binary array", + })?; + let keys = pk_dict_array.keys(); + if keys.is_empty() { + return Ok(None); + } + + let first_key = keys.value(0); + if first_key != keys.value(keys.len() - 1) { + return Ok(None); + } + + Ok(Some(pk_values.value(first_key as usize))) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use api::v1::SemanticType; + use common_recordbatch::filter::SimpleFilterEvaluator; + use datafusion_expr::{col, lit}; + use datatypes::arrow::array::{ + ArrayRef, BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array, + UInt64Array, + }; + use datatypes::arrow::datatypes::{Schema, UInt32Type}; + use datatypes::arrow::record_batch::RecordBatch; + use datatypes::prelude::ConcreteDataType; + use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec}; + use store_api::codec::PrimaryKeyEncoding; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; + use store_api::storage::ColumnSchema; + + use super::*; + use crate::sst::internal_fields; + use crate::sst::parquet::format::ReadFormat; + use crate::test_util::sst_util::{ + new_primary_key, sst_region_metadata, sst_region_metadata_with_encoding, + }; + + fn new_test_filters(exprs: &[datafusion_expr::Expr]) -> Vec { + exprs + .iter() + .filter_map(SimpleFilterEvaluator::try_new) + .collect() + } + + fn expected_metadata_with_reused_tag_name( + old_metadata: &RegionMetadata, + ) -> Arc { + let mut builder = RegionMetadataBuilder::new(old_metadata.region_id); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "tag_0".to_string(), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: 10, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "tag_1".to_string(), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "field_0".to_string(), + ConcreteDataType::uint64_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 2, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts".to_string(), + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 3, + }) + .primary_key(vec![10, 1]); + + Arc::new(builder.build().unwrap()) + } + + fn new_raw_batch_with_metadata( + metadata: Arc, + primary_keys: &[&[u8]], + field_values: &[u64], + ) -> RecordBatch { + assert_eq!(primary_keys.len(), field_values.len()); + + let arrow_schema = metadata.schema.arrow_schema(); + let field_column = arrow_schema + .field(arrow_schema.index_of("field_0").unwrap()) + .clone(); + let time_index_column = arrow_schema + .field(arrow_schema.index_of("ts").unwrap()) + .clone(); + let mut fields = vec![field_column, time_index_column]; + fields.extend( + internal_fields() + .into_iter() + .map(|field| field.as_ref().clone()), + ); + let schema = Arc::new(Schema::new(fields)); + + let mut dict_values = Vec::new(); + let mut keys = Vec::with_capacity(primary_keys.len()); + for pk in primary_keys { + let key = dict_values + .iter() + .position(|existing: &&[u8]| existing == pk) + .unwrap_or_else(|| { + dict_values.push(*pk); + dict_values.len() - 1 + }); + keys.push(key as u32); + } + + let pk_array: ArrayRef = Arc::new(DictionaryArray::::new( + UInt32Array::from(keys), + Arc::new(BinaryArray::from_iter_values(dict_values.iter().copied())), + )); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(field_values.to_vec())), + Arc::new(TimestampMillisecondArray::from_iter_values( + 0..primary_keys.len() as i64, + )), + pk_array, + Arc::new(UInt64Array::from(vec![1; primary_keys.len()])), + Arc::new(UInt8Array::from(vec![1; primary_keys.len()])), + ], + ) + .unwrap() + } + + fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch { + new_raw_batch_with_metadata(Arc::new(sst_region_metadata()), primary_keys, field_values) + } + + fn field_values(batch: &RecordBatch) -> Vec { + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + } + + #[test] + fn test_retain_usable_primary_key_filters_skips_non_tag_filters() { + let metadata = Arc::new(sst_region_metadata()); + let mut filters = + new_test_filters(&[col("field_0").eq(lit(1_u64)), col("ts").gt(lit(0_i64))]); + + retain_usable_primary_key_filters(&metadata, None, &mut filters); + + assert!(filters.is_empty()); + } + + #[test] + fn test_retain_usable_primary_key_filters_skips_reused_expected_tag_name() { + let metadata = Arc::new(sst_region_metadata()); + let expected_metadata = expected_metadata_with_reused_tag_name(&metadata); + let mut filters = new_test_filters(&[col("tag_0").eq(lit("b"))]); + + retain_usable_primary_key_filters( + &metadata, + Some(expected_metadata.as_ref()), + &mut filters, + ); + + assert!(filters.is_empty()); + } + + #[test] + fn test_is_usable_primary_key_filter_skips_legacy_primary_key_batches() { + let metadata = Arc::new(sst_region_metadata_with_encoding( + PrimaryKeyEncoding::Sparse, + )); + let read_format = ReadFormat::new_flat( + metadata.clone(), + metadata.column_metadatas.iter().map(|c| c.column_id), + None, + "test", + true, + ) + .unwrap(); + assert!(read_format.as_flat().is_some()); + + let filter = SimpleFilterEvaluator::try_new(&col("tag_0").eq(lit("b"))).unwrap(); + assert!(is_usable_primary_key_filter(&metadata, None, &filter)); + } + + #[test] + fn test_prefilter_primary_key_drops_single_dictionary_batch() { + let metadata = Arc::new(sst_region_metadata()); + let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("b"))])); + let mut primary_key_filter = + build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters); + let pk_a = new_primary_key(&["a", "x"]); + let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]); + + let filtered = + prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()).unwrap(); + + assert!(filtered.is_none()); + } + + #[test] + fn test_prefilter_primary_key_builds_mask_for_fragmented_matches() { + let metadata = Arc::new(sst_region_metadata()); + let filters = Arc::new(new_test_filters(&[col("tag_0") + .eq(lit("a")) + .or(col("tag_0").eq(lit("c")))])); + let mut primary_key_filter = + build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters); + let pk_a = new_primary_key(&["a", "x"]); + let pk_b = new_primary_key(&["b", "x"]); + let pk_c = new_primary_key(&["c", "x"]); + let pk_d = new_primary_key(&["d", "x"]); + let batch = new_raw_batch( + &[ + pk_a.as_slice(), + pk_a.as_slice(), + pk_b.as_slice(), + pk_b.as_slice(), + pk_c.as_slice(), + pk_c.as_slice(), + pk_d.as_slice(), + pk_d.as_slice(), + ], + &[10, 11, 12, 13, 14, 15, 16, 17], + ); + + let filtered = prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()) + .unwrap() + .unwrap(); + + assert_eq!(filtered.num_rows(), 4); + assert_eq!(field_values(&filtered), vec![10, 11, 14, 15]); + } + + struct CountingPrimaryKeyFilter { + hits: Arc, + expected: Vec, + } + + impl PrimaryKeyFilter for CountingPrimaryKeyFilter { + fn matches(&mut self, pk: &[u8]) -> bool { + self.hits.fetch_add(1, Ordering::Relaxed); + pk == self.expected.as_slice() + } + } + + #[test] + fn test_cached_primary_key_filter_reuses_previous_result() { + let expected = new_primary_key(&["a", "x"]); + let hits = Arc::new(AtomicUsize::new(0)); + let mut filter = CachedPrimaryKeyFilter::new(Box::new(CountingPrimaryKeyFilter { + hits: Arc::clone(&hits), + expected: expected.clone(), + })); + + assert!(filter.matches(expected.as_slice())); + assert!(filter.matches(expected.as_slice())); + assert!(!filter.matches(new_primary_key(&["b", "x"]).as_slice())); + + assert_eq!(hits.load(Ordering::Relaxed), 2); + } + + #[test] + fn test_batch_single_primary_key() { + let pk_a = new_primary_key(&["a", "x"]); + let pk_b = new_primary_key(&["b", "x"]); + + let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]); + assert_eq!( + batch_single_primary_key(&batch).unwrap(), + Some(pk_a.as_slice()) + ); + + let batch = new_raw_batch(&[pk_a.as_slice(), pk_b.as_slice()], &[10, 11]); + assert_eq!(batch_single_primary_key(&batch).unwrap(), None); + } +} diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 500f32ae91..f152c97075 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -21,43 +21,40 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use api::v1::SemanticType; -use async_trait::async_trait; use common_recordbatch::filter::SimpleFilterEvaluator; -use common_telemetry::{debug, tracing, warn}; +use common_telemetry::{tracing, warn}; use datafusion_expr::Expr; use datatypes::arrow::array::ArrayRef; use datatypes::arrow::datatypes::Field; -use datatypes::arrow::error::ArrowError; use datatypes::arrow::record_batch::RecordBatch; use datatypes::data_type::ConcreteDataType; use datatypes::prelude::DataType; +use futures::StreamExt; use mito_codec::row_converter::build_primary_key_codec; use object_store::ObjectStore; -use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection}; -use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels}; -use parquet::file::metadata::{KeyValue, PageIndexPolicy, ParquetMetaData}; +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions, RowSelection}; +use parquet::arrow::async_reader::{ParquetRecordBatchStream, ParquetRecordBatchStreamBuilder}; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData}; use partition::expr::PartitionExpr; -use snafu::{OptionExt, ResultExt}; +use snafu::ResultExt; use store_api::codec::PrimaryKeyEncoding; use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef}; use store_api::region_request::PathType; use store_api::storage::{ColumnId, FileId}; use table::predicate::Predicate; -use crate::cache::CacheStrategy; use crate::cache::index::result_cache::PredicateKey; +use crate::cache::{CacheStrategy, CachedSstMeta}; #[cfg(feature = "vector_index")] use crate::error::ApplyVectorIndexSnafu; -use crate::error::{ - ArrowReaderSnafu, InvalidMetadataSnafu, InvalidParquetSnafu, ReadDataPartSnafu, - ReadParquetSnafu, Result, SerializePartitionExprSnafu, -}; +use crate::error::{ReadDataPartSnafu, ReadParquetSnafu, Result, SerializePartitionExprSnafu}; use crate::metrics::{ PRECISE_FILTER_ROWS_TOTAL, READ_ROW_GROUPS_TOTAL, READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_TOTAL, READ_STAGE_ELAPSED, }; use crate::read::flat_projection::CompactionProjectionMapper; -use crate::read::prune::{PruneReader, Source}; +use crate::read::prune::FlatPruneReader; use crate::read::{Batch, BatchReader}; use crate::sst::file::FileHandle; use crate::sst::index::bloom_filter::applier::{ @@ -71,16 +68,17 @@ use crate::sst::index::inverted_index::applier::{ }; #[cfg(feature = "vector_index")] use crate::sst::index::vector_index::applier::VectorIndexApplierRef; +use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE; +use crate::sst::parquet::async_reader::SstAsyncFileReader; use crate::sst::parquet::file_range::{ FileRangeContext, FileRangeContextRef, PartitionFilterContext, PreFilterMode, RangeBase, row_group_contains_delete, }; use crate::sst::parquet::format::{ReadFormat, need_override_sequence}; use crate::sst::parquet::metadata::MetadataLoader; -use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics}; +use crate::sst::parquet::row_group::ParquetFetchMetrics; use crate::sst::parquet::row_selection::RowGroupSelection; use crate::sst::parquet::stats::RowGroupPruningStats; -use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, PARQUET_METADATA_KEY}; use crate::sst::tag_maybe_to_dictionary_field; const INDEX_TYPE_FULLTEXT: &str = "fulltext"; @@ -303,7 +301,8 @@ impl ParquetReaderBuilder { pub async fn build(&self) -> Result> { let mut metrics = ReaderMetrics::default(); - let Some((context, selection)) = self.build_reader_input(&mut metrics).await? else { + let Some((context, selection)) = self.build_reader_input_inner(&mut metrics, true).await? + else { return Ok(None); }; ParquetReader::new(Arc::new(context), selection) @@ -325,12 +324,14 @@ impl ParquetReaderBuilder { &self, metrics: &mut ReaderMetrics, ) -> Result> { - self.build_reader_input_inner(metrics).await + self.build_reader_input_inner(metrics, self.flat_format) + .await } async fn build_reader_input_inner( &self, metrics: &mut ReaderMetrics, + flat_format: bool, ) -> Result> { let start = Instant::now(); @@ -338,7 +339,7 @@ impl ParquetReaderBuilder { let file_size = self.file_handle.meta_ref().file_size; // Loads parquet metadata of the file. - let (parquet_meta, cache_miss) = self + let (sst_meta, cache_miss) = self .read_parquet_metadata( &file_path, file_size, @@ -346,9 +347,8 @@ impl ParquetReaderBuilder { self.page_index_policy, ) .await?; - // Decodes region metadata. - let key_value_meta = parquet_meta.file_metadata().key_value_metadata(); - let region_meta = Arc::new(Self::get_region_metadata(&file_path, key_value_meta)?); + let parquet_meta = sst_meta.parquet_metadata(); + let region_meta = sst_meta.region_metadata(); let region_partition_expr_str = self .expected_metadata .as_ref() @@ -373,7 +373,7 @@ impl ParquetReaderBuilder { // before compat handling. let compaction_projection_mapper = if self.compaction && !is_same_region_partition - && self.flat_format + && flat_format && region_meta.primary_key_encoding == PrimaryKeyEncoding::Sparse { Some(CompactionProjectionMapper::try_new(®ion_meta)?) @@ -385,7 +385,7 @@ impl ParquetReaderBuilder { ReadFormat::new( region_meta.clone(), Some(column_ids), - self.flat_format, + flat_format, Some(parquet_meta.file_metadata().schema_descr().num_columns()), &file_path, skip_auto_convert, @@ -401,7 +401,7 @@ impl ParquetReaderBuilder { ReadFormat::new( region_meta.clone(), Some(&column_ids), - self.flat_format, + flat_format, Some(parquet_meta.file_metadata().schema_descr().num_columns()), &file_path, skip_auto_convert, @@ -415,6 +415,12 @@ impl ParquetReaderBuilder { .set_override_sequence(self.file_handle.meta_ref().sequence.map(|x| x.get())); } + // Computes the projection mask. + let parquet_schema_desc = parquet_meta.file_metadata().schema_descr(); + let indices = read_format.projection_indices(); + // Now we assumes we don't have nested schemas. + // TODO(yingwen): Revisit this if we introduce nested types such as JSON type. + let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied()); let selection = self .row_groups_to_read(&read_format, &parquet_meta, &mut metrics.filter_metrics) .await; @@ -446,26 +452,20 @@ impl ParquetReaderBuilder { .map(|meta| meta.schema.clone()) .unwrap_or_else(|| region_meta.schema.clone()); - // Computes the projection mask. - let parquet_schema_desc = parquet_meta.file_metadata().schema_descr(); - let indices = read_format.projection_indices(); - // Now we assumes we don't have nested schemas. - // TODO(yingwen): Revisit this if we introduce nested types such as JSON type. - let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied()); - - // Computes the field levels. - let hint = Some(read_format.arrow_schema().fields()); - let field_levels = - parquet_to_arrow_field_levels(parquet_schema_desc, projection_mask.clone(), hint) + // Create ArrowReaderMetadata for async stream building. + let arrow_reader_options = + ArrowReaderOptions::new().with_schema(read_format.arrow_schema().clone()); + let arrow_metadata = + ArrowReaderMetadata::try_new(parquet_meta.clone(), arrow_reader_options) .context(ReadDataPartSnafu)?; let reader_builder = RowGroupReaderBuilder { file_handle: self.file_handle.clone(), file_path, parquet_meta, + arrow_metadata, object_store: self.object_store.clone(), projection: projection_mask, - field_levels, cache_strategy: self.cache_strategy.clone(), }; @@ -599,42 +599,15 @@ impl ParquetReaderBuilder { })) } - /// Decodes region metadata from key value. - fn get_region_metadata( - file_path: &str, - key_value_meta: Option<&Vec>, - ) -> Result { - let key_values = key_value_meta.context(InvalidParquetSnafu { - file: file_path, - reason: "missing key value meta", - })?; - let meta_value = key_values - .iter() - .find(|kv| kv.key == PARQUET_METADATA_KEY) - .with_context(|| InvalidParquetSnafu { - file: file_path, - reason: format!("key {} not found", PARQUET_METADATA_KEY), - })?; - let json = meta_value - .value - .as_ref() - .with_context(|| InvalidParquetSnafu { - file: file_path, - reason: format!("No value for key {}", PARQUET_METADATA_KEY), - })?; - - RegionMetadata::from_json(json).context(InvalidMetadataSnafu) - } - /// Reads parquet metadata of specific file. - /// Returns (metadata, cache_miss_flag). + /// Returns (fused metadata, cache_miss_flag). async fn read_parquet_metadata( &self, file_path: &str, file_size: u64, cache_metrics: &mut MetadataCacheMetrics, page_index_policy: PageIndexPolicy, - ) -> Result<(Arc, bool)> { + ) -> Result<(Arc, bool)> { let start = Instant::now(); let _t = READ_STAGE_ELAPSED .with_label_values(&["read_parquet_metadata"]) @@ -644,7 +617,7 @@ impl ParquetReaderBuilder { // Tries to get from cache with metrics tracking. if let Some(metadata) = self .cache_strategy - .get_parquet_meta_data(file_id, cache_metrics, page_index_policy) + .get_sst_meta_data(file_id, cache_metrics, page_index_policy) .await { cache_metrics.metadata_load_cost += start.elapsed(); @@ -657,10 +630,10 @@ impl ParquetReaderBuilder { metadata_loader.with_page_index_policy(page_index_policy); let metadata = metadata_loader.load(cache_metrics).await?; - let metadata = Arc::new(metadata); + let metadata = Arc::new(CachedSstMeta::try_new(file_path, metadata)?); // Cache the metadata. self.cache_strategy - .put_parquet_meta_data(file_id, metadata.clone()); + .put_sst_meta_data(file_id, metadata.clone()); cache_metrics.metadata_load_cost += start.elapsed(); Ok((metadata, true)) @@ -1667,7 +1640,7 @@ impl ReaderMetrics { } } -/// Builder to build a [ParquetRecordBatchReader] for a row group. +/// Builder to build a [ParquetRecordBatchStream] for a row group. pub(crate) struct RowGroupReaderBuilder { /// SST file to read. /// @@ -1677,12 +1650,12 @@ pub(crate) struct RowGroupReaderBuilder { file_path: String, /// Metadata of the parquet file. parquet_meta: Arc, + /// Arrow reader metadata for building async stream. + arrow_metadata: ArrowReaderMetadata, /// Object store as an Operator. object_store: ObjectStore, /// Projection mask. projection: ProjectionMask, - /// Field levels to read. - field_levels: FieldLevels, /// Cache. cache_strategy: CacheStrategy, } @@ -1706,66 +1679,43 @@ impl RowGroupReaderBuilder { &self.cache_strategy } - /// Builds a [ParquetRecordBatchReader] to read the row group at `row_group_idx`. + /// Builds a [ParquetRecordBatchStream] to read the row group at `row_group_idx`. pub(crate) async fn build( &self, row_group_idx: usize, row_selection: Option, fetch_metrics: Option<&ParquetFetchMetrics>, - ) -> Result { - let fetch_start = Instant::now(); - - let mut row_group = InMemoryRowGroup::create( - self.file_handle.region_id(), - self.file_handle.file_id().file_id(), - &self.parquet_meta, - row_group_idx, - self.cache_strategy.clone(), - &self.file_path, + ) -> Result> { + // Create async file reader with caching support. + let async_reader = SstAsyncFileReader::new( + self.file_handle.file_id(), + self.file_path.clone(), self.object_store.clone(), - ); - // Fetches data into memory. - row_group - .fetch(&self.projection, row_selection.as_ref(), fetch_metrics) - .await - .context(ReadParquetSnafu { - path: &self.file_path, - })?; - - // Record total fetch elapsed time. - if let Some(metrics) = fetch_metrics { - metrics.data.lock().unwrap().total_fetch_elapsed += fetch_start.elapsed(); - } - - // Builds the parquet reader. - // Now the row selection is None. - ParquetRecordBatchReader::try_new_with_row_groups( - &self.field_levels, - &row_group, - DEFAULT_READ_BATCH_SIZE, - row_selection, + self.cache_strategy.clone(), + self.parquet_meta.clone(), + row_group_idx, ) - .context(ReadParquetSnafu { - path: &self.file_path, - }) - } -} + .with_fetch_metrics(fetch_metrics.cloned()); -/// The state of a [ParquetReader]. -enum ReaderState { - /// The reader is reading a row group. - Readable(PruneReader), - /// The reader is exhausted. - Exhausted(ReaderMetrics), -} + // Build the async stream using ArrowReaderBuilder API. + let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata( + async_reader, + self.arrow_metadata.clone(), + ); + builder = builder + .with_row_groups(vec![row_group_idx]) + .with_projection(self.projection.clone()) + .with_batch_size(DEFAULT_READ_BATCH_SIZE); -impl ReaderState { - /// Returns the metrics of the reader. - fn metrics(&self) -> ReaderMetrics { - match self { - ReaderState::Readable(reader) => reader.metrics(), - ReaderState::Exhausted(m) => m.clone(), + if let Some(selection) = row_selection { + builder = builder.with_row_selection(selection); } + + let stream = builder.build().context(ReadParquetSnafu { + path: &self.file_path, + })?; + + Ok(stream) } } @@ -1879,13 +1829,12 @@ pub struct ParquetReader { /// Row group selection to read. selection: RowGroupSelection, /// Reader of current row group. - reader_state: ReaderState, + reader: Option, /// Metrics for tracking row group fetch operations. fetch_metrics: ParquetFetchMetrics, } -#[async_trait] -impl BatchReader for ParquetReader { +impl ParquetReader { #[tracing::instrument( skip_all, fields( @@ -1893,18 +1842,20 @@ impl BatchReader for ParquetReader { file_id = %self.context.reader_builder().file_handle.file_id() ) )] - async fn next_batch(&mut self) -> Result> { - let ReaderState::Readable(reader) = &mut self.reader_state else { - return Ok(None); - }; + pub async fn next_record_batch(&mut self) -> Result> { + loop { + if let Some(reader) = &mut self.reader { + if let Some(batch) = reader.next_batch().await? { + return Ok(Some(batch)); + } + self.reader = None; + continue; + } - // We don't collect the elapsed time if the reader returns an error. - if let Some(batch) = reader.next_batch().await? { - return Ok(Some(batch)); - } + let Some((row_group_idx, row_selection)) = self.selection.pop_first() else { + return Ok(None); + }; - // No more items in current row group, reads next row group. - while let Some((row_group_idx, row_selection)) = self.selection.pop_first() { let parquet_reader = self .context .reader_builder() @@ -1915,54 +1866,14 @@ impl BatchReader for ParquetReader { ) .await?; - // Resets the parquet reader. - // Compute skip_fields for this row group let skip_fields = self.context.should_skip_fields(row_group_idx); - reader.reset_source( - Source::RowGroup(RowGroupReader::new(self.context.clone(), parquet_reader)), + self.reader = Some(FlatPruneReader::new_with_row_group_reader( + self.context.clone(), + FlatRowGroupReader::new(self.context.clone(), parquet_reader), skip_fields, - ); - if let Some(batch) = reader.next_batch().await? { - return Ok(Some(batch)); - } + )); } - - // The reader is exhausted. - self.reader_state = ReaderState::Exhausted(reader.metrics().clone()); - Ok(None) } -} - -impl Drop for ParquetReader { - fn drop(&mut self) { - let metrics = self.reader_state.metrics(); - debug!( - "Read parquet {} {}, range: {:?}, {}/{} row groups, metrics: {:?}", - self.context.reader_builder().file_handle.region_id(), - self.context.reader_builder().file_handle.file_id(), - self.context.reader_builder().file_handle.time_range(), - metrics.filter_metrics.rg_total - - metrics.filter_metrics.rg_inverted_filtered - - metrics.filter_metrics.rg_minmax_filtered - - metrics.filter_metrics.rg_fulltext_filtered - - metrics.filter_metrics.rg_bloom_filtered, - metrics.filter_metrics.rg_total, - metrics - ); - - // Report metrics. - READ_STAGE_ELAPSED - .with_label_values(&["build_parquet_reader"]) - .observe(metrics.build_cost.as_secs_f64()); - READ_STAGE_ELAPSED - .with_label_values(&["scan_row_groups"]) - .observe(metrics.scan_cost.as_secs_f64()); - metrics.observe_rows("parquet_reader"); - metrics.filter_metrics.observe(); - } -} - -impl ParquetReader { /// Creates a new reader. #[tracing::instrument( skip_all, @@ -1975,28 +1886,27 @@ impl ParquetReader { context: FileRangeContextRef, mut selection: RowGroupSelection, ) -> Result { + debug_assert!(context.read_format().as_flat().is_some()); let fetch_metrics = ParquetFetchMetrics::default(); - // No more items in current row group, reads next row group. - let reader_state = if let Some((row_group_idx, row_selection)) = selection.pop_first() { + let reader = if let Some((row_group_idx, row_selection)) = selection.pop_first() { let parquet_reader = context .reader_builder() .build(row_group_idx, Some(row_selection), Some(&fetch_metrics)) .await?; - // Compute skip_fields once for this row group let skip_fields = context.should_skip_fields(row_group_idx); - ReaderState::Readable(PruneReader::new_with_row_group_reader( + Some(FlatPruneReader::new_with_row_group_reader( context.clone(), - RowGroupReader::new(context.clone(), parquet_reader), + FlatRowGroupReader::new(context.clone(), parquet_reader), skip_fields, )) } else { - ReaderState::Exhausted(ReaderMetrics::default()) + None }; Ok(ParquetReader { context, selection, - reader_state, + reader, fetch_metrics, }) } @@ -2014,27 +1924,19 @@ impl ParquetReader { /// RowGroupReaderContext represents the fields that cannot be shared /// between different `RowGroupReader`s. pub(crate) trait RowGroupReaderContext: Send { - fn map_result( - &self, - result: std::result::Result, ArrowError>, - ) -> Result>; - fn read_format(&self) -> &ReadFormat; + + fn file_path(&self) -> &str; } impl RowGroupReaderContext for FileRangeContextRef { - fn map_result( - &self, - result: std::result::Result, ArrowError>, - ) -> Result> { - result.context(ArrowReaderSnafu { - path: self.file_path(), - }) - } - fn read_format(&self) -> &ReadFormat { self.as_ref().read_format() } + + fn file_path(&self) -> &str { + self.as_ref().file_path() + } } /// [RowGroupReader] that reads from [FileRange]. @@ -2042,8 +1944,11 @@ pub(crate) type RowGroupReader = RowGroupReaderBase; impl RowGroupReader { /// Creates a new reader from file range. - pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self { - Self::create(context, reader) + pub(crate) fn new( + context: FileRangeContextRef, + stream: ParquetRecordBatchStream, + ) -> Self { + Self::create(context, stream) } } @@ -2051,8 +1956,8 @@ impl RowGroupReader { pub(crate) struct RowGroupReaderBase { /// Context of [RowGroupReader] so adapts to different underlying implementation. context: T, - /// Inner parquet reader. - reader: ParquetRecordBatchReader, + /// Inner parquet record batch stream. + stream: ParquetRecordBatchStream, /// Buffered batches to return. batches: VecDeque, /// Local scan metrics. @@ -2066,7 +1971,7 @@ where T: RowGroupReaderContext, { /// Creates a new reader to read the primary key format. - pub(crate) fn create(context: T, reader: ParquetRecordBatchReader) -> Self { + pub(crate) fn create(context: T, stream: ParquetRecordBatchStream) -> Self { // The batch length from the reader should be less than or equal to DEFAULT_READ_BATCH_SIZE. let override_sequence = context .read_format() @@ -2075,7 +1980,7 @@ where Self { context, - reader, + stream, batches: VecDeque::new(), metrics: ReaderMetrics::default(), override_sequence, @@ -2092,13 +1997,18 @@ where self.context.read_format() } - /// Tries to fetch next [RecordBatch] from the reader. - fn fetch_next_record_batch(&mut self) -> Result> { - self.context.map_result(self.reader.next().transpose()) + /// Tries to fetch next [RecordBatch] from the stream asynchronously. + async fn fetch_next_record_batch(&mut self) -> Result> { + match self.stream.next().await.transpose() { + Ok(batch) => Ok(batch), + Err(e) => Err(e).context(ReadParquetSnafu { + path: self.context.file_path(), + }), + } } /// Returns the next [Batch]. - pub(crate) fn next_inner(&mut self) -> Result> { + pub(crate) async fn next_inner(&mut self) -> Result> { let scan_start = Instant::now(); if let Some(batch) = self.batches.pop_front() { self.metrics.num_rows += batch.num_rows(); @@ -2108,7 +2018,7 @@ where // We need to fetch next record batch and convert it to batches. while self.batches.is_empty() { - let Some(record_batch) = self.fetch_next_record_batch()? else { + let Some(record_batch) = self.fetch_next_record_batch().await? else { self.metrics.scan_cost += scan_start.elapsed(); return Ok(None); }; @@ -2136,10 +2046,10 @@ where #[async_trait::async_trait] impl BatchReader for RowGroupReaderBase where - T: RowGroupReaderContext, + T: RowGroupReaderContext + Send + Sync, { async fn next_batch(&mut self) -> Result> { - self.next_inner() + self.next_inner().await } } @@ -2147,15 +2057,18 @@ where pub(crate) struct FlatRowGroupReader { /// Context for file ranges. context: FileRangeContextRef, - /// Inner parquet reader. - reader: ParquetRecordBatchReader, + /// Inner parquet record batch stream. + stream: ParquetRecordBatchStream, /// Cached sequence array to override sequences. override_sequence: Option, } impl FlatRowGroupReader { /// Creates a new flat reader from file range. - pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self { + pub(crate) fn new( + context: FileRangeContextRef, + stream: ParquetRecordBatchStream, + ) -> Self { // The batch length from the reader should be less than or equal to DEFAULT_READ_BATCH_SIZE. let override_sequence = context .read_format() @@ -2163,16 +2076,16 @@ impl FlatRowGroupReader { Self { context, - reader, + stream, override_sequence, } } /// Returns the next RecordBatch. - pub(crate) fn next_batch(&mut self) -> Result> { - match self.reader.next() { + pub(crate) async fn next_batch(&mut self) -> Result> { + match self.stream.next().await { Some(batch_result) => { - let record_batch = batch_result.context(ArrowReaderSnafu { + let record_batch = batch_result.context(ReadParquetSnafu { path: self.context.file_path(), })?; diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs index 8f3f6c5f62..38ef62c6b8 100644 --- a/src/mito2/src/sst/parquet/row_group.rs +++ b/src/mito2/src/sst/parquet/row_group.rs @@ -12,28 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Ports private structs from [parquet crate](https://github.com/apache/arrow-rs/blob/7e134f4d277c0b62c27529fc15a4739de3ad0afd/parquet/src/arrow/async_reader/mod.rs#L644-L650). +//! Parquet row group reading utilities. use std::ops::Range; use std::sync::Arc; -use bytes::{Buf, Bytes}; -use object_store::ObjectStore; -use parquet::arrow::ProjectionMask; -use parquet::arrow::arrow_reader::{RowGroups, RowSelection}; -use parquet::column::page::{PageIterator, PageReader}; -use parquet::errors::{ParquetError, Result}; -use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData}; -use parquet::file::page_index::offset_index::OffsetIndexMetaData; -use parquet::file::reader::{ChunkReader, Length}; -use parquet::file::serialized_reader::SerializedPageReader; -use store_api::storage::{FileId, RegionId}; -use tokio::task::yield_now; - -use crate::cache::file_cache::{FileType, IndexKey}; -use crate::cache::{CacheStrategy, PageKey, PageValue}; -use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES}; -use crate::sst::parquet::helper::{MERGE_GAP, fetch_byte_ranges}; +use crate::sst::parquet::helper::MERGE_GAP; /// Inner data for ParquetFetchMetrics. #[derive(Default, Debug, Clone)] @@ -74,9 +58,9 @@ impl ParquetFetchMetricsData { } /// Metrics for tracking page/row group fetch operations. -#[derive(Default)] +#[derive(Default, Clone)] pub struct ParquetFetchMetrics { - pub data: std::sync::Mutex, + pub data: Arc>, } impl std::fmt::Debug for ParquetFetchMetrics { @@ -204,363 +188,12 @@ impl ParquetFetchMetrics { } } -pub(crate) struct RowGroupBase<'a> { - parquet_metadata: &'a ParquetMetaData, - row_group_idx: usize, - pub(crate) offset_index: Option<&'a [OffsetIndexMetaData]>, - /// Compressed page of each column. - column_chunks: Vec>>, - pub(crate) row_count: usize, -} - -impl<'a> RowGroupBase<'a> { - pub(crate) fn new(parquet_meta: &'a ParquetMetaData, row_group_idx: usize) -> Self { - let metadata = parquet_meta.row_group(row_group_idx); - // `offset_index` is always `None` if we don't set - // [with_page_index()](https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index) - // to `true`. - let offset_index = parquet_meta - .offset_index() - // filter out empty offset indexes (old versions specified Some(vec![]) when no present) - .filter(|index| !index.is_empty()) - .map(|x| x[row_group_idx].as_slice()); - - Self { - parquet_metadata: parquet_meta, - row_group_idx, - offset_index, - column_chunks: vec![None; metadata.columns().len()], - row_count: metadata.num_rows() as usize, - } - } - - pub(crate) fn calc_sparse_read_ranges( - &self, - projection: &ProjectionMask, - offset_index: &[OffsetIndexMetaData], - selection: &RowSelection, - ) -> (Vec>, Vec>) { - // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the - // `RowSelection` - let mut page_start_offsets: Vec> = vec![]; - let ranges = self - .column_chunks - .iter() - .zip(self.row_group_metadata().columns()) - .enumerate() - .filter(|&(idx, (chunk, _chunk_meta))| chunk.is_none() && projection.leaf_included(idx)) - .flat_map(|(idx, (_chunk, chunk_meta))| { - // If the first page does not start at the beginning of the column, - // then we need to also fetch a dictionary page. - let mut ranges = vec![]; - let (start, _len) = chunk_meta.byte_range(); - match offset_index[idx].page_locations.first() { - Some(first) if first.offset as u64 != start => { - ranges.push(start..first.offset as u64); - } - _ => (), - } - - ranges.extend( - selection - .scan_ranges(&offset_index[idx].page_locations) - .iter() - .map(|range| range.start..range.end), - ); - page_start_offsets.push(ranges.iter().map(|range| range.start as usize).collect()); - - ranges - }) - .collect::>(); - (ranges, page_start_offsets) - } - - pub(crate) fn assign_sparse_chunk( - &mut self, - projection: &ProjectionMask, - data: Vec, - page_start_offsets: Vec>, - ) { - let mut page_start_offsets = page_start_offsets.into_iter(); - let mut chunk_data = data.into_iter(); - - for (idx, chunk) in self.column_chunks.iter_mut().enumerate() { - if chunk.is_some() || !projection.leaf_included(idx) { - continue; - } - - if let Some(offsets) = page_start_offsets.next() { - let mut chunks = Vec::with_capacity(offsets.len()); - for _ in 0..offsets.len() { - chunks.push(chunk_data.next().unwrap()); - } - - let column = self - .parquet_metadata - .row_group(self.row_group_idx) - .column(idx); - *chunk = Some(Arc::new(ColumnChunkData::Sparse { - length: column.byte_range().1 as usize, - data: offsets.into_iter().zip(chunks).collect(), - })) - } - } - } - - pub(crate) fn calc_dense_read_ranges(&self, projection: &ProjectionMask) -> Vec> { - self.column_chunks - .iter() - .enumerate() - .filter(|&(idx, chunk)| chunk.is_none() && projection.leaf_included(idx)) - .map(|(idx, _chunk)| { - let column = self.row_group_metadata().column(idx); - let (start, length) = column.byte_range(); - start..(start + length) - }) - .collect::>() - } - - /// Assigns compressed chunk binary data to [RowGroupBase::column_chunks] - /// and returns the chunk offset and binary data assigned. - pub(crate) fn assign_dense_chunk( - &mut self, - projection: &ProjectionMask, - chunk_data: Vec, - ) { - let mut chunk_data = chunk_data.into_iter(); - - for (idx, chunk) in self.column_chunks.iter_mut().enumerate() { - if chunk.is_some() || !projection.leaf_included(idx) { - continue; - } - - // Get the fetched page. - let Some(data) = chunk_data.next() else { - continue; - }; - - let column = self - .parquet_metadata - .row_group(self.row_group_idx) - .column(idx); - *chunk = Some(Arc::new(ColumnChunkData::Dense { - offset: column.byte_range().0 as usize, - data, - })); - } - } - - /// Create [PageReader] from [RowGroupBase::column_chunks] - pub(crate) fn column_reader( - &self, - col_idx: usize, - ) -> Result> { - let page_reader = match &self.column_chunks[col_idx] { - None => { - return Err(ParquetError::General(format!( - "Invalid column index {col_idx}, column was not fetched" - ))); - } - Some(data) => { - let page_locations = self - .offset_index - // filter out empty offset indexes (old versions specified Some(vec![]) when no present) - .filter(|index| !index.is_empty()) - .map(|index| index[col_idx].page_locations.clone()); - SerializedPageReader::new( - data.clone(), - self.row_group_metadata().column(col_idx), - self.row_count, - page_locations, - )? - } - }; - - Ok(page_reader) - } - - pub(crate) fn parquet_metadata(&self) -> &ParquetMetaData { - self.parquet_metadata - } - - pub(crate) fn row_group_metadata(&self) -> &RowGroupMetaData { - self.parquet_metadata().row_group(self.row_group_idx) - } -} - -/// An in-memory collection of column chunks -pub struct InMemoryRowGroup<'a> { - region_id: RegionId, - file_id: FileId, - row_group_idx: usize, - cache_strategy: CacheStrategy, - file_path: &'a str, - /// Object store. - object_store: ObjectStore, - base: RowGroupBase<'a>, -} - -impl<'a> InMemoryRowGroup<'a> { - /// Creates a new [InMemoryRowGroup] by `row_group_idx`. - /// - /// # Panics - /// Panics if the `row_group_idx` is invalid. - pub fn create( - region_id: RegionId, - file_id: FileId, - parquet_meta: &'a ParquetMetaData, - row_group_idx: usize, - cache_strategy: CacheStrategy, - file_path: &'a str, - object_store: ObjectStore, - ) -> Self { - Self { - region_id, - file_id, - row_group_idx, - cache_strategy, - file_path, - object_store, - base: RowGroupBase::new(parquet_meta, row_group_idx), - } - } - - /// Fetches the necessary column data into memory - pub async fn fetch( - &mut self, - projection: &ProjectionMask, - selection: Option<&RowSelection>, - metrics: Option<&ParquetFetchMetrics>, - ) -> Result<()> { - if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) { - let (fetch_ranges, page_start_offsets) = - self.base - .calc_sparse_read_ranges(projection, offset_index, selection); - - let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?; - // Assign sparse chunk data to base. - self.base - .assign_sparse_chunk(projection, chunk_data, page_start_offsets); - } else { - // Release the CPU to avoid blocking the runtime. Since `fetch_pages_from_cache` - // is a synchronous, CPU-bound operation. - yield_now().await; - - // Calculate ranges to read. - let fetch_ranges = self.base.calc_dense_read_ranges(projection); - - if fetch_ranges.is_empty() { - // Nothing to fetch. - return Ok(()); - } - - // Fetch data with ranges - let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?; - - // Assigns fetched data to base. - self.base.assign_dense_chunk(projection, chunk_data); - } - - Ok(()) - } - - /// Try to fetch data from the memory cache or the WriteCache, - /// if not in WriteCache, fetch data from object store directly. - async fn fetch_bytes( - &self, - ranges: &[Range], - metrics: Option<&ParquetFetchMetrics>, - ) -> Result> { - // Now fetch page timer includes the whole time to read pages. - let _timer = READ_STAGE_FETCH_PAGES.start_timer(); - - let page_key = PageKey::new(self.file_id, self.row_group_idx, ranges.to_vec()); - if let Some(pages) = self.cache_strategy.get_pages(&page_key) { - if let Some(metrics) = metrics { - let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum(); - let mut metrics_data = metrics.data.lock().unwrap(); - metrics_data.page_cache_hit += 1; - metrics_data.pages_to_fetch_mem += ranges.len(); - metrics_data.page_size_to_fetch_mem += total_size; - metrics_data.page_size_needed += total_size; - } - return Ok(pages.compressed.clone()); - } - - // Calculate total range size for metrics. - let (total_range_size, unaligned_size) = compute_total_range_size(ranges); - - let key = IndexKey::new(self.region_id, self.file_id, FileType::Parquet); - let fetch_write_cache_start = metrics.map(|_| std::time::Instant::now()); - let write_cache_result = self.fetch_ranges_from_write_cache(key, ranges).await; - let pages = match write_cache_result { - Some(data) => { - if let Some(metrics) = metrics { - let elapsed = fetch_write_cache_start - .map(|start| start.elapsed()) - .unwrap_or_default(); - let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum(); - let mut metrics_data = metrics.data.lock().unwrap(); - metrics_data.write_cache_fetch_elapsed += elapsed; - metrics_data.write_cache_hit += 1; - metrics_data.pages_to_fetch_write_cache += ranges.len(); - metrics_data.page_size_to_fetch_write_cache += unaligned_size; - metrics_data.page_size_needed += range_size_needed; - } - data - } - None => { - // Fetch data from object store. - let _timer = READ_STAGE_ELAPSED - .with_label_values(&["cache_miss_read"]) - .start_timer(); - - let start = metrics.map(|_| std::time::Instant::now()); - let data = fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges) - .await - .map_err(|e| ParquetError::External(Box::new(e)))?; - if let Some(metrics) = metrics { - let elapsed = start.map(|start| start.elapsed()).unwrap_or_default(); - let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum(); - let mut metrics_data = metrics.data.lock().unwrap(); - metrics_data.store_fetch_elapsed += elapsed; - metrics_data.cache_miss += 1; - metrics_data.pages_to_fetch_store += ranges.len(); - metrics_data.page_size_to_fetch_store += unaligned_size; - metrics_data.page_size_needed += range_size_needed; - } - data - } - }; - - // Put pages back to the cache. - let page_value = PageValue::new(pages.clone(), total_range_size); - self.cache_strategy - .put_pages(page_key, Arc::new(page_value)); - - Ok(pages) - } - - /// Fetches data from write cache. - /// Returns `None` if the data is not in the cache. - async fn fetch_ranges_from_write_cache( - &self, - key: IndexKey, - ranges: &[Range], - ) -> Option> { - if let Some(cache) = self.cache_strategy.write_cache() { - return cache.file_cache().read_ranges(key, ranges).await; - } - None - } -} - /// Computes the max possible buffer size to read the given `ranges`. /// Returns (aligned_size, unaligned_size) where: /// - aligned_size: total size aligned to pooled buffer size /// - unaligned_size: actual total size without alignment // See https://github.com/apache/opendal/blob/v0.54.0/core/src/types/read/reader.rs#L166-L192 -fn compute_total_range_size(ranges: &[Range]) -> (u64, u64) { +pub(crate) fn compute_total_range_size(ranges: &[Range]) -> (u64, u64) { if ranges.is_empty() { return (0, 0); } @@ -602,96 +235,3 @@ fn align_to_pooled_buf_size(size: u64) -> u64 { const POOLED_BUF_SIZE: u64 = 2 * 1024 * 1024; size.div_ceil(POOLED_BUF_SIZE) * POOLED_BUF_SIZE } - -impl RowGroups for InMemoryRowGroup<'_> { - fn num_rows(&self) -> usize { - self.base.row_count - } - - fn column_chunks(&self, i: usize) -> Result> { - // Creates a page reader to read column at `i`. - let page_reader = self.base.column_reader(i)?; - - Ok(Box::new(ColumnChunkIterator { - reader: Some(Ok(Box::new(page_reader))), - })) - } - - fn row_groups(&self) -> Box + '_> { - Box::new(std::iter::once(self.base.row_group_metadata())) - } - - fn metadata(&self) -> &ParquetMetaData { - self.base.parquet_metadata() - } -} - -/// An in-memory column chunk -#[derive(Clone)] -pub(crate) enum ColumnChunkData { - /// Column chunk data representing only a subset of data pages - Sparse { - /// Length of the full column chunk - length: usize, - /// Set of data pages included in this sparse chunk. Each element is a tuple - /// of (page offset, page data) - data: Vec<(usize, Bytes)>, - }, - /// Full column chunk and its offset - Dense { offset: usize, data: Bytes }, -} - -impl ColumnChunkData { - fn get(&self, start: u64) -> Result { - match &self { - ColumnChunkData::Sparse { data, .. } => data - .binary_search_by_key(&start, |(offset, _)| *offset as u64) - .map(|idx| data[idx].1.clone()) - .map_err(|_| { - ParquetError::General(format!( - "Invalid offset in sparse column chunk data: {start}" - )) - }), - ColumnChunkData::Dense { offset, data } => { - let start = start as usize - *offset; - Ok(data.slice(start..)) - } - } - } -} - -impl Length for ColumnChunkData { - fn len(&self) -> u64 { - match &self { - ColumnChunkData::Sparse { length, .. } => *length as u64, - ColumnChunkData::Dense { data, .. } => data.len() as u64, - } - } -} - -impl ChunkReader for ColumnChunkData { - type T = bytes::buf::Reader; - - fn get_read(&self, start: u64) -> Result { - Ok(self.get(start)?.reader()) - } - - fn get_bytes(&self, start: u64, length: usize) -> Result { - Ok(self.get(start)?.slice(..length)) - } -} - -/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`] -pub(crate) struct ColumnChunkIterator { - pub(crate) reader: Option>>, -} - -impl Iterator for ColumnChunkIterator { - type Item = Result>; - - fn next(&mut self) -> Option { - self.reader.take() - } -} - -impl PageIterator for ColumnChunkIterator {} diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index b207f11ef8..4e75073e26 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -50,7 +50,7 @@ use crate::config::{IndexBuildMode, IndexConfig}; use crate::error::{ InvalidMetadataSnafu, OpenDalSnafu, Result, UnexpectedSnafu, WriteParquetSnafu, }; -use crate::read::{Batch, FlatSource, Source}; +use crate::read::FlatSource; use crate::sst::file::RegionFileId; use crate::sst::index::{IndexOutput, Indexer, IndexerBuilder}; use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index}; @@ -60,6 +60,35 @@ use crate::sst::{ DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator, }; +/// Converts a flat RecordBatch for writing to parquet. +enum FlatBatchConverter { + /// Write as-is in flat format. + Flat(FlatWriteFormat), + /// Convert flat batch to primary-key format by stripping tag columns. + PrimaryKey { + format: PrimaryKeyWriteFormat, + num_fields: usize, + }, +} + +impl FlatBatchConverter { + fn arrow_schema(&self) -> &SchemaRef { + match self { + FlatBatchConverter::Flat(f) => f.arrow_schema(), + FlatBatchConverter::PrimaryKey { format, .. } => format.arrow_schema(), + } + } + + fn convert_batch(&self, batch: &RecordBatch) -> Result { + match self { + FlatBatchConverter::Flat(f) => f.convert_batch(batch), + FlatBatchConverter::PrimaryKey { format, num_fields } => { + format.convert_flat_batch(batch, *num_fields) + } + } + } +} + /// Parquet SST writer. pub struct ParquetWriter<'a, F: WriterFactory, I: IndexerBuilder, P: FilePathProvider> { /// Path provider that creates SST and index file paths according to file id. @@ -240,81 +269,6 @@ where Ok(()) } - /// Iterates source and writes all rows to Parquet file. - /// - /// Returns the [SstInfo] if the SST is written. - pub async fn write_all( - &mut self, - source: Source, - override_sequence: Option, // override the `sequence` field from `Source` - opts: &WriteOptions, - ) -> Result { - let res = self - .write_all_without_cleaning(source, override_sequence, opts) - .await; - if res.is_err() { - // Clean tmp files explicitly on failure. - let file_id = self.current_file; - if let Some(cleaner) = &self.file_cleaner { - cleaner.clean_by_file_id(file_id).await; - } - } - res - } - - async fn write_all_without_cleaning( - &mut self, - mut source: Source, - override_sequence: Option, // override the `sequence` field from `Source` - opts: &WriteOptions, - ) -> Result { - let mut results = smallvec![]; - let write_format = PrimaryKeyWriteFormat::new(self.metadata.clone()) - .with_override_sequence(override_sequence); - let mut stats = SourceStats::default(); - - while let Some(res) = self - .write_next_batch(&mut source, &write_format, opts) - .await - .transpose() - { - match res { - Ok(mut batch) => { - stats.update(&batch); - let start = Instant::now(); - // safety: self.current_indexer must be set when first batch has been written. - match self.index_config.build_mode { - IndexBuildMode::Sync => { - self.current_indexer - .as_mut() - .unwrap() - .update(&mut batch) - .await; - } - IndexBuildMode::Async => {} - } - self.metrics.update_index += start.elapsed(); - if let Some(max_file_size) = opts.max_file_size - && self.bytes_written.load(Ordering::Relaxed) > max_file_size - { - self.finish_current_file(&mut results, &mut stats).await?; - } - } - Err(e) => { - if let Some(indexer) = &mut self.current_indexer { - indexer.abort().await; - } - return Err(e); - } - } - } - - self.finish_current_file(&mut results, &mut stats).await?; - - // object_store.write will make sure all bytes are written or an error is raised. - Ok(results) - } - /// Iterates FlatSource and writes all RecordBatch in flat format to Parquet file. /// /// Returns the [SstInfo] if the SST is written. @@ -324,11 +278,15 @@ where override_sequence: Option, opts: &WriteOptions, ) -> Result { - let res = self - .write_all_flat_without_cleaning(source, override_sequence, opts) - .await; + let converter = FlatBatchConverter::Flat( + FlatWriteFormat::new( + self.metadata.clone(), + &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding), + ) + .with_override_sequence(override_sequence), + ); + let res = self.write_all_flat_inner(source, &converter, opts).await; if res.is_err() { - // Clean tmp files explicitly on failure. let file_id = self.current_file; if let Some(cleaner) = &self.file_cleaner { cleaner.clean_by_file_id(file_id).await; @@ -337,36 +295,58 @@ where res } - async fn write_all_flat_without_cleaning( + /// Iterates FlatSource and writes all RecordBatch in primary-key format to Parquet file. + /// + /// Returns the [SstInfo] if the SST is written. + pub async fn write_all_flat_as_primary_key( &mut self, - mut source: FlatSource, + source: FlatSource, override_sequence: Option, opts: &WriteOptions, + ) -> Result { + let num_fields = self.metadata.field_columns().count(); + let converter = FlatBatchConverter::PrimaryKey { + format: PrimaryKeyWriteFormat::new(self.metadata.clone()) + .with_override_sequence(override_sequence), + num_fields, + }; + let res = self.write_all_flat_inner(source, &converter, opts).await; + if res.is_err() { + let file_id = self.current_file; + if let Some(cleaner) = &self.file_cleaner { + cleaner.clean_by_file_id(file_id).await; + } + } + res + } + + async fn write_all_flat_inner( + &mut self, + mut source: FlatSource, + converter: &FlatBatchConverter, + opts: &WriteOptions, ) -> Result { let mut results = smallvec![]; - let flat_format = FlatWriteFormat::new( - self.metadata.clone(), - &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding), - ) - .with_override_sequence(override_sequence); let mut stats = SourceStats::default(); while let Some(record_batch) = self - .write_next_flat_batch(&mut source, &flat_format, opts) + .write_next_flat_batch(&mut source, converter, opts) .await .transpose() { match record_batch { Ok(batch) => { stats.update_flat(&batch)?; - let start = Instant::now(); - // safety: self.current_indexer must be set when first batch has been written. - self.current_indexer - .as_mut() - .unwrap() - .update_flat(&batch) - .await; - self.metrics.update_index += start.elapsed(); + if matches!(self.index_config.build_mode, IndexBuildMode::Sync) { + let start = Instant::now(); + // safety: self.current_indexer must be set when first batch has been written. + self.current_indexer + .as_mut() + .unwrap() + .update_flat(&batch) + .await; + self.metrics.update_index += start.elapsed(); + } if let Some(max_file_size) = opts.max_file_size && self.bytes_written.load(Ordering::Relaxed) > max_file_size { @@ -411,34 +391,10 @@ where .set_column_compression(op_type_col, Compression::UNCOMPRESSED) } - async fn write_next_batch( - &mut self, - source: &mut Source, - write_format: &PrimaryKeyWriteFormat, - opts: &WriteOptions, - ) -> Result> { - let start = Instant::now(); - let Some(batch) = source.next_batch().await? else { - return Ok(None); - }; - self.metrics.iter_source += start.elapsed(); - - let arrow_batch = write_format.convert_batch(&batch)?; - - let start = Instant::now(); - self.maybe_init_writer(write_format.arrow_schema(), opts) - .await? - .write(&arrow_batch) - .await - .context(WriteParquetSnafu)?; - self.metrics.write_batch += start.elapsed(); - Ok(Some(batch)) - } - async fn write_next_flat_batch( &mut self, source: &mut FlatSource, - flat_format: &FlatWriteFormat, + converter: &FlatBatchConverter, opts: &WriteOptions, ) -> Result> { let start = Instant::now(); @@ -447,15 +403,16 @@ where }; self.metrics.iter_source += start.elapsed(); - let arrow_batch = flat_format.convert_batch(&record_batch)?; + let arrow_batch = converter.convert_batch(&record_batch)?; let start = Instant::now(); - self.maybe_init_writer(flat_format.arrow_schema(), opts) + self.maybe_init_writer(converter.arrow_schema(), opts) .await? .write(&arrow_batch) .await .context(WriteParquetSnafu)?; self.metrics.write_batch += start.elapsed(); + // Return original flat batch for stats/indexer which use flat layout. Ok(Some(record_batch)) } @@ -515,26 +472,6 @@ struct SourceStats { } impl SourceStats { - fn update(&mut self, batch: &Batch) { - if batch.is_empty() { - return; - } - - self.num_rows += batch.num_rows(); - self.series_estimator.update(batch); - // Safety: batch is not empty. - let (min_in_batch, max_in_batch) = ( - batch.first_timestamp().unwrap(), - batch.last_timestamp().unwrap(), - ); - if let Some(time_range) = &mut self.time_range { - time_range.0 = time_range.0.min(min_in_batch); - time_range.1 = time_range.1.max(max_in_batch); - } else { - self.time_range = Some((min_in_batch, max_in_batch)); - } - } - fn update_flat(&mut self, record_batch: &RecordBatch) -> Result<()> { if record_batch.num_rows() == 0 { return Ok(()); diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs index 842689bba6..350195bfa9 100644 --- a/src/mito2/src/test_util.rs +++ b/src/mito2/src/test_util.rs @@ -15,6 +15,7 @@ //! Utilities for testing. pub mod batch_util; +pub mod bench_util; pub mod memtable_util; pub mod scheduler_util; pub mod sst_util; diff --git a/src/mito2/src/test_util/bench_util.rs b/src/mito2/src/test_util/bench_util.rs new file mode 100644 index 0000000000..8f182e4157 --- /dev/null +++ b/src/mito2/src/test_util/bench_util.rs @@ -0,0 +1,259 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared utilities for mito2 benchmarks. +//! +//! Provides a TSBS cpu-like data generator ([`CpuDataGenerator`]) and schema +//! ([`cpu_metadata`]) used by multiple benchmark binaries in this directory. + +use api::v1::value::ValueData; +use api::v1::{Row, Rows, SemanticType}; +use datafusion_common::Column; +use datafusion_expr::{Expr, lit}; +use datatypes::data_type::ConcreteDataType; +use datatypes::schema::ColumnSchema; +use rand::Rng; +use rand::rngs::ThreadRng; +use rand::seq::IndexedRandom; +use store_api::metadata::{ + ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, +}; +use store_api::storage::RegionId; +use table::predicate::Predicate; + +use crate::memtable::KeyValues; +use crate::test_util::memtable_util::region_metadata_to_row_schema; + +pub struct Host { + pub hostname: String, + pub region: String, + pub datacenter: String, + pub rack: String, + pub os: String, + pub arch: String, + pub team: String, + pub service: String, + pub service_version: String, + pub service_environment: String, +} + +impl Host { + pub fn random_with_id(id: usize) -> Host { + let mut rng = rand::rng(); + let region = format!("ap-southeast-{}", rng.random_range(0..10)); + let datacenter = format!( + "{}{}", + region, + ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap() + ); + Host { + hostname: format!("host_{id}"), + region, + datacenter, + rack: rng.random_range(0..100).to_string(), + os: "Ubuntu16.04LTS".to_string(), + arch: "x86".to_string(), + team: "CHI".to_string(), + service: rng.random_range(0..100).to_string(), + service_version: rng.random_range(0..10).to_string(), + service_environment: "test".to_string(), + } + } + + pub fn fill_values(&self, values: &mut Vec) { + let tags = [ + api::v1::Value { + value_data: Some(ValueData::StringValue(self.hostname.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.region.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.datacenter.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.rack.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.os.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.arch.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.team.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.service.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.service_version.clone())), + }, + api::v1::Value { + value_data: Some(ValueData::StringValue(self.service_environment.clone())), + }, + ]; + for tag in tags { + values.push(tag); + } + } +} + +pub struct CpuDataGenerator { + pub metadata: RegionMetadataRef, + column_schemas: Vec, + hosts: Vec, + start_sec: i64, + end_sec: i64, +} + +impl CpuDataGenerator { + pub fn new( + metadata: RegionMetadataRef, + num_hosts: usize, + start_sec: i64, + end_sec: i64, + ) -> Self { + let column_schemas = region_metadata_to_row_schema(&metadata); + Self { + metadata, + column_schemas, + hosts: Self::generate_hosts(num_hosts), + start_sec, + end_sec, + } + } + + pub fn iter(&self) -> impl Iterator + '_ { + // point per 10s. + (self.start_sec..self.end_sec) + .step_by(10) + .enumerate() + .map(|(seq, ts)| self.build_key_values(seq, ts)) + } + + pub fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues { + let rows = self + .hosts + .iter() + .map(|host| { + let mut rng = rand::rng(); + let mut values = Vec::with_capacity(21); + values.push(api::v1::Value { + value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)), + }); + host.fill_values(&mut values); + for _ in 0..10 { + values.push(api::v1::Value { + value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))), + }); + } + Row { values } + }) + .collect(); + let mutation = api::v1::Mutation { + op_type: api::v1::OpType::Put as i32, + sequence: seq as u64, + rows: Some(Rows { + schema: self.column_schemas.clone(), + rows, + }), + write_hint: None, + }; + + KeyValues::new(&self.metadata, mutation).unwrap() + } + + pub fn random_host_filter(&self) -> Predicate { + let host = self.random_hostname(); + let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host)); + Predicate::new(vec![expr]) + } + + pub fn random_host_filter_exprs(&self) -> Vec { + let host = self.random_hostname(); + vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))] + } + + pub fn random_hostname(&self) -> String { + let mut rng = rand::rng(); + self.hosts.choose(&mut rng).unwrap().hostname.clone() + } + + pub fn random_f64(rng: &mut ThreadRng) -> f64 { + let base: u32 = rng.random_range(30..95); + base as f64 + } + + pub fn generate_hosts(num_hosts: usize) -> Vec { + (0..num_hosts).map(Host::random_with_id).collect() + } +} + +/// Creates a metadata for TSBS cpu-like table. +pub fn cpu_metadata() -> RegionMetadata { + let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1)); + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 0, + }); + let mut column_id = 1; + let tags = [ + "hostname", + "region", + "datacenter", + "rack", + "os", + "arch", + "team", + "service", + "service_version", + "service_environment", + ]; + for tag in tags { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true), + semantic_type: SemanticType::Tag, + column_id, + }); + column_id += 1; + } + let fields = [ + "usage_user", + "usage_system", + "usage_idle", + "usage_nice", + "usage_iowait", + "usage_irq", + "usage_softirq", + "usage_steal", + "usage_guest", + "usage_guest_nice", + ]; + for field in fields { + builder.push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true), + semantic_type: SemanticType::Field, + column_id, + }); + column_id += 1; + } + builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + builder.build().unwrap() +} diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs index 7ddac4ee0d..25ab9bb8b4 100644 --- a/src/mito2/src/test_util/memtable_util.rs +++ b/src/mito2/src/test_util/memtable_util.rs @@ -30,8 +30,7 @@ use mito_codec::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortFi use store_api::metadata::{ ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, }; -use store_api::storage::{ColumnId, RegionId, SequenceNumber, SequenceRange}; -use table::predicate::Predicate; +use store_api::storage::{ColumnId, RegionId, SequenceNumber}; use crate::error::Result; use crate::memtable::bulk::part::BulkPart; @@ -83,16 +82,6 @@ impl Memtable for EmptyMemtable { Ok(()) } - #[cfg(any(test, feature = "test"))] - fn iter( - &self, - _projection: Option<&[ColumnId]>, - _filters: Option, - _sequence: Option, - ) -> Result { - Ok(Box::new(std::iter::empty())) - } - fn ranges( &self, _projection: Option<&[ColumnId]>, diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs index 389d9bf107..e9515030c0 100644 --- a/src/mito2/src/test_util/sst_util.rs +++ b/src/mito2/src/test_util/sst_util.rs @@ -18,7 +18,11 @@ use std::sync::Arc; use api::v1::{OpType, SemanticType}; use common_time::Timestamp; -use datatypes::arrow::array::{BinaryArray, TimestampMillisecondArray, UInt8Array, UInt64Array}; +use datatypes::arrow::array::{ + ArrayRef, BinaryDictionaryBuilder, RecordBatch, StringDictionaryBuilder, + TimestampMillisecondArray, UInt8Array, UInt64Array, +}; +use datatypes::arrow::datatypes::UInt32Type; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, SkippingIndexOptions}; use datatypes::value::ValueRef; @@ -32,8 +36,9 @@ use store_api::metric_engine_consts::{ use store_api::storage::consts::ReservedColumnId; use store_api::storage::{FileId, RegionId}; -use crate::read::{Batch, BatchBuilder, Source}; +use crate::read::{Batch, FlatSource, Source}; use crate::sst::file::{FileHandle, FileMeta}; +use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema}; use crate::test_util::{VecBatchReader, new_batch_builder, new_noop_file_purger}; /// Test region id. @@ -246,34 +251,68 @@ pub fn new_batch_by_range(tags: &[&str], start: usize, end: usize) -> Batch { new_batch_with_custom_sequence(tags, start, end, 1000) } -pub fn new_batch_with_binary(tags: &[&str], start: usize, end: usize) -> Batch { +/// Creates a flat format RecordBatch for testing. +/// Similar to `new_batch_by_range` but returns a RecordBatch in flat format. +pub fn new_record_batch_by_range(tags: &[&str], start: usize, end: usize) -> RecordBatch { + new_record_batch_with_custom_sequence(tags, start, end, 1000) +} + +/// Creates a flat format RecordBatch for testing with a custom sequence. +pub fn new_record_batch_with_custom_sequence( + tags: &[&str], + start: usize, + end: usize, + sequence: u64, +) -> RecordBatch { assert!(end >= start); + let metadata = Arc::new(sst_region_metadata()); + let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default()); + + let num_rows = end - start; + let mut columns = Vec::new(); + + // Add primary key columns (tag_0, tag_1) as dictionary arrays + let mut tag_0_builder = StringDictionaryBuilder::::new(); + let mut tag_1_builder = StringDictionaryBuilder::::new(); + + for _ in 0..num_rows { + tag_0_builder.append_value(tags[0]); + tag_1_builder.append_value(tags[1]); + } + + columns.push(Arc::new(tag_0_builder.finish()) as ArrayRef); + columns.push(Arc::new(tag_1_builder.finish()) as ArrayRef); + + // Add field column (field_0) + let field_values: Vec = (start..end).map(|v| v as u64).collect(); + columns.push(Arc::new(UInt64Array::from(field_values))); + + // Add time index column (ts) + let timestamps: Vec = (start..end).map(|v| v as i64).collect(); + columns.push(Arc::new(TimestampMillisecondArray::from(timestamps))); + + // Add encoded primary key column let pk = new_primary_key(tags); - let timestamps: Vec<_> = (start..end).map(|v| v as i64).collect(); - let sequences = vec![1000; end - start]; - let op_types = vec![OpType::Put; end - start]; + let mut pk_builder = BinaryDictionaryBuilder::::new(); + for _ in 0..num_rows { + pk_builder.append(&pk).unwrap(); + } + columns.push(Arc::new(pk_builder.finish())); - let field: Vec<_> = (start..end) - .map(|_v| "some data".as_bytes().to_vec()) - .collect(); + // Add sequence column + columns.push(Arc::new(UInt64Array::from_value(sequence, num_rows))); - let mut builder = BatchBuilder::new(pk); - builder - .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values( - timestamps.iter().copied(), - ))) - .unwrap() - .sequences_array(Arc::new(UInt64Array::from_iter_values( - sequences.iter().copied(), - ))) - .unwrap() - .op_types_array(Arc::new(UInt8Array::from_iter_values( - op_types.iter().map(|v| *v as u8), - ))) - .unwrap() - .push_field_array(1, Arc::new(BinaryArray::from_iter_values(field))) - .unwrap(); - builder.build().unwrap() + // Add op_type column + columns.push(Arc::new(UInt8Array::from_value( + OpType::Put as u8, + num_rows, + ))); + RecordBatch::try_new(flat_schema, columns).unwrap() +} + +/// Creates a FlatSource from flat format RecordBatches. +pub fn new_flat_source_from_record_batches(batches: Vec) -> FlatSource { + FlatSource::Iter(Box::new(batches.into_iter().map(Ok))) } /// Creates a new region metadata for testing SSTs with binary datatype. diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs index 71896b3d5d..fd5ad82f3f 100644 --- a/src/mito2/src/worker.rs +++ b/src/mito2/src/worker.rs @@ -207,6 +207,7 @@ impl WorkerGroup { .vector_cache_size(config.vector_cache_size.as_bytes()) .page_cache_size(config.page_cache_size.as_bytes()) .selector_result_cache_size(config.selector_result_cache_size.as_bytes()) + .range_result_cache_size(config.range_result_cache_size.as_bytes()) .index_metadata_size(config.index.metadata_cache_size.as_bytes()) .index_content_size(config.index.content_cache_size.as_bytes()) .index_content_page_size(config.index.content_cache_page_size.as_bytes()) @@ -421,6 +422,7 @@ impl WorkerGroup { .vector_cache_size(config.vector_cache_size.as_bytes()) .page_cache_size(config.page_cache_size.as_bytes()) .selector_result_cache_size(config.selector_result_cache_size.as_bytes()) + .range_result_cache_size(config.range_result_cache_size.as_bytes()) .write_cache(write_cache) .build(), ); diff --git a/src/partition/src/cache.rs b/src/partition/src/cache.rs index a886e1e08d..4066b69aa3 100644 --- a/src/partition/src/cache.rs +++ b/src/partition/src/cache.rs @@ -121,10 +121,12 @@ pub fn new_partition_info_cache( CacheContainer::new( name, cache, - Box::new(|cache, ident| { + Box::new(|cache, idents| { Box::pin(async move { - if let CacheIdent::TableId(table_id) = ident { - cache.invalidate(table_id).await + for ident in idents { + if let CacheIdent::TableId(table_id) = ident { + cache.invalidate(table_id).await + } } Ok(()) }) diff --git a/src/pipeline/src/manager/pipeline_operator.rs b/src/pipeline/src/manager/pipeline_operator.rs index 77ef8ade23..6c4256db69 100644 --- a/src/pipeline/src/manager/pipeline_operator.rs +++ b/src/pipeline/src/manager/pipeline_operator.rs @@ -20,6 +20,7 @@ use api::v1::CreateTableExpr; use catalog::{CatalogManagerRef, RegisterSystemTableRequest}; use common_catalog::consts::{DEFAULT_PRIVATE_SCHEMA_NAME, default_engine}; use common_telemetry::info; +use common_time::FOREVER; use datatypes::timestamp::TimestampNanosecond; use futures::FutureExt; use operator::insert::InserterRef; @@ -28,6 +29,7 @@ use query::QueryEngineRef; use session::context::QueryContextRef; use snafu::{OptionExt, ResultExt}; use table::TableRef; +use table::requests::TTL_KEY; use crate::Pipeline; use crate::error::{CatalogSnafu, CreateTableSnafu, PipelineTableNotFoundSnafu, Result}; @@ -59,6 +61,9 @@ impl PipelineOperator { fn create_table_request(&self, catalog: &str) -> RegisterSystemTableRequest { let (time_index, primary_keys, column_defs) = PipelineTable::build_pipeline_schema(); + let mut table_options = HashMap::new(); + table_options.insert(TTL_KEY.to_string(), FOREVER.to_string()); + let create_table_expr = CreateTableExpr { catalog_name: catalog.to_string(), schema_name: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(), @@ -68,7 +73,7 @@ impl PipelineOperator { time_index, primary_keys, create_if_not_exists: true, - table_options: Default::default(), + table_options, table_id: None, // Should and will be assigned by Meta. engine: default_engine().to_string(), }; diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs index dc84c4afac..e2e577debf 100644 --- a/src/query/src/datafusion.rs +++ b/src/query/src/datafusion.rs @@ -354,25 +354,6 @@ impl DatafusionQueryEngine { Ok(physical_plan) } - #[tracing::instrument(skip_all)] - pub fn optimize( - &self, - context: &QueryEngineContext, - plan: &LogicalPlan, - ) -> Result { - let _timer = metrics::OPTIMIZE_LOGICAL_ELAPSED.start_timer(); - - // Optimized by extension rules - let optimized_plan = self - .state - .optimize_by_extension_rules(plan.clone(), context)?; - - // Optimized by datafusion optimizer - let optimized_plan = self.state.session_state().optimize(&optimized_plan)?; - - Ok(optimized_plan) - } - #[tracing::instrument(skip_all)] fn optimize_physical_plan( &self, @@ -444,32 +425,17 @@ impl QueryEngine for DatafusionQueryEngine { async fn describe( &self, plan: LogicalPlan, - query_ctx: QueryContextRef, + _query_ctx: QueryContextRef, ) -> Result { - let ctx = self.engine_context(query_ctx); - if let Ok(optimised_plan) = self.optimize(&ctx, &plan) { - let schema = optimised_plan - .schema() - .clone() - .try_into() - .context(ConvertSchemaSnafu)?; - Ok(DescribeResult { - schema, - logical_plan: optimised_plan, - }) - } else { - // Table's like those in information_schema cannot be optimized when - // it contains parameters. So we fallback to original plans. - let schema = plan - .schema() - .clone() - .try_into() - .context(ConvertSchemaSnafu)?; - Ok(DescribeResult { - schema, - logical_plan: plan, - }) - } + let schema = plan + .schema() + .clone() + .try_into() + .context(ConvertSchemaSnafu)?; + Ok(DescribeResult { + schema, + logical_plan: plan, + }) } async fn execute(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result { @@ -924,7 +890,7 @@ mod tests { ) ); assert_eq!( - "Limit: skip=0, fetch=20\n Aggregate: groupBy=[[]], aggr=[[sum(CAST(numbers.number AS UInt64))]]\n TableScan: numbers projection=[number]", + "Limit: skip=0, fetch=20\n Projection: sum(numbers.number)\n Aggregate: groupBy=[[]], aggr=[[sum(numbers.number)]]\n TableScan: numbers", format!("{}", logical_plan.display_indent()) ); } diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs index 4259b587ba..aaac1e3124 100644 --- a/src/query/src/optimizer.rs +++ b/src/query/src/optimizer.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod constant_term; +pub mod count_nest_aggr; pub mod count_wildcard; pub mod parallelize_scan; pub mod pass_distribution; diff --git a/src/query/src/optimizer/count_nest_aggr.rs b/src/query/src/optimizer/count_nest_aggr.rs new file mode 100644 index 0000000000..89ba426074 --- /dev/null +++ b/src/query/src/optimizer/count_nest_aggr.rs @@ -0,0 +1,346 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::config::ConfigOptions; +use datafusion::functions_aggregate::count::count_udaf; +use datafusion::logical_expr::{Extension, LogicalPlan, LogicalPlanBuilder, Sort}; +use datafusion_common::Result; +use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_expr::{Expr, UserDefinedLogicalNodeCore, lit}; +use promql::extension_plan::{InstantManipulate, SeriesDivide, SeriesNormalize}; +use store_api::metric_engine_consts::DATA_SCHEMA_TSID_COLUMN_NAME; + +use crate::QueryEngineContext; +use crate::optimizer::ExtensionAnalyzerRule; + +/// Rewrites `count(() by (...))` into a presence-based +/// group count. +/// +/// This stays intentionally narrow: +/// - the outer aggregate must be plain `count` +/// - the inner aggregate must be a plain aggregate whose result existence is equivalent to input +/// group existence +/// - the inner input must be the direct instant-vector-selector plan +/// - the outer count must only group by the evaluation timestamp +#[derive(Debug)] +pub struct CountNestAggrRule; + +impl ExtensionAnalyzerRule for CountNestAggrRule { + fn analyze( + &self, + plan: LogicalPlan, + _ctx: &QueryEngineContext, + _config: &ConfigOptions, + ) -> Result { + plan.transform_down(&Self::rewrite_plan).map(|x| x.data) + } +} + +impl CountNestAggrRule { + fn rewrite_plan(plan: LogicalPlan) -> Result> { + let LogicalPlan::Sort(sort) = plan else { + return Ok(Transformed::no(plan)); + }; + + if let Some(rewritten) = Self::try_rewrite_sort(&sort)? { + Ok(Transformed::yes(rewritten)) + } else { + Ok(Transformed::no(LogicalPlan::Sort(sort))) + } + } + + fn try_rewrite_sort(sort: &Sort) -> Result> { + if sort.fetch.is_some() { + return Ok(None); + } + + let LogicalPlan::Aggregate(outer_agg) = sort.input.as_ref() else { + return Ok(None); + }; + if outer_agg.group_expr.len() != 1 || outer_agg.aggr_expr.len() != 1 { + return Ok(None); + } + let outer_time_expr = outer_agg.group_expr[0].clone(); + let outer_count_arg = + match Self::aggregate_if(&outer_agg.aggr_expr[0], |name| name == "count") { + Some((_, arg)) => arg, + None => return Ok(None), + }; + + let LogicalPlan::Sort(inner_sort) = outer_agg.input.as_ref() else { + return Ok(None); + }; + if inner_sort.fetch.is_some() { + return Ok(None); + } + + let LogicalPlan::Aggregate(inner_agg) = inner_sort.input.as_ref() else { + return Ok(None); + }; + if inner_agg.aggr_expr.len() != 1 || inner_agg.group_expr.is_empty() { + return Ok(None); + } + let (inner_is_count, inner_value_expr) = + match Self::aggregate_if(&inner_agg.aggr_expr[0], |name| { + Self::is_supported_inner_aggregate(name) + }) { + Some((name, arg)) => (name == "count", arg), + None => return Ok(None), + }; + let Expr::Column(_) = inner_value_expr else { + return Ok(None); + }; + + let Expr::Column(outer_count_column) = outer_count_arg else { + return Ok(None); + }; + let inner_output_field = inner_agg.schema.field(inner_agg.group_expr.len()); + if outer_count_column.name != *inner_output_field.name() { + return Ok(None); + } + + if !Self::is_projection_chain_to_instant(inner_agg.input.as_ref()) { + return Ok(None); + } + + if !inner_agg + .group_expr + .iter() + .all(|expr| matches!(expr, Expr::Column(_))) + { + return Ok(None); + } + + let Some(time_expr_pos) = inner_agg + .group_expr + .iter() + .position(|expr| expr == &outer_time_expr) + else { + return Ok(None); + }; + + let mut presence_group_exprs = Vec::with_capacity(inner_agg.group_expr.len()); + presence_group_exprs.push(outer_time_expr.clone()); + presence_group_exprs.extend( + inner_agg + .group_expr + .iter() + .enumerate() + .filter(|(idx, _)| *idx != time_expr_pos) + .map(|(_, expr)| expr.clone()), + ); + + let mut required_input_columns = + Self::collect_required_input_columns(&presence_group_exprs, inner_value_expr); + required_input_columns.extend(Self::collect_required_instant_columns( + inner_agg.input.as_ref(), + )); + let presence_source = Self::rebuild_projection_chain_to_instant( + inner_agg.input.as_ref(), + &required_input_columns, + )?; + + let outer_value_name = outer_agg + .schema + .field(outer_agg.group_expr.len()) + .name() + .clone(); + let mut presence_input = LogicalPlanBuilder::from(presence_source); + if !inner_is_count { + presence_input = presence_input.filter(inner_value_expr.clone().is_not_null())?; + } + let presence_input = presence_input + .project(presence_group_exprs.clone())? + .distinct()? + .build()?; + + let rewritten = LogicalPlanBuilder::from(presence_input) + .aggregate( + outer_agg.group_expr.clone(), + vec![count_udaf().call(vec![lit(1_i64)]).alias(outer_value_name)], + )? + .sort(sort.expr.clone())? + .build()?; + + Ok(Some(rewritten)) + } + + fn collect_required_input_columns(group_exprs: &[Expr], value_expr: &Expr) -> HashSet { + let mut required = HashSet::new(); + + for expr in group_exprs { + if let Expr::Column(column) = expr { + required.insert(column.name.clone()); + } + } + if let Expr::Column(column) = value_expr { + // Keep the value column in the pruned instant input so `InstantManipulate` + // can still perform stale-NaN filtering before we project down to keys. + required.insert(column.name.clone()); + } + + required + } + + fn collect_required_instant_columns(plan: &LogicalPlan) -> HashSet { + let mut required = HashSet::new(); + Self::collect_required_instant_columns_into(plan, &mut required); + required + } + + fn collect_required_instant_columns_into(plan: &LogicalPlan, required: &mut HashSet) { + match plan { + LogicalPlan::Projection(projection) => { + Self::collect_required_instant_columns_into(projection.input.as_ref(), required); + } + LogicalPlan::Extension(extension) => { + for expr in extension.node.expressions() { + if let Expr::Column(column) = expr { + required.insert(column.name); + } + } + + if extension.node.as_any().is::() + && extension.node.inputs()[0] + .schema() + .fields() + .iter() + .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME) + { + required.insert(DATA_SCHEMA_TSID_COLUMN_NAME.to_string()); + } + + if let Some(input) = extension.node.inputs().into_iter().next() { + Self::collect_required_instant_columns_into(input, required); + } + } + _ => {} + } + } + + fn aggregate_if(expr: &Expr, accept_name: F) -> Option<(&str, &Expr)> + where + F: FnOnce(&str) -> bool, + { + let Expr::AggregateFunction(func) = expr else { + return None; + }; + let name = func.func.name(); + if !accept_name(name) + || func.params.filter.is_some() + || func.params.distinct + || !func.params.order_by.is_empty() + || func.params.args.len() != 1 + { + return None; + } + + Some((name, &func.params.args[0])) + } + + fn is_supported_inner_aggregate(name: &str) -> bool { + matches!( + name, + "count" | "sum" | "avg" | "min" | "max" | "stddev_pop" | "var_pop" + ) + } + + fn is_projection_chain_to_instant(plan: &LogicalPlan) -> bool { + let mut current = plan; + loop { + match current { + LogicalPlan::Projection(projection) => current = projection.input.as_ref(), + LogicalPlan::Extension(ext) => { + return ext.node.as_any().is::(); + } + _ => return false, + } + } + } + + fn rebuild_projection_chain_to_instant( + plan: &LogicalPlan, + required_columns: &HashSet, + ) -> Result { + match plan { + LogicalPlan::Projection(projection) => { + let input = Self::rebuild_projection_chain_to_instant( + projection.input.as_ref(), + required_columns, + )?; + LogicalPlanBuilder::from(input) + .project(projection.expr.clone())? + .build() + } + LogicalPlan::Extension(extension) => { + if let Some(instant) = extension.node.as_any().downcast_ref::() { + let input = + Self::prune_instant_input(extension.node.inputs()[0], required_columns)?; + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new(instant.with_exprs_and_inputs(vec![], vec![input])?), + })); + } + + Ok(plan.clone()) + } + _ => Ok(plan.clone()), + } + } + + fn prune_instant_input( + plan: &LogicalPlan, + required_columns: &HashSet, + ) -> Result { + match plan { + LogicalPlan::Extension(extension) => { + if let Some(normalize) = extension.node.as_any().downcast_ref::() { + let input = + Self::prune_instant_input(extension.node.inputs()[0], required_columns)?; + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new(normalize.with_exprs_and_inputs(vec![], vec![input])?), + })); + } + + if let Some(divide) = extension.node.as_any().downcast_ref::() { + let divide_input = extension.node.inputs()[0].clone(); + + let projection_exprs = divide_input + .schema() + .fields() + .iter() + .filter(|field| required_columns.contains(field.name())) + .map(|field| { + Expr::Column(datafusion_common::Column::from_name(field.name().clone())) + }) + .collect::>(); + let projected_input = LogicalPlanBuilder::from(divide_input) + .project(projection_exprs)? + .build()?; + + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new( + divide.with_exprs_and_inputs(vec![], vec![projected_input])?, + ), + })); + } + + Ok(plan.clone()) + } + _ => Ok(plan.clone()), + } + } +} diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs index 44c9bc3956..6b206b9d8d 100644 --- a/src/query/src/planner.rs +++ b/src/query/src/planner.rs @@ -28,6 +28,7 @@ use datafusion::execution::context::SessionState; use datafusion::sql::planner::PlannerContext; use datafusion_common::ToDFSchema; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_expr::expr::{Exists, InSubquery}; use datafusion_expr::{ Analyze, Explain, ExplainFormat, Expr as DfExpr, LogicalPlan, LogicalPlanBuilder, PlanType, ToStringifiedPlan, col, @@ -277,17 +278,22 @@ impl DfLogicalPlanner { let table_provider = DfTableSourceProvider::new( self.engine_state.catalog_manager().clone(), self.engine_state.disallow_cross_catalog_query(), - query_ctx, + query_ctx.clone(), plan_decoder, self.session_state .config_options() .sql_parser .enable_ident_normalization, ); - PromPlanner::stmt_to_plan(table_provider, stmt, &self.engine_state) + let plan = PromPlanner::stmt_to_plan(table_provider, stmt, &self.engine_state) .await .map_err(BoxedError::new) - .context(QueryPlanSnafu) + .context(QueryPlanSnafu)?; + + let context = QueryEngineContext::new(self.session_state.clone(), query_ctx); + Ok(self + .engine_state + .optimize_by_extension_rules(plan, &context)?) } #[tracing::instrument(skip_all)] @@ -424,9 +430,20 @@ impl DfLogicalPlanner { let mut placeholder_types = HashMap::new(); let mut casted_placeholders = HashSet::new(); + Self::extract_from_plan(plan, &mut placeholder_types, &mut casted_placeholders)?; + + Ok(placeholder_types) + } + + fn extract_from_plan( + plan: &LogicalPlan, + placeholder_types: &mut HashMap>, + casted_placeholders: &mut HashSet, + ) -> Result<()> { plan.apply(|node| { for expr in node.expressions() { let _ = expr.apply(|e| { + // Handle casted placeholders if let DfExpr::Cast(cast) = e && let DfExpr::Placeholder(ph) = &*cast.expr { @@ -434,6 +451,7 @@ impl DfLogicalPlanner { casted_placeholders.insert(ph.id.clone()); } + // Handle bare (non-casted) placeholders if let DfExpr::Placeholder(ph) = e && !casted_placeholders.contains(&ph.id) && !placeholder_types.contains_key(&ph.id) @@ -441,13 +459,26 @@ impl DfLogicalPlanner { placeholder_types.insert(ph.id.clone(), None); } + // Recurse into subquery plans embedded in expressions + match e { + DfExpr::Exists(Exists { subquery, .. }) + | DfExpr::InSubquery(InSubquery { subquery, .. }) + | DfExpr::ScalarSubquery(subquery) => { + Self::extract_from_plan( + &subquery.subquery, + placeholder_types, + casted_placeholders, + )?; + } + _ => {} + } + Ok(TreeNodeRecursion::Continue) }); } Ok(TreeNodeRecursion::Continue) })?; - - Ok(placeholder_types) + Ok(()) } /// Gets inferred parameter types from a logical plan. @@ -545,15 +576,22 @@ mod tests { use std::sync::Arc; use arrow_schema::DataType; + use catalog::RegisterTableRequest; + use catalog::memory::MemoryCatalogManager; + use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; use session::context::QueryContext; + use store_api::metric_engine_consts::{ + DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME, LOGICAL_TABLE_METADATA_KEY, + METRIC_ENGINE_NAME, + }; use table::metadata::{TableInfoBuilder, TableMetaBuilder}; use table::test_util::EmptyTable; use super::*; - use crate::QueryEngineRef; - use crate::parser::QueryLanguageParser; + use crate::parser::{PromQuery, QueryLanguageParser}; + use crate::{QueryEngineFactory, QueryEngineRef}; async fn create_test_engine() -> QueryEngineRef { let columns = vec![ @@ -574,6 +612,109 @@ mod tests { crate::tests::new_query_engine_with_table(table) } + fn create_promql_test_engine() -> QueryEngineRef { + let catalog_manager = MemoryCatalogManager::with_default_setup(); + let physical_table_name = "phy"; + let physical_table_id = 999u32; + + let physical_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new( + DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string(), + ConcreteDataType::uint32_datatype(), + false, + ), + ColumnSchema::new( + DATA_SCHEMA_TSID_COLUMN_NAME.to_string(), + ConcreteDataType::uint64_datatype(), + false, + ), + ColumnSchema::new("tag_0", ConcreteDataType::string_datatype(), false), + ColumnSchema::new("tag_1", ConcreteDataType::string_datatype(), false), + ColumnSchema::new( + "timestamp", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), + ColumnSchema::new("field_0", ConcreteDataType::float64_datatype(), true), + ])); + let physical_meta = TableMetaBuilder::empty() + .schema(physical_schema) + .primary_key_indices(vec![0, 1, 2, 3]) + .value_indices(vec![4, 5]) + .engine(METRIC_ENGINE_NAME.to_string()) + .next_column_id(1024) + .build() + .unwrap(); + let physical_info = TableInfoBuilder::default() + .table_id(physical_table_id) + .name(physical_table_name) + .meta(physical_meta) + .build() + .unwrap(); + catalog_manager + .register_table_sync(RegisterTableRequest { + catalog: DEFAULT_CATALOG_NAME.to_string(), + schema: DEFAULT_SCHEMA_NAME.to_string(), + table_name: physical_table_name.to_string(), + table_id: physical_table_id, + table: EmptyTable::from_table_info(&physical_info), + }) + .unwrap(); + + let mut options = table::requests::TableOptions::default(); + options.extra_options.insert( + LOGICAL_TABLE_METADATA_KEY.to_string(), + physical_table_name.to_string(), + ); + let logical_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("tag_0", ConcreteDataType::string_datatype(), false), + ColumnSchema::new("tag_1", ConcreteDataType::string_datatype(), false), + ColumnSchema::new( + "timestamp", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), + ColumnSchema::new("field_0", ConcreteDataType::float64_datatype(), true), + ])); + let logical_meta = TableMetaBuilder::empty() + .schema(logical_schema) + .primary_key_indices(vec![0, 1]) + .value_indices(vec![3]) + .engine(METRIC_ENGINE_NAME.to_string()) + .options(options) + .next_column_id(1024) + .build() + .unwrap(); + let logical_info = TableInfoBuilder::default() + .table_id(1024) + .name("some_metric") + .meta(logical_meta) + .build() + .unwrap(); + catalog_manager + .register_table_sync(RegisterTableRequest { + catalog: DEFAULT_CATALOG_NAME.to_string(), + schema: DEFAULT_SCHEMA_NAME.to_string(), + table_name: "some_metric".to_string(), + table_id: 1024, + table: EmptyTable::from_table_info(&logical_info), + }) + .unwrap(); + + QueryEngineFactory::new( + catalog_manager, + None, + None, + None, + None, + false, + crate::options::QueryOptions::default(), + ) + .query_engine() + } + async fn parse_sql_to_plan(sql: &str) -> LogicalPlan { let stmt = QueryLanguageParser::parse_sql(sql, &QueryContext::arc()).unwrap(); let engine = create_test_engine().await; @@ -584,6 +725,25 @@ mod tests { .unwrap() } + async fn parse_promql_to_plan(query: &str) -> LogicalPlan { + let engine = create_promql_test_engine(); + let query_ctx = QueryContext::arc(); + let stmt = QueryLanguageParser::parse_promql( + &PromQuery { + query: query.to_string(), + start: "0".to_string(), + end: "10".to_string(), + step: "5s".to_string(), + lookback: "300s".to_string(), + alias: None, + }, + &query_ctx, + ) + .unwrap(); + + engine.planner().plan(&stmt, query_ctx).await.unwrap() + } + #[tokio::test] async fn test_extract_placeholder_cast_types_multiple() { let plan = parse_sql_to_plan( @@ -619,4 +779,82 @@ mod tests { assert_eq!(type_2, &Some(DataType::Utf8)); assert_eq!(type_3, &Some(DataType::Int32)); } + + #[tokio::test] + async fn test_plan_pql_applies_extension_rules() { + for inner_agg in ["count", "sum", "avg", "min", "max", "stddev", "stdvar"] { + let plan = parse_promql_to_plan(&format!( + "sum(irate(some_metric[1h])) / scalar(count({inner_agg}(some_metric) by (tag_0)))" + )) + .await; + let plan_str = plan.display_indent_schema().to_string(); + assert!(plan_str.contains("Distinct:"), "{inner_agg}: {plan_str}"); + } + } + + #[tokio::test] + async fn test_plan_pql_filters_null_only_groups_for_non_count_inner_aggs() { + let count_plan = parse_promql_to_plan("scalar(count(count(some_metric) by (tag_0)))").await; + let count_plan_str = count_plan.display_indent_schema().to_string(); + assert!( + !count_plan_str.contains("field_0 IS NOT NULL"), + "{count_plan_str}" + ); + + for inner_agg in ["sum", "avg", "min", "max", "stddev", "stdvar"] { + let plan = parse_promql_to_plan(&format!( + "scalar(count({inner_agg}(some_metric) by (tag_0)))" + )) + .await; + let plan_str = plan.display_indent_schema().to_string(); + assert!( + plan_str.contains("field_0 IS NOT NULL"), + "{inner_agg}: {plan_str}" + ); + } + } + + #[tokio::test] + async fn test_plan_pql_skips_extension_rules_for_non_direct_or_unsupported_inner_agg() { + for query in [ + "sum(irate(some_metric[1h])) / scalar(count(sum(irate(some_metric[1h])) by (tag_0)))", + "sum(irate(some_metric[1h])) / scalar(count(group(some_metric) by (tag_0)))", + ] { + let plan = parse_promql_to_plan(query).await; + let plan_str = plan.display_indent_schema().to_string(); + assert!(!plan_str.contains("Distinct:"), "{query}: {plan_str}"); + } + } + + #[tokio::test] + async fn test_plan_sql_does_not_apply_nested_count_rule() { + let plan = parse_sql_to_plan( + "SELECT id, count(inner_count) \ + FROM ( \ + SELECT id, count(name) AS inner_count \ + FROM test \ + GROUP BY id \ + ORDER BY id \ + LIMIT 1000000 \ + ) t \ + GROUP BY id \ + ORDER BY id", + ) + .await; + + let plan_str = plan.display_indent_schema().to_string(); + assert!(!plan_str.contains("Distinct:"), "{plan_str}"); + } + + #[tokio::test] + async fn test_get_inferred_parameter_types_subquery() { + let plan = parse_sql_to_plan( + r#"SELECT * FROM test WHERE id = (SELECT id FROM test CROSS JOIN (SELECT parse_ident($1::TEXT) AS parts) p LIMIT 1)"#, + ).await; + let types = DfLogicalPlanner::get_inferred_parameter_types(&plan).unwrap(); + + assert_eq!(types.len(), 1); + let type_1 = types.get("$1").unwrap(); + assert_eq!(type_1, &Some(DataType::Utf8)); + } } diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs index 427644e26a..23d654d2b6 100644 --- a/src/query/src/promql/planner.rs +++ b/src/query/src/promql/planner.rs @@ -3323,28 +3323,55 @@ impl PromPlanner { fn prom_token_to_binary_expr_builder( token: TokenType, ) -> Result Result>> { + let cast_float = |expr| { + if matches!( + &expr, + DfExpr::Cast(Cast { + data_type: ArrowDataType::Float64, + .. + }) + ) || matches!(&expr, DfExpr::Literal(ScalarValue::Float64(_), _)) + { + expr + } else { + DfExpr::Cast(Cast { + expr: Box::new(expr), + data_type: ArrowDataType::Float64, + }) + } + }; match token.id() { - token::T_ADD => Ok(Box::new(|lhs, rhs| Ok(lhs + rhs))), - token::T_SUB => Ok(Box::new(|lhs, rhs| Ok(lhs - rhs))), - token::T_MUL => Ok(Box::new(|lhs, rhs| Ok(lhs * rhs))), - token::T_DIV => Ok(Box::new(|lhs, rhs| Ok(lhs / rhs))), - token::T_MOD => Ok(Box::new(|lhs: DfExpr, rhs| Ok(lhs % rhs))), + token::T_ADD => Ok(Box::new(move |lhs, rhs| { + Ok(cast_float(lhs) + cast_float(rhs)) + })), + token::T_SUB => Ok(Box::new(move |lhs, rhs| { + Ok(cast_float(lhs) - cast_float(rhs)) + })), + token::T_MUL => Ok(Box::new(move |lhs, rhs| { + Ok(cast_float(lhs) * cast_float(rhs)) + })), + token::T_DIV => Ok(Box::new(move |lhs, rhs| { + Ok(cast_float(lhs) / cast_float(rhs)) + })), + token::T_MOD => Ok(Box::new(move |lhs: DfExpr, rhs| { + Ok(cast_float(lhs) % cast_float(rhs)) + })), token::T_EQLC => Ok(Box::new(|lhs, rhs| Ok(lhs.eq(rhs)))), token::T_NEQ => Ok(Box::new(|lhs, rhs| Ok(lhs.not_eq(rhs)))), token::T_GTR => Ok(Box::new(|lhs, rhs| Ok(lhs.gt(rhs)))), token::T_LSS => Ok(Box::new(|lhs, rhs| Ok(lhs.lt(rhs)))), token::T_GTE => Ok(Box::new(|lhs, rhs| Ok(lhs.gt_eq(rhs)))), token::T_LTE => Ok(Box::new(|lhs, rhs| Ok(lhs.lt_eq(rhs)))), - token::T_POW => Ok(Box::new(|lhs, rhs| { + token::T_POW => Ok(Box::new(move |lhs, rhs| { Ok(DfExpr::ScalarFunction(ScalarFunction { func: datafusion_functions::math::power(), - args: vec![lhs, rhs], + args: vec![cast_float(lhs), cast_float(rhs)], })) })), - token::T_ATAN2 => Ok(Box::new(|lhs, rhs| { + token::T_ATAN2 => Ok(Box::new(move |lhs, rhs| { Ok(DfExpr::ScalarFunction(ScalarFunction { func: datafusion_functions::math::atan2(), - args: vec![lhs, rhs], + args: vec![cast_float(lhs), cast_float(rhs)], })) })), _ => UnexpectedTokenSnafu { token }.fail(), @@ -4029,6 +4056,7 @@ mod test { use table::test_util::EmptyTable; use super::*; + use crate::QueryEngineContext; use crate::options::QueryOptions; use crate::parser::QueryLanguageParser; @@ -4046,6 +4074,64 @@ mod test { ) } + async fn build_optimized_promql_plan( + table_provider: DfTableSourceProvider, + eval_stmt: &EvalStmt, + ) -> LogicalPlan { + let state = build_query_engine_state(); + let raw_plan = PromPlanner::stmt_to_plan(table_provider, eval_stmt, &state) + .await + .unwrap(); + let context = QueryEngineContext::new(state.session_state(), QueryContext::arc()); + state + .optimize_by_extension_rules(raw_plan, &context) + .unwrap() + } + + async fn build_optimized_tsid_plan( + query: &str, + num_tag: usize, + num_field: usize, + end_secs: u64, + lookback_secs: u64, + ) -> String { + let eval_stmt = EvalStmt { + expr: parser::parse(query).unwrap(), + start: UNIX_EPOCH, + end: UNIX_EPOCH + .checked_add(Duration::from_secs(end_secs)) + .unwrap(), + interval: Duration::from_secs(5), + lookback_delta: Duration::from_secs(lookback_secs), + }; + let table_provider = build_test_table_provider_with_tsid( + &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())], + num_tag, + num_field, + ) + .await; + + build_optimized_promql_plan(table_provider, &eval_stmt) + .await + .display_indent_schema() + .to_string() + } + + async fn assert_nested_count_rewrite_applies(query: &str, expected_outer_agg: &str) { + let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await; + + assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]")); + assert!(plan_str.contains("Projection: some_metric.timestamp, some_metric.tag_0")); + assert!(plan_str.contains("Distinct:")); + assert!(plan_str.contains(expected_outer_agg), "{plan_str}"); + assert!(!plan_str.contains("PromSeriesDivide: tags=[\"tag_0\"]")); + } + + async fn assert_nested_count_rewrite_missing(query: &str, num_tag: usize, lookback_secs: u64) { + let plan_str = build_optimized_tsid_plan(query, num_tag, 1, 100_000, lookback_secs).await; + assert!(!plan_str.contains("Distinct:"), "{plan_str}"); + } + async fn build_test_table_provider( table_name_tuples: &[(String, String)], num_tag: usize, @@ -4658,6 +4744,117 @@ mod test { ); } + #[tokio::test] + async fn scalar_count_count_range_keeps_full_window() { + let plan_str = build_optimized_tsid_plan( + "scalar(count(count(some_metric) by (tag_0)))", + 1, + 1, + 100_000, + 1, + ) + .await; + assert!(plan_str.contains("ScalarCalculate: tags=[]")); + assert!(plan_str.contains("PromInstantManipulate: range=[0..100000000]")); + assert!(!plan_str.contains("PromInstantManipulate: range=[99999000..99999000]")); + } + + #[tokio::test] + async fn scalar_count_count_rewrite_applies_inside_binary_expr_for_tsid_input() { + let plan_str = build_optimized_tsid_plan( + "sum(irate(some_metric[1h])) / scalar(count(count(some_metric) by (tag_0)))", + 2, + 1, + 10, + 300, + ) + .await; + assert!(plan_str.contains("Distinct:"), "{plan_str}"); + } + + #[tokio::test] + async fn nested_count_rewrite_keeps_full_series_key_with_tsid_input() { + assert_nested_count_rewrite_applies( + "count(count(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(count(some_metric.field_0))]]" + ) + .await; + } + + #[tokio::test] + async fn nested_sum_count_rewrite_keeps_full_series_key_with_tsid_input() { + assert_nested_count_rewrite_applies( + "count(sum(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(sum(some_metric.field_0))]]" + ) + .await; + } + + #[tokio::test] + async fn nested_supported_inner_aggs_rewrite_apply_for_tsid_input() { + for (query, expected_outer_agg) in [ + ( + "count(avg(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(avg(some_metric.field_0))]]", + ), + ( + "count(min(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(min(some_metric.field_0))]]", + ), + ( + "count(max(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(max(some_metric.field_0))]]", + ), + ( + "count(stddev(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(stddev_pop(some_metric.field_0))]]", + ), + ( + "count(stdvar(some_metric) by (tag_0))", + "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(var_pop(some_metric.field_0))]]", + ), + ] { + assert_nested_count_rewrite_applies(query, expected_outer_agg).await; + } + } + + #[tokio::test] + async fn nested_non_count_inner_aggs_rewrite_filter_null_values_for_tsid_input() { + let count_plan = + build_optimized_tsid_plan("count(count(some_metric) by (tag_0))", 2, 1, 100_000, 1) + .await; + assert!( + !count_plan.contains("some_metric.field_0 IS NOT NULL"), + "{count_plan}" + ); + + for query in [ + "count(sum(some_metric) by (tag_0))", + "count(avg(some_metric) by (tag_0))", + "count(min(some_metric) by (tag_0))", + "count(max(some_metric) by (tag_0))", + "count(stddev(some_metric) by (tag_0))", + "count(stdvar(some_metric) by (tag_0))", + ] { + let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await; + assert!( + plan_str.contains("Filter: some_metric.field_0 IS NOT NULL"), + "{query}: {plan_str}" + ); + } + } + + #[tokio::test] + async fn nested_unsupported_or_non_direct_inner_aggs_do_not_rewrite() { + assert_nested_count_rewrite_missing("count(group(some_metric) by (tag_0))", 2, 1).await; + assert_nested_count_rewrite_missing( + "count(sum(irate(some_metric[1h])) by (tag_0))", + 2, + 300, + ) + .await; + } + #[tokio::test] async fn physical_table_name_is_not_leaked_in_plan() { let prom_expr = parser::parse("some_metric").unwrap(); @@ -5169,7 +5366,7 @@ mod test { .unwrap(); let expected = String::from( - "Projection: rhs.tag_0, rhs.timestamp, lhs.field_0 + rhs.field_0 AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\ + "Projection: rhs.tag_0, rhs.timestamp, CAST(lhs.field_0 AS Float64) + CAST(rhs.field_0 AS Float64) AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\ \n Inner Join: lhs.tag_0 = rhs.tag_0, lhs.timestamp = rhs.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n SubqueryAlias: lhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ @@ -5224,7 +5421,7 @@ mod test { async fn binary_op_literal_column() { let query = r#"1 + some_metric{tag_0="bar"}"#; let expected = String::from( - "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + some_metric.field_0 AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\ + "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + CAST(some_metric.field_0 AS Float64) AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ @@ -5262,7 +5459,7 @@ mod test { async fn bool_with_additional_arithmetic() { let query = "some_metric + (1 == bool 2)"; let expected = String::from( - "Projection: some_metric.tag_0, some_metric.timestamp, some_metric.field_0 + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\ + "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 AS Float64) + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ @@ -5372,7 +5569,7 @@ mod test { PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state()) .await .unwrap(); - let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\ + let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, CAST(http_server_requests_seconds_sum.greptime_value AS Float64) / CAST(http_server_requests_seconds_count.greptime_value AS Float64) AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\ \n Inner Join: http_server_requests_seconds_sum.greptime_timestamp = http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.uri = http_server_requests_seconds_count.uri\ \n SubqueryAlias: http_server_requests_seconds_sum\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\ @@ -5763,7 +5960,7 @@ mod test { let query = "some_alt_metric{__schema__=\"greptime_private\"} / some_metric"; let expected = String::from( - "Projection: some_metric.tag_0, some_metric.timestamp, greptime_private.some_alt_metric.field_0 / some_metric.field_0 AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\ + "Projection: some_metric.tag_0, some_metric.timestamp, CAST(greptime_private.some_alt_metric.field_0 AS Float64) / CAST(some_metric.field_0 AS Float64) AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\ \n Inner Join: greptime_private.some_alt_metric.tag_0 = some_metric.tag_0, greptime_private.some_alt_metric.timestamp = some_metric.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n SubqueryAlias: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ \n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\ diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs index a45fc4c896..f696c8b53e 100644 --- a/src/query/src/query_engine/state.rs +++ b/src/query/src/query_engine/state.rs @@ -60,6 +60,7 @@ use crate::dist_plan::{ use crate::metrics::{QUERY_MEMORY_POOL_REJECTED_TOTAL, QUERY_MEMORY_POOL_USAGE_BYTES}; use crate::optimizer::ExtensionAnalyzerRule; use crate::optimizer::constant_term::MatchesConstantTermOptimizer; +use crate::optimizer::count_nest_aggr::CountNestAggrRule; use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule; use crate::optimizer::parallelize_scan::ParallelizeScan; use crate::optimizer::pass_distribution::PassDistribution; @@ -146,6 +147,7 @@ impl QueryEngineState { // The [`TypeConversionRule`] must be at first extension_rules.insert(0, Arc::new(TypeConversionRule) as _); + extension_rules.push(Arc::new(CountNestAggrRule) as _); // Apply the datafusion rules let mut analyzer = Analyzer::new(); diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index e75192c9ba..8b64a256e7 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -89,7 +89,7 @@ operator.workspace = true otel-arrow-rust.workspace = true parking_lot.workspace = true pg_interval = { version = "0.5.2", package = "pg_interval_2" } -pgwire = { version = "0.38", default-features = false, features = [ +pgwire = { version = "0.38.1", default-features = false, features = [ "server-api-ring", "pg-ext-types", ] } diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION index 03ee1a5314..87a1cf595a 100644 --- a/src/servers/dashboard/VERSION +++ b/src/servers/dashboard/VERSION @@ -1 +1 @@ -v0.11.13 +v0.12.0 diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index ca6a77a077..506a240cac 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -78,7 +78,7 @@ use crate::metrics_handler::MetricsHandler; use crate::prometheus_handler::PrometheusHandlerRef; use crate::query_handler::sql::ServerSqlQueryHandlerRef; use crate::query_handler::{ - InfluxdbLineProtocolHandlerRef, JaegerQueryHandlerRef, LogQueryHandlerRef, + DashboardHandlerRef, InfluxdbLineProtocolHandlerRef, JaegerQueryHandlerRef, LogQueryHandlerRef, OpenTelemetryProtocolHandlerRef, OpentsdbProtocolHandlerRef, PipelineHandlerRef, PromStoreProtocolHandlerRef, }; @@ -112,8 +112,8 @@ pub mod utils; use result::HttpOutputWriter; pub(crate) use timeout::DynamicTimeoutLayer; +mod client_ip; use crate::prom_remote_write::validation::PromValidationMode; - mod hints; mod read_preference; #[cfg(any(test, feature = "testing"))] @@ -507,6 +507,11 @@ pub struct GreptimeOptionsConfigState { pub greptime_config_options: String, } +#[derive(Clone)] +pub struct DashboardState { + pub handler: DashboardHandlerRef, +} + pub struct HttpServerBuilder { options: HttpOptions, plugins: Plugins, @@ -703,6 +708,16 @@ impl HttpServerBuilder { } } + pub fn with_dashboard_handler(self, handler: DashboardHandlerRef) -> Self { + Self { + router: self.router.nest( + &format!("/{HTTP_API_VERSION}/dashboards"), + HttpServer::route_dashboard(handler), + ), + ..self + } + } + pub fn with_extra_router(self, router: Router) -> Self { Self { router: self.router.merge(router), @@ -868,6 +883,7 @@ impl HttpServer { authorize::check_http_auth, )) .layer(middleware::from_fn(hints::extract_hints)) + .layer(middleware::from_fn(client_ip::log_error_with_client_ip)) .layer(middleware::from_fn( read_preference::extract_read_preference, )), @@ -1169,6 +1185,26 @@ impl HttpServer { ) .with_state(handler) } + + #[cfg(feature = "dashboard")] + fn route_dashboard(handler: DashboardHandlerRef) -> Router { + use crate::http::dashboard::{add_dashboard, delete_dashboard, list_dashboards}; + + Router::new() + .route("/", routing::get(list_dashboards)) + .route("/{dashboard_name}", routing::post(add_dashboard)) + .route("/{dashboard_name}", routing::delete(delete_dashboard)) + .layer( + ServiceBuilder::new() + .layer(RequestDecompressionLayer::new().pass_through_unaccepted(true)), + ) + .with_state(DashboardState { handler }) + } + + #[cfg(not(feature = "dashboard"))] + fn route_dashboard(handler: DashboardHandlerRef) -> Router { + Router::new().with_state(DashboardState { handler }) + } } pub const HTTP_SERVER: &str = "HTTP_SERVER"; @@ -1212,7 +1248,10 @@ impl Server for HttpServer { error!(e; "Failed to set TCP_NODELAY on incoming connection"); } }); - let serve = axum::serve(listener, app.into_make_service()); + let serve = axum::serve( + listener, + app.into_make_service_with_connect_info::(), + ); // FIXME(yingwen): Support keepalive. // See: diff --git a/src/servers/src/http/client_ip.rs b/src/servers/src/http/client_ip.rs new file mode 100644 index 0000000000..70df554ebb --- /dev/null +++ b/src/servers/src/http/client_ip.rs @@ -0,0 +1,109 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::net::SocketAddr; + +use axum::body::Body; +use axum::extract::{ConnectInfo, MatchedPath}; +use axum::http::Request; +use axum::middleware::Next; +use axum::response::Response; +use common_telemetry::warn; + +/// Middleware that logs HTTP error responses (4xx/5xx) with client IP address. +/// +/// Extracts client address from [`ConnectInfo`] if available. +pub async fn log_error_with_client_ip(req: Request, next: Next) -> Response { + let request_info = req + .extensions() + .get::>() + .map(|c| c.0) + .map(|addr| { + let method = req.method().clone(); + let uri = req.uri().clone(); + let matched_path = req.extensions().get::().cloned(); + (addr, method, uri, matched_path) + }); + + let response = next.run(req).await; + + if (response.status().is_client_error() || response.status().is_server_error()) + && let Some((addr, method, uri, matched_path)) = request_info + { + warn!( + "HTTP error response {} for {} {} (matched: {}) from client {}", + response.status(), + method, + uri, + matched_path + .as_ref() + .map(|p| p.as_str()) + .unwrap_or(""), + addr + ); + } + + response +} + +#[cfg(test)] +mod tests { + use axum::Router; + use axum::routing::get; + use http::StatusCode; + use tower::ServiceExt; + + use super::*; + + #[tokio::test] + async fn test_middleware_passes_error_response() { + async fn not_found_handler() -> StatusCode { + StatusCode::NOT_FOUND + } + + let app = Router::new() + .route("/not-found", get(not_found_handler)) + .layer(axum::middleware::from_fn(log_error_with_client_ip)); + + let response = app + .oneshot( + Request::builder() + .uri("/not-found") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::NOT_FOUND); + } + + #[tokio::test] + async fn test_middleware_passes_success_response() { + async fn ok_handler() -> StatusCode { + StatusCode::OK + } + + let app = Router::new() + .route("/ok", get(ok_handler)) + .layer(axum::middleware::from_fn(log_error_with_client_ip)); + + let response = app + .oneshot(Request::builder().uri("/ok").body(Body::empty()).unwrap()) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + } +} diff --git a/src/servers/src/http/dashboard.rs b/src/servers/src/http/dashboard.rs index bdb98490f0..ea894ca7d0 100644 --- a/src/servers/src/http/dashboard.rs +++ b/src/servers/src/http/dashboard.rs @@ -12,14 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -use axum::body::Body; +use std::sync::Arc; +use std::time::Instant; + +use axum::body::{Body, Bytes}; +use axum::extract::{Extension, Path, State}; use axum::http::{StatusCode, Uri, header}; use axum::response::Response; -use common_telemetry::debug; +use common_telemetry::{debug, error}; use rust_embed::RustEmbed; -use snafu::ResultExt; +use session::context::{Channel, QueryContext}; +use snafu::{ResultExt, ensure}; -use crate::error::{BuildHttpResponseSnafu, Result}; +use crate::error::{BuildHttpResponseSnafu, InvalidParameterSnafu, Result}; +use crate::http::DashboardState; +use crate::http::result::greptime_manage_resp::{DashboardOutput, GreptimedbManageResponse}; #[derive(RustEmbed)] #[folder = "dashboard/dist/"] @@ -61,3 +68,102 @@ fn get_assets(path: &str) -> Result { } .context(BuildHttpResponseSnafu) } + +#[axum_macros::debug_handler] +pub async fn add_dashboard( + State(state): State, + Path(dashboard_name): Path, + Extension(mut query_ctx): Extension, + payload: Bytes, +) -> Result { + let start = Instant::now(); + let handler = state.handler; + ensure!( + !dashboard_name.is_empty(), + InvalidParameterSnafu { + reason: "dashboard_name is required in path", + } + ); + + let definition = String::from_utf8_lossy(&payload).to_string(); + + query_ctx.set_channel(Channel::HttpSql); + let query_ctx = Arc::new(query_ctx); + + handler + .save(&dashboard_name, &definition, query_ctx) + .await + .map(|_| { + GreptimedbManageResponse::from_dashboard( + dashboard_name, + start.elapsed().as_millis() as u64, + ) + }) + .map_err(|e| { + error!(e; "failed to save dashboard"); + e + }) +} + +#[axum_macros::debug_handler] +pub async fn list_dashboards( + State(state): State, + Extension(mut query_ctx): Extension, +) -> Result { + let start = Instant::now(); + let handler = state.handler; + + query_ctx.set_channel(Channel::HttpSql); + let query_ctx = Arc::new(query_ctx); + + handler + .list(query_ctx) + .await + .map(|dashboards| { + let outputs: Vec = dashboards + .into_iter() + .map(|d| DashboardOutput { + name: d.name, + definition: d.definition, + }) + .collect(); + GreptimedbManageResponse::from_dashboards(outputs, start.elapsed().as_millis() as u64) + }) + .map_err(|e| { + error!(e; "failed to list dashboards"); + e + }) +} + +#[axum_macros::debug_handler] +pub async fn delete_dashboard( + State(state): State, + Extension(mut query_ctx): Extension, + Path(dashboard_name): Path, +) -> Result { + let start = Instant::now(); + let handler = state.handler; + ensure!( + !dashboard_name.is_empty(), + InvalidParameterSnafu { + reason: "dashboard_name is required", + } + ); + + query_ctx.set_channel(Channel::HttpSql); + let query_ctx = Arc::new(query_ctx); + + handler + .delete(&dashboard_name, query_ctx) + .await + .map(|_| { + GreptimedbManageResponse::from_dashboard( + dashboard_name, + start.elapsed().as_millis() as u64, + ) + }) + .map_err(|e| { + error!(e; "failed to delete dashboard"); + e + }) +} diff --git a/src/servers/src/http/result/error_result.rs b/src/servers/src/http/result/error_result.rs index 7b70066b68..9bd6e1a7a3 100644 --- a/src/servers/src/http/result/error_result.rs +++ b/src/servers/src/http/result/error_result.rs @@ -32,17 +32,24 @@ pub struct ErrorResponse { impl ErrorResponse { pub fn from_error(error: impl ErrorExt) -> Self { let code = error.status_code(); - if code.should_log_error() { error!(error; "Failed to handle HTTP request"); } else { debug!("Failed to handle HTTP request, err: {:?}", error); } - - Self::from_error_message(code, error.output_msg()) + ErrorResponse { + code: code as u32, + error: error.output_msg(), + execution_time_ms: 0, + } } pub fn from_error_message(code: StatusCode, msg: String) -> Self { + if code.should_log_error() { + error!("Failed to handle HTTP request: {}", msg); + } else { + debug!("Failed to handle HTTP request: {}", msg); + } ErrorResponse { code: code as u32, error: msg, diff --git a/src/servers/src/http/result/greptime_manage_resp.rs b/src/servers/src/http/result/greptime_manage_resp.rs index 3f7f3c6eec..2b3a5d455c 100644 --- a/src/servers/src/http/result/greptime_manage_resp.rs +++ b/src/servers/src/http/result/greptime_manage_resp.rs @@ -62,6 +62,25 @@ impl GreptimedbManageResponse { } } + pub fn from_dashboard(name: String, execution_time_ms: u64) -> Self { + GreptimedbManageResponse { + manage_result: ManageResult::Dashboards { + dashboards: vec![DashboardOutput { + name, + definition: String::new(), + }], + }, + execution_time_ms, + } + } + + pub fn from_dashboards(dashboards: Vec, execution_time_ms: u64) -> Self { + GreptimedbManageResponse { + manage_result: ManageResult::Dashboards { dashboards }, + execution_time_ms, + } + } + pub fn with_execution_time(mut self, execution_time: u64) -> Self { self.execution_time_ms = execution_time; self @@ -77,6 +96,7 @@ impl GreptimedbManageResponse { pub enum ManageResult { Pipelines { pipelines: Vec }, Sql { sql: SqlOutput }, + Dashboards { dashboards: Vec }, } #[derive(Serialize, Deserialize, Debug)] @@ -87,6 +107,13 @@ pub struct PipelineOutput { pipeline: Option, } +#[derive(Serialize, Deserialize, Debug)] +pub struct DashboardOutput { + pub name: String, + #[serde(skip_serializing_if = "String::is_empty")] + pub definition: String, +} + #[derive(Serialize, Deserialize, Debug)] pub struct SqlOutput { pub(crate) sql: String, diff --git a/src/servers/src/postgres/types.rs b/src/servers/src/postgres/types.rs index a95890e78c..d4d15ef64a 100644 --- a/src/servers/src/postgres/types.rs +++ b/src/servers/src/postgres/types.rs @@ -235,7 +235,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result { match origin { &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN), &ConcreteDataType::Boolean(_) => Ok(Type::BOOL), - &ConcreteDataType::Int8(_) => Ok(Type::CHAR), + &ConcreteDataType::Int8(_) => Ok(Type::INT2), &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2), &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4), &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8), @@ -253,7 +253,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result { ConcreteDataType::List(list) => match list.item_type() { &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN), &ConcreteDataType::Boolean(_) => Ok(Type::BOOL_ARRAY), - &ConcreteDataType::Int8(_) => Ok(Type::CHAR_ARRAY), + &ConcreteDataType::Int8(_) => Ok(Type::INT2_ARRAY), &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2_ARRAY), &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4_ARRAY), &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8_ARRAY), @@ -1151,7 +1151,7 @@ mod test { let pg_field_info = vec![ FieldInfo::new("nulls".into(), None, None, Type::UNKNOWN, FieldFormat::Text), FieldInfo::new("bools".into(), None, None, Type::BOOL, FieldFormat::Text), - FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text), + FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text), FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text), @@ -1230,7 +1230,7 @@ mod test { Type::NUMERIC, FieldFormat::Text, ), - FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text), + FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text), FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text), FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text), diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index 60efe69faa..21c7646560 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -44,6 +44,12 @@ use pipeline::{GreptimePipelineParams, Pipeline, PipelineInfo, PipelineVersion, use serde_json::Value; use session::context::{QueryContext, QueryContextRef}; +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct DashboardDefinition { + pub name: String, + pub definition: String, +} + use crate::error::Result; use crate::http::jaeger::QueryTraceParams; use crate::influxdb::InfluxdbRequest; @@ -176,6 +182,18 @@ pub trait PipelineHandler { ) -> Result<(String, TimestampNanosecond)>; } +/// Handling dashboard as code CRUD +pub type DashboardHandlerRef = Arc; + +#[async_trait] +pub trait DashboardHandler { + async fn save(&self, name: &str, definition: &str, ctx: QueryContextRef) -> Result<()>; + + async fn list(&self, ctx: QueryContextRef) -> Result>; + + async fn delete(&self, name: &str, ctx: QueryContextRef) -> Result<()>; +} + /// Handle log query requests. #[async_trait] pub trait LogQueryHandler { diff --git a/src/servers/src/query_handler/grpc.rs b/src/servers/src/query_handler/grpc.rs index 67d8b3890e..d66a76464e 100644 --- a/src/servers/src/query_handler/grpc.rs +++ b/src/servers/src/query_handler/grpc.rs @@ -17,15 +17,13 @@ use std::sync::Arc; use api::v1::greptime_request::Request; use async_trait::async_trait; -use common_base::AffectedRows; use common_grpc::flight::do_put::DoPutResponse; use common_query::Output; use futures::Stream; use session::context::QueryContextRef; -use table::TableRef; use crate::error::Result; -use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream}; +use crate::grpc::flight::PutRecordBatchRequestStream; pub type ServerGrpcQueryHandlerRef = Arc; @@ -35,13 +33,6 @@ pub type RawRecordBatch = bytes::Bytes; pub trait GrpcQueryHandler { async fn do_query(&self, query: Request, ctx: QueryContextRef) -> Result; - async fn put_record_batch( - &self, - request: PutRecordBatchRequest, - table_ref: &mut Option, - ctx: QueryContextRef, - ) -> Result; - fn handle_put_record_batch_stream( &self, stream: PutRecordBatchRequestStream, diff --git a/src/servers/tests/mod.rs b/src/servers/tests/mod.rs index e3f8f8fc79..c4f83c5e6c 100644 --- a/src/servers/tests/mod.rs +++ b/src/servers/tests/mod.rs @@ -18,7 +18,6 @@ use api::v1::greptime_request::Request; use api::v1::query_request::Query; use async_trait::async_trait; use catalog::memory::MemoryCatalogManager; -use common_base::AffectedRows; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_grpc::flight::do_put::DoPutResponse; use common_query::Output; @@ -149,15 +148,6 @@ impl GrpcQueryHandler for DummyInstance { Ok(output) } - async fn put_record_batch( - &self, - _request: servers::grpc::flight::PutRecordBatchRequest, - _table_ref: &mut Option, - _ctx: QueryContextRef, - ) -> Result { - unimplemented!() - } - fn handle_put_record_batch_stream( &self, _stream: servers::grpc::flight::PutRecordBatchRequestStream, diff --git a/src/store-api/src/metadata.rs b/src/store-api/src/metadata.rs index d571a5392f..0c663bccc0 100644 --- a/src/store-api/src/metadata.rs +++ b/src/store-api/src/metadata.rs @@ -18,8 +18,8 @@ use std::any::Any; use std::collections::{HashMap, HashSet}; -use std::fmt; use std::sync::Arc; +use std::{fmt, mem}; use api::v1::SemanticType; use api::v1::column_def::try_as_column_schema; @@ -99,6 +99,12 @@ impl ColumnMetadata { pub fn is_same_datatype(&self, other: &Self) -> bool { self.column_schema.data_type == other.column_schema.data_type } + + /// Returns the estimated memory footprint of this metadata. + pub fn estimated_size(&self) -> usize { + mem::size_of_val(self) - mem::size_of_val(&self.column_schema) + + self.column_schema.estimated_size() + } } #[cfg_attr(doc, aquamarine::aquamarine)] @@ -226,6 +232,25 @@ impl RegionMetadata { serde_json::from_str(s).context(SerdeJsonSnafu) } + /// Returns the estimated memory footprint of this metadata. + pub fn estimated_size(&self) -> usize { + mem::size_of_val(self) + + mem::size_of::() * self.column_metadatas.capacity() + + self + .column_metadatas + .iter() + .map(|column| column.estimated_size() - mem::size_of::()) + .sum::() + + mem::size_of::() * self.primary_key.capacity() + + mem::size_of::<(ColumnId, usize)>() * self.id_to_index.capacity() + + self.schema.estimated_size() + + self + .partition_expr + .as_ref() + .map(|expr| expr.capacity()) + .unwrap_or_default() + } + /// Encode the metadata to a JSON string. pub fn to_json(&self) -> Result { serde_json::to_string(&self).context(SerdeJsonSnafu) diff --git a/src/table/src/predicate.rs b/src/table/src/predicate.rs index f9be7be16e..2c9ac41560 100644 --- a/src/table/src/predicate.rs +++ b/src/table/src/predicate.rs @@ -203,7 +203,7 @@ pub fn build_time_range_predicate( /// Extract time range filter from `WHERE`/`IN (...)`/`BETWEEN` clauses. /// Return None if no time range can be found in expr. -fn extract_time_range_from_expr( +pub fn extract_time_range_from_expr( ts_col_name: &str, ts_col_unit: TimeUnit, expr: &Expr, diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs index 43fc36644b..15b4278f51 100644 --- a/src/table/src/requests.rs +++ b/src/table/src/requests.rs @@ -36,8 +36,9 @@ use store_api::metric_engine_consts::{ LOGICAL_TABLE_METADATA_KEY, PHYSICAL_TABLE_METADATA_KEY, is_metric_engine_option_key, }; use store_api::mito_engine_options::{ - APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, TWCS_FALLBACK_TO_LOCAL, - TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM, is_mito_engine_option_key, + APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, SST_FORMAT_KEY, + TWCS_FALLBACK_TO_LOCAL, TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM, + is_mito_engine_option_key, }; use store_api::region_request::{SetRegionOption, UnsetRegionOption}; @@ -56,13 +57,14 @@ pub const TABLE_DATA_MODEL_TRACE_V1: &str = "greptime_trace_v1"; pub const OTLP_METRIC_COMPAT_KEY: &str = "otlp_metric_compat"; pub const OTLP_METRIC_COMPAT_PROM: &str = "prom"; -pub const VALID_TABLE_OPTION_KEYS: [&str; 12] = [ +pub const VALID_TABLE_OPTION_KEYS: [&str; 13] = [ // common keys: WRITE_BUFFER_SIZE_KEY, TTL_KEY, STORAGE_KEY, COMMENT_KEY, SKIP_WAL_KEY, + SST_FORMAT_KEY, // file engine keys: FILE_TABLE_LOCATION_KEY, FILE_TABLE_FORMAT_KEY, @@ -94,6 +96,7 @@ static VALID_DB_OPT_KEYS: Lazy> = Lazy::new(|| { set.insert(TWCS_TIME_WINDOW); set.insert(TWCS_TRIGGER_FILE_NUM); set.insert(TWCS_MAX_OUTPUT_FILE_SIZE); + set.insert(SST_FORMAT_KEY); set }); diff --git a/tests-fuzz/Cargo.toml b/tests-fuzz/Cargo.toml index a537ca0687..bc687092c0 100644 --- a/tests-fuzz/Cargo.toml +++ b/tests-fuzz/Cargo.toml @@ -100,6 +100,13 @@ test = false bench = false doc = false +[[bin]] +name = "fuzz_repartition_metric_table" +path = "targets/ddl/fuzz_repartition_metric_table.rs" +test = false +bench = false +doc = false + [[bin]] name = "fuzz_alter_table" path = "targets/ddl/fuzz_alter_table.rs" diff --git a/tests-fuzz/README.md b/tests-fuzz/README.md index 6807e19a1c..cc9d7eb84e 100644 --- a/tests-fuzz/README.md +++ b/tests-fuzz/README.md @@ -66,3 +66,23 @@ GT_FUZZ_OVERRIDE_SEED=6666 GT_FUZZ_OVERRIDE_ACTIONS=175 cargo fuzz run fuzz_targ ``` For more details, visit [cargo fuzz](https://rust-fuzz.github.io/book/cargo-fuzz/tutorial.html) or run the command `cargo fuzz --help`. + +## Repartition Metric Dump Artifacts + +For `fuzz_repartition_metric_table`, dump artifacts are written under one run directory. + +- Table data snapshots: `.table-data.csv` +- SQL traces per logical table: `.trace.sql` +- Seed metadata: `seed.meta` + +SQL trace behavior: + +- Insert SQL is appended after successful execution with comment fields including + `started_at_ms` and `elapsed_ms`. +- Repartition events are broadcast to all logical table trace files with comment fields including + `action_idx`, `started_at_ms`, `elapsed_ms`, and SQL text. + +Run directory lifecycle: + +- On success, the run directory is cleaned up. +- On failure, the run directory is retained for CI/local diffing. diff --git a/tests-fuzz/src/fake.rs b/tests-fuzz/src/fake.rs index aa92e0293a..8910a39206 100644 --- a/tests-fuzz/src/fake.rs +++ b/tests-fuzz/src/fake.rs @@ -65,6 +65,26 @@ where _v: PhantomData, } +pub struct ConstGenerator { + value: V, +} + +impl ConstGenerator { + pub fn new(value: V) -> Self { + Self { value } + } +} + +impl Random for ConstGenerator +where + R: Rng, + V: Clone, +{ + fn choose(&self, _rng: &mut R, amount: usize) -> Vec { + vec![self.value.clone(); amount] + } +} + pub fn random_capitalize_map(rng: &mut R, s: Ident) -> Ident { let mut v = s.value.chars().collect::>(); diff --git a/tests-fuzz/src/generator/create_expr.rs b/tests-fuzz/src/generator/create_expr.rs index fae6a95eda..261a310db2 100644 --- a/tests-fuzz/src/generator/create_expr.rs +++ b/tests-fuzz/src/generator/create_expr.rs @@ -193,6 +193,26 @@ fn generate_partition_def( } } +fn generate_metric_partition(partitions: usize) -> Option<(Column, PartitionDef)> { + if partitions <= 1 { + return None; + } + + let partition_column = Column { + name: Ident::new("host"), + column_type: ConcreteDataType::string_datatype(), + options: vec![ColumnOption::PrimaryKey], + }; + let bounds = generate_partition_bounds(&partition_column.column_type, partitions - 1); + let partitions = SimplePartitions::new(partition_column.name.clone(), bounds); + let partition_def = PartitionDef { + columns: vec![partitions.column_name.clone()], + exprs: partitions.generate().unwrap(), + }; + + Some((partition_column, partition_def)) +} + /// Generate a physical table with 2 columns: ts of TimestampType::Millisecond as time index and val of Float64Type. #[derive(Builder)] #[builder(pattern = "owned")] @@ -201,6 +221,8 @@ pub struct CreatePhysicalTableExprGenerator { name_generator: Box>, #[builder(default = "false")] if_not_exists: bool, + #[builder(default = "0")] + partition: usize, #[builder(default, setter(into))] with_clause: HashMap, } @@ -215,25 +237,35 @@ impl Generator for CreatePhysicalTableExpr options.insert(key.clone(), Value::from(value.clone())); } + let mut columns = vec![ + Column { + name: Ident::new("ts"), + column_type: ConcreteDataType::timestamp_millisecond_datatype(), + options: vec![ColumnOption::TimeIndex], + }, + Column { + name: Ident::new("val"), + column_type: ConcreteDataType::float64_datatype(), + options: vec![], + }, + ]; + + let mut partition = None; + let mut primary_keys = vec![]; + if let Some((partition_column, partition_def)) = generate_metric_partition(self.partition) { + columns.push(partition_column); + partition = Some(partition_def); + primary_keys.push(columns.len() - 1); + } + Ok(CreateTableExpr { table_name: self.name_generator.generate(rng), - columns: vec![ - Column { - name: Ident::new("ts"), - column_type: ConcreteDataType::timestamp_millisecond_datatype(), - options: vec![ColumnOption::TimeIndex], - }, - Column { - name: Ident::new("val"), - column_type: ConcreteDataType::float64_datatype(), - options: vec![], - }, - ], + columns, if_not_exists: self.if_not_exists, - partition: None, + partition, engine: "metric".to_string(), options, - primary_keys: vec![], + primary_keys, }) } } @@ -245,6 +277,8 @@ pub struct CreateLogicalTableExprGenerator { physical_table_ctx: TableContextRef, labels: usize, if_not_exists: bool, + #[builder(default = "true")] + include_partition_column: bool, #[builder(default = "Box::new(WordGenerator)")] name_generator: Box>, } @@ -253,11 +287,11 @@ impl Generator for CreateLogicalTableExprG type Error = Error; fn generate(&self, rng: &mut R) -> Result { - // Currently we mock the usage of GreptimeDB as Prometheus' backend, the physical table must have two columns. + // Currently we mock the usage of GreptimeDB as Prometheus' backend, the physical table must have ts and val. ensure!( - self.physical_table_ctx.columns.len() == 2, + self.physical_table_ctx.columns.len() >= 2, error::UnexpectedSnafu { - violated: "The physical table must have two columns" + violated: "The physical table must have at least two columns" } ); @@ -265,9 +299,16 @@ impl Generator for CreateLogicalTableExprG let logical_table_name = self .physical_table_ctx .generate_unique_table_name(rng, self.name_generator.as_ref()); + let mut physical_columns = self.physical_table_ctx.columns.clone(); + if !self.include_partition_column + && let Some(partition_def) = &self.physical_table_ctx.partition + { + physical_columns.retain(|column| !partition_def.columns.contains(&column.name)); + } + let mut logical_table = CreateTableExpr { table_name: logical_table_name, - columns: self.physical_table_ctx.columns.clone(), + columns: physical_columns, if_not_exists: self.if_not_exists, partition: None, engine: "metric".to_string(), @@ -459,6 +500,58 @@ mod tests { })); } + #[test] + fn test_create_physical_table_expr_generator_with_partition() { + let mut rng = rand::rng(); + let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default() + .partition(3) + .if_not_exists(false) + .build() + .unwrap() + .generate(&mut rng) + .unwrap(); + + assert_eq!(physical_table_expr.engine, "metric"); + assert!(physical_table_expr.partition.is_some()); + assert_eq!(physical_table_expr.partition.unwrap().exprs.len(), 3); + } + + #[test] + fn test_create_logical_table_expr_generator_without_partition_column() { + let mut rng = rand::rng(); + let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default() + .partition(3) + .if_not_exists(false) + .build() + .unwrap() + .generate(&mut rng) + .unwrap(); + let partition_columns = physical_table_expr + .partition + .as_ref() + .unwrap() + .columns + .clone(); + let physical_table_ctx = Arc::new(TableContext::from(&physical_table_expr)); + + let logical_table_expr = CreateLogicalTableExprGeneratorBuilder::default() + .physical_table_ctx(physical_table_ctx) + .labels(3) + .include_partition_column(false) + .if_not_exists(false) + .build() + .unwrap() + .generate(&mut rng) + .unwrap(); + + assert!( + logical_table_expr + .columns + .iter() + .all(|column| !partition_columns.contains(&column.name)) + ); + } + #[test] fn test_create_logical_table_expr_generator_deterministic() { let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0); diff --git a/tests-fuzz/src/ir.rs b/tests-fuzz/src/ir.rs index e8c15dcf95..ce1628cd61 100644 --- a/tests-fuzz/src/ir.rs +++ b/tests-fuzz/src/ir.rs @@ -20,6 +20,7 @@ pub(crate) mod insert_expr; pub(crate) mod partition_expr; pub(crate) mod repartition_expr; pub(crate) mod select_expr; +pub(crate) mod string_value; use core::fmt; use std::collections::HashMap; @@ -126,20 +127,7 @@ pub fn generate_partition_bounds(datatype: &ConcreteDataType, bounds: usize) -> ConcreteDataType::Int64(_) => generate_values!(i64, bounds), ConcreteDataType::Float32(_) => generate_values!(f32, bounds), ConcreteDataType::Float64(_) => generate_values!(f64, bounds), - ConcreteDataType::String(_) => { - let base = b'A'; - let range = b'z' - b'A'; - let step = range / (bounds as u8 + 1); - (1..=bounds) - .map(|i| { - Value::from( - char::from(base + step * i as u8) - .escape_default() - .to_string(), - ) - }) - .collect() - } + ConcreteDataType::String(_) => string_value::generate_partition_bounds(bounds), _ => unimplemented!("unsupported type: {datatype}"), } } @@ -157,10 +145,7 @@ pub fn generate_random_value( ConcreteDataType::Int64(_) => Value::from(rng.random::()), ConcreteDataType::Float32(_) => Value::from(rng.random::()), ConcreteDataType::Float64(_) => Value::from(rng.random::()), - ConcreteDataType::String(_) => match random_str { - Some(random) => Value::from(random.generate(rng).value), - None => Value::from(rng.random::().to_string()), - }, + ConcreteDataType::String(_) => string_value::generate_data_string_value(rng, random_str), ConcreteDataType::Date(_) => generate_random_date(rng), _ => unimplemented!("unsupported type: {datatype}"), @@ -341,21 +326,7 @@ pub fn generate_partition_value( } } datatypes::data_type::ConcreteDataType::String(_) => { - let upper = match first { - datatypes::value::Value::String(v) => v.as_utf8(), - _ => "", - }; - if bound_idx == 0 { - if upper <= "A" { - datatypes::value::Value::from("") - } else { - datatypes::value::Value::from("A") - } - } else if bound_idx < bounds.len() { - bounds[bound_idx - 1].clone() - } else { - last.clone() - } + string_value::generate_partition_value(bounds, bound_idx) } _ => unimplemented!("unsupported partition column type: {column_type}"), } diff --git a/tests-fuzz/src/ir/partition_expr.rs b/tests-fuzz/src/ir/partition_expr.rs index c91dd487ae..908223366c 100644 --- a/tests-fuzz/src/ir/partition_expr.rs +++ b/tests-fuzz/src/ir/partition_expr.rs @@ -20,7 +20,7 @@ use snafu::ensure; use crate::context::TableContext; use crate::error::{self, Result}; -use crate::ir::{Ident, generate_random_value}; +use crate::ir::{Ident, generate_random_value, string_value}; /// A partitioning scheme that divides a single column into multiple ranges based on provided bounds. /// @@ -245,6 +245,10 @@ pub fn generate_unique_bound( datatype: &ConcreteDataType, bounds: &[Value], ) -> Result { + if matches!(datatype, ConcreteDataType::String(_)) { + return string_value::generate_unique_partition_bound(rng, bounds); + } + for _ in 0..16 { let candidate = generate_random_value(rng, datatype, None); if !bounds.contains(&candidate) { diff --git a/tests-fuzz/src/ir/string_value.rs b/tests-fuzz/src/ir/string_value.rs new file mode 100644 index 0000000000..6a53aa69de --- /dev/null +++ b/tests-fuzz/src/ir/string_value.rs @@ -0,0 +1,162 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datatypes::value::Value; +use rand::Rng; + +use crate::error::{self, Result}; +use crate::generator::Random; +use crate::ir::Ident; + +const READABLE_CHARSET: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +fn readable_token(index: usize) -> String { + let base = READABLE_CHARSET.len(); + let mut n = index + 1; + let mut buf = Vec::new(); + + while n > 0 { + let rem = (n - 1) % base; + buf.push(READABLE_CHARSET[rem] as char); + n = (n - 1) / base; + } + + buf.iter().rev().collect() +} + +pub fn generate_data_string_value( + rng: &mut R, + random_str: Option<&dyn Random>, +) -> Value { + match random_str { + Some(random) => Value::from(random.generate(rng).value), + None => { + let idx = rng.random_range(0..(READABLE_CHARSET.len() * READABLE_CHARSET.len() * 4)); + Value::from(readable_token(idx)) + } + } +} + +/// Generates ordered readable string bounds for partition expressions. +pub fn generate_partition_bounds(bounds: usize) -> Vec { + let token_space = READABLE_CHARSET.len() * READABLE_CHARSET.len() * 1024; + (1..=bounds) + .map(|i| { + let idx = i * token_space / (bounds + 1); + Value::from(readable_token(idx)) + }) + .collect() +} + +/// Picks a representative string value for the target partition range. +pub fn generate_partition_value(bounds: &[Value], bound_idx: usize) -> Value { + let first = bounds.first().unwrap(); + let last = bounds.last().unwrap(); + let upper = match first { + Value::String(v) => v.as_utf8(), + _ => "", + }; + + if bound_idx == 0 { + if upper <= "0" { + Value::from("") + } else { + Value::from("0") + } + } else if bound_idx < bounds.len() { + bounds[bound_idx - 1].clone() + } else { + last.clone() + } +} + +/// Generates a unique readable bound not present in existing bounds. +pub fn generate_unique_partition_bound(rng: &mut R, bounds: &[Value]) -> Result { + let search_space = READABLE_CHARSET.len() * READABLE_CHARSET.len() * 1024; + let start = rng.random_range(0..search_space); + for offset in 0..search_space { + let idx = start + offset; + let candidate = Value::from(readable_token(idx)); + if !bounds.contains(&candidate) { + return Ok(candidate); + } + } + + error::UnexpectedSnafu { + violated: "unable to generate unique string partition bound".to_string(), + } + .fail() +} + +#[cfg(test)] +mod tests { + use rand::SeedableRng; + use rand_chacha::ChaCha8Rng; + + use super::*; + + #[test] + fn test_readable_token_grows_length() { + assert_eq!("0", readable_token(0)); + assert_eq!("9", readable_token(9)); + assert_eq!("A", readable_token(10)); + assert_eq!("z", readable_token(61)); + assert_eq!("00", readable_token(62)); + } + + #[test] + fn test_generate_partition_bounds_are_readable_and_unique() { + let bounds = generate_partition_bounds(8); + assert_eq!(8, bounds.len()); + + let mut values = bounds + .iter() + .map(|v| match v { + Value::String(s) => s.as_utf8().to_string(), + _ => panic!("expected string value"), + }) + .collect::>(); + let mut dedup = values.clone(); + dedup.sort(); + dedup.dedup(); + assert_eq!(values.len(), dedup.len()); + + for s in values.drain(..) { + assert!(s.chars().all(|c| c.is_ascii_alphanumeric())); + } + } + + #[test] + fn test_generate_partition_value_for_string_bounds() { + let bounds = vec![Value::from("A"), Value::from("M")]; + assert_eq!(Value::from("0"), generate_partition_value(&bounds, 0)); + assert_eq!(Value::from("A"), generate_partition_value(&bounds, 1)); + assert_eq!(Value::from("M"), generate_partition_value(&bounds, 2)); + } + + #[test] + fn test_generate_unique_partition_bound_not_in_existing() { + let mut rng = ChaCha8Rng::seed_from_u64(42); + let bounds = vec![Value::from("0"), Value::from("1"), Value::from("2")]; + let candidate = generate_unique_partition_bound(&mut rng, &bounds).unwrap(); + assert!(!bounds.contains(&candidate)); + match candidate { + Value::String(s) => { + assert!(!s.as_utf8().is_empty()); + assert!(s.as_utf8().chars().all(|c| c.is_ascii_alphanumeric())); + } + _ => panic!("expected string value"), + } + } +} diff --git a/tests-fuzz/src/translator.rs b/tests-fuzz/src/translator.rs index 673b543f2c..4c5e0bb6a4 100644 --- a/tests-fuzz/src/translator.rs +++ b/tests-fuzz/src/translator.rs @@ -13,6 +13,8 @@ // limitations under the License. mod common; +/// Translator that converts insert expressions into CSV records. +pub mod csv; pub mod mysql; pub mod postgres; diff --git a/tests-fuzz/src/translator/csv.rs b/tests-fuzz/src/translator/csv.rs new file mode 100644 index 0000000000..e95956862c --- /dev/null +++ b/tests-fuzz/src/translator/csv.rs @@ -0,0 +1,121 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::error::Error; +use crate::ir::insert_expr::{InsertIntoExpr, RowValue}; +use crate::translator::DslTranslator; + +/// One CSV record converted from an insert row. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CsvRecord { + /// Cell values in column order. + pub values: Vec, +} + +/// CSV records converted from an insert expression. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CsvRecords { + /// Target table name from insert expression. + pub table_name: String, + /// Header values from insert columns. + pub headers: Vec, + /// Converted row records. + pub records: Vec, +} + +/// Translates `InsertIntoExpr` into CSV-writer-ready records. +pub struct InsertExprToCsvRecordsTranslator; + +impl DslTranslator for InsertExprToCsvRecordsTranslator { + type Error = Error; + + fn translate(&self, input: &InsertIntoExpr) -> Result { + let headers = input + .columns + .iter() + .map(|column| column.name.to_string()) + .collect::>(); + let records = input + .values_list + .iter() + .map(|row| CsvRecord { + values: row.iter().map(Self::format_row_value).collect(), + }) + .collect::>(); + + Ok(CsvRecords { + table_name: input.table_name.to_string(), + headers, + records, + }) + } +} + +impl InsertExprToCsvRecordsTranslator { + fn format_row_value(value: &RowValue) -> String { + match value { + RowValue::Value(datatypes::value::Value::Null) => String::new(), + RowValue::Value(v) => v.to_string(), + RowValue::Default => "DEFAULT".to_string(), + } + } +} + +#[cfg(test)] +mod tests { + use datatypes::data_type::ConcreteDataType; + + use super::InsertExprToCsvRecordsTranslator; + use crate::ir::create_expr::ColumnOption; + use crate::ir::insert_expr::{InsertIntoExpr, RowValue}; + use crate::ir::{Column, Ident}; + use crate::translator::DslTranslator; + + #[test] + fn test_translate_insert_expr_to_csv_records() { + let input = InsertIntoExpr { + table_name: Ident::new("metric_a"), + omit_column_list: false, + columns: vec![ + Column { + name: "host".into(), + column_type: ConcreteDataType::string_datatype(), + options: vec![ColumnOption::PrimaryKey], + }, + Column { + name: "value".into(), + column_type: ConcreteDataType::float64_datatype(), + options: vec![], + }, + ], + values_list: vec![ + vec![ + RowValue::Value(datatypes::value::Value::String("web-1".into())), + RowValue::Value(datatypes::value::Value::Int32(15)), + ], + vec![ + RowValue::Value(datatypes::value::Value::Null), + RowValue::Default, + ], + ], + }; + + let output = InsertExprToCsvRecordsTranslator.translate(&input).unwrap(); + assert_eq!(output.table_name, "metric_a"); + assert_eq!(output.headers, vec!["host", "value"]); + assert_eq!(output.records.len(), 2); + assert_eq!(output.records[0].values, vec!["web-1", "15"]); + assert_eq!(output.records[1].values, vec!["", "DEFAULT"]); + } +} diff --git a/tests-fuzz/src/utils.rs b/tests-fuzz/src/utils.rs index 0780f6c93d..d55abab3c2 100644 --- a/tests-fuzz/src/utils.rs +++ b/tests-fuzz/src/utils.rs @@ -15,6 +15,8 @@ pub mod cluster_info; pub mod config; pub mod crd; +/// CSV dump writer utilities for fuzz tests. +pub mod csv_dump_writer; pub mod health; pub mod migration; pub mod partition; @@ -22,10 +24,15 @@ pub mod pod_failure; pub mod procedure; #[cfg(feature = "unstable")] pub mod process; +pub mod retry; +/// SQL dump writer utilities for fuzz tests. +pub mod sql_dump_writer; pub mod wait; use std::env; +use std::str::FromStr; +use common_base::readable_size::ReadableSize; use common_telemetry::info; use common_telemetry::tracing::log::LevelFilter; use paste::paste; @@ -126,6 +133,14 @@ pub const GT_FUZZ_INPUT_MAX_COLUMNS: &str = "GT_FUZZ_INPUT_MAX_COLUMNS"; pub const GT_FUZZ_INPUT_MAX_ALTER_ACTIONS: &str = "GT_FUZZ_INPUT_MAX_ALTER_ACTIONS"; pub const GT_FUZZ_INPUT_MAX_INSERT_ACTIONS: &str = "GT_FUZZ_INPUT_MAX_INSERT_ACTIONS"; pub const FUZZ_OVERRIDE_PREFIX: &str = "GT_FUZZ_OVERRIDE_"; +/// Enables CSV dump generation for fuzz runs. +pub const GT_FUZZ_DUMP_TABLE_CSV: &str = "GT_FUZZ_DUMP_TABLE_CSV"; +/// Base directory for CSV dump sessions. +pub const GT_FUZZ_DUMP_DIR: &str = "GT_FUZZ_DUMP_DIR"; +/// Directory suffix used by one CSV dump session. +pub const GT_FUZZ_DUMP_SUFFIX: &str = "GT_FUZZ_DUMP_SUFFIX"; +/// Max in-memory CSV buffer size before auto flush. +pub const GT_FUZZ_DUMP_BUFFER_MAX_BYTES: &str = "GT_FUZZ_DUMP_BUFFER_MAX_BYTES"; /// Reads an override value for a fuzz parameter from env `GT_FUZZ_OVERRIDE_`. pub fn get_fuzz_override(name: &str) -> Option @@ -137,6 +152,33 @@ where env::var(&key).ok().and_then(|v| v.parse().ok()) } +/// Returns CSV dump base directory. +pub fn get_gt_fuzz_dump_dir() -> String { + let _ = dotenv::dotenv(); + env::var(GT_FUZZ_DUMP_DIR).unwrap_or_else(|_| "/tmp/greptime-fuzz-dumps".to_string()) +} + +/// Returns CSV dump directory suffix. +pub fn get_gt_fuzz_dump_suffix() -> String { + let _ = dotenv::dotenv(); + env::var(GT_FUZZ_DUMP_SUFFIX).unwrap_or_else(|_| ".repartition-metric-csv".to_string()) +} + +/// Returns max CSV in-memory buffer size. +pub fn get_gt_fuzz_dump_buffer_max_bytes() -> usize { + let _ = dotenv::dotenv(); + env::var(GT_FUZZ_DUMP_BUFFER_MAX_BYTES) + .ok() + .and_then(|value| { + value.parse::().ok().or_else(|| { + ReadableSize::from_str(&value) + .ok() + .map(|size| size.as_bytes() as usize) + }) + }) + .unwrap_or(8 * 1024 * 1024) +} + macro_rules! make_get_from_env_helper { ($key:expr, $default: expr) => { paste! { diff --git a/tests-fuzz/src/utils/csv_dump_writer.rs b/tests-fuzz/src/utils/csv_dump_writer.rs new file mode 100644 index 0000000000..de16a23c24 --- /dev/null +++ b/tests-fuzz/src/utils/csv_dump_writer.rs @@ -0,0 +1,383 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::fs::{File, OpenOptions, create_dir_all, remove_dir_all}; +use std::io::Write; +use std::path::{Path, PathBuf}; + +use common_telemetry::{info, warn}; +use common_time::util::current_time_millis; +use snafu::ResultExt; + +use crate::error::{self, Result}; +use crate::translator::csv::CsvRecords; +use crate::utils::{ + get_gt_fuzz_dump_buffer_max_bytes, get_gt_fuzz_dump_dir, get_gt_fuzz_dump_suffix, +}; + +/// Metadata for one CSV dump session. +#[derive(Debug, Clone)] +pub struct CsvDumpMetadata { + /// Fuzz target name. + pub target: String, + /// Seed used by current fuzz input. + pub seed: u64, + /// Repartition action count. + pub actions: usize, + /// Initial partition count. + pub partitions: usize, + /// Logical table count. + pub tables: usize, + /// Session start time in unix milliseconds. + pub started_at_unix_ms: i64, +} + +impl CsvDumpMetadata { + /// Builds dump metadata with current timestamp. + pub fn new( + target: impl Into, + seed: u64, + actions: usize, + partitions: usize, + tables: usize, + ) -> Self { + Self { + target: target.into(), + seed, + actions, + partitions, + tables, + started_at_unix_ms: current_time_millis(), + } + } +} + +/// Session writer for staged CSV dump records. +#[derive(Debug)] +pub struct CsvDumpSession { + /// Session metadata. + pub metadata: CsvDumpMetadata, + /// Session directory path. + pub run_dir: PathBuf, + /// Max in-memory buffer size before auto flush. + pub max_buffer_bytes: usize, + records: Vec, + buffered_bytes: usize, + written_tables: HashSet, + full_headers_by_table: HashMap>, +} + +impl CsvDumpSession { + /// Creates session directory and writes seed metadata file. + pub fn new(metadata: CsvDumpMetadata) -> Result { + Self::new_with_buffer_limit(metadata, get_gt_fuzz_dump_buffer_max_bytes()) + } + + /// Creates session with a custom in-memory buffer limit. + pub fn new_with_buffer_limit( + metadata: CsvDumpMetadata, + max_buffer_bytes: usize, + ) -> Result { + let run_dir = build_run_dir(&metadata); + create_dir_all(&run_dir).context(error::CreateFileSnafu { + path: run_dir.to_string_lossy().to_string(), + })?; + write_seed_meta(&run_dir, &metadata)?; + info!( + "Create csv dump session, target: {}, run_dir: {}, max_buffer_bytes: {}", + metadata.target, + run_dir.display(), + max_buffer_bytes + ); + + Ok(Self { + metadata, + run_dir, + max_buffer_bytes, + records: Vec::new(), + buffered_bytes: 0, + written_tables: HashSet::new(), + full_headers_by_table: HashMap::new(), + }) + } + + /// Appends one table CSV records batch with full table headers. + pub fn append(&mut self, records: CsvRecords, full_headers: Vec) -> Result<()> { + self.full_headers_by_table + .entry(records.table_name.clone()) + .or_insert(full_headers); + self.buffered_bytes += estimate_csv_records_size(&records); + self.records.push(records); + if self.buffered_bytes >= self.max_buffer_bytes { + self.flush_buffered_records()?; + } + Ok(()) + } + + /// Flushes all appended batches to CSV files. + pub fn flush_all(&mut self) -> Result<()> { + self.flush_buffered_records() + } + + /// Removes session directory after successful validation. + pub fn cleanup_on_success(&self) -> std::io::Result<()> { + match remove_dir_all(&self.run_dir) { + Ok(_) => { + info!( + "Cleanup csv dump directory on success: {}", + self.run_dir.display() + ); + Ok(()) + } + Err(err) => { + warn!( + "Cleanup csv dump directory failed: {}, error: {:?}", + self.run_dir.display(), + err + ); + Err(err) + } + } + } + + fn flush_buffered_records(&mut self) -> Result<()> { + if self.records.is_empty() { + return Ok(()); + } + for batch in &self.records { + write_batch_csv( + &self.run_dir, + batch, + &mut self.written_tables, + &self.full_headers_by_table, + )?; + } + self.records.clear(); + self.buffered_bytes = 0; + Ok(()) + } +} + +fn write_seed_meta(run_dir: &Path, metadata: &CsvDumpMetadata) -> Result<()> { + let path = run_dir.join("seed.meta"); + let mut file = File::create(&path).context(error::CreateFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + + let content = format!( + "target={}\nseed={}\nactions={}\npartitions={}\ntables={}\nstarted_at_unix_ms={}\n", + metadata.target, + metadata.seed, + metadata.actions, + metadata.partitions, + metadata.tables, + metadata.started_at_unix_ms, + ); + file.write_all(content.as_bytes()) + .context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + }) +} + +fn write_batch_csv( + run_dir: &Path, + batch: &CsvRecords, + written_tables: &mut HashSet, + full_headers_by_table: &HashMap>, +) -> Result<()> { + let output_headers = full_headers_by_table + .get(&batch.table_name) + .cloned() + .unwrap_or_else(|| batch.headers.clone()); + let file_name = format!("{}.table-data.csv", sanitize_file_name(&batch.table_name)); + let path = run_dir.join(file_name); + let mut file = OpenOptions::new() + .create(true) + .append(true) + .open(&path) + .context(error::CreateFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + + if written_tables.insert(batch.table_name.clone()) { + file.write_all(join_line(&output_headers).as_bytes()) + .context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + file.write_all(b"\n").context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + } + + let header_index = batch + .headers + .iter() + .enumerate() + .map(|(idx, header)| (header.as_str(), idx)) + .collect::>(); + + for record in &batch.records { + let aligned_values = output_headers + .iter() + .map(|header| { + header_index + .get(header.as_str()) + .and_then(|idx| record.values.get(*idx)) + .cloned() + .unwrap_or_default() + }) + .collect::>(); + file.write_all(join_line(&aligned_values).as_bytes()) + .context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + file.write_all(b"\n").context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + } + + Ok(()) +} + +fn estimate_csv_records_size(records: &CsvRecords) -> usize { + let headers = records.headers.iter().map(String::len).sum::(); + let rows = records + .records + .iter() + .flat_map(|record| record.values.iter()) + .map(String::len) + .sum::(); + headers + rows +} + +fn join_line(cells: &[String]) -> String { + cells + .iter() + .map(|cell| escape_csv_cell(cell)) + .collect::>() + .join(",") +} + +fn escape_csv_cell(value: &str) -> String { + if value.contains([',', '"', '\n', '\r']) { + format!("\"{}\"", value.replace('"', "\"\"")) + } else { + value.to_string() + } +} + +fn sanitize_file_name(raw: &str) -> String { + raw.chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' { + ch + } else { + '_' + } + }) + .collect() +} + +fn build_run_dir(metadata: &CsvDumpMetadata) -> PathBuf { + let base = PathBuf::from(get_gt_fuzz_dump_dir()); + let suffix = get_gt_fuzz_dump_suffix(); + let name = format!( + "{}_seed_{}_actions_{}_ts_{}{}", + metadata.target, metadata.seed, metadata.actions, metadata.started_at_unix_ms, suffix + ); + base.join(name) +} + +#[cfg(test)] +mod tests { + use super::{CsvDumpMetadata, CsvDumpSession}; + use crate::translator::csv::{CsvRecord, CsvRecords}; + + #[test] + fn test_create_session_and_flush() { + let mut session = CsvDumpSession::new_with_buffer_limit( + CsvDumpMetadata::new("fuzz_case", 1, 2, 3, 4), + 1024, + ) + .unwrap(); + session + .append( + CsvRecords { + table_name: "metric-a".to_string(), + headers: vec!["host".to_string(), "value".to_string()], + records: vec![CsvRecord { + values: vec!["web-1".to_string(), "10".to_string()], + }], + }, + vec!["host".to_string(), "value".to_string()], + ) + .unwrap(); + session.flush_all().unwrap(); + + assert!(session.run_dir.exists()); + assert!(session.run_dir.join("seed.meta").exists()); + assert!(session.run_dir.join("metric-a.table-data.csv").exists()); + } + + #[test] + fn test_auto_flush_on_buffer_limit() { + let mut session = + CsvDumpSession::new_with_buffer_limit(CsvDumpMetadata::new("fuzz_case", 5, 2, 3, 4), 1) + .unwrap(); + session + .append( + CsvRecords { + table_name: "metric-b".to_string(), + headers: vec!["host".to_string()], + records: vec![CsvRecord { + values: vec!["web-2".to_string()], + }], + }, + vec!["host".to_string()], + ) + .unwrap(); + + assert!(session.run_dir.join("metric-b.table-data.csv").exists()); + assert_eq!(session.buffered_bytes, 0); + } + + #[test] + fn test_flush_with_partial_headers_uses_full_headers() { + let mut session = CsvDumpSession::new_with_buffer_limit( + CsvDumpMetadata::new("fuzz_case", 7, 2, 3, 4), + 1024, + ) + .unwrap(); + session + .append( + CsvRecords { + table_name: "metric-c".to_string(), + headers: vec!["host".to_string(), "value".to_string()], + records: vec![CsvRecord { + values: vec!["web-3".to_string(), "12".to_string()], + }], + }, + vec!["host".to_string(), "idc".to_string(), "value".to_string()], + ) + .unwrap(); + session.flush_all().unwrap(); + + let file = + std::fs::read_to_string(session.run_dir.join("metric-c.table-data.csv")).unwrap(); + let mut lines = file.lines(); + assert_eq!(lines.next().unwrap(), "host,idc,value"); + assert_eq!(lines.next().unwrap(), "web-3,,12"); + } +} diff --git a/tests-fuzz/src/utils/partition.rs b/tests-fuzz/src/utils/partition.rs index d3dc30061d..89a684326b 100644 --- a/tests-fuzz/src/utils/partition.rs +++ b/tests-fuzz/src/utils/partition.rs @@ -36,7 +36,7 @@ pub struct PartitionCount { } pub async fn count_partitions(db: &MySqlPool, datanode_id: u64) -> Result { - let sql = "select count(1) as count from information_schema.region_peers where peer_id == ?"; + let sql = "select count(1) as count from information_schema.region_peers where peer_id = ?"; sqlx::query_as::<_, PartitionCount>(sql) .bind(datanode_id) .fetch_one(db) diff --git a/tests-fuzz/src/utils/retry.rs b/tests-fuzz/src/utils/retry.rs new file mode 100644 index 0000000000..06d1ede54f --- /dev/null +++ b/tests-fuzz/src/utils/retry.rs @@ -0,0 +1,49 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::future::Future; +use std::time::Duration; + +use common_telemetry::warn; + +pub async fn retry_with_backoff( + mut operation: F, + max_attempts: usize, + init_backoff: Duration, + max_backoff: Duration, +) -> Result +where + F: FnMut() -> Fut, + Fut: Future>, + E: std::fmt::Debug, +{ + let mut backoff = init_backoff; + for attempt in 0..max_attempts { + match operation().await { + Ok(result) => return Ok(result), + Err(err) if attempt + 1 == max_attempts => return Err(err), + Err(err) => { + let current_attempt = attempt + 1; + warn!( + "Retryable operation failed, attempt: {}, max_attempts: {}, backoff: {:?}, error: {:?}", + current_attempt, max_attempts, backoff, err + ); + tokio::time::sleep(backoff).await; + backoff = std::cmp::min(backoff * 2, max_backoff); + } + } + } + + panic!("retry loop should always return") +} diff --git a/tests-fuzz/src/utils/sql_dump_writer.rs b/tests-fuzz/src/utils/sql_dump_writer.rs new file mode 100644 index 0000000000..6f098d9584 --- /dev/null +++ b/tests-fuzz/src/utils/sql_dump_writer.rs @@ -0,0 +1,267 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fs::{OpenOptions, create_dir_all}; +use std::io::Write; +use std::path::PathBuf; + +use snafu::ResultExt; + +use crate::error::{self, Result}; +use crate::utils::get_gt_fuzz_dump_buffer_max_bytes; + +/// Session writer for table-scoped SQL trace files. +#[derive(Debug)] +pub struct SqlDumpSession { + /// Session directory path. + pub run_dir: PathBuf, + /// Max in-memory buffer size before auto flush. + pub max_buffer_bytes: usize, + buffered_bytes: usize, + entries_by_table: HashMap>, +} + +impl SqlDumpSession { + /// Creates SQL dump session with default buffer limit. + pub fn new(run_dir: PathBuf) -> Result { + Self::new_with_buffer_limit(run_dir, get_gt_fuzz_dump_buffer_max_bytes()) + } + + /// Creates SQL dump session with custom buffer limit. + pub fn new_with_buffer_limit(run_dir: PathBuf, max_buffer_bytes: usize) -> Result { + create_dir_all(&run_dir).context(error::CreateFileSnafu { + path: run_dir.to_string_lossy().to_string(), + })?; + + Ok(Self { + run_dir, + max_buffer_bytes, + buffered_bytes: 0, + entries_by_table: HashMap::new(), + }) + } + + /// Appends one SQL statement for a logical table. + pub fn append_sql(&mut self, table: &str, sql: &str, comment: Option<&str>) -> Result<()> { + let entry = format_sql_entry(sql, comment); + self.push_entry(table, entry)?; + Ok(()) + } + + /// Broadcasts one comment event to all table trace files. + pub fn broadcast_event(&mut self, tables: I, event: &str, sql: &str) -> Result<()> + where + I: IntoIterator, + T: AsRef, + { + let entry = format_sql_entry(sql, Some(event)); + for table in tables { + self.push_entry(table.as_ref(), entry.clone())?; + } + Ok(()) + } + + /// Flushes all staged SQL traces to table-scoped files. + pub fn flush_all(&mut self) -> Result<()> { + self.flush_buffered_entries() + } + + fn push_entry(&mut self, table: &str, entry: String) -> Result<()> { + self.buffered_bytes += entry.len(); + self.entries_by_table + .entry(table.to_string()) + .or_default() + .push(entry); + + if self.buffered_bytes >= self.max_buffer_bytes { + self.flush_buffered_entries()?; + } + Ok(()) + } + + fn flush_buffered_entries(&mut self) -> Result<()> { + if self.entries_by_table.is_empty() { + return Ok(()); + } + + for (table, entries) in &self.entries_by_table { + let path = self + .run_dir + .join(format!("{}.trace.sql", sanitize_file_name(table))); + let mut file = OpenOptions::new() + .create(true) + .append(true) + .open(&path) + .context(error::CreateFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + + for entry in entries { + file.write_all(entry.as_bytes()) + .context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + file.write_all(b"\n").context(error::WriteFileSnafu { + path: path.to_string_lossy().to_string(), + })?; + } + } + + self.entries_by_table.clear(); + self.buffered_bytes = 0; + Ok(()) + } +} + +fn format_sql_entry(sql: &str, comment: Option<&str>) -> String { + let normalized_sql = normalize_sql(sql); + if let Some(comment) = comment { + format!("{}\n{normalized_sql}", format_comment(comment)) + } else { + normalized_sql + } +} + +fn format_comment(comment: &str) -> String { + comment + .lines() + .map(|line| format!("-- {line}")) + .collect::>() + .join("\n") +} + +fn normalize_sql(sql: &str) -> String { + let trimmed = sql.trim_end(); + if trimmed.ends_with(';') { + trimmed.to_string() + } else { + format!("{trimmed};") + } +} + +fn sanitize_file_name(raw: &str) -> String { + raw.chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' { + ch + } else { + '_' + } + }) + .collect() +} + +#[cfg(test)] +mod tests { + use std::time::{SystemTime, UNIX_EPOCH}; + + use super::SqlDumpSession; + + #[test] + fn test_append_sql_writes_table_trace_file() { + let run_dir = std::env::temp_dir().join(format!( + "tests-fuzz-sql-dump-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + )); + + let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap(); + session + .append_sql( + "metric-a", + "INSERT INTO t VALUES(1)", + Some("kind=insert elapsed_ms=10"), + ) + .unwrap(); + session.flush_all().unwrap(); + + let content = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap(); + assert!(content.contains("-- kind=insert elapsed_ms=10")); + assert!(content.contains("INSERT INTO t VALUES(1);")); + } + + #[test] + fn test_broadcast_event_writes_to_all_tables() { + let run_dir = std::env::temp_dir().join(format!( + "tests-fuzz-sql-broadcast-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + )); + + let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap(); + session + .broadcast_event( + ["metric-a", "metric-b"], + "repartition action_idx=3", + "ALTER TABLE t REPARTITION", + ) + .unwrap(); + session.flush_all().unwrap(); + + let content_a = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap(); + let content_b = std::fs::read_to_string(run_dir.join("metric-b.trace.sql")).unwrap(); + assert!(content_a.contains("-- repartition action_idx=3")); + assert!(content_a.contains("ALTER TABLE t REPARTITION;")); + assert!(content_b.contains("-- repartition action_idx=3")); + assert!(content_b.contains("ALTER TABLE t REPARTITION;")); + } + + #[test] + fn test_multiline_comment_is_prefixed_per_line() { + let run_dir = std::env::temp_dir().join(format!( + "tests-fuzz-sql-dump-comment-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + )); + + let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap(); + session + .append_sql( + "metric-a", + "INSERT INTO t VALUES(1)", + Some("kind=insert\nstarted_at_ms=1 elapsed_ms=2"), + ) + .unwrap(); + session.flush_all().unwrap(); + + let content = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap(); + assert!(content.contains("-- kind=insert\n-- started_at_ms=1 elapsed_ms=2")); + } + + #[test] + fn test_auto_flush_on_buffer_limit() { + let run_dir = std::env::temp_dir().join(format!( + "tests-fuzz-sql-dump-limit-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + )); + + let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1).unwrap(); + session + .append_sql("metric-a", "INSERT INTO t VALUES(1)", None) + .unwrap(); + + assert!(run_dir.join("metric-a.trace.sql").exists()); + assert_eq!(session.buffered_bytes, 0); + } +} diff --git a/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs new file mode 100644 index 0000000000..7932bc7759 --- /dev/null +++ b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs @@ -0,0 +1,684 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#![no_main] + +use std::collections::{BTreeMap, HashMap}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use arbitrary::{Arbitrary, Unstructured}; +use common_telemetry::{info, warn}; +use common_time::Timestamp; +use common_time::util::current_time_millis; +use libfuzzer_sys::fuzz_target; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaChaRng; +use snafu::{ResultExt, ensure}; +use sqlx::{MySql, Pool}; +use tests_fuzz::context::{TableContext, TableContextRef}; +use tests_fuzz::error::{self, Result}; +use tests_fuzz::fake::{ + ConstGenerator, MappedGenerator, WordGenerator, merge_two_word_map_fn, random_capitalize_map, + uppercase_and_keyword_backtick_map, +}; +use tests_fuzz::generator::Generator; +use tests_fuzz::generator::create_expr::{ + CreateLogicalTableExprGeneratorBuilder, CreatePhysicalTableExprGeneratorBuilder, +}; +use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder; +use tests_fuzz::generator::repartition_expr::{ + MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder, +}; +use tests_fuzz::ir::{ + CreateTableExpr, Ident, InsertIntoExpr, RepartitionExpr, generate_random_value, + generate_unique_timestamp_for_mysql_with_clock, +}; +use tests_fuzz::translator::DslTranslator; +use tests_fuzz::translator::csv::InsertExprToCsvRecordsTranslator; +use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator; +use tests_fuzz::translator::mysql::insert_expr::InsertIntoExprTranslator; +use tests_fuzz::translator::mysql::repartition_expr::RepartitionExprTranslator; +use tests_fuzz::utils::csv_dump_writer::{CsvDumpMetadata, CsvDumpSession}; +use tests_fuzz::utils::retry::retry_with_backoff; +use tests_fuzz::utils::sql_dump_writer::SqlDumpSession; +use tests_fuzz::utils::{ + Connections, get_fuzz_override, get_gt_fuzz_input_max_alter_actions, + get_gt_fuzz_input_max_tables, init_greptime_connections_via_env, +}; +use tests_fuzz::validator::row::count_values; +use tokio::sync::{mpsc, oneshot}; + +const BARRIER_ACK_TIMEOUT_SECS: u64 = 10; +const VALIDATE_QUERY_MAX_ATTEMPTS: usize = 6; +const VALIDATE_QUERY_INIT_BACKOFF: Duration = Duration::from_millis(50); +const VALIDATE_QUERY_MAX_BACKOFF: Duration = Duration::from_millis(800); + +#[derive(Clone)] +struct FuzzContext { + greptime: Pool, +} + +impl FuzzContext { + async fn close(self) { + self.greptime.close().await; + } +} + +#[derive(Clone, Debug)] +struct FuzzInput { + seed: u64, + actions: usize, + partitions: usize, + tables: usize, +} + +fn generate_create_physical_table_expr( + partitions: usize, + rng: &mut R, +) -> Result { + CreatePhysicalTableExprGeneratorBuilder::default() + .name_generator(Box::new(ConstGenerator::new(Ident::new( + "fuzz_repartition_metric_physical", + )))) + .if_not_exists(rng.random_bool(0.5)) + .partition(partitions) + .build() + .unwrap() + .generate(rng) +} + +fn generate_create_logical_table_expr( + physical_table_ctx: TableContextRef, + include_partition_column: bool, + rng: &mut R, +) -> Result { + CreateLogicalTableExprGeneratorBuilder::default() + .name_generator(Box::new(MappedGenerator::new( + WordGenerator, + merge_two_word_map_fn(random_capitalize_map, uppercase_and_keyword_backtick_map), + ))) + .physical_table_ctx(physical_table_ctx) + .labels(rng.random_range(1..=5)) + .if_not_exists(rng.random_bool(0.5)) + .include_partition_column(include_partition_column) + .build() + .unwrap() + .generate(rng) +} + +fn generate_insert_expr( + rows: usize, + rng: &mut R, + table_ctx: TableContextRef, + clock: Arc>, +) -> Result { + let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock); + InsertExprGeneratorBuilder::default() + .omit_column_list(false) + .table_ctx(table_ctx) + .rows(rows) + .value_generator(Box::new(generate_random_value)) + .ts_value_generator(ts_value_generator) + .build() + .unwrap() + .generate(rng) +} + +async fn create_metric_tables( + ctx: &FuzzContext, + rng: &mut R, + partitions: usize, + table_count: usize, +) -> Result<( + TableContextRef, + BTreeMap, + HashMap, + String, +)> { + let create_physical_expr = generate_create_physical_table_expr(partitions, rng)?; + let translator = CreateTableExprTranslator; + let create_physical_sql = translator.translate(&create_physical_expr)?; + let result = sqlx::query(&create_physical_sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { + sql: &create_physical_sql, + })?; + info!("Create physical table: {create_physical_sql}, result: {result:?}"); + let physical_table_ctx = Arc::new(TableContext::from(&create_physical_expr)); + ensure!( + physical_table_ctx.partition.is_some(), + error::AssertSnafu { + reason: "Physical metric table must have partition".to_string() + } + ); + + let mut logical_tables = BTreeMap::new(); + let mut create_logical_sqls = HashMap::new(); + let max_attempts = table_count * 3; + for _ in 0..max_attempts { + if logical_tables.len() >= table_count { + break; + } + + let include_partition_column = rng.random_bool(0.5); + let create_logical_expr = generate_create_logical_table_expr( + physical_table_ctx.clone(), + include_partition_column, + rng, + )?; + if logical_tables.contains_key(&create_logical_expr.table_name) { + continue; + } + + let create_logical_sql = translator.translate(&create_logical_expr)?; + let result = sqlx::query(&create_logical_sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { + sql: &create_logical_sql, + })?; + info!("Create logical table: {create_logical_sql}, result: {result:?}"); + let logical_ctx = Arc::new(TableContext::from(&create_logical_expr)); + create_logical_sqls.insert(logical_ctx.name.to_string(), create_logical_sql); + logical_tables.insert(logical_ctx.name.clone(), logical_ctx); + } + + ensure!( + !logical_tables.is_empty(), + error::AssertSnafu { + reason: "No logical table created".to_string() + } + ); + + Ok(( + physical_table_ctx, + logical_tables, + create_logical_sqls, + create_physical_sql, + )) +} + +async fn execute_insert_with_retry(ctx: &FuzzContext, sql: &str) -> Result<()> { + let mut delay = Duration::from_millis(100); + let mut attempt = 0; + let max_attempts = 10; + loop { + match sqlx::query(sql) + .persistent(false) + .execute(&ctx.greptime) + .await + { + Ok(_) => return Ok(()), + Err(err) => { + tokio::time::sleep(delay).await; + delay = std::cmp::min(delay * 2, Duration::from_secs(1)); + attempt += 1; + warn!("Execute insert with retry: {sql}, attempt: {attempt}, error: {err:?}"); + if attempt >= max_attempts { + return Err(err).context(error::ExecuteQuerySnafu { sql }); + } + } + } + } +} + +struct SharedState { + clock: Arc>, + inserted_rows: HashMap, + csv_dump_session: Option, + sql_dump_session: Option, + running: bool, +} + +enum WriterControl { + Barrier { + epoch: usize, + ack: oneshot::Sender<()>, + }, + Resume { + epoch: usize, + }, + Stop, +} + +fn handle_writer_control(control: WriterControl, paused: &mut bool) -> bool { + match control { + WriterControl::Barrier { epoch, ack } => { + info!("Writer received barrier control, epoch: {epoch}"); + *paused = true; + let _ = ack.send(()); + false + } + WriterControl::Resume { epoch } => { + info!("Writer received resume control, epoch: {epoch}"); + *paused = false; + false + } + WriterControl::Stop => { + info!("Writer received stop control"); + true + } + } +} + +async fn write_loop( + mut rng: R, + ctx: FuzzContext, + logical_tables: BTreeMap, + shared_state: Arc>, + mut control_rx: mpsc::UnboundedReceiver, +) -> Result<()> { + info!("Start write loop"); + let mut paused = false; + loop { + while let Ok(control) = control_rx.try_recv() { + if handle_writer_control(control, &mut paused) { + return Ok(()); + } + } + + if paused { + match control_rx.recv().await { + Some(control) => { + if handle_writer_control(control, &mut paused) { + return Ok(()); + } + } + None => return Ok(()), + } + continue; + } + + let (running, clock) = { + let state = shared_state.lock().unwrap(); + (state.running, state.clock.clone()) + }; + if !running { + break; + } + + for table_ctx in logical_tables.values() { + let rows = rng.random_range(1..=3); + let insert_expr = + generate_insert_expr(rows, &mut rng, table_ctx.clone(), clock.clone())?; + let translator = InsertIntoExprTranslator; + let sql = translator.translate(&insert_expr)?; + let inserted = insert_expr.values_list.len() as u64; + let csv_records = InsertExprToCsvRecordsTranslator.translate(&insert_expr)?; + let table_name = table_ctx.name.to_string(); + let full_headers = table_ctx + .columns + .iter() + .map(|column| column.name.value.clone()) + .collect::>(); + + let started_at_ms = current_time_millis(); + let now = Instant::now(); + execute_insert_with_retry(&ctx, &sql).await?; + let elapsed = now.elapsed(); + info!("Execute insert sql: {sql}, elapsed: {elapsed:?}"); + + let mut state = shared_state.lock().unwrap(); + if let Some(csv_dump_session) = state.csv_dump_session.as_mut() { + csv_dump_session.append(csv_records, full_headers)?; + } + if let Some(sql_dump_session) = state.sql_dump_session.as_mut() { + let comment = format!( + "kind=insert table={} started_at_ms={} elapsed_ms={}", + table_name, + started_at_ms, + elapsed.as_millis() + ); + sql_dump_session.append_sql(&table_name, &sql, Some(&comment))?; + } + *state.inserted_rows.entry(table_name).or_insert(0) += inserted; + } + + tokio::time::sleep(Duration::from_millis(100)).await; + } + info!("Write loop ended"); + + Ok(()) +} + +async fn validate_rows( + ctx: &FuzzContext, + logical_tables: &BTreeMap, + inserted_rows: &HashMap, +) -> Result<()> { + for table_ctx in logical_tables.values() { + let expected = *inserted_rows.get(&table_ctx.name.to_string()).unwrap_or(&0) as usize; + let count_sql = format!("SELECT COUNT(1) AS count FROM {}", table_ctx.name); + let count = retry_with_backoff( + || count_values(&ctx.greptime, &count_sql), + VALIDATE_QUERY_MAX_ATTEMPTS, + VALIDATE_QUERY_INIT_BACKOFF, + VALIDATE_QUERY_MAX_BACKOFF, + ) + .await?; + let distinct_count_sql = format!( + "SELECT COUNT(DISTINCT {}) AS count FROM {}", + table_ctx.timestamp_column().unwrap().name, + table_ctx.name + ); + let distinct_count = retry_with_backoff( + || count_values(&ctx.greptime, &distinct_count_sql), + VALIDATE_QUERY_MAX_ATTEMPTS, + VALIDATE_QUERY_INIT_BACKOFF, + VALIDATE_QUERY_MAX_BACKOFF, + ) + .await?; + info!( + "Validate rows for table: {}, expected: {}, count: {}, distinct_count: {}", + table_ctx.name, expected, count.count as usize, distinct_count.count as usize + ); + assert_eq!(count.count as usize, expected); + + assert_eq!(distinct_count.count as usize, expected); + } + Ok(()) +} + +fn flush_dump_sessions_and_snapshot( + shared_state: &Arc>, +) -> Result> { + let mut state = shared_state.lock().unwrap(); + if let Some(csv_dump_session) = state.csv_dump_session.as_mut() { + csv_dump_session.flush_all()?; + } + if let Some(sql_dump_session) = state.sql_dump_session.as_mut() { + sql_dump_session.flush_all()?; + } + Ok(state.inserted_rows.clone()) +} + +async fn cleanup_tables( + ctx: &FuzzContext, + physical_table_ctx: &TableContextRef, + logical_tables: &BTreeMap, +) -> Result<()> { + for table_ctx in logical_tables.values() { + let drop_logical_sql = format!("DROP TABLE {}", table_ctx.name); + let result = sqlx::query(&drop_logical_sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { + sql: &drop_logical_sql, + })?; + info!("Drop logical table: {drop_logical_sql}, result: {result:?}"); + } + + let drop_physical_sql = format!("DROP TABLE {}", physical_table_ctx.name); + let result = sqlx::query(&drop_physical_sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { + sql: &drop_physical_sql, + })?; + info!("Drop physical table: {drop_physical_sql}, result: {result:?}"); + Ok(()) +} + +fn repartition_operation( + table_ctx: &TableContextRef, + rng: &mut R, +) -> Result { + let split = rng.random_bool(0.5); + if table_ctx.partition.as_ref().unwrap().exprs.len() <= 2 || split { + let expr = SplitPartitionExprGeneratorBuilder::default() + .table_ctx(table_ctx.clone()) + .build() + .unwrap() + .generate(rng)?; + Ok(RepartitionExpr::Split(expr)) + } else { + let expr = MergePartitionExprGeneratorBuilder::default() + .table_ctx(table_ctx.clone()) + .build() + .unwrap() + .generate(rng)?; + Ok(RepartitionExpr::Merge(expr)) + } +} + +impl Arbitrary<'_> for FuzzInput { + fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result { + let seed = get_fuzz_override::("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?); + let mut rng = ChaChaRng::seed_from_u64(seed); + let partitions = + get_fuzz_override::("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8)); + let max_tables = get_gt_fuzz_input_max_tables(); + let tables = get_fuzz_override::("TABLES") + .unwrap_or_else(|| rng.random_range(1..=std::cmp::max(1, max_tables))); + let max_actions = get_gt_fuzz_input_max_alter_actions(); + let actions = get_fuzz_override::("ACTIONS") + .unwrap_or_else(|| rng.random_range(1..max_actions)); + + Ok(FuzzInput { + seed, + actions, + partitions, + tables, + }) + } +} + +async fn execute_repartition_metric_table(ctx: FuzzContext, input: FuzzInput) -> Result<()> { + info!("input: {input:?}"); + let mut rng = ChaChaRng::seed_from_u64(input.seed); + let clock = Arc::new(Mutex::new(Timestamp::current_millis())); + + let (mut physical_table_ctx, logical_tables, create_logical_sqls, create_physical_sql) = + create_metric_tables(&ctx, &mut rng, input.partitions, input.tables).await?; + + let mut inserted_rows = HashMap::with_capacity(logical_tables.len()); + for table_ctx in logical_tables.values() { + inserted_rows.insert(table_ctx.name.to_string(), 0); + } + let csv_dump_session = CsvDumpSession::new(CsvDumpMetadata::new( + "fuzz_repartition_metric_table", + input.seed, + input.actions, + input.partitions, + input.tables, + ))?; + let sql_dump_session = SqlDumpSession::new(csv_dump_session.run_dir.clone())?; + let logical_table_names = logical_tables + .values() + .map(|table_ctx| table_ctx.name.to_string()) + .collect::>(); + + let mut sql_dump_session = sql_dump_session; + sql_dump_session.append_sql( + &physical_table_ctx.name.to_string(), + &create_physical_sql, + Some("kind=create_physical_table"), + )?; + for table_name in &logical_table_names { + if let Some(create_sql) = create_logical_sqls.get(table_name) { + sql_dump_session.append_sql( + table_name, + create_sql, + Some("kind=create_logical_table"), + )?; + } + } + + let shared_state = Arc::new(Mutex::new(SharedState { + clock, + inserted_rows, + csv_dump_session: Some(csv_dump_session), + sql_dump_session: Some(sql_dump_session), + running: true, + })); + let writer_rng = ChaChaRng::seed_from_u64(input.seed ^ 0xA5A5_A5A5_A5A5_A5A5); + let (control_tx, control_rx) = mpsc::unbounded_channel::(); + let writer_task = tokio::spawn(write_loop( + writer_rng, + ctx.clone(), + logical_tables.clone(), + shared_state.clone(), + control_rx, + )); + tokio::time::sleep(Duration::from_millis(100)).await; + + for i in 0..input.actions { + let partition_num = physical_table_ctx.partition.as_ref().unwrap().exprs.len(); + info!( + "partition_num: {partition_num}, action: {}/{}, table: {}, logical table num: {}", + i + 1, + input.actions, + physical_table_ctx.name, + logical_tables.len() + ); + + let repartition_expr = repartition_operation(&physical_table_ctx, &mut rng)?; + let translator = RepartitionExprTranslator; + let sql = translator.translate(&repartition_expr)?; + info!("Repartition sql: {sql}"); + let started_at_ms = current_time_millis(); + let now = Instant::now(); + let result = sqlx::query(&sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { sql: &sql })?; + let elapsed = now.elapsed(); + info!("Repartition result: {result:?}, elapsed: {elapsed:?}"); + + physical_table_ctx = Arc::new( + Arc::unwrap_or_clone(physical_table_ctx) + .repartition(repartition_expr) + .unwrap(), + ); + + let partition_entries = tests_fuzz::validator::partition::fetch_partitions_info_schema( + &ctx.greptime, + "public".into(), + &physical_table_ctx.name, + ) + .await?; + tests_fuzz::validator::partition::assert_partitions( + physical_table_ctx.partition.as_ref().unwrap(), + &partition_entries, + )?; + + { + let mut state = shared_state.lock().unwrap(); + if let Some(sql_dump_session) = state.sql_dump_session.as_mut() { + let repartition_comment = format!( + "kind=repartition table={} action_idx={} started_at_ms={} elapsed_ms={}", + physical_table_ctx.name, + i + 1, + started_at_ms, + elapsed.as_millis() + ); + sql_dump_session.append_sql( + &physical_table_ctx.name.to_string(), + &sql, + Some(&repartition_comment), + )?; + let event = format!( + "repartition action_idx={} started_at_ms={} elapsed_ms={} sql={}", + i + 1, + started_at_ms, + elapsed.as_millis(), + sql + ); + sql_dump_session.broadcast_event(logical_table_names.iter(), &event, &sql)?; + } + } + + let (ack_tx, ack_rx) = oneshot::channel(); + control_tx + .send(WriterControl::Barrier { + epoch: i + 1, + ack: ack_tx, + }) + .expect("barrier control send must succeed"); + tokio::time::timeout(Duration::from_secs(BARRIER_ACK_TIMEOUT_SECS), ack_rx) + .await + .expect("barrier ack timeout") + .expect("barrier ack dropped"); + + let inserted_rows_snapshot = flush_dump_sessions_and_snapshot(&shared_state)?; + info!("validate rows, epoch: {}", i + 1); + validate_rows(&ctx, &logical_tables, &inserted_rows_snapshot).await?; + + control_tx + .send(WriterControl::Resume { epoch: i + 1 }) + .expect("resume control send must succeed"); + } + + let _ = control_tx.send(WriterControl::Stop); + shared_state.lock().unwrap().running = false; + writer_task.await.unwrap().unwrap(); + let inserted_rows = flush_dump_sessions_and_snapshot(&shared_state)?; + let (mut csv_dump_session, mut sql_dump_session) = { + let mut state = shared_state.lock().unwrap(); + (state.csv_dump_session.take(), state.sql_dump_session.take()) + }; + + let run_result = async { + validate_rows(&ctx, &logical_tables, &inserted_rows).await?; + cleanup_tables(&ctx, &physical_table_ctx, &logical_tables).await?; + Ok(()) + } + .await; + + if let Some(csv_dump_session) = csv_dump_session.take() { + match &run_result { + Ok(_) => { + if let Err(err) = csv_dump_session.cleanup_on_success() { + warn!( + "Cleanup csv dump directory failed, path: {}, error: {:?}", + csv_dump_session.run_dir.display(), + err + ); + } + } + Err(_) => { + warn!( + "Keep csv dump directory for failure analysis, path: {}", + csv_dump_session.run_dir.display() + ); + } + } + } + if let Some(sql_dump_session) = sql_dump_session.take() + && run_result.is_err() + { + warn!( + "Keep sql dump directory for failure analysis, path: {}", + sql_dump_session.run_dir.display() + ); + } + + ctx.close().await; + run_result +} + +fuzz_target!(|input: FuzzInput| { + common_telemetry::init_default_ut_logging(); + common_runtime::block_on_global(async { + let Connections { mysql } = init_greptime_connections_via_env().await; + let ctx = FuzzContext { + greptime: mysql.expect("mysql connection init must be succeed"), + }; + execute_repartition_metric_table(ctx, input) + .await + .unwrap_or_else(|err| panic!("fuzz test must be succeed: {err:?}")); + }) +}); diff --git a/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs b/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs index c8ebbb54af..17cbfb9251 100644 --- a/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs +++ b/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs @@ -261,13 +261,18 @@ async fn migrate_regions(ctx: &FuzzContext, migrations: &[Migration]) -> Result< { let output = procedure_state(&greptime, &procedure_id).await; info!("Checking procedure: {procedure_id}, output: {output}"); - (fetch_partition(&greptime, region_id).await.unwrap(), output) + (fetch_partition(&greptime, region_id).await.ok(), output) } }) }, |(partition, output)| { - info!("Region: {region_id}, datanode: {}", partition.datanode_id); - partition.datanode_id == migration.to_peer && output.contains("Done") + if let Some(partition) = partition { + info!("Region: {region_id}, datanode: {}", partition.datanode_id); + partition.datanode_id == migration.to_peer && output.contains("Done") + } else { + info!("Region: {region_id}, partition not found yet"); + false + } }, Duration::from_secs(5), ) diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index 0c6b965fd3..ec35205a55 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -5,7 +5,7 @@ edition.workspace = true license.workspace = true [features] -dashboard = [] +dashboard = ["servers/dashboard"] vector_index = [] [lints] diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs index fd0d1ef3c4..2bf6e812c7 100644 --- a/tests-integration/src/test_util.rs +++ b/tests-integration/src/test_util.rs @@ -534,6 +534,7 @@ pub async fn setup_test_http_app_with_frontend_and_custom_options( .with_influxdb_handler(instance.fe_instance().clone()) .with_otlp_handler(instance.fe_instance().clone(), true) .with_jaeger_handler(instance.fe_instance().clone()) + .with_dashboard_handler(instance.fe_instance().clone()) .with_greptime_config_options(instance.opts.to_toml().unwrap()); if let Some(user_provider) = user_provider { diff --git a/tests-integration/src/tests/promql_test.rs b/tests-integration/src/tests/promql_test.rs index 7fbce91ea6..ede4663118 100644 --- a/tests-integration/src/tests/promql_test.rs +++ b/tests-integration/src/tests/promql_test.rs @@ -15,7 +15,9 @@ use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use common_query::Output; +use common_query::{Output, OutputData}; +use common_recordbatch::util::collect_batches; +use datatypes::arrow::array::{Float64Array, Int64Array}; use frontend::instance::Instance; use query::parser::{PromQuery, QueryLanguageParser, QueryStatement}; use rstest::rstest; @@ -151,6 +153,103 @@ async fn create_insert_tql_assert( check_unordered_output_stream(query_output, expected).await; } +async fn execute_all(instance: &Arc, sql: &str, query_ctx: Arc) { + instance + .do_query(sql, query_ctx) + .await + .into_iter() + .for_each(|v| { + let _ = v.unwrap(); + }); +} + +#[allow(clippy::too_many_arguments)] +async fn promql_query_as_batches( + ins: Arc, + promql: &str, + alias: Option, + query_ctx: Arc, + start: SystemTime, + end: SystemTime, + interval: Duration, + lookback: Duration, +) -> common_recordbatch::RecordBatches { + let output = promql_query( + ins, promql, alias, query_ctx, start, end, interval, lookback, + ) + .await + .unwrap(); + match output.data { + OutputData::Stream(stream) => collect_batches(stream).await.unwrap(), + OutputData::RecordBatches(recordbatches) => recordbatches, + _ => unreachable!(), + } +} + +const ANON_PROMQL_RATIO_REPRO_DB: &str = "repro_db"; + +const ANON_PROMQL_RATIO_REPRO_CREATE: &str = r#" +CREATE TABLE phy ( + t TIMESTAMP TIME INDEX, + v DOUBLE +) ENGINE=metric WITH ("physical_metric_table" = ""); + +CREATE TABLE metric_a ( + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + l5 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l1, l2, l3, l4, l5) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +CREATE TABLE metric_b ( + l6 STRING NULL, + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l6, l1, l2, l3, l4) +) ENGINE=metric WITH (on_physical_table = 'phy'); +"#; + +const ANON_PROMQL_RATIO_REPRO_INSERT: &str = r#" +INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES + ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120); + +INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES + ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2); +"#; + +const ANON_PROMQL_RATIO_REPRO_NUMERATOR: &str = r#"count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50))"#; + +const ANON_PROMQL_RATIO_REPRO_DENOMINATOR: &str = + r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))"#; + +const ANON_PROMQL_RATIO_REPRO_WHOLE: &str = r#"(count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))) * 100"#; + +const ANON_PROMQL_RATIO_REPRO_SCALAR_DIV: &str = + r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m])) / 2"#; + #[apply(both_instances_cases)] async fn sql_insert_tql_query_ceil(instance: Arc) { let instance = instance.frontend(); @@ -709,3 +808,140 @@ async fn cross_schema_query(instance: Arc) { check_unordered_output_stream(query_output, expected).await; } + +#[apply(both_instances_cases)] +async fn anon_promql_ratio_repro(instance: Arc) { + let ins = instance.frontend(); + + execute_all( + &ins, + &format!("CREATE DATABASE {ANON_PROMQL_RATIO_REPRO_DB}"), + QueryContext::arc(), + ) + .await; + + let repro_ctx: Arc = + QueryContext::with_db_name(Some(ANON_PROMQL_RATIO_REPRO_DB)).into(); + execute_all(&ins, ANON_PROMQL_RATIO_REPRO_CREATE, repro_ctx.clone()).await; + execute_all(&ins, ANON_PROMQL_RATIO_REPRO_INSERT, repro_ctx).await; + + let start = UNIX_EPOCH.checked_add(Duration::from_secs(180)).unwrap(); + let end = UNIX_EPOCH.checked_add(Duration::from_secs(360)).unwrap(); + let interval = Duration::from_secs(180); + let lookback = Duration::from_secs(1); + + let numerator = promql_query_as_batches( + ins.clone(), + ANON_PROMQL_RATIO_REPRO_NUMERATOR, + Some("num".to_string()), + QueryContext::arc(), + start, + end, + interval, + lookback, + ) + .await; + let denominator = promql_query_as_batches( + ins.clone(), + ANON_PROMQL_RATIO_REPRO_DENOMINATOR, + Some("den".to_string()), + QueryContext::arc(), + start, + end, + interval, + lookback, + ) + .await; + let whole = promql_query_as_batches( + ins.clone(), + ANON_PROMQL_RATIO_REPRO_WHOLE, + Some("pct".to_string()), + QueryContext::arc(), + start, + end, + interval, + lookback, + ) + .await; + let scalar_div = promql_query_as_batches( + ins, + ANON_PROMQL_RATIO_REPRO_SCALAR_DIV, + Some("half_den".to_string()), + QueryContext::arc(), + start, + end, + interval, + lookback, + ) + .await; + + let numerator = numerator.iter().collect::>(); + let denominator = denominator.iter().collect::>(); + let whole = whole.iter().collect::>(); + let scalar_div = scalar_div.iter().collect::>(); + + let numerator_values = numerator[0] + .column_by_name("num") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let denominator_values = denominator[0] + .column_by_name("den") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let percentage_values = whole[0] + .column_by_name("pct") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let scalar_div_values = scalar_div[0] + .column_by_name("half_den") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(numerator_values.len(), 1, "{}", numerator[0].pretty_print()); + assert_eq!( + denominator_values.len(), + 1, + "{}", + denominator[0].pretty_print() + ); + assert_eq!(percentage_values.len(), 1, "{}", whole[0].pretty_print()); + assert_eq!( + scalar_div_values.len(), + 1, + "{}", + scalar_div[0].pretty_print() + ); + + assert_eq!( + numerator_values.value(0), + 1, + "{}", + numerator[0].pretty_print() + ); + assert_eq!( + denominator_values.value(0), + 3, + "{}", + denominator[0].pretty_print() + ); + assert!( + (scalar_div_values.value(0) - 1.5).abs() < 1e-9, + "{}", + scalar_div[0].pretty_print() + ); + + let expected = 100.0 / 3.0; + assert!( + (percentage_values.value(0) - expected).abs() < 1e-9, + "{}", + whole[0].pretty_print() + ); +} diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 68fa2a228d..7ae59ae9fc 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -106,6 +106,7 @@ macro_rules! http_tests { test_config_api, test_dynamic_tracer_toggle, test_dashboard_path, + test_dashboard_api, test_prometheus_remote_write, test_prometheus_remote_special_labels, test_prometheus_remote_schema_labels, @@ -147,6 +148,7 @@ macro_rules! http_tests { test_jaeger_query_api_for_trace_v1, test_influxdb_write, + test_influxdb_write_with_hints, test_http_memory_limit, ); )* @@ -1640,6 +1642,7 @@ fn drop_lines_with_inconsistent_results(input: String) -> String { "metadata_cache_size =", "content_cache_size =", "result_cache_size =", + "range_result_cache_size =", "name =", "recovery_parallelism =", "max_background_index_builds =", @@ -1720,6 +1723,121 @@ pub async fn test_dashboard_path(store_type: StorageType) { #[cfg(not(feature = "dashboard"))] pub async fn test_dashboard_path(_: StorageType) {} +#[cfg(feature = "dashboard")] +pub async fn test_dashboard_api(store_type: StorageType) { + common_telemetry::init_default_ut_logging(); + let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "dashboard_api").await; + let client = TestClient::new(app).await; + + // 1. List dashboards - should be empty initially + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert!(dashboards.is_empty()); + + // 2. Save a dashboard + let dashboard_definition = r#"{"title": "My Dashboard", "panels": []}"#; + let res = client + .post("/v1/dashboards/test_dashboard") + .body(dashboard_definition) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 1); + assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard"); + + // 3. Save another dashboard + let res = client + .post("/v1/dashboards/another_dashboard") + .body(r#"{"title": "Another Dashboard"}"#) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + // 4. List dashboards - should have 2 + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 2); + + let names: Vec<&str> = dashboards + .iter() + .map(|d| d.get("name").unwrap().as_str().unwrap()) + .collect(); + assert!(names.contains(&"test_dashboard")); + assert!(names.contains(&"another_dashboard")); + + // 5. Update a dashboard by posting again with new definition + let updated_definition = r#"{"title": "Updated Dashboard", "panels": [{"id": 1}]}"#; + let res = client + .post("/v1/dashboards/test_dashboard") + .body(updated_definition) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 1); + assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard"); + + // Verify the definition was updated by listing again + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 2); + + // Find test_dashboard and verify it has updated definition + let test_db = dashboards + .iter() + .find(|d| d.get("name").unwrap() == "test_dashboard") + .unwrap(); + assert_eq!( + test_db.get("definition").unwrap(), + r#"{"title": "Updated Dashboard", "panels": [{"id": 1}]}"# + ); + + // 6. Delete one dashboard + let res = client.delete("/v1/dashboards/test_dashboard").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 1); + assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard"); + + // 7. List dashboards - should have 1 + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert_eq!(dashboards.len(), 1); + assert_eq!(dashboards[0].get("name").unwrap(), "another_dashboard"); + + // 8. Delete the remaining dashboard + let res = client + .delete("/v1/dashboards/another_dashboard") + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + // 9. List dashboards - should be empty + let res = client.get("/v1/dashboards").send().await; + assert_eq!(res.status(), StatusCode::OK); + let body: Value = res.json().await; + let dashboards = body.get("dashboards").unwrap().as_array().unwrap(); + assert!(dashboards.is_empty()); + + guard.remove_all().await; +} + +#[cfg(not(feature = "dashboard"))] +pub async fn test_dashboard_api(_: StorageType) {} + pub async fn test_prometheus_remote_write(store_type: StorageType) { common_telemetry::init_default_ut_logging(); let (app, mut guard) = @@ -3522,6 +3640,43 @@ transform: guard.remove_all().await; } +pub async fn test_influxdb_write_with_hints(storage_type: StorageType) { + common_telemetry::init_default_ut_logging(); + let (app, mut guard) = + setup_test_http_app_with_frontend(storage_type, "test_influxdb_write_with_hints").await; + + let client = TestClient::new(app).await; + + let result = client + .post("/v1/influxdb/write?db=public") + .header("x-greptime-hints", "sst_format=flat,ttl=30d,skip_wal=true") + .body("sst_fmt_table,host=host1 cpu=1.2 1664370459457010101") + .send() + .await; + assert_eq!(result.status(), 204); + + let res = client + .get("/v1/sql?sql=show create table sst_fmt_table") + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + let resp = res.text().await; + assert!( + resp.contains("sst_format = 'flat'"), + "expected sst_format = 'flat' in SHOW CREATE TABLE output, got: {resp}" + ); + assert!( + resp.contains("ttl = '30days'"), + "expected ttl = '30days' in SHOW CREATE TABLE output, got: {resp}" + ); + assert!( + resp.contains("skip_wal = 'true'"), + "expected skip_wal = 'true' in SHOW CREATE TABLE output, got: {resp}" + ); + + guard.remove_all().await; +} + /// Test one-to-many VRL pipeline expansion. /// This test verifies that a VRL processor can return an array, which results in /// multiple output rows from a single input row. diff --git a/tests/cases/distributed/explain/step_aggr_advance.result b/tests/cases/distributed/explain/step_aggr_advance.result index 4bd83b7afa..5938fa202d 100644 --- a/tests/cases/distributed/explain/step_aggr_advance.result +++ b/tests/cases/distributed/explain/step_aggr_advance.result @@ -442,54 +442,54 @@ Affected Rows: 0 -- SQLNESS REPLACE (Hash.*) REDACTED tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m])); -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) | -| | Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp | -| | MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST | -| | MergeScan [is_placeholder=false, remote_input=[ | -| | SubqueryAlias: aggr_optimize_not | -| | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST | -| | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] | -| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL | -| | Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d | -| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] | -| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | -| | PromSeriesDivide: tags=["a", "b", "c", "d"] | -| | Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST | -| | Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None) | -| | TableScan: aggr_optimize_not | -| | ]] | -| | SubqueryAlias: aggr_optimize_not_count | -| | Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST | -| | Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] | -| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL | -| | Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c | -| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] | -| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | -| | PromSeriesDivide: tags=["a", "b", "c", "d"] | -| | Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST | -| | MergeScan [is_placeholder=false, remote_input=[ | -| | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None) | -| | TableScan: aggr_optimize_not_count | -| | ]] | -| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, CAST(aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) / CAST(aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) | +| | Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp | +| | MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST | +| | MergeScan [is_placeholder=false, remote_input=[ | +| | SubqueryAlias: aggr_optimize_not | +| | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST | +| | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] | +| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL | +| | Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d | +| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] | +| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | +| | PromSeriesDivide: tags=["a", "b", "c", "d"] | +| | Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST | +| | Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None) | +| | TableScan: aggr_optimize_not | +| | ]] | +| | SubqueryAlias: aggr_optimize_not_count | +| | Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST | +| | Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] | +| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL | +| | Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c | +| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] | +| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | +| | PromSeriesDivide: tags=["a", "b", "c", "d"] | +| | Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST | +| | MergeScan [is_placeholder=false, remote_input=[ | +| | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None) | +| | TableScan: aggr_optimize_not_count | +| | ]] | +| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] | | | REDACTED -| | CoalescePartitionsExec | -| | AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] | -| | FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL | -| | ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] | -| | PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] | -| | PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | -| | PromSeriesDivideExec: tags=["a", "b", "c", "d"] | -| | SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true] | +| | CoalescePartitionsExec | +| | AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] | +| | FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL | +| | ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] | +| | PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] | +| | PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] | +| | PromSeriesDivideExec: tags=["a", "b", "c", "d"] | +| | SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true] | | | MergeScanExec: REDACTED -| | SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] | -| | CooperativeExec | +| | SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] | +| | CooperativeExec | | | MergeScanExec: REDACTED -| | | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| | | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -- SQLNESS REPLACE (metrics.*) REDACTED -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED diff --git a/tests/cases/standalone/common/alter/alter_database.result b/tests/cases/standalone/common/alter/alter_database.result index 911ef5ddfc..2fccce10de 100644 --- a/tests/cases/standalone/common/alter/alter_database.result +++ b/tests/cases/standalone/common/alter/alter_database.result @@ -314,6 +314,85 @@ SHOW CREATE DATABASE alter_database; | | ) | +----------------+----------------------------------------------+ +-- Test sst_format option +ALTER DATABASE alter_database SET 'sst_format'='flat'; + +Affected Rows: 0 + +SHOW CREATE DATABASE alter_database; + ++----------------+----------------------------------------------+ +| Database | Create Database | ++----------------+----------------------------------------------+ +| alter_database | CREATE DATABASE IF NOT EXISTS alter_database | +| | WITH( | +| | 'compaction.twcs.time_window' = '30m', | +| | 'compaction.type' = 'twcs', | +| | sst_format = 'flat' | +| | ) | ++----------------+----------------------------------------------+ + +USE alter_database; + +Affected Rows: 0 + +CREATE TABLE monitor(ts TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +SHOW CREATE TABLE monitor; + ++---------+----------------------------------------+ +| Table | Create Table | ++---------+----------------------------------------+ +| monitor | CREATE TABLE IF NOT EXISTS "monitor" ( | +| | "ts" TIMESTAMP(3) NOT NULL, | +| | TIME INDEX ("ts") | +| | ) | +| | | +| | ENGINE=mito | +| | WITH( | +| | sst_format = 'flat' | +| | ) | ++---------+----------------------------------------+ + +USE public; + +Affected Rows: 0 + +ALTER DATABASE alter_database SET 'sst_format'='primary_key'; + +Affected Rows: 0 + +SHOW CREATE DATABASE alter_database; + ++----------------+----------------------------------------------+ +| Database | Create Database | ++----------------+----------------------------------------------+ +| alter_database | CREATE DATABASE IF NOT EXISTS alter_database | +| | WITH( | +| | 'compaction.twcs.time_window' = '30m', | +| | 'compaction.type' = 'twcs', | +| | sst_format = 'primary_key' | +| | ) | ++----------------+----------------------------------------------+ + +ALTER DATABASE alter_database UNSET 'sst_format'; + +Affected Rows: 0 + +SHOW CREATE DATABASE alter_database; + ++----------------+----------------------------------------------+ +| Database | Create Database | ++----------------+----------------------------------------------+ +| alter_database | CREATE DATABASE IF NOT EXISTS alter_database | +| | WITH( | +| | 'compaction.twcs.time_window' = '30m', | +| | 'compaction.type' = 'twcs' | +| | ) | ++----------------+----------------------------------------------+ + DROP DATABASE alter_database; Affected Rows: 0 diff --git a/tests/cases/standalone/common/alter/alter_database.sql b/tests/cases/standalone/common/alter/alter_database.sql index 1b2f75637a..33b309153e 100644 --- a/tests/cases/standalone/common/alter/alter_database.sql +++ b/tests/cases/standalone/common/alter/alter_database.sql @@ -90,5 +90,25 @@ ALTER DATABASE alter_database UNSET 'ttl'; SHOW CREATE DATABASE alter_database; -DROP DATABASE alter_database; +-- Test sst_format option +ALTER DATABASE alter_database SET 'sst_format'='flat'; +SHOW CREATE DATABASE alter_database; + +USE alter_database; + +CREATE TABLE monitor(ts TIMESTAMP TIME INDEX); + +SHOW CREATE TABLE monitor; + +USE public; + +ALTER DATABASE alter_database SET 'sst_format'='primary_key'; + +SHOW CREATE DATABASE alter_database; + +ALTER DATABASE alter_database UNSET 'sst_format'; + +SHOW CREATE DATABASE alter_database; + +DROP DATABASE alter_database; diff --git a/tests/cases/standalone/common/flow/flow_tql_avg.result b/tests/cases/standalone/common/flow/flow_tql_avg.result new file mode 100644 index 0000000000..8438f41eb6 --- /dev/null +++ b/tests/cases/standalone/common/flow/flow_tql_avg.result @@ -0,0 +1,126 @@ +CREATE TABLE sensor_readings ( + `value` DOUBLE, + ts TIMESTAMP TIME INDEX, + sensor STRING, + loc STRING, + PRIMARY KEY (sensor, loc) +); + +Affected Rows: 0 + +CREATE TABLE sensor_readings_avg ( + `value` DOUBLE, + ts TIMESTAMP TIME INDEX, + sensor STRING, + PRIMARY KEY (sensor) +); + +Affected Rows: 0 + +INSERT INTO sensor_readings VALUES + (20, now() - '30s'::interval, 'test', 'A'); + +Affected Rows: 1 + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +TQL EVAL (now() - '1m'::interval, now(), '1m') +avg by(sensor) (sensor_readings) AS value; + ++-------+--------+---------------------+ +| value | sensor | ts | ++-------+--------+---------------------+ +| 20.0 | test | TS | ++-------+--------+---------------------+ + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value; + ++-------+--------+---------------------+ +| value | sensor | ts | ++-------+--------+---------------------+ +| 20.0 | test | TS | ++-------+--------+---------------------+ + +CREATE FLOW sensor_readings_avg_flow +SINK TO sensor_readings_avg +EVAL INTERVAL '1m' AS +TQL EVAL (now() - '1m'::interval, now(), '1m') +avg by(sensor) (sensor_readings) AS value; + +Affected Rows: 0 + +-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED | +ADMIN FLUSH_FLOW('sensor_readings_avg_flow'); + ++----------------------------------------------+ +| ADMIN FLUSH_FLOW('sensor_readings_avg_flow') | ++----------------------------------------------+ +| FLOW_FLUSHED | ++----------------------------------------------+ + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1; + ++-------+---------------------+--------+ +| value | ts | sensor | ++-------+---------------------+--------+ +| 20.0 | TS | test | ++-------+---------------------+--------+ + +DROP FLOW sensor_readings_avg_flow; + +Affected Rows: 0 + +-- SQLNESS SLEEP 1s +INSERT INTO sensor_readings VALUES + (30, now() - '40s'::interval, 'test', 'B'); + +Affected Rows: 1 + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +TQL EVAL (now() - '1m'::interval, now(), '1m') +avg by(sensor) (sensor_readings) AS value; + ++-------+--------+---------------------+ +| value | sensor | ts | ++-------+--------+---------------------+ +| 25.0 | test | TS | ++-------+--------+---------------------+ + +CREATE FLOW sensor_readings_avg_flow +SINK TO sensor_readings_avg +EVAL INTERVAL '1m' AS +TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value; + +Affected Rows: 0 + +-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED | +ADMIN FLUSH_FLOW('sensor_readings_avg_flow'); + ++----------------------------------------------+ +| ADMIN FLUSH_FLOW('sensor_readings_avg_flow') | ++----------------------------------------------+ +| FLOW_FLUSHED | ++----------------------------------------------+ + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1; + ++-------+---------------------+--------+ +| value | ts | sensor | ++-------+---------------------+--------+ +| 25.0 | TS | test | ++-------+---------------------+--------+ + +DROP FLOW sensor_readings_avg_flow; + +Affected Rows: 0 + +DROP TABLE sensor_readings_avg; + +Affected Rows: 0 + +DROP TABLE sensor_readings; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/flow/flow_tql_avg.sql b/tests/cases/standalone/common/flow/flow_tql_avg.sql new file mode 100644 index 0000000000..a5d6ab9d2b --- /dev/null +++ b/tests/cases/standalone/common/flow/flow_tql_avg.sql @@ -0,0 +1,63 @@ +CREATE TABLE sensor_readings ( + `value` DOUBLE, + ts TIMESTAMP TIME INDEX, + sensor STRING, + loc STRING, + PRIMARY KEY (sensor, loc) +); + +CREATE TABLE sensor_readings_avg ( + `value` DOUBLE, + ts TIMESTAMP TIME INDEX, + sensor STRING, + PRIMARY KEY (sensor) +); + +INSERT INTO sensor_readings VALUES + (20, now() - '30s'::interval, 'test', 'A'); + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +TQL EVAL (now() - '1m'::interval, now(), '1m') +avg by(sensor) (sensor_readings) AS value; + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value; + +CREATE FLOW sensor_readings_avg_flow +SINK TO sensor_readings_avg +EVAL INTERVAL '1m' AS +TQL EVAL (now() - '1m'::interval, now(), '1m') +avg by(sensor) (sensor_readings) AS value; + +-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED | +ADMIN FLUSH_FLOW('sensor_readings_avg_flow'); + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1; + +DROP FLOW sensor_readings_avg_flow; + +-- SQLNESS SLEEP 1s +INSERT INTO sensor_readings VALUES + (30, now() - '40s'::interval, 'test', 'B'); + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +TQL EVAL (now() - '1m'::interval, now(), '1m') +avg by(sensor) (sensor_readings) AS value; + + +CREATE FLOW sensor_readings_avg_flow +SINK TO sensor_readings_avg +EVAL INTERVAL '1m' AS +TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value; + +-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED | +ADMIN FLUSH_FLOW('sensor_readings_avg_flow'); + +-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS +SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1; + +DROP FLOW sensor_readings_avg_flow; + +DROP TABLE sensor_readings_avg; +DROP TABLE sensor_readings; diff --git a/tests/cases/standalone/common/prepare/mysql_prepare.result b/tests/cases/standalone/common/prepare/mysql_prepare.result index abc267b50e..5ef242a891 100644 --- a/tests/cases/standalone/common/prepare/mysql_prepare.result +++ b/tests/cases/standalone/common/prepare/mysql_prepare.result @@ -42,7 +42,7 @@ affected_rows: 0 -- SQLNESS PROTOCOL MYSQL EXECUTE stmt USING 'a'; -Failed to execute query, err: MySqlError { ERROR 1815 (HY000): (EngineExecuteQuery): Cast error: Cannot cast string 'a' to value of Int32 type } +Failed to execute query, err: MySqlError { ERROR 1210 (HY000): (InvalidArguments): Invalid request parameter: Unable to convert a to datatype Int32(Int32Type) } -- SQLNESS PROTOCOL MYSQL DEALLOCATE stmt; @@ -124,6 +124,25 @@ DEALLOCATE stmt; affected_rows: 0 +-- SQLNESS PROTOCOL MYSQL +PREPARE stmt FROM 'SELECT table_name, table_schema FROM information_schema.tables WHERE table_name = ?'; + +affected_rows: 0 + +-- SQLNESS PROTOCOL MYSQL +EXECUTE stmt USING 'cake'; + ++------------+--------------+ +| table_name | table_schema | ++------------+--------------+ +| cake | public | ++------------+--------------+ + +-- SQLNESS PROTOCOL MYSQL +DEALLOCATE stmt; + +affected_rows: 0 + -- SQLNESS PROTOCOL MYSQL DROP TABLE cake; diff --git a/tests/cases/standalone/common/prepare/mysql_prepare.sql b/tests/cases/standalone/common/prepare/mysql_prepare.sql index 8e80a0a867..e96e945f88 100644 --- a/tests/cases/standalone/common/prepare/mysql_prepare.sql +++ b/tests/cases/standalone/common/prepare/mysql_prepare.sql @@ -72,5 +72,14 @@ EXECUTE stmt USING 'happy', 42, 0; -- SQLNESS PROTOCOL MYSQL DEALLOCATE stmt; +-- SQLNESS PROTOCOL MYSQL +PREPARE stmt FROM 'SELECT table_name, table_schema FROM information_schema.tables WHERE table_name = ?'; + +-- SQLNESS PROTOCOL MYSQL +EXECUTE stmt USING 'cake'; + +-- SQLNESS PROTOCOL MYSQL +DEALLOCATE stmt; + -- SQLNESS PROTOCOL MYSQL DROP TABLE cake; diff --git a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result new file mode 100644 index 0000000000..ab3c4db715 --- /dev/null +++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result @@ -0,0 +1,106 @@ +CREATE TABLE phy ( + t TIMESTAMP TIME INDEX, + v DOUBLE +) ENGINE=metric WITH ("physical_metric_table" = ""); + +Affected Rows: 0 + +CREATE TABLE metric_a ( + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + l5 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l1, l2, l3, l4, l5) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +Affected Rows: 0 + +CREATE TABLE metric_b ( + l6 STRING NULL, + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l6, l1, l2, l3, l4) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +Affected Rows: 0 + +INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES + ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120); + +Affected Rows: 9 + +INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES + ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2); + +Affected Rows: 6 + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)); + ++---------------------+-------------------------------------------------------------------+ +| t | count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) | ++---------------------+-------------------------------------------------------------------+ +| 1970-01-01T00:03:00 | 1 | ++---------------------+-------------------------------------------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])); + ++---------------------+---------------------------------------------+ +| t | count(prom_rate(t_range,v,t,Int64(180000))) | ++---------------------+---------------------------------------------+ +| 1970-01-01T00:03:00 | 3 | ++---------------------+---------------------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2; + ++---------------------+----------------------------------------------------------+ +| t | count(prom_rate(t_range,v,t,Int64(180000))) / Float64(2) | ++---------------------+----------------------------------------------------------+ +| 1970-01-01T00:03:00 | 1.5 | ++---------------------+----------------------------------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100; + ++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +| t | metric_b.count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) / metric_a.count(prom_rate(t_range,v,t,Int64(180000))) * Float64(100) | ++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +| 1970-01-01T00:03:00 | 33.33333333333333 | ++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ + +DROP TABLE metric_a; + +Affected Rows: 0 + +DROP TABLE metric_b; + +Affected Rows: 0 + +DROP TABLE phy; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql new file mode 100644 index 0000000000..946d4f93a1 --- /dev/null +++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql @@ -0,0 +1,63 @@ +CREATE TABLE phy ( + t TIMESTAMP TIME INDEX, + v DOUBLE +) ENGINE=metric WITH ("physical_metric_table" = ""); + +CREATE TABLE metric_a ( + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + l5 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l1, l2, l3, l4, l5) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +CREATE TABLE metric_b ( + l6 STRING NULL, + l1 STRING NULL, + l2 STRING NULL, + l3 STRING NULL, + l4 STRING NULL, + t TIMESTAMP NOT NULL, + v DOUBLE NULL, + TIME INDEX (t), + PRIMARY KEY (l6, l1, l2, l3, l4) +) ENGINE=metric WITH (on_physical_table = 'phy'); + +INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES + ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120), + ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30), + ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60), + ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120); + +INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES + ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1), + ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2), + ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2; + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100; + +DROP TABLE metric_a; +DROP TABLE metric_b; +DROP TABLE phy; diff --git a/tests/cases/standalone/common/promql/scalar.result b/tests/cases/standalone/common/promql/scalar.result index c5c3e5ebd1..c3292b4f5c 100644 --- a/tests/cases/standalone/common/promql/scalar.result +++ b/tests/cases/standalone/common/promql/scalar.result @@ -136,6 +136,42 @@ TQL EVAL (0, 15, '5s') scalar(count(count(host) by (host))); | 1970-01-01T00:00:15 | 2.0 | +---------------------+--------------------------------+ +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(sum(host) by (host))); + ++---------------------+------------------------------+ +| ts | scalar(count(sum(host.val))) | ++---------------------+------------------------------+ +| 1970-01-01T00:00:00 | 2.0 | +| 1970-01-01T00:00:05 | 2.0 | +| 1970-01-01T00:00:10 | 2.0 | +| 1970-01-01T00:00:15 | 2.0 | ++---------------------+------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(avg(host) by (host))); + ++---------------------+------------------------------+ +| ts | scalar(count(avg(host.val))) | ++---------------------+------------------------------+ +| 1970-01-01T00:00:00 | 2.0 | +| 1970-01-01T00:00:05 | 2.0 | +| 1970-01-01T00:00:10 | 2.0 | +| 1970-01-01T00:00:15 | 2.0 | ++---------------------+------------------------------+ + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(stddev(host) by (host))); + ++---------------------+-------------------------------------+ +| ts | scalar(count(stddev_pop(host.val))) | ++---------------------+-------------------------------------+ +| 1970-01-01T00:00:00 | 2.0 | +| 1970-01-01T00:00:05 | 2.0 | +| 1970-01-01T00:00:10 | 2.0 | +| 1970-01-01T00:00:15 | 2.0 | ++---------------------+-------------------------------------+ + -- SQLNESS SORT_RESULT 3 1 TQL EVAL (0, 15, '5s') scalar(host{host="host1"} + scalar(host{host="host2"})); @@ -516,7 +552,99 @@ TQL EVAL (0, 15, '5s') clamp_max(clamp(host{host="host1"}, 0, 15), 6); | 1970-01-01T00:00:15 | 6.0 | host1 | +---------------------+---------------------------------------------------------+-------+ -Drop table host; +DROP TABLE host; + +Affected Rows: 0 + +CREATE TABLE presence_metric ( + ts timestamp(3) time index, + instance STRING, + cpu STRING, + shard STRING, + val DOUBLE, + PRIMARY KEY (instance, cpu, shard), +); + +Affected Rows: 0 + +INSERT INTO TABLE presence_metric VALUES + (0, 'i1', 'cpu0', 'a', 1.0), + (0, 'i1', 'cpu0', 'b', 2.0), + (0, 'i1', 'cpu1', 'a', 10.0), + (0, 'i1', 'cpu2', 'a', 20.0), + (0, 'i2', 'cpu9', 'a', 100.0), + (200000, 'i1', 'cpu0', 'a', 'NAN'::DOUBLE), + (200000, 'i1', 'cpu0', 'b', 'NAN'::DOUBLE), + (200000, 'i1', 'cpu1', 'a', 11.0), + (200000, 'i1', 'cpu2', 'a', NULL), + (200000, 'i2', 'cpu9', 'a', 101.0), + (400000, 'i1', 'cpu1', 'a', 12.0), + (400000, 'i2', 'cpu9', 'a', 102.0), + (600000, 'i1', 'cpu0', 'a', 7.0), + (600000, 'i1', 'cpu0', 'b', 8.0), + (600000, 'i2', 'cpu9', 'a', 103.0); + +Affected Rows: 15 + +-- NaN drops `cpu0` from the grouped count, while the NULL sample on `cpu2` +-- still leaves a zero-valued row in `count(...) by (cpu)`. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') count(presence_metric{instance="i1"}) by (cpu); + ++------+---------------------+----------------------------+ +| cpu | ts | count(presence_metric.val) | ++------+---------------------+----------------------------+ +| cpu0 | 1970-01-01T00:00:00 | 2 | +| cpu0 | 1970-01-01T00:10:00 | 2 | +| cpu1 | 1970-01-01T00:00:00 | 1 | +| cpu1 | 1970-01-01T00:03:20 | 1 | +| cpu1 | 1970-01-01T00:06:40 | 1 | +| cpu1 | 1970-01-01T00:10:00 | 1 | +| cpu2 | 1970-01-01T00:00:00 | 1 | +| cpu2 | 1970-01-01T00:03:20 | 0 | +| cpu2 | 1970-01-01T00:06:40 | 0 | ++------+---------------------+----------------------------+ + +-- Nested-count rewrite should preserve grouped presence after stale-NaN filtering and null-value pruning. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric{instance="i1"}) by (cpu))); + ++---------------------+-------------------------------------------+ +| ts | scalar(count(count(presence_metric.val))) | ++---------------------+-------------------------------------------+ +| 1970-01-01T00:00:00 | 3.0 | +| 1970-01-01T00:03:20 | 2.0 | +| 1970-01-01T00:06:40 | 2.0 | +| 1970-01-01T00:10:00 | 2.0 | ++---------------------+-------------------------------------------+ + +-- Non-count inner aggregates must drop NULL-only groups before the outer count. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(sum(presence_metric{instance="i1"}) by (cpu))); + ++---------------------+-----------------------------------------+ +| ts | scalar(count(sum(presence_metric.val))) | ++---------------------+-----------------------------------------+ +| 1970-01-01T00:00:00 | 3.0 | +| 1970-01-01T00:03:20 | 1.0 | +| 1970-01-01T00:06:40 | 1.0 | +| 1970-01-01T00:10:00 | 2.0 | ++---------------------+-----------------------------------------+ + +-- False case: outer `by (instance)` keeps multiple series at the scalar input, so scalar should still yield NaN. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric) by (instance, cpu)) by (instance)); + ++---------------------+-------------------------------------------+ +| ts | scalar(count(count(presence_metric.val))) | ++---------------------+-------------------------------------------+ +| 1970-01-01T00:00:00 | NaN | +| 1970-01-01T00:03:20 | NaN | +| 1970-01-01T00:06:40 | NaN | +| 1970-01-01T00:10:00 | NaN | ++---------------------+-------------------------------------------+ + +DROP TABLE presence_metric; Affected Rows: 0 diff --git a/tests/cases/standalone/common/promql/scalar.sql b/tests/cases/standalone/common/promql/scalar.sql index b4007bbf15..662f9665fe 100644 --- a/tests/cases/standalone/common/promql/scalar.sql +++ b/tests/cases/standalone/common/promql/scalar.sql @@ -43,6 +43,15 @@ TQL EVAL (0, 15, '5s') scalar(host{host="host1"}) + host; -- SQLNESS SORT_RESULT 3 1 TQL EVAL (0, 15, '5s') scalar(count(count(host) by (host))); +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(sum(host) by (host))); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(avg(host) by (host))); + +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 15, '5s') scalar(count(stddev(host) by (host))); + -- SQLNESS SORT_RESULT 3 1 TQL EVAL (0, 15, '5s') scalar(host{host="host1"} + scalar(host{host="host2"})); @@ -149,4 +158,49 @@ TQL EVAL (0, 15, '5s') clamp(clamp_min(host{host="host1"}, 1), 0, 12); -- SQLNESS SORT_RESULT 3 1 TQL EVAL (0, 15, '5s') clamp_max(clamp(host{host="host1"}, 0, 15), 6); -Drop table host; +DROP TABLE host; + +CREATE TABLE presence_metric ( + ts timestamp(3) time index, + instance STRING, + cpu STRING, + shard STRING, + val DOUBLE, + PRIMARY KEY (instance, cpu, shard), +); + +INSERT INTO TABLE presence_metric VALUES + (0, 'i1', 'cpu0', 'a', 1.0), + (0, 'i1', 'cpu0', 'b', 2.0), + (0, 'i1', 'cpu1', 'a', 10.0), + (0, 'i1', 'cpu2', 'a', 20.0), + (0, 'i2', 'cpu9', 'a', 100.0), + (200000, 'i1', 'cpu0', 'a', 'NAN'::DOUBLE), + (200000, 'i1', 'cpu0', 'b', 'NAN'::DOUBLE), + (200000, 'i1', 'cpu1', 'a', 11.0), + (200000, 'i1', 'cpu2', 'a', NULL), + (200000, 'i2', 'cpu9', 'a', 101.0), + (400000, 'i1', 'cpu1', 'a', 12.0), + (400000, 'i2', 'cpu9', 'a', 102.0), + (600000, 'i1', 'cpu0', 'a', 7.0), + (600000, 'i1', 'cpu0', 'b', 8.0), + (600000, 'i2', 'cpu9', 'a', 103.0); + +-- NaN drops `cpu0` from the grouped count, while the NULL sample on `cpu2` +-- still leaves a zero-valued row in `count(...) by (cpu)`. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') count(presence_metric{instance="i1"}) by (cpu); + +-- Nested-count rewrite should preserve grouped presence after stale-NaN filtering and null-value pruning. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric{instance="i1"}) by (cpu))); + +-- Non-count inner aggregates must drop NULL-only groups before the outer count. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(sum(presence_metric{instance="i1"}) by (cpu))); + +-- False case: outer `by (instance)` keeps multiple series at the scalar input, so scalar should still yield NaN. +-- SQLNESS SORT_RESULT 3 1 +TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric) by (instance, cpu)) by (instance)); + +DROP TABLE presence_metric; diff --git a/tests/cases/standalone/common/tql/tql-cte.result b/tests/cases/standalone/common/tql/tql-cte.result index a8c0c45d5d..e8278e80bd 100644 --- a/tests/cases/standalone/common/tql/tql-cte.result +++ b/tests/cases/standalone/common/tql/tql-cte.result @@ -427,8 +427,8 @@ SELECT min(val) as min_computed, max(val) as max_computed FROM computed; | | Aggregate: groupBy=[[]], aggr=[[min(computed.val), max(computed.val)]] | | | SubqueryAlias: computed | | | Projection: metric.ts AS ts, val * Float64(2) + Float64(1) AS val | -| | Projection: metric.ts, val * Float64(2) + Float64(1) AS val * Float64(2) + Float64(1) | -| | Projection: metric.ts, metric.val * Float64(2) AS val * Float64(2) | +| | Projection: metric.ts, CAST(val * Float64(2) AS Float64) + Float64(1) AS val * Float64(2) + Float64(1) | +| | Projection: metric.ts, CAST(metric.val AS Float64) * Float64(2) AS val * Float64(2) | | | PromInstantManipulate: range=[0..40000], lookback=[300000], interval=[10000], time index=[ts] | | | PromSeriesDivide: tags=[] | | | Filter: metric.ts >= TimestampMillisecond(-299999, None) AND metric.ts <= TimestampMillisecond(40000, None) | diff --git a/tests/cases/standalone/common/types/json/json.result b/tests/cases/standalone/common/types/json/json.result index 8c4755f4ae..8fad9632b1 100644 --- a/tests/cases/standalone/common/types/json/json.result +++ b/tests/cases/standalone/common/types/json/json.result @@ -37,22 +37,23 @@ INSERT INTO jsons VALUES('[null]', 0), } ] } -}}', 11); +}}', 11), +('{"a":"abc\u2028tom"}', 12); -Affected Rows: 12 +Affected Rows: 13 -INSERT INTO jsons VALUES(parse_json('[null]'), 12), -(parse_json('[true]'), 13), -(parse_json('[false]'), 14), -(parse_json('[0]'), 15), -(parse_json('["foo"]'), 16), -(parse_json('[]'), 17), -(parse_json('{}'), 18), -(parse_json('[0,1]'), 19), -(parse_json('{"foo":"bar"}'), 20), -(parse_json('{"a":null,"foo":"bar"}'), 21), -(parse_json('[-1]'), 22), -(parse_json('[-2147483648]'), 23), +INSERT INTO jsons VALUES(parse_json('[null]'), 1000), +(parse_json('[true]'), 1001), +(parse_json('[false]'), 1002), +(parse_json('[0]'), 1003), +(parse_json('["foo"]'), 1004), +(parse_json('[]'), 1005), +(parse_json('{}'), 1006), +(parse_json('[0,1]'), 1007), +(parse_json('{"foo":"bar"}'), 1008), +(parse_json('{"a":null,"foo":"bar"}'), 1009), +(parse_json('[-1]'), 1010), +(parse_json('[-2147483648]'), 1011), (parse_json('{"entities": { "description": { "urls": [ @@ -76,9 +77,10 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12), } ] } - }}'), 24); + }}'), 1012), +(parse_json('{"a":"abc\u2028tom"}'), 1013); -Affected Rows: 13 +Affected Rows: 14 SELECT json_to_string(j), t FROM jsons; @@ -97,25 +99,27 @@ SELECT json_to_string(j), t FROM jsons; | {"a":null,"foo":"bar"} | 1970-01-01T00:00:00.009 | | [-1] | 1970-01-01T00:00:00.010 | | {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.011 | -| [null] | 1970-01-01T00:00:00.012 | -| [true] | 1970-01-01T00:00:00.013 | -| [false] | 1970-01-01T00:00:00.014 | -| [0] | 1970-01-01T00:00:00.015 | -| ["foo"] | 1970-01-01T00:00:00.016 | -| [] | 1970-01-01T00:00:00.017 | -| {} | 1970-01-01T00:00:00.018 | -| [0,1] | 1970-01-01T00:00:00.019 | -| {"foo":"bar"} | 1970-01-01T00:00:00.020 | -| {"a":null,"foo":"bar"} | 1970-01-01T00:00:00.021 | -| [-1] | 1970-01-01T00:00:00.022 | -| [-2147483648] | 1970-01-01T00:00:00.023 | -| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.024 | +| {"a":"abc\u2028tom"} | 1970-01-01T00:00:00.012 | +| [null] | 1970-01-01T00:00:01 | +| [true] | 1970-01-01T00:00:01.001 | +| [false] | 1970-01-01T00:00:01.002 | +| [0] | 1970-01-01T00:00:01.003 | +| ["foo"] | 1970-01-01T00:00:01.004 | +| [] | 1970-01-01T00:00:01.005 | +| {} | 1970-01-01T00:00:01.006 | +| [0,1] | 1970-01-01T00:00:01.007 | +| {"foo":"bar"} | 1970-01-01T00:00:01.008 | +| {"a":null,"foo":"bar"} | 1970-01-01T00:00:01.009 | +| [-1] | 1970-01-01T00:00:01.010 | +| [-2147483648] | 1970-01-01T00:00:01.011 | +| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:01.012 | +| {"a":"abc\u2028tom"} | 1970-01-01T00:00:01.013 | +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+ --Insert invalid json strings-- DELETE FROM jsons; -Affected Rows: 25 +Affected Rows: 27 INSERT INTO jsons VALUES(parse_json('{"a":1, "b":2, "c":3'), 4); diff --git a/tests/cases/standalone/common/types/json/json.sql b/tests/cases/standalone/common/types/json/json.sql index 868edc59e8..5a521ee1c6 100644 --- a/tests/cases/standalone/common/types/json/json.sql +++ b/tests/cases/standalone/common/types/json/json.sql @@ -35,20 +35,21 @@ INSERT INTO jsons VALUES('[null]', 0), } ] } -}}', 11); +}}', 11), +('{"a":"abc\u2028tom"}', 12); -INSERT INTO jsons VALUES(parse_json('[null]'), 12), -(parse_json('[true]'), 13), -(parse_json('[false]'), 14), -(parse_json('[0]'), 15), -(parse_json('["foo"]'), 16), -(parse_json('[]'), 17), -(parse_json('{}'), 18), -(parse_json('[0,1]'), 19), -(parse_json('{"foo":"bar"}'), 20), -(parse_json('{"a":null,"foo":"bar"}'), 21), -(parse_json('[-1]'), 22), -(parse_json('[-2147483648]'), 23), +INSERT INTO jsons VALUES(parse_json('[null]'), 1000), +(parse_json('[true]'), 1001), +(parse_json('[false]'), 1002), +(parse_json('[0]'), 1003), +(parse_json('["foo"]'), 1004), +(parse_json('[]'), 1005), +(parse_json('{}'), 1006), +(parse_json('[0,1]'), 1007), +(parse_json('{"foo":"bar"}'), 1008), +(parse_json('{"a":null,"foo":"bar"}'), 1009), +(parse_json('[-1]'), 1010), +(parse_json('[-2147483648]'), 1011), (parse_json('{"entities": { "description": { "urls": [ @@ -72,7 +73,8 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12), } ] } - }}'), 24); + }}'), 1012), +(parse_json('{"a":"abc\u2028tom"}'), 1013); SELECT json_to_string(j), t FROM jsons; diff --git a/tests/cases/standalone/common/view/create.result b/tests/cases/standalone/common/view/create.result index 1c6e0ee50b..76b9838628 100644 --- a/tests/cases/standalone/common/view/create.result +++ b/tests/cases/standalone/common/view/create.result @@ -30,6 +30,10 @@ CREATE VIEW test_view as SELECT * FROM public.numbers; Affected Rows: 0 +CREATE VIEW test_view2 as SELECT * FROM test_view; + +Affected Rows: 0 + --- View already exists ---- CREATE VIEW test_view as SELECT * FROM public.numbers; @@ -51,6 +55,7 @@ SHOW TABLES; | numbers | | test_table | | test_view | +| test_view2 | +------------------+ SHOW FULL TABLES; @@ -61,6 +66,7 @@ SHOW FULL TABLES; | numbers | LOCAL TEMPORARY | | test_table | BASE TABLE | | test_view | VIEW | +| test_view2 | VIEW | +------------------+-----------------+ -- psql: \dv @@ -124,17 +130,19 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE; |greptime|information_schema|tables|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y| |greptime|public|test_table|BASETABLE|ID|ID|ID|ID|ID|ID|mito|ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N| |greptime|public|test_view|VIEW|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N| +|greptime|public|test_view2|VIEW|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N| |greptime|information_schema|views|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y| +++++++++++++++++++++++++ -- SQLNESS REPLACE (\s\d+\s) ID -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME -SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW'; +SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW' ORDER BY TABLE_NAME; +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+ | table_catalog | table_schema | table_name | table_type | table_id | data_length | max_data_length | index_length | max_index_length | avg_row_length | engine | version | row_format | table_rows | data_free | auto_increment | create_time | update_time | check_time | table_collation | checksum | create_options | table_comment | temporary | +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+ | greptime | public | test_view | VIEW |ID |ID |ID |ID |ID |ID | |ID | Fixed |ID |ID |ID |DATETIME |DATETIME | | utf8_bin |ID | | | N | +| greptime | public | test_view2 | VIEW |ID |ID |ID |ID |ID |ID | |ID | Fixed |ID |ID |ID |DATETIME |DATETIME | | utf8_bin |ID | | | N | +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+ SHOW COLUMNS FROM test_view; @@ -169,10 +177,31 @@ SELECT * FROM test_view LIMIT 10; | 9 | +--------+ +SELECT * FROM test_view2 LIMIT 10; + ++--------+ +| number | ++--------+ +| 0 | +| 1 | +| 2 | +| 3 | +| 4 | +| 5 | +| 6 | +| 7 | +| 8 | +| 9 | ++--------+ + DROP VIEW test_view; Affected Rows: 0 +DROP VIEW test_view2; + +Affected Rows: 0 + DROP TABLE test_table; Affected Rows: 0 diff --git a/tests/cases/standalone/common/view/create.sql b/tests/cases/standalone/common/view/create.sql index b82704d3a9..91149f44f4 100644 --- a/tests/cases/standalone/common/view/create.sql +++ b/tests/cases/standalone/common/view/create.sql @@ -16,6 +16,8 @@ CREATE OR REPLACE VIEW test_table as SELECT * FROM public.numbers; CREATE VIEW test_view as SELECT * FROM public.numbers; +CREATE VIEW test_view2 as SELECT * FROM test_view; + --- View already exists ---- CREATE VIEW test_view as SELECT * FROM public.numbers; @@ -48,7 +50,7 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE; -- SQLNESS REPLACE (\s\d+\s) ID -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME -SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW'; +SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW' ORDER BY TABLE_NAME; SHOW COLUMNS FROM test_view; @@ -58,8 +60,12 @@ SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'test_view'; SELECT * FROM test_view LIMIT 10; +SELECT * FROM test_view2 LIMIT 10; + DROP VIEW test_view; +DROP VIEW test_view2; + DROP TABLE test_table; SELECT * FROM test_view LIMIT 10; diff --git a/tests/cases/standalone/tql-explain-analyze/tsid_column.result b/tests/cases/standalone/tql-explain-analyze/tsid_column.result index 84544b1655..4a7a875060 100644 --- a/tests/cases/standalone/tql-explain-analyze/tsid_column.result +++ b/tests/cases/standalone/tql-explain-analyze/tsid_column.result @@ -112,10 +112,63 @@ TQL ANALYZE (0, 10, '5s') sum(irate(tsid_metric[1h])) / scalar(count(count(tsid |_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[count(count(tsid_metric.val))] REDACTED |_|_|_RepartitionExec: partitioning=REDACTED |_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[count(count(tsid_metric.val))] REDACTED -|_|_|_ProjectionExec: expr=[ts@1 as ts, count(tsid_metric.val)@2 as count(tsid_metric.val)] REDACTED -|_|_|_AggregateExec: mode=FinalPartitioned, gby=[job@0 as job, ts@1 as ts], aggr=[count(tsid_metric.val)] REDACTED +|_|_|_ProjectionExec: expr=[ts@0 as ts] REDACTED +|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED |_|_|_RepartitionExec: partitioning=REDACTED -|_|_|_AggregateExec: mode=Partial, gby=[job@1 as job, ts@2 as ts], aggr=[count(tsid_metric.val)] REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED +|_|_|_ProjectionExec: expr=[ts@3 as ts, job@1 as job] REDACTED +|_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[ts] REDACTED +|_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED +|_|_|_ProjectionExec: expr=[val@1 as val, job@3 as job, __tsid@2 as __tsid, ts@0 as ts] REDACTED +|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED +|_|_|_| +| 1_| 0_|_SortPreservingMergeExec: [ts@0 ASC NULLS LAST] REDACTED +|_|_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED +|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[sum(prom_irate(ts_range,val))] REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[sum(prom_irate(ts_range,val))] REDACTED +|_|_|_FilterExec: prom_irate(ts_range,val)@1 IS NOT NULL REDACTED +|_|_|_ProjectionExec: expr=[ts@2 as ts, prom_irate(ts_range@3, val@0) as prom_irate(ts_range,val)] REDACTED +|_|_|_PromRangeManipulateExec: req range=[0..10000], interval=[5000], eval range=[3600000], time index=[ts] REDACTED +|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[ts], filter NaN: [true] REDACTED +|_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED +|_|_|_ProjectionExec: expr=[val@1 as val, __tsid@2 as __tsid, ts@0 as ts] REDACTED +|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED +|_|_|_| +|_|_| Total rows: 2_| ++-+-+-+ + +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +TQL ANALYZE (0, 10, '5s') sum(irate(tsid_metric[1h])) / scalar(count(sum(tsid_metric) by (job))); + ++-+-+-+ +| stage | node | plan_| ++-+-+-+ +| 0_| 0_|_ProjectionExec: expr=[ts@1 as ts, sum(prom_irate(ts_range,val))@2 / scalar(count(sum(tsid_metric.val)))@0 as lhs.sum(prom_irate(ts_range,val)) / rhs.scalar(count(sum(tsid_metric.val)))] REDACTED +|_|_|_REDACTED +|_|_|_ScalarCalculateExec: tags=[] REDACTED +|_|_|_CoalescePartitionsExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_CooperativeExec REDACTED +|_|_|_MergeScanExec: REDACTED +|_|_|_| +| 1_| 0_|_SortPreservingMergeExec: [ts@0 ASC NULLS LAST] REDACTED +|_|_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED +|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[count(sum(tsid_metric.val))] REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[count(sum(tsid_metric.val))] REDACTED +|_|_|_ProjectionExec: expr=[ts@0 as ts] REDACTED +|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED +|_|_|_RepartitionExec: partitioning=REDACTED +|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED +|_|_|_ProjectionExec: expr=[ts@1 as ts, job@0 as job] REDACTED +|_|_|_FilterExec: val@0 IS NOT NULL, projection=[job@1, ts@2] REDACTED |_|_|_ProjectionExec: expr=[val@0 as val, job@1 as job, ts@3 as ts] REDACTED |_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[ts] REDACTED |_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED diff --git a/tests/cases/standalone/tql-explain-analyze/tsid_column.sql b/tests/cases/standalone/tql-explain-analyze/tsid_column.sql index 7b3de23f33..dedce2dfb1 100644 --- a/tests/cases/standalone/tql-explain-analyze/tsid_column.sql +++ b/tests/cases/standalone/tql-explain-analyze/tsid_column.sql @@ -51,6 +51,14 @@ TQL ANALYZE (0, 10, '5s') sum by (job, instance) (tsid_metric); -- SQLNESS REPLACE (Hash.*) REDACTED TQL ANALYZE (0, 10, '5s') sum(irate(tsid_metric[1h])) / scalar(count(count(tsid_metric) by (job))); +-- SQLNESS REPLACE (metrics.*) REDACTED +-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED +-- SQLNESS REPLACE (-+) - +-- SQLNESS REPLACE (\s\s+) _ +-- SQLNESS REPLACE (peers.*) REDACTED +-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED +-- SQLNESS REPLACE (Hash.*) REDACTED +TQL ANALYZE (0, 10, '5s') sum(irate(tsid_metric[1h])) / scalar(count(sum(tsid_metric) by (job))); + DROP TABLE tsid_metric; DROP TABLE tsid_physical; - diff --git a/tests/conf/datanode-test.toml.template b/tests/conf/datanode-test.toml.template index 4cb0423c72..3ec8a2f695 100644 --- a/tests/conf/datanode-test.toml.template +++ b/tests/conf/datanode-test.toml.template @@ -28,7 +28,7 @@ type = 'File' data_home = '{data_home}' [meta_client_options] -metasrv_addrs = ['{metasrv_addr}'] +metasrv_addrs = ['{addrs.metasrv_addr}'] timeout_millis = 3000 connect_timeout_millis = 5000 tcp_nodelay = false diff --git a/tests/conf/frontend-test.toml.template b/tests/conf/frontend-test.toml.template index de4ce86adc..25d44ff6e4 100644 --- a/tests/conf/frontend-test.toml.template +++ b/tests/conf/frontend-test.toml.template @@ -1,3 +1,3 @@ [grpc] -bind_addr = "{grpc_addr}" -server_addr = "{grpc_addr}" +bind_addr = "{addrs.grpc_addr}" +server_addr = "{addrs.grpc_addr}" diff --git a/tests/conf/standalone-test.toml.template b/tests/conf/standalone-test.toml.template index 509eac7ca6..50c014e991 100644 --- a/tests/conf/standalone-test.toml.template +++ b/tests/conf/standalone-test.toml.template @@ -26,12 +26,12 @@ type = 'File' data_home = '{data_home}' [grpc] -bind_addr = '{grpc_addr}' +bind_addr = '{addrs.grpc_addr}' runtime_size = 8 [mysql] enable = true -addr = "{mysql_addr}" +addr = "{addrs.mysql_addr}" runtime_size = 2 prepared_stmt_cache_size= 10000 @@ -40,7 +40,7 @@ mode = "disable" [postgres] enable = true -addr = "{postgres_addr}" +addr = "{addrs.postgres_addr}" runtime_size = 2 [procedure] diff --git a/tests/runner/src/server_mode.rs b/tests/runner/src/server_mode.rs index 172baf32ff..1f7cb72bf4 100644 --- a/tests/runner/src/server_mode.rs +++ b/tests/runner/src/server_mode.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::path::Path; use std::sync::{Mutex, OnceLock}; @@ -96,15 +96,7 @@ struct ConfigContext { use_etcd: bool, store_addrs: String, instance_id: usize, - // for following addrs, leave it empty if not needed - // required for datanode - metasrv_addr: String, - // for frontend and standalone - grpc_addr: String, - // for standalone - mysql_addr: String, - // for standalone - postgres_addr: String, + addrs: HashMap, // enable flat format for storage engine enable_flat_format: bool, } @@ -275,40 +267,26 @@ impl ServerMode { let procedure_dir = data_home.join("procedure").display().to_string(); // Get the required addresses based on server mode - let (metasrv_addr, grpc_addr, mysql_addr, postgres_addr) = match self { + let addrs: HashMap = match self { ServerMode::Standalone { rpc_bind_addr, mysql_addr, postgres_addr, - .. - } => ( - String::new(), - rpc_bind_addr.clone(), - mysql_addr.clone(), - postgres_addr.clone(), - ), - ServerMode::Frontend { - rpc_bind_addr, - mysql_addr, - postgres_addr, - .. - } => ( - String::new(), - rpc_bind_addr.clone(), - mysql_addr.clone(), - postgres_addr.clone(), - ), - ServerMode::Datanode { - rpc_bind_addr, - metasrv_addr, - .. - } => ( - metasrv_addr.clone(), - rpc_bind_addr.clone(), - String::new(), - String::new(), - ), - _ => (String::new(), String::new(), String::new(), String::new()), + http_addr, + } => [ + ("http_addr".to_string(), http_addr.clone()), + ("grpc_addr".to_string(), rpc_bind_addr.clone()), + ("mysql_addr".to_string(), mysql_addr.clone()), + ("postgres_addr".to_string(), postgres_addr.clone()), + ] + .into(), + ServerMode::Frontend { rpc_bind_addr, .. } => { + [("grpc_addr".to_string(), rpc_bind_addr.clone())].into() + } + ServerMode::Datanode { metasrv_addr, .. } => { + [("metasrv_addr".to_string(), metasrv_addr.clone())].into() + } + _ => HashMap::new(), }; let ctx = ConfigContext { @@ -326,10 +304,7 @@ impl ServerMode { .collect::>() .join(","), instance_id: id, - metasrv_addr, - grpc_addr, - mysql_addr, - postgres_addr, + addrs, enable_flat_format: db_ctx.store_config().enable_flat_format, };