From 6c57f4b7e43e909924d40cfa97815d052453631f Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Thu, 7 Aug 2025 20:16:48 +0800 Subject: [PATCH] feat: pick automated metadata recovery feature (#6676) * feat: persist column ids in table metadata (#6457) * feat: persist column ids in table metadata Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: add column metadata to response extensions (#6451) Signed-off-by: WenyXu * refactor(meta): extract `AlterTableExecutor` from `AlterTableProcedure` (#6470) * refactor(meta): extract `AlterTableExecutor` from `AlterTableProcedure` Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * refactor(meta): separate validation and execution logic in alter logical tables procedure (#6478) * refactor(meta): separate validation and execution logic in alter logical tables procedure Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * fix: fix state transition in create table procedure (#6523) Signed-off-by: WenyXu * feat: add table reconciliation utilities (#6519) * feat: add table reconciliation utilities Signed-off-by: WenyXu * fix: fix unit tests Signed-off-by: WenyXu * chore: apply suggestison from CR Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: update comment Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: Support ListMetadataRequest to retrieve regions' metadata (#6348) * feat: support list metadata in region server Signed-off-by: evenyag * test: add test for list region metadata Signed-off-by: evenyag * feat: return null if region not exists Signed-off-by: evenyag * chore: update greptime-proto Signed-off-by: evenyag --------- Signed-off-by: evenyag * refactor: support multiple index operations in single alter region request (#6487) * refactor: support multiple index operations in single alter region request Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: update greptime-proto Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: implement pause/resume functionality for procedure manager (#6393) * feat: implement pause/resume functionality for procedure manager Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: move metasrv admin to http server while keep tonic for backward compatibility (#6466) * feat: move metasrv admin to http server while keep tonic for backward compatibility Signed-off-by: lyang24 * refactor with nest method Signed-off-by: lyang24 --------- Signed-off-by: lyang24 Co-authored-by: lyang24 * feat: allow igoring nonexistent regions in recovery mode (#6592) * feat: allow ignoring nonexistent regions Signed-off-by: WenyXu * feat: ignore nonexistent regions during startup in recovery mode Signed-off-by: WenyXu * feat: allow enabling recovery mode via http api Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: allow setting next table id via http api (#6597) * feat: allow reset next table id via http api Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggesions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: ignore internal keys in metadata snapshots (#6606) feat: ignore dumpping internal keys Signed-off-by: WenyXu * feat: introduce reconcile table procedure (#6584) * feat: introduce `SyncColumns` Signed-off-by: WenyXu * feat: introduce reconcile table procedure Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggesions from CR Signed-off-by: WenyXu * chore: add tests Signed-off-by: WenyXu * chore: add comments Signed-off-by: WenyXu * chore: update proto Signed-off-by: WenyXu * chore: apply suggestions Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: introduce reconcile database procedure (#6612) * feat: introduce reconcile database procedure Signed-off-by: WenyXu * feat: hold the schema lock Signed-off-by: WenyXu * chore: add todo Signed-off-by: WenyXu * chore: update comments Signed-off-by: WenyXu * chore: rename to `fast_fail` Signed-off-by: WenyXu * chore: add logs Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: introduce reconcile logical tables procedure (#6588) * feat: introduce reconcile logical tables procedure Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * fix: lock logical tables Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * refactor: remove procedure executor from DDL manager (#6625) * refactor: remove procedure executor from DDL manager Signed-off-by: WenyXu * chore: clippy Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat: introduce reconcile catalog procedure (#6613) Signed-off-by: WenyXu * feat: introduce reconciliation interface (#6614) * feat: introduce reconcile interface Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * chore: upgrade proto Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * fix: fix sequence peek method to return correct values when sequence is not initialized (#6643) fix: improve sequence peek method to handle uninitialized sequences Signed-off-by: WenyXu * fix: sequence peek with remote value (#6648) * fix: sequence peek with remote value * chore: more ut * chore: add more ut * feat: add metrics for reconciliation procedures (#6652) * feat: add metrics for reconciliation procedures Signed-off-by: WenyXu * refactor: improve error handling Signed-off-by: WenyXu * fix(datanode): handle ignore_nonexistent_region flag in open_all_regions Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu * refactor: merge metrics Signed-off-by: WenyXu * chore: minor refactor Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * feat(metric-engine): add metadata region cache (#6657) * feat(metric-engine): add metadata region cache Signed-off-by: WenyXu * feat: use lru Signed-off-by: WenyXu * chore: rename Signed-off-by: WenyXu * chore: rename Signed-off-by: WenyXu * chore: add comments Signed-off-by: WenyXu * chore: default ttl Signed-off-by: WenyXu * chore: longer ttl Signed-off-by: WenyXu --------- Signed-off-by: WenyXu * chore: update greptime-proto Signed-off-by: WenyXu * chore: bump version to 0.15.5 Signed-off-by: WenyXu --------- Signed-off-by: WenyXu Signed-off-by: evenyag Signed-off-by: lyang24 Co-authored-by: Yingwen Co-authored-by: Lanqing Yang Co-authored-by: lyang24 Co-authored-by: jeremyhi --- Cargo.lock | 163 ++- Cargo.toml | 6 +- src/api/src/region.rs | 12 + src/api/src/v1/column_def.rs | 44 +- src/catalog/src/information_extension.rs | 2 +- src/cli/src/bench.rs | 1 + src/cli/src/metadata/repair.rs | 1 - src/cli/src/metadata/repair/alter_table.rs | 3 +- src/cmd/src/metasrv.rs | 2 +- src/cmd/src/standalone.rs | 16 +- src/common/function/src/admin.rs | 9 + .../function/src/admin/reconcile_catalog.rs | 179 +++ .../function/src/admin/reconcile_database.rs | 198 +++ .../function/src/admin/reconcile_table.rs | 149 ++ src/common/function/src/handlers.rs | 4 + src/common/function/src/helper.rs | 68 +- src/common/function/src/lib.rs | 1 + src/common/function/src/state.rs | 6 +- src/common/grpc-expr/src/alter.rs | 149 +- src/common/meta/Cargo.toml | 1 + src/common/meta/src/ddl.rs | 68 +- .../meta/src/ddl/alter_logical_tables.rs | 184 +-- .../src/ddl/alter_logical_tables/check.rs | 136 -- .../src/ddl/alter_logical_tables/executor.rs | 216 +++ .../src/ddl/alter_logical_tables/metadata.rs | 158 -- .../alter_logical_tables/region_request.rs | 113 -- .../alter_logical_tables/table_cache_keys.rs | 50 - .../alter_logical_tables/update_metadata.rs | 53 +- .../src/ddl/alter_logical_tables/validator.rs | 279 ++++ src/common/meta/src/ddl/alter_table.rs | 183 ++- src/common/meta/src/ddl/alter_table/check.rs | 62 - .../meta/src/ddl/alter_table/executor.rs | 308 ++++ .../src/ddl/alter_table/region_request.rs | 48 +- .../src/ddl/alter_table/update_metadata.rs | 103 -- .../meta/src/ddl/create_logical_tables.rs | 43 +- .../create_logical_tables/update_metadata.rs | 4 +- src/common/meta/src/ddl/create_table.rs | 43 +- .../meta/src/ddl/create_table_template.rs | 50 +- .../meta/src/ddl/drop_table/executor.rs | 8 +- .../meta/src/ddl/physical_table_metadata.rs | 60 - src/common/meta/src/ddl/table_meta.rs | 5 + src/common/meta/src/ddl/test_util.rs | 103 +- .../meta/src/ddl/test_util/create_table.rs | 1 + .../src/ddl/test_util/datanode_handler.rs | 62 +- .../meta/src/ddl/test_util/region_metadata.rs | 34 + .../src/ddl/tests/alter_logical_tables.rs | 204 ++- src/common/meta/src/ddl/tests/alter_table.rs | 66 +- .../src/ddl/tests/create_logical_tables.rs | 140 +- src/common/meta/src/ddl/tests/create_table.rs | 90 +- src/common/meta/src/ddl/utils.rs | 60 +- .../meta/src/ddl/utils/raw_table_info.rs | 123 ++ .../src/ddl/utils/region_metadata_lister.rs | 240 ++++ src/common/meta/src/ddl/utils/table_id.rs | 46 + src/common/meta/src/ddl/utils/table_info.rs | 100 ++ src/common/meta/src/ddl_manager.rs | 188 +-- src/common/meta/src/error.rs | 124 +- src/common/meta/src/key.rs | 12 +- src/common/meta/src/key/maintenance.rs | 86 -- src/common/meta/src/key/runtime_switch.rs | 250 ++++ src/common/meta/src/key/table_info.rs | 1 + src/common/meta/src/key/table_name.rs | 20 + src/common/meta/src/key/table_route.rs | 11 + src/common/meta/src/lib.rs | 2 + src/common/meta/src/metrics.rs | 42 + src/common/meta/src/procedure_executor.rs | 173 +++ src/common/meta/src/reconciliation.rs | 20 + src/common/meta/src/reconciliation/manager.rs | 246 ++++ .../src/reconciliation/reconcile_catalog.rs | 237 +++ .../reconciliation/reconcile_catalog/end.rs | 48 + .../reconcile_catalog/reconcile_databases.rs | 104 ++ .../reconciliation/reconcile_catalog/start.rs | 58 + .../src/reconciliation/reconcile_database.rs | 285 ++++ .../reconciliation/reconcile_database/end.rs | 49 + .../reconcile_logical_tables.rs | 248 ++++ .../reconcile_database/reconcile_tables.rs | 166 +++ .../reconcile_database/start.rs | 63 + .../reconcile_logical_tables.rs | 272 ++++ .../reconcile_regions.rs | 146 ++ .../reconciliation_end.rs | 53 + .../reconciliation_start.rs | 192 +++ .../resolve_table_metadatas.rs | 156 ++ .../update_table_infos.rs | 182 +++ .../src/reconciliation/reconcile_table.rs | 280 ++++ .../reconcile_table/reconcile_regions.rs | 199 +++ .../reconcile_table/reconciliation_end.rs | 53 + .../reconcile_table/reconciliation_start.rs | 134 ++ .../resolve_column_metadata.rs | 170 +++ .../reconcile_table/update_table_info.rs | 129 ++ src/common/meta/src/reconciliation/utils.rs | 1267 +++++++++++++++++ src/common/meta/src/rpc/ddl.rs | 3 +- src/common/meta/src/sequence.rs | 289 +++- src/common/meta/src/snapshot.rs | 13 +- src/common/procedure-test/Cargo.toml | 1 + src/common/procedure-test/src/lib.rs | 8 + src/common/procedure/src/error.rs | 19 +- src/common/procedure/src/local.rs | 64 +- src/common/procedure/src/local/runner.rs | 8 + src/common/procedure/src/procedure.rs | 14 + src/common/procedure/src/watcher.rs | 2 +- src/datanode/src/datanode.rs | 85 +- src/datanode/src/error.rs | 9 + src/datanode/src/region_server.rs | 351 ++++- src/datanode/src/tests.rs | 33 +- src/datatypes/src/schema/constraint.rs | 9 + src/flow/src/server.rs | 2 +- src/frontend/src/instance.rs | 3 + src/frontend/src/instance/builder.rs | 5 +- src/meta-client/src/client.rs | 20 +- src/meta-client/src/client/procedure.rs | 28 +- src/meta-srv/Cargo.toml | 5 + src/meta-srv/src/bootstrap.rs | 17 +- src/meta-srv/src/election.rs | 3 - src/meta-srv/src/election/etcd.rs | 3 +- src/meta-srv/src/election/rds/mysql.rs | 2 +- src/meta-srv/src/election/rds/postgres.rs | 2 +- src/meta-srv/src/error.rs | 47 +- src/meta-srv/src/metasrv.rs | 28 +- src/meta-srv/src/metasrv/builder.rs | 32 +- .../procedure/region_migration/test_util.rs | 1 + src/meta-srv/src/procedure/utils.rs | 2 + .../src/procedure/wal_prune/test_util.rs | 1 + src/meta-srv/src/region/supervisor.rs | 20 +- src/meta-srv/src/service.rs | 1 + src/meta-srv/src/service/admin.rs | 947 +++++++++++- src/meta-srv/src/service/admin/health.rs | 1 + src/meta-srv/src/service/admin/leader.rs | 1 + src/meta-srv/src/service/admin/maintenance.rs | 140 +- src/meta-srv/src/service/admin/node_lease.rs | 1 + src/meta-srv/src/service/admin/procedure.rs | 125 ++ src/meta-srv/src/service/admin/recovery.rs | 63 + src/meta-srv/src/service/admin/sequencer.rs | 100 ++ src/meta-srv/src/service/admin/util.rs | 40 + src/meta-srv/src/service/cluster.rs | 41 +- src/meta-srv/src/service/procedure.rs | 128 +- src/meta-srv/src/service/utils.rs | 34 + src/metric-engine/Cargo.toml | 2 + src/metric-engine/src/data_region.rs | 7 +- src/metric-engine/src/engine.rs | 4 + src/metric-engine/src/engine/alter.rs | 123 +- src/metric-engine/src/engine/create.rs | 59 +- src/metric-engine/src/error.rs | 10 + src/metric-engine/src/metadata_region.rs | 107 +- src/metric-engine/src/test_util.rs | 116 +- src/mito2/src/engine.rs | 85 +- src/mito2/src/engine/alter_test.rs | 55 +- src/mito2/src/error.rs | 11 +- src/operator/src/expr_helper.rs | 140 +- src/operator/src/procedure.rs | 14 +- src/operator/src/statement.rs | 2 +- src/operator/src/statement/ddl.rs | 3 +- src/query/src/dist_plan/analyzer/test.rs | 1 + src/store-api/src/metadata.rs | 238 ++-- src/store-api/src/metric_engine_consts.rs | 4 + src/store-api/src/region_engine.rs | 3 +- src/store-api/src/region_request.rs | 389 +++-- src/table/src/metadata.rs | 363 ++--- src/table/src/requests.rs | 34 +- src/table/src/table/numbers.rs | 1 + tests-integration/src/standalone.rs | 11 +- 159 files changed, 12353 insertions(+), 2303 deletions(-) create mode 100644 src/common/function/src/admin/reconcile_catalog.rs create mode 100644 src/common/function/src/admin/reconcile_database.rs create mode 100644 src/common/function/src/admin/reconcile_table.rs delete mode 100644 src/common/meta/src/ddl/alter_logical_tables/check.rs create mode 100644 src/common/meta/src/ddl/alter_logical_tables/executor.rs delete mode 100644 src/common/meta/src/ddl/alter_logical_tables/metadata.rs delete mode 100644 src/common/meta/src/ddl/alter_logical_tables/region_request.rs delete mode 100644 src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs create mode 100644 src/common/meta/src/ddl/alter_logical_tables/validator.rs delete mode 100644 src/common/meta/src/ddl/alter_table/check.rs create mode 100644 src/common/meta/src/ddl/alter_table/executor.rs delete mode 100644 src/common/meta/src/ddl/alter_table/update_metadata.rs delete mode 100644 src/common/meta/src/ddl/physical_table_metadata.rs create mode 100644 src/common/meta/src/ddl/test_util/region_metadata.rs create mode 100644 src/common/meta/src/ddl/utils/raw_table_info.rs create mode 100644 src/common/meta/src/ddl/utils/region_metadata_lister.rs create mode 100644 src/common/meta/src/ddl/utils/table_id.rs create mode 100644 src/common/meta/src/ddl/utils/table_info.rs delete mode 100644 src/common/meta/src/key/maintenance.rs create mode 100644 src/common/meta/src/key/runtime_switch.rs create mode 100644 src/common/meta/src/procedure_executor.rs create mode 100644 src/common/meta/src/reconciliation.rs create mode 100644 src/common/meta/src/reconciliation/manager.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_catalog.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_catalog/end.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_catalog/reconcile_databases.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_catalog/start.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_database.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_database/end.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_database/reconcile_logical_tables.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_database/reconcile_tables.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_database/start.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_logical_tables.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_logical_tables/reconcile_regions.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_logical_tables/reconciliation_end.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_logical_tables/reconciliation_start.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_logical_tables/resolve_table_metadatas.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_logical_tables/update_table_infos.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_table.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_table/reconcile_regions.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_table/reconciliation_end.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_table/reconciliation_start.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_table/resolve_column_metadata.rs create mode 100644 src/common/meta/src/reconciliation/reconcile_table/update_table_info.rs create mode 100644 src/common/meta/src/reconciliation/utils.rs create mode 100644 src/meta-srv/src/service/admin/procedure.rs create mode 100644 src/meta-srv/src/service/admin/recovery.rs create mode 100644 src/meta-srv/src/service/admin/sequencer.rs create mode 100644 src/meta-srv/src/service/utils.rs diff --git a/Cargo.lock b/Cargo.lock index e222c5a760..eb1eba4864 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -211,7 +211,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c" [[package]] name = "api" -version = "0.15.4" +version = "0.15.5" dependencies = [ "common-base", "common-decimal", @@ -944,7 +944,7 @@ dependencies = [ [[package]] name = "auth" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "async-trait", @@ -1586,7 +1586,7 @@ dependencies = [ [[package]] name = "cache" -version = "0.15.4" +version = "0.15.5" dependencies = [ "catalog", "common-error", @@ -1621,7 +1621,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "catalog" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "arrow 54.2.1", @@ -1961,7 +1961,7 @@ checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" [[package]] name = "cli" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-stream", "async-trait", @@ -2006,7 +2006,7 @@ dependencies = [ "session", "snafu 0.8.5", "store-api", - "substrait 0.15.4", + "substrait 0.15.5", "table", "tempfile", "tokio", @@ -2015,7 +2015,7 @@ dependencies = [ [[package]] name = "client" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "arc-swap", @@ -2045,7 +2045,7 @@ dependencies = [ "rand 0.9.0", "serde_json", "snafu 0.8.5", - "substrait 0.15.4", + "substrait 0.15.5", "substrait 0.37.3", "tokio", "tokio-stream", @@ -2086,7 +2086,7 @@ dependencies = [ [[package]] name = "cmd" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-trait", "auth", @@ -2147,7 +2147,7 @@ dependencies = [ "snafu 0.8.5", "stat", "store-api", - "substrait 0.15.4", + "substrait 0.15.5", "table", "temp-env", "tempfile", @@ -2194,7 +2194,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335" [[package]] name = "common-base" -version = "0.15.4" +version = "0.15.5" dependencies = [ "anymap2", "async-trait", @@ -2216,11 +2216,11 @@ dependencies = [ [[package]] name = "common-catalog" -version = "0.15.4" +version = "0.15.5" [[package]] name = "common-config" -version = "0.15.4" +version = "0.15.5" dependencies = [ "common-base", "common-error", @@ -2245,7 +2245,7 @@ dependencies = [ [[package]] name = "common-datasource" -version = "0.15.4" +version = "0.15.5" dependencies = [ "arrow 54.2.1", "arrow-schema 54.3.1", @@ -2282,7 +2282,7 @@ dependencies = [ [[package]] name = "common-decimal" -version = "0.15.4" +version = "0.15.5" dependencies = [ "bigdecimal 0.4.8", "common-error", @@ -2295,7 +2295,7 @@ dependencies = [ [[package]] name = "common-error" -version = "0.15.4" +version = "0.15.5" dependencies = [ "common-macro", "http 1.1.0", @@ -2306,7 +2306,7 @@ dependencies = [ [[package]] name = "common-frontend" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-trait", "common-error", @@ -2323,7 +2323,7 @@ dependencies = [ [[package]] name = "common-function" -version = "0.15.4" +version = "0.15.5" dependencies = [ "ahash 0.8.11", "api", @@ -2376,7 +2376,7 @@ dependencies = [ [[package]] name = "common-greptimedb-telemetry" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-trait", "common-runtime", @@ -2393,7 +2393,7 @@ dependencies = [ [[package]] name = "common-grpc" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "arrow-flight", @@ -2425,7 +2425,7 @@ dependencies = [ [[package]] name = "common-grpc-expr" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "common-base", @@ -2444,7 +2444,7 @@ dependencies = [ [[package]] name = "common-macro" -version = "0.15.4" +version = "0.15.5" dependencies = [ "arc-swap", "common-query", @@ -2458,7 +2458,7 @@ dependencies = [ [[package]] name = "common-mem-prof" -version = "0.15.4" +version = "0.15.5" dependencies = [ "anyhow", "common-error", @@ -2474,7 +2474,7 @@ dependencies = [ [[package]] name = "common-meta" -version = "0.15.4" +version = "0.15.5" dependencies = [ "anymap2", "api", @@ -2494,6 +2494,7 @@ dependencies = [ "common-procedure-test", "common-query", "common-recordbatch", + "common-runtime", "common-telemetry", "common-test-util", "common-time", @@ -2539,7 +2540,7 @@ dependencies = [ [[package]] name = "common-options" -version = "0.15.4" +version = "0.15.5" dependencies = [ "common-grpc", "humantime-serde", @@ -2548,11 +2549,11 @@ dependencies = [ [[package]] name = "common-plugins" -version = "0.15.4" +version = "0.15.5" [[package]] name = "common-pprof" -version = "0.15.4" +version = "0.15.5" dependencies = [ "common-error", "common-macro", @@ -2564,7 +2565,7 @@ dependencies = [ [[package]] name = "common-procedure" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-stream", "async-trait", @@ -2591,16 +2592,17 @@ dependencies = [ [[package]] name = "common-procedure-test" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-trait", "common-procedure", "snafu 0.8.5", + "tokio", ] [[package]] name = "common-query" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "async-trait", @@ -2626,7 +2628,7 @@ dependencies = [ [[package]] name = "common-recordbatch" -version = "0.15.4" +version = "0.15.5" dependencies = [ "arc-swap", "common-error", @@ -2646,7 +2648,7 @@ dependencies = [ [[package]] name = "common-runtime" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-trait", "clap 4.5.19", @@ -2676,14 +2678,14 @@ dependencies = [ [[package]] name = "common-session" -version = "0.15.4" +version = "0.15.5" dependencies = [ "strum 0.27.1", ] [[package]] name = "common-telemetry" -version = "0.15.4" +version = "0.15.5" dependencies = [ "backtrace", "common-error", @@ -2711,7 +2713,7 @@ dependencies = [ [[package]] name = "common-test-util" -version = "0.15.4" +version = "0.15.5" dependencies = [ "client", "common-grpc", @@ -2724,7 +2726,7 @@ dependencies = [ [[package]] name = "common-time" -version = "0.15.4" +version = "0.15.5" dependencies = [ "arrow 54.2.1", "chrono", @@ -2742,7 +2744,7 @@ dependencies = [ [[package]] name = "common-version" -version = "0.15.4" +version = "0.15.5" dependencies = [ "build-data", "cargo-manifest", @@ -2753,7 +2755,7 @@ dependencies = [ [[package]] name = "common-wal" -version = "0.15.4" +version = "0.15.5" dependencies = [ "common-base", "common-error", @@ -2776,7 +2778,7 @@ dependencies = [ [[package]] name = "common-workload" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "common-telemetry", @@ -3732,7 +3734,7 @@ dependencies = [ [[package]] name = "datanode" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "arrow-flight", @@ -3785,7 +3787,7 @@ dependencies = [ "session", "snafu 0.8.5", "store-api", - "substrait 0.15.4", + "substrait 0.15.5", "table", "tokio", "toml 0.8.19", @@ -3794,7 +3796,7 @@ dependencies = [ [[package]] name = "datatypes" -version = "0.15.4" +version = "0.15.5" dependencies = [ "arrow 54.2.1", "arrow-array 54.2.1", @@ -4454,7 +4456,7 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "file-engine" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "async-trait", @@ -4591,7 +4593,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" [[package]] name = "flow" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "arrow 54.2.1", @@ -4656,7 +4658,7 @@ dependencies = [ "sql", "store-api", "strum 0.27.1", - "substrait 0.15.4", + "substrait 0.15.5", "table", "tokio", "tonic 0.12.3", @@ -4711,7 +4713,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa" [[package]] name = "frontend" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "arc-swap", @@ -4771,7 +4773,7 @@ dependencies = [ "sqlparser 0.54.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0cf6c04490d59435ee965edd2078e8855bd8471e)", "store-api", "strfmt", - "substrait 0.15.4", + "substrait 0.15.5", "table", "tokio", "tokio-util", @@ -5161,7 +5163,7 @@ dependencies = [ [[package]] name = "greptime-proto" version = "0.1.0" -source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=a5d256ba4abb7393e0859ffbf7fca1e38f3433dc#a5d256ba4abb7393e0859ffbf7fca1e38f3433dc" +source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=f3103a8c9b8ce162457d0a3e3ca00d53d1a8bd06#f3103a8c9b8ce162457d0a3e3ca00d53d1a8bd06" dependencies = [ "prost 0.13.5", "serde", @@ -5932,7 +5934,7 @@ dependencies = [ [[package]] name = "index" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-trait", "asynchronous-codec", @@ -6817,7 +6819,7 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "log-query" -version = "0.15.4" +version = "0.15.5" dependencies = [ "chrono", "common-error", @@ -6829,7 +6831,7 @@ dependencies = [ [[package]] name = "log-store" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-stream", "async-trait", @@ -7127,7 +7129,7 @@ dependencies = [ [[package]] name = "meta-client" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "async-trait", @@ -7155,10 +7157,13 @@ dependencies = [ [[package]] name = "meta-srv" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "async-trait", + "axum 0.8.1", + "axum-extra", + "axum-macros", "bytes", "chrono", "clap 4.5.19", @@ -7191,6 +7196,7 @@ dependencies = [ "http-body-util", "humantime", "humantime-serde", + "hyper 0.14.30", "hyper-util", "itertools 0.14.0", "lazy_static", @@ -7218,6 +7224,7 @@ dependencies = [ "toml 0.8.19", "tonic 0.12.3", "tower 0.5.2", + "tower-http 0.6.2", "tracing", "tracing-subscriber", "typetag", @@ -7246,7 +7253,7 @@ dependencies = [ [[package]] name = "metric-engine" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "aquamarine", @@ -7256,6 +7263,7 @@ dependencies = [ "common-base", "common-error", "common-macro", + "common-meta", "common-query", "common-recordbatch", "common-runtime", @@ -7270,6 +7278,7 @@ dependencies = [ "lazy_static", "mito-codec", "mito2", + "moka", "mur3", "object-store", "prometheus", @@ -7336,7 +7345,7 @@ dependencies = [ [[package]] name = "mito-codec" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "bytes", @@ -7359,7 +7368,7 @@ dependencies = [ [[package]] name = "mito2" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "aquamarine", @@ -8109,7 +8118,7 @@ dependencies = [ [[package]] name = "object-store" -version = "0.15.4" +version = "0.15.5" dependencies = [ "anyhow", "bytes", @@ -8423,7 +8432,7 @@ dependencies = [ [[package]] name = "operator" -version = "0.15.4" +version = "0.15.5" dependencies = [ "ahash 0.8.11", "api", @@ -8478,7 +8487,7 @@ dependencies = [ "sql", "sqlparser 0.54.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0cf6c04490d59435ee965edd2078e8855bd8471e)", "store-api", - "substrait 0.15.4", + "substrait 0.15.5", "table", "tokio", "tokio-util", @@ -8745,7 +8754,7 @@ dependencies = [ [[package]] name = "partition" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "async-trait", @@ -9033,7 +9042,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pipeline" -version = "0.15.4" +version = "0.15.5" dependencies = [ "ahash 0.8.11", "api", @@ -9176,7 +9185,7 @@ dependencies = [ [[package]] name = "plugins" -version = "0.15.4" +version = "0.15.5" dependencies = [ "auth", "clap 4.5.19", @@ -9489,7 +9498,7 @@ dependencies = [ [[package]] name = "promql" -version = "0.15.4" +version = "0.15.5" dependencies = [ "ahash 0.8.11", "async-trait", @@ -9771,7 +9780,7 @@ dependencies = [ [[package]] name = "puffin" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-compression 0.4.13", "async-trait", @@ -9813,7 +9822,7 @@ dependencies = [ [[package]] name = "query" -version = "0.15.4" +version = "0.15.5" dependencies = [ "ahash 0.8.11", "api", @@ -9879,7 +9888,7 @@ dependencies = [ "sqlparser 0.54.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0cf6c04490d59435ee965edd2078e8855bd8471e)", "statrs", "store-api", - "substrait 0.15.4", + "substrait 0.15.5", "table", "tokio", "tokio-stream", @@ -11165,7 +11174,7 @@ dependencies = [ [[package]] name = "servers" -version = "0.15.4" +version = "0.15.5" dependencies = [ "ahash 0.8.11", "api", @@ -11286,7 +11295,7 @@ dependencies = [ [[package]] name = "session" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "arc-swap", @@ -11625,7 +11634,7 @@ dependencies = [ [[package]] name = "sql" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "chrono", @@ -11680,7 +11689,7 @@ dependencies = [ [[package]] name = "sqlness-runner" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-trait", "clap 4.5.19", @@ -11980,7 +11989,7 @@ dependencies = [ [[package]] name = "stat" -version = "0.15.4" +version = "0.15.5" dependencies = [ "nix 0.30.1", ] @@ -12006,7 +12015,7 @@ dependencies = [ [[package]] name = "store-api" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "aquamarine", @@ -12167,7 +12176,7 @@ dependencies = [ [[package]] name = "substrait" -version = "0.15.4" +version = "0.15.5" dependencies = [ "async-trait", "bytes", @@ -12347,7 +12356,7 @@ dependencies = [ [[package]] name = "table" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "async-trait", @@ -12608,7 +12617,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "tests-fuzz" -version = "0.15.4" +version = "0.15.5" dependencies = [ "arbitrary", "async-trait", @@ -12652,7 +12661,7 @@ dependencies = [ [[package]] name = "tests-integration" -version = "0.15.4" +version = "0.15.5" dependencies = [ "api", "arrow-flight", @@ -12719,7 +12728,7 @@ dependencies = [ "sql", "sqlx", "store-api", - "substrait 0.15.4", + "substrait 0.15.5", "table", "tempfile", "time", diff --git a/Cargo.toml b/Cargo.toml index 1246ee621d..007bd61efe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,7 +71,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.15.4" +version = "0.15.5" edition = "2021" license = "Apache-2.0" @@ -134,7 +134,7 @@ etcd-client = "0.14" fst = "0.4.7" futures = "0.3" futures-util = "0.3" -greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "a5d256ba4abb7393e0859ffbf7fca1e38f3433dc" } +greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "f3103a8c9b8ce162457d0a3e3ca00d53d1a8bd06" } hex = "0.4" http = "1" humantime = "2.1" @@ -220,6 +220,8 @@ tokio-util = { version = "0.7", features = ["io-util", "compat"] } toml = "0.8.8" tonic = { version = "0.12", features = ["tls", "gzip", "zstd"] } tower = "0.5" +tower-http = "0.6" +tracing = "0.1" tracing-appender = "0.2" tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] } typetag = "0.2" diff --git a/src/api/src/region.rs b/src/api/src/region.rs index d752382534..e7000cd744 100644 --- a/src/api/src/region.rs +++ b/src/api/src/region.rs @@ -22,6 +22,7 @@ use greptime_proto::v1::region::RegionResponse as RegionResponseV1; pub struct RegionResponse { pub affected_rows: AffectedRows, pub extensions: HashMap>, + pub metadata: Vec, } impl RegionResponse { @@ -29,6 +30,7 @@ impl RegionResponse { Self { affected_rows: region_response.affected_rows as _, extensions: region_response.extensions, + metadata: region_response.metadata, } } @@ -37,6 +39,16 @@ impl RegionResponse { Self { affected_rows, extensions: Default::default(), + metadata: Vec::new(), + } + } + + /// Creates one response with metadata. + pub fn from_metadata(metadata: Vec) -> Self { + Self { + affected_rows: 0, + extensions: Default::default(), + metadata, } } } diff --git a/src/api/src/v1/column_def.rs b/src/api/src/v1/column_def.rs index 316d5342db..3c3d37aa3a 100644 --- a/src/api/src/v1/column_def.rs +++ b/src/api/src/v1/column_def.rs @@ -24,7 +24,7 @@ use greptime_proto::v1::{ }; use snafu::ResultExt; -use crate::error::{self, Result}; +use crate::error::{self, ConvertColumnDefaultConstraintSnafu, Result}; use crate::helper::ColumnDataTypeWrapper; use crate::v1::{ColumnDef, ColumnOptions, SemanticType}; @@ -77,6 +77,48 @@ pub fn try_as_column_schema(column_def: &ColumnDef) -> Result { }) } +/// Tries to construct a `ColumnDef` from the given `ColumnSchema`. +/// +/// TODO(weny): Add tests for this function. +pub fn try_as_column_def(column_schema: &ColumnSchema, is_primary_key: bool) -> Result { + let column_datatype = + ColumnDataTypeWrapper::try_from(column_schema.data_type.clone()).map(|w| w.to_parts())?; + + let semantic_type = if column_schema.is_time_index() { + SemanticType::Timestamp + } else if is_primary_key { + SemanticType::Tag + } else { + SemanticType::Field + } as i32; + let comment = column_schema + .metadata() + .get(COMMENT_KEY) + .cloned() + .unwrap_or_default(); + + let default_constraint = match column_schema.default_constraint() { + None => vec![], + Some(v) => v + .clone() + .try_into() + .context(ConvertColumnDefaultConstraintSnafu { + column: &column_schema.name, + })?, + }; + let options = options_from_column_schema(column_schema); + Ok(ColumnDef { + name: column_schema.name.clone(), + data_type: column_datatype.0 as i32, + is_nullable: column_schema.is_nullable(), + default_constraint, + semantic_type, + comment, + datatype_extension: column_datatype.1, + options, + }) +} + /// Constructs a `ColumnOptions` from the given `ColumnSchema`. pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option { let mut options = ColumnOptions::default(); diff --git a/src/catalog/src/information_extension.rs b/src/catalog/src/information_extension.rs index 4d829ae01a..e481d469bd 100644 --- a/src/catalog/src/information_extension.rs +++ b/src/catalog/src/information_extension.rs @@ -16,8 +16,8 @@ use api::v1::meta::ProcedureStatus; use common_error::ext::BoxedError; use common_meta::cluster::{ClusterInfo, NodeInfo}; use common_meta::datanode::RegionStat; -use common_meta::ddl::{ExecutorContext, ProcedureExecutor}; use common_meta::key::flow::flow_state::FlowStat; +use common_meta::procedure_executor::{ExecutorContext, ProcedureExecutor}; use common_meta::rpc::procedure; use common_procedure::{ProcedureInfo, ProcedureState}; use meta_client::MetaClientRef; diff --git a/src/cli/src/bench.rs b/src/cli/src/bench.rs index 2bd5d709b2..d08bb06948 100644 --- a/src/cli/src/bench.rs +++ b/src/cli/src/bench.rs @@ -160,6 +160,7 @@ fn create_table_info(table_id: TableId, table_name: TableName) -> RawTableInfo { options: Default::default(), region_numbers: (1..=100).collect(), partition_key_indices: vec![], + column_ids: vec![], }; RawTableInfo { diff --git a/src/cli/src/metadata/repair.rs b/src/cli/src/metadata/repair.rs index 212a52d6e1..98b2f8fee9 100644 --- a/src/cli/src/metadata/repair.rs +++ b/src/cli/src/metadata/repair.rs @@ -241,7 +241,6 @@ impl RepairTool { let alter_table_request = alter_table::make_alter_region_request_for_peer( logical_table_id, &alter_table_expr, - full_table_metadata.table_info.ident.version, peer, physical_region_routes, )?; diff --git a/src/cli/src/metadata/repair/alter_table.rs b/src/cli/src/metadata/repair/alter_table.rs index adfdd95ef7..53827d0b42 100644 --- a/src/cli/src/metadata/repair/alter_table.rs +++ b/src/cli/src/metadata/repair/alter_table.rs @@ -66,7 +66,6 @@ pub fn generate_alter_table_expr_for_all_columns( pub fn make_alter_region_request_for_peer( logical_table_id: TableId, alter_table_expr: &AlterTableExpr, - schema_version: u64, peer: &Peer, region_routes: &[RegionRoute], ) -> Result { @@ -74,7 +73,7 @@ pub fn make_alter_region_request_for_peer( let mut requests = Vec::with_capacity(regions_on_this_peer.len()); for region_number in ®ions_on_this_peer { let region_id = RegionId::new(logical_table_id, *region_number); - let request = make_alter_region_request(region_id, alter_table_expr, schema_version); + let request = make_alter_region_request(region_id, alter_table_expr); requests.push(request); } diff --git a/src/cmd/src/metasrv.rs b/src/cmd/src/metasrv.rs index 4e26a71cde..ea2262e243 100644 --- a/src/cmd/src/metasrv.rs +++ b/src/cmd/src/metasrv.rs @@ -341,7 +341,7 @@ impl StartCommand { .context(error::BuildMetaServerSnafu)?; let metasrv = builder.build().await.context(error::BuildMetaServerSnafu)?; - let instance = MetasrvInstance::new(opts, plugins, metasrv) + let instance = MetasrvInstance::new(metasrv) .await .context(error::BuildMetaServerSnafu)?; diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 282789d626..830fce9f94 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -34,13 +34,14 @@ use common_meta::cluster::{NodeInfo, NodeStatus}; use common_meta::datanode::RegionStat; use common_meta::ddl::flow_meta::FlowMetadataAllocator; use common_meta::ddl::table_meta::TableMetadataAllocator; -use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl, ProcedureExecutorRef}; +use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl}; use common_meta::ddl_manager::DdlManager; use common_meta::key::flow::flow_state::FlowStat; use common_meta::key::flow::FlowMetadataManager; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; use common_meta::peer::Peer; +use common_meta::procedure_executor::LocalProcedureExecutor; use common_meta::region_keeper::MemoryRegionKeeper; use common_meta::region_registry::LeaderRegionRegistry; use common_meta::sequence::SequenceBuilder; @@ -609,9 +610,8 @@ impl StartCommand { flow_metadata_allocator: flow_metadata_allocator.clone(), region_failure_detector_controller: Arc::new(NoopRegionFailureDetectorControl), }; - let procedure_manager_c = procedure_manager.clone(); - let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager_c, true) + let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager.clone(), true) .context(error::InitDdlManagerSnafu)?; #[cfg(feature = "enterprise")] let ddl_manager = { @@ -619,7 +619,11 @@ impl StartCommand { plugins.get(); ddl_manager.with_trigger_ddl_manager(trigger_ddl_manager) }; - let ddl_task_executor: ProcedureExecutorRef = Arc::new(ddl_manager); + + let procedure_executor = Arc::new(LocalProcedureExecutor::new( + Arc::new(ddl_manager), + procedure_manager.clone(), + )); let fe_instance = FrontendBuilder::new( fe_opts.clone(), @@ -627,7 +631,7 @@ impl StartCommand { layered_cache_registry.clone(), catalog_manager.clone(), node_manager.clone(), - ddl_task_executor.clone(), + procedure_executor.clone(), process_manager, ) .with_plugin(plugins.clone()) @@ -652,7 +656,7 @@ impl StartCommand { catalog_manager.clone(), kv_backend.clone(), layered_cache_registry.clone(), - ddl_task_executor.clone(), + procedure_executor, node_manager, ) .await diff --git a/src/common/function/src/admin.rs b/src/common/function/src/admin.rs index c06b28e7d5..1a02caa088 100644 --- a/src/common/function/src/admin.rs +++ b/src/common/function/src/admin.rs @@ -16,6 +16,9 @@ mod add_region_follower; mod flush_compact_region; mod flush_compact_table; mod migrate_region; +mod reconcile_catalog; +mod reconcile_database; +mod reconcile_table; mod remove_region_follower; use std::sync::Arc; @@ -24,6 +27,9 @@ use add_region_follower::AddRegionFollowerFunction; use flush_compact_region::{CompactRegionFunction, FlushRegionFunction}; use flush_compact_table::{CompactTableFunction, FlushTableFunction}; use migrate_region::MigrateRegionFunction; +use reconcile_catalog::ReconcileCatalogFunction; +use reconcile_database::ReconcileDatabaseFunction; +use reconcile_table::ReconcileTableFunction; use remove_region_follower::RemoveRegionFollowerFunction; use crate::flush_flow::FlushFlowFunction; @@ -43,5 +49,8 @@ impl AdminFunction { registry.register_async(Arc::new(FlushTableFunction)); registry.register_async(Arc::new(CompactTableFunction)); registry.register_async(Arc::new(FlushFlowFunction)); + registry.register_async(Arc::new(ReconcileCatalogFunction)); + registry.register_async(Arc::new(ReconcileDatabaseFunction)); + registry.register_async(Arc::new(ReconcileTableFunction)); } } diff --git a/src/common/function/src/admin/reconcile_catalog.rs b/src/common/function/src/admin/reconcile_catalog.rs new file mode 100644 index 0000000000..fc2fec3273 --- /dev/null +++ b/src/common/function/src/admin/reconcile_catalog.rs @@ -0,0 +1,179 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::meta::reconcile_request::Target; +use api::v1::meta::{ReconcileCatalog, ReconcileRequest}; +use common_macro::admin_fn; +use common_query::error::{ + InvalidFuncArgsSnafu, MissingProcedureServiceHandlerSnafu, Result, + UnsupportedInputDataTypeSnafu, +}; +use common_query::prelude::{Signature, TypeSignature, Volatility}; +use common_telemetry::info; +use datatypes::prelude::*; +use session::context::QueryContextRef; + +use crate::handlers::ProcedureServiceHandlerRef; +use crate::helper::{ + cast_u32, default_parallelism, default_resolve_strategy, get_string_from_params, + parse_resolve_strategy, +}; + +const FN_NAME: &str = "reconcile_catalog"; + +/// A function to reconcile a catalog. +/// Returns the procedure id if success. +/// +/// - `reconcile_catalog(resolve_strategy)`. +/// - `reconcile_catalog(resolve_strategy, parallelism)`. +/// +/// - `reconcile_catalog()`. +#[admin_fn( + name = ReconcileCatalogFunction, + display_name = reconcile_catalog, + sig_fn = signature, + ret = string +)] +pub(crate) async fn reconcile_catalog( + procedure_service_handler: &ProcedureServiceHandlerRef, + query_ctx: &QueryContextRef, + params: &[ValueRef<'_>], +) -> Result { + let (resolve_strategy, parallelism) = match params.len() { + 0 => (default_resolve_strategy(), default_parallelism()), + 1 => ( + parse_resolve_strategy(get_string_from_params(params, 0, FN_NAME)?)?, + default_parallelism(), + ), + 2 => { + let Some(parallelism) = cast_u32(¶ms[1])? else { + return UnsupportedInputDataTypeSnafu { + function: FN_NAME, + datatypes: params.iter().map(|v| v.data_type()).collect::>(), + } + .fail(); + }; + ( + parse_resolve_strategy(get_string_from_params(params, 0, FN_NAME)?)?, + parallelism, + ) + } + size => { + return InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not correct, expect 0, 1 or 2, have: {}", + size + ), + } + .fail(); + } + }; + info!( + "Reconciling catalog with resolve_strategy: {:?}, parallelism: {}", + resolve_strategy, parallelism + ); + let pid = procedure_service_handler + .reconcile(ReconcileRequest { + target: Some(Target::ReconcileCatalog(ReconcileCatalog { + catalog_name: query_ctx.current_catalog().to_string(), + parallelism, + resolve_strategy: resolve_strategy as i32, + })), + ..Default::default() + }) + .await?; + match pid { + Some(pid) => Ok(Value::from(pid)), + None => Ok(Value::Null), + } +} + +fn signature() -> Signature { + let nums = ConcreteDataType::numerics(); + let mut signs = Vec::with_capacity(2 + nums.len()); + signs.extend([ + // reconcile_catalog() + TypeSignature::NullAry, + // reconcile_catalog(resolve_strategy) + TypeSignature::Exact(vec![ConcreteDataType::string_datatype()]), + ]); + for sign in nums { + // reconcile_catalog(resolve_strategy, parallelism) + signs.push(TypeSignature::Exact(vec![ + ConcreteDataType::string_datatype(), + sign, + ])); + } + Signature::one_of(signs, Volatility::Immutable) +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + use std::sync::Arc; + + use common_query::error::Error; + use datatypes::vectors::{StringVector, UInt64Vector, VectorRef}; + + use crate::admin::reconcile_catalog::ReconcileCatalogFunction; + use crate::function::{AsyncFunction, FunctionContext}; + + #[tokio::test] + async fn test_reconcile_catalog() { + common_telemetry::init_default_ut_logging(); + + // reconcile_catalog() + let f = ReconcileCatalogFunction; + let args = vec![]; + let result = f.eval(FunctionContext::mock(), &args).await.unwrap(); + let expect: VectorRef = Arc::new(StringVector::from(vec!["test_pid"])); + assert_eq!(expect, result); + + // reconcile_catalog(resolve_strategy) + let f = ReconcileCatalogFunction; + let args = vec![Arc::new(StringVector::from(vec!["UseMetasrv"])) as _]; + let result = f.eval(FunctionContext::mock(), &args).await.unwrap(); + let expect: VectorRef = Arc::new(StringVector::from(vec!["test_pid"])); + assert_eq!(expect, result); + + // reconcile_catalog(resolve_strategy, parallelism) + let f = ReconcileCatalogFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["UseLatest"])) as _, + Arc::new(UInt64Vector::from_slice([10])) as _, + ]; + let result = f.eval(FunctionContext::mock(), &args).await.unwrap(); + let expect: VectorRef = Arc::new(StringVector::from(vec!["test_pid"])); + assert_eq!(expect, result); + + // unsupported input data type + let f = ReconcileCatalogFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["UseLatest"])) as _, + Arc::new(StringVector::from(vec!["test"])) as _, + ]; + let err = f.eval(FunctionContext::mock(), &args).await.unwrap_err(); + assert_matches!(err, Error::UnsupportedInputDataType { .. }); + + // invalid function args + let f = ReconcileCatalogFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["UseLatest"])) as _, + Arc::new(UInt64Vector::from_slice([10])) as _, + Arc::new(StringVector::from(vec!["10"])) as _, + ]; + let err = f.eval(FunctionContext::mock(), &args).await.unwrap_err(); + assert_matches!(err, Error::InvalidFuncArgs { .. }); + } +} diff --git a/src/common/function/src/admin/reconcile_database.rs b/src/common/function/src/admin/reconcile_database.rs new file mode 100644 index 0000000000..622d2bb069 --- /dev/null +++ b/src/common/function/src/admin/reconcile_database.rs @@ -0,0 +1,198 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::meta::reconcile_request::Target; +use api::v1::meta::{ReconcileDatabase, ReconcileRequest}; +use common_macro::admin_fn; +use common_query::error::{ + InvalidFuncArgsSnafu, MissingProcedureServiceHandlerSnafu, Result, + UnsupportedInputDataTypeSnafu, +}; +use common_query::prelude::{Signature, TypeSignature, Volatility}; +use common_telemetry::info; +use datatypes::prelude::*; +use session::context::QueryContextRef; + +use crate::handlers::ProcedureServiceHandlerRef; +use crate::helper::{ + cast_u32, default_parallelism, default_resolve_strategy, get_string_from_params, + parse_resolve_strategy, +}; + +const FN_NAME: &str = "reconcile_database"; + +/// A function to reconcile a database. +/// Returns the procedure id if success. +/// +/// - `reconcile_database(database_name)`. +/// - `reconcile_database(database_name, resolve_strategy)`. +/// - `reconcile_database(database_name, resolve_strategy, parallelism)`. +/// +/// The parameters: +/// - `database_name`: the database name +#[admin_fn( + name = ReconcileDatabaseFunction, + display_name = reconcile_database, + sig_fn = signature, + ret = string +)] +pub(crate) async fn reconcile_database( + procedure_service_handler: &ProcedureServiceHandlerRef, + query_ctx: &QueryContextRef, + params: &[ValueRef<'_>], +) -> Result { + let (database_name, resolve_strategy, parallelism) = match params.len() { + 1 => ( + get_string_from_params(params, 0, FN_NAME)?, + default_resolve_strategy(), + default_parallelism(), + ), + 2 => ( + get_string_from_params(params, 0, FN_NAME)?, + parse_resolve_strategy(get_string_from_params(params, 1, FN_NAME)?)?, + default_parallelism(), + ), + 3 => { + let Some(parallelism) = cast_u32(¶ms[2])? else { + return UnsupportedInputDataTypeSnafu { + function: FN_NAME, + datatypes: params.iter().map(|v| v.data_type()).collect::>(), + } + .fail(); + }; + ( + get_string_from_params(params, 0, FN_NAME)?, + parse_resolve_strategy(get_string_from_params(params, 1, FN_NAME)?)?, + parallelism, + ) + } + size => { + return InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not correct, expect 1, 2 or 3, have: {}", + size + ), + } + .fail(); + } + }; + info!( + "Reconciling database: {}, resolve_strategy: {:?}, parallelism: {}", + database_name, resolve_strategy, parallelism + ); + let pid = procedure_service_handler + .reconcile(ReconcileRequest { + target: Some(Target::ReconcileDatabase(ReconcileDatabase { + catalog_name: query_ctx.current_catalog().to_string(), + database_name: database_name.to_string(), + parallelism, + resolve_strategy: resolve_strategy as i32, + })), + ..Default::default() + }) + .await?; + match pid { + Some(pid) => Ok(Value::from(pid)), + None => Ok(Value::Null), + } +} + +fn signature() -> Signature { + let nums = ConcreteDataType::numerics(); + let mut signs = Vec::with_capacity(2 + nums.len()); + signs.extend([ + // reconcile_database(datanode_name) + TypeSignature::Exact(vec![ConcreteDataType::string_datatype()]), + // reconcile_database(database_name, resolve_strategy) + TypeSignature::Exact(vec![ + ConcreteDataType::string_datatype(), + ConcreteDataType::string_datatype(), + ]), + ]); + for sign in nums { + // reconcile_database(database_name, resolve_strategy, parallelism) + signs.push(TypeSignature::Exact(vec![ + ConcreteDataType::string_datatype(), + ConcreteDataType::string_datatype(), + sign, + ])); + } + Signature::one_of(signs, Volatility::Immutable) +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + use std::sync::Arc; + + use common_query::error::Error; + use datatypes::vectors::{StringVector, UInt32Vector, VectorRef}; + + use crate::admin::reconcile_database::ReconcileDatabaseFunction; + use crate::function::{AsyncFunction, FunctionContext}; + + #[tokio::test] + async fn test_reconcile_catalog() { + common_telemetry::init_default_ut_logging(); + + // reconcile_database(database_name) + let f = ReconcileDatabaseFunction; + let args = vec![Arc::new(StringVector::from(vec!["test"])) as _]; + let result = f.eval(FunctionContext::mock(), &args).await.unwrap(); + let expect: VectorRef = Arc::new(StringVector::from(vec!["test_pid"])); + assert_eq!(expect, result); + + // reconcile_database(database_name, resolve_strategy) + let f = ReconcileDatabaseFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["test"])) as _, + Arc::new(StringVector::from(vec!["UseLatest"])) as _, + ]; + let result = f.eval(FunctionContext::mock(), &args).await.unwrap(); + let expect: VectorRef = Arc::new(StringVector::from(vec!["test_pid"])); + assert_eq!(expect, result); + + // reconcile_database(database_name, resolve_strategy, parallelism) + let f = ReconcileDatabaseFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["test"])) as _, + Arc::new(StringVector::from(vec!["UseLatest"])) as _, + Arc::new(UInt32Vector::from_slice([10])) as _, + ]; + let result = f.eval(FunctionContext::mock(), &args).await.unwrap(); + let expect: VectorRef = Arc::new(StringVector::from(vec!["test_pid"])); + assert_eq!(expect, result); + + // invalid function args + let f = ReconcileDatabaseFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["UseLatest"])) as _, + Arc::new(UInt32Vector::from_slice([10])) as _, + Arc::new(StringVector::from(vec!["v1"])) as _, + Arc::new(StringVector::from(vec!["v2"])) as _, + ]; + let err = f.eval(FunctionContext::mock(), &args).await.unwrap_err(); + assert_matches!(err, Error::InvalidFuncArgs { .. }); + + // unsupported input data type + let f = ReconcileDatabaseFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["UseLatest"])) as _, + Arc::new(UInt32Vector::from_slice([10])) as _, + Arc::new(StringVector::from(vec!["v1"])) as _, + ]; + let err = f.eval(FunctionContext::mock(), &args).await.unwrap_err(); + assert_matches!(err, Error::UnsupportedInputDataType { .. }); + } +} diff --git a/src/common/function/src/admin/reconcile_table.rs b/src/common/function/src/admin/reconcile_table.rs new file mode 100644 index 0000000000..61e54e47bc --- /dev/null +++ b/src/common/function/src/admin/reconcile_table.rs @@ -0,0 +1,149 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::meta::reconcile_request::Target; +use api::v1::meta::{ReconcileRequest, ReconcileTable, ResolveStrategy}; +use common_catalog::format_full_table_name; +use common_error::ext::BoxedError; +use common_macro::admin_fn; +use common_query::error::{ + MissingProcedureServiceHandlerSnafu, Result, TableMutationSnafu, UnsupportedInputDataTypeSnafu, +}; +use common_query::prelude::{Signature, TypeSignature, Volatility}; +use common_telemetry::info; +use datatypes::prelude::*; +use session::context::QueryContextRef; +use session::table_name::table_name_to_full_name; +use snafu::ResultExt; + +use crate::handlers::ProcedureServiceHandlerRef; +use crate::helper::parse_resolve_strategy; + +const FN_NAME: &str = "reconcile_table"; + +/// A function to reconcile a table. +/// Returns the procedure id if success. +/// +/// - `reconcile_table(table_name)`. +/// - `reconcile_table(table_name, resolve_strategy)`. +/// +/// The parameters: +/// - `table_name`: the table name +#[admin_fn( + name = ReconcileTableFunction, + display_name = reconcile_table, + sig_fn = signature, + ret = string +)] +pub(crate) async fn reconcile_table( + procedure_service_handler: &ProcedureServiceHandlerRef, + query_ctx: &QueryContextRef, + params: &[ValueRef<'_>], +) -> Result { + let (table_name, resolve_strategy) = match params { + [ValueRef::String(table_name)] => (table_name, ResolveStrategy::UseLatest), + [ValueRef::String(table_name), ValueRef::String(resolve_strategy)] => { + (table_name, parse_resolve_strategy(resolve_strategy)?) + } + _ => { + return UnsupportedInputDataTypeSnafu { + function: FN_NAME, + datatypes: params.iter().map(|v| v.data_type()).collect::>(), + } + .fail() + } + }; + let (catalog_name, schema_name, table_name) = table_name_to_full_name(table_name, query_ctx) + .map_err(BoxedError::new) + .context(TableMutationSnafu)?; + info!( + "Reconciling table: {} with resolve_strategy: {:?}", + format_full_table_name(&catalog_name, &schema_name, &table_name), + resolve_strategy + ); + let pid = procedure_service_handler + .reconcile(ReconcileRequest { + target: Some(Target::ReconcileTable(ReconcileTable { + catalog_name, + schema_name, + table_name, + resolve_strategy: resolve_strategy as i32, + })), + ..Default::default() + }) + .await?; + match pid { + Some(pid) => Ok(Value::from(pid)), + None => Ok(Value::Null), + } +} + +fn signature() -> Signature { + Signature::one_of( + vec![ + // reconcile_table(table_name) + TypeSignature::Exact(vec![ConcreteDataType::string_datatype()]), + // reconcile_table(table_name, resolve_strategy) + TypeSignature::Exact(vec![ + ConcreteDataType::string_datatype(), + ConcreteDataType::string_datatype(), + ]), + ], + Volatility::Immutable, + ) +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + use std::sync::Arc; + + use common_query::error::Error; + use datatypes::vectors::{StringVector, VectorRef}; + + use crate::admin::reconcile_table::ReconcileTableFunction; + use crate::function::{AsyncFunction, FunctionContext}; + + #[tokio::test] + async fn test_reconcile_table() { + common_telemetry::init_default_ut_logging(); + + // reconcile_table(table_name) + let f = ReconcileTableFunction; + let args = vec![Arc::new(StringVector::from(vec!["test"])) as _]; + let result = f.eval(FunctionContext::mock(), &args).await.unwrap(); + let expect: VectorRef = Arc::new(StringVector::from(vec!["test_pid"])); + assert_eq!(expect, result); + + // reconcile_table(table_name, resolve_strategy) + let f = ReconcileTableFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["test"])) as _, + Arc::new(StringVector::from(vec!["UseMetasrv"])) as _, + ]; + let result = f.eval(FunctionContext::mock(), &args).await.unwrap(); + let expect: VectorRef = Arc::new(StringVector::from(vec!["test_pid"])); + assert_eq!(expect, result); + + // unsupported input data type + let f = ReconcileTableFunction; + let args = vec![ + Arc::new(StringVector::from(vec!["test"])) as _, + Arc::new(StringVector::from(vec!["UseMetasrv"])) as _, + Arc::new(StringVector::from(vec!["10"])) as _, + ]; + let err = f.eval(FunctionContext::mock(), &args).await.unwrap_err(); + assert_matches!(err, Error::UnsupportedInputDataType { .. }); + } +} diff --git a/src/common/function/src/handlers.rs b/src/common/function/src/handlers.rs index bcb6ce5460..7289de6763 100644 --- a/src/common/function/src/handlers.rs +++ b/src/common/function/src/handlers.rs @@ -14,6 +14,7 @@ use std::sync::Arc; +use api::v1::meta::ReconcileRequest; use async_trait::async_trait; use catalog::CatalogManagerRef; use common_base::AffectedRows; @@ -65,6 +66,9 @@ pub trait ProcedureServiceHandler: Send + Sync { /// Migrate a region from source peer to target peer, returns the procedure id if success. async fn migrate_region(&self, request: MigrateRegionRequest) -> Result>; + /// Reconcile a table, database or catalog, returns the procedure id if success. + async fn reconcile(&self, request: ReconcileRequest) -> Result>; + /// Query the procedure' state by its id async fn query_procedure_state(&self, pid: &str) -> Result; diff --git a/src/common/function/src/helper.rs b/src/common/function/src/helper.rs index e4a1cd1af8..e572c2df1c 100644 --- a/src/common/function/src/helper.rs +++ b/src/common/function/src/helper.rs @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_query::error::{InvalidInputTypeSnafu, Result}; +use api::v1::meta::ResolveStrategy; +use common_query::error::{ + InvalidFuncArgsSnafu, InvalidInputTypeSnafu, Result, UnsupportedInputDataTypeSnafu, +}; use common_query::prelude::{Signature, TypeSignature, Volatility}; use datatypes::prelude::ConcreteDataType; use datatypes::types::cast::cast; use datatypes::value::ValueRef; -use snafu::ResultExt; +use snafu::{OptionExt, ResultExt}; /// Create a function signature with oneof signatures of interleaving two arguments. pub fn one_of_sigs2(args1: Vec, args2: Vec) -> Signature { @@ -43,3 +46,64 @@ pub fn cast_u64(value: &ValueRef) -> Result> { }) .map(|v| v.as_u64()) } + +/// Cast a [`ValueRef`] to u32, returns `None` if fails +pub fn cast_u32(value: &ValueRef) -> Result> { + cast((*value).into(), &ConcreteDataType::uint32_datatype()) + .context(InvalidInputTypeSnafu { + err_msg: format!( + "Failed to cast input into uint32, actual type: {:#?}", + value.data_type(), + ), + }) + .map(|v| v.as_u64().map(|v| v as u32)) +} + +/// Parse a resolve strategy from a string. +pub fn parse_resolve_strategy(strategy: &str) -> Result { + ResolveStrategy::from_str_name(strategy).context(InvalidFuncArgsSnafu { + err_msg: format!("Invalid resolve strategy: {}", strategy), + }) +} + +/// Default parallelism for reconcile operations. +pub fn default_parallelism() -> u32 { + 64 +} + +/// Default resolve strategy for reconcile operations. +pub fn default_resolve_strategy() -> ResolveStrategy { + ResolveStrategy::UseLatest +} + +/// Get the string value from the params. +/// +/// # Errors +/// Returns an error if the input type is not a string. +pub fn get_string_from_params<'a>( + params: &'a [ValueRef<'a>], + index: usize, + fn_name: &'a str, +) -> Result<&'a str> { + let ValueRef::String(s) = ¶ms[index] else { + return UnsupportedInputDataTypeSnafu { + function: fn_name, + datatypes: params.iter().map(|v| v.data_type()).collect::>(), + } + .fail(); + }; + Ok(s) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_resolve_strategy() { + assert_eq!( + parse_resolve_strategy("UseLatest").unwrap(), + ResolveStrategy::UseLatest + ); + } +} diff --git a/src/common/function/src/lib.rs b/src/common/function/src/lib.rs index 95b8b6a3b1..429d99892d 100644 --- a/src/common/function/src/lib.rs +++ b/src/common/function/src/lib.rs @@ -14,6 +14,7 @@ #![feature(let_chains)] #![feature(try_blocks)] +#![feature(assert_matches)] mod admin; mod flush_flow; diff --git a/src/common/function/src/state.rs b/src/common/function/src/state.rs index 211f7e1438..510bb613a6 100644 --- a/src/common/function/src/state.rs +++ b/src/common/function/src/state.rs @@ -32,7 +32,7 @@ impl FunctionState { pub fn mock() -> Self { use std::sync::Arc; - use api::v1::meta::ProcedureStatus; + use api::v1::meta::{ProcedureStatus, ReconcileRequest}; use async_trait::async_trait; use catalog::CatalogManagerRef; use common_base::AffectedRows; @@ -63,6 +63,10 @@ impl FunctionState { Ok(Some("test_pid".to_string())) } + async fn reconcile(&self, _request: ReconcileRequest) -> Result> { + Ok(Some("test_pid".to_string())) + } + async fn query_procedure_state(&self, _pid: &str) -> Result { Ok(ProcedureStateResponse { status: ProcedureStatus::Done.into(), diff --git a/src/common/grpc-expr/src/alter.rs b/src/common/grpc-expr/src/alter.rs index ee80d9551e..ba3791432d 100644 --- a/src/common/grpc-expr/src/alter.rs +++ b/src/common/grpc-expr/src/alter.rs @@ -29,8 +29,8 @@ use snafu::{ensure, OptionExt, ResultExt}; use store_api::region_request::{SetRegionOption, UnsetRegionOption}; use table::metadata::TableId; use table::requests::{ - AddColumnRequest, AlterKind, AlterTableRequest, ModifyColumnTypeRequest, SetIndexOptions, - UnsetIndexOptions, + AddColumnRequest, AlterKind, AlterTableRequest, ModifyColumnTypeRequest, SetIndexOption, + UnsetIndexOption, }; use crate::error::{ @@ -43,6 +43,59 @@ use crate::error::{ const LOCATION_TYPE_FIRST: i32 = LocationType::First as i32; const LOCATION_TYPE_AFTER: i32 = LocationType::After as i32; +fn set_index_option_from_proto(set_index: api::v1::SetIndex) -> Result { + let options = set_index.options.context(MissingAlterIndexOptionSnafu)?; + Ok(match options { + api::v1::set_index::Options::Fulltext(f) => SetIndexOption::Fulltext { + column_name: f.column_name.clone(), + options: FulltextOptions::new( + f.enable, + as_fulltext_option_analyzer( + Analyzer::try_from(f.analyzer).context(InvalidSetFulltextOptionRequestSnafu)?, + ), + f.case_sensitive, + as_fulltext_option_backend( + PbFulltextBackend::try_from(f.backend) + .context(InvalidSetFulltextOptionRequestSnafu)?, + ), + f.granularity as u32, + f.false_positive_rate, + ) + .context(InvalidIndexOptionSnafu)?, + }, + api::v1::set_index::Options::Inverted(i) => SetIndexOption::Inverted { + column_name: i.column_name, + }, + api::v1::set_index::Options::Skipping(s) => SetIndexOption::Skipping { + column_name: s.column_name, + options: SkippingIndexOptions::new( + s.granularity as u32, + s.false_positive_rate, + as_skipping_index_type( + PbSkippingIndexType::try_from(s.skipping_index_type) + .context(InvalidSetSkippingIndexOptionRequestSnafu)?, + ), + ) + .context(InvalidIndexOptionSnafu)?, + }, + }) +} + +fn unset_index_option_from_proto(unset_index: api::v1::UnsetIndex) -> Result { + let options = unset_index.options.context(MissingAlterIndexOptionSnafu)?; + Ok(match options { + api::v1::unset_index::Options::Fulltext(f) => UnsetIndexOption::Fulltext { + column_name: f.column_name, + }, + api::v1::unset_index::Options::Inverted(i) => UnsetIndexOption::Inverted { + column_name: i.column_name, + }, + api::v1::unset_index::Options::Skipping(s) => UnsetIndexOption::Skipping { + column_name: s.column_name, + }, + }) +} + /// Convert an [`AlterTableExpr`] to an [`AlterTableRequest`] pub fn alter_expr_to_request(table_id: TableId, expr: AlterTableExpr) -> Result { let catalog_name = expr.catalog_name; @@ -121,70 +174,34 @@ pub fn alter_expr_to_request(table_id: TableId, expr: AlterTableExpr) -> Result< .context(InvalidUnsetTableOptionRequestSnafu)?, } } - Kind::SetIndex(o) => match o.options { - Some(opt) => match opt { - api::v1::set_index::Options::Fulltext(f) => AlterKind::SetIndex { - options: SetIndexOptions::Fulltext { - column_name: f.column_name.clone(), - options: FulltextOptions::new( - f.enable, - as_fulltext_option_analyzer( - Analyzer::try_from(f.analyzer) - .context(InvalidSetFulltextOptionRequestSnafu)?, - ), - f.case_sensitive, - as_fulltext_option_backend( - PbFulltextBackend::try_from(f.backend) - .context(InvalidSetFulltextOptionRequestSnafu)?, - ), - f.granularity as u32, - f.false_positive_rate, - ) - .context(InvalidIndexOptionSnafu)?, - }, - }, - api::v1::set_index::Options::Inverted(i) => AlterKind::SetIndex { - options: SetIndexOptions::Inverted { - column_name: i.column_name, - }, - }, - api::v1::set_index::Options::Skipping(s) => AlterKind::SetIndex { - options: SetIndexOptions::Skipping { - column_name: s.column_name, - options: SkippingIndexOptions::new( - s.granularity as u32, - s.false_positive_rate, - as_skipping_index_type( - PbSkippingIndexType::try_from(s.skipping_index_type) - .context(InvalidSetSkippingIndexOptionRequestSnafu)?, - ), - ) - .context(InvalidIndexOptionSnafu)?, - }, - }, - }, - None => return MissingAlterIndexOptionSnafu.fail(), - }, - Kind::UnsetIndex(o) => match o.options { - Some(opt) => match opt { - api::v1::unset_index::Options::Fulltext(f) => AlterKind::UnsetIndex { - options: UnsetIndexOptions::Fulltext { - column_name: f.column_name, - }, - }, - api::v1::unset_index::Options::Inverted(i) => AlterKind::UnsetIndex { - options: UnsetIndexOptions::Inverted { - column_name: i.column_name, - }, - }, - api::v1::unset_index::Options::Skipping(s) => AlterKind::UnsetIndex { - options: UnsetIndexOptions::Skipping { - column_name: s.column_name, - }, - }, - }, - None => return MissingAlterIndexOptionSnafu.fail(), - }, + Kind::SetIndex(o) => { + let option = set_index_option_from_proto(o)?; + AlterKind::SetIndexes { + options: vec![option], + } + } + Kind::UnsetIndex(o) => { + let option = unset_index_option_from_proto(o)?; + AlterKind::UnsetIndexes { + options: vec![option], + } + } + Kind::SetIndexes(o) => { + let options = o + .set_indexes + .into_iter() + .map(set_index_option_from_proto) + .collect::>>()?; + AlterKind::SetIndexes { options } + } + Kind::UnsetIndexes(o) => { + let options = o + .unset_indexes + .into_iter() + .map(unset_index_option_from_proto) + .collect::>>()?; + AlterKind::UnsetIndexes { options } + } Kind::DropDefaults(o) => { let names = o .drop_defaults diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml index 5f009305f6..2e12a49875 100644 --- a/src/common/meta/Cargo.toml +++ b/src/common/meta/Cargo.toml @@ -32,6 +32,7 @@ common-procedure.workspace = true common-procedure-test.workspace = true common-query.workspace = true common-recordbatch.workspace = true +common-runtime.workspace = true common-telemetry.workspace = true common-time.workspace = true common-wal.workspace = true diff --git a/src/common/meta/src/ddl.rs b/src/common/meta/src/ddl.rs index 3c822bae1b..0fc65419c6 100644 --- a/src/common/meta/src/ddl.rs +++ b/src/common/meta/src/ddl.rs @@ -15,25 +15,17 @@ use std::collections::HashMap; use std::sync::Arc; -use api::v1::meta::ProcedureDetailResponse; -use common_telemetry::tracing_context::W3cTrace; use store_api::storage::{RegionId, RegionNumber, TableId}; use crate::cache_invalidator::CacheInvalidatorRef; use crate::ddl::flow_meta::FlowMetadataAllocatorRef; use crate::ddl::table_meta::TableMetadataAllocatorRef; -use crate::error::{Result, UnsupportedSnafu}; use crate::key::flow::FlowMetadataManagerRef; use crate::key::table_route::PhysicalTableRouteValue; use crate::key::TableMetadataManagerRef; use crate::node_manager::NodeManagerRef; use crate::region_keeper::MemoryRegionKeeperRef; use crate::region_registry::LeaderRegionRegistryRef; -use crate::rpc::ddl::{SubmitDdlTaskRequest, SubmitDdlTaskResponse}; -use crate::rpc::procedure::{ - AddRegionFollowerRequest, MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse, - RemoveRegionFollowerRequest, -}; use crate::DatanodeId; pub mod alter_database; @@ -44,13 +36,13 @@ pub mod create_flow; pub mod create_logical_tables; pub mod create_table; mod create_table_template; +pub(crate) use create_table_template::{build_template_from_raw_table_info, CreateRequestBuilder}; pub mod create_view; pub mod drop_database; pub mod drop_flow; pub mod drop_table; pub mod drop_view; pub mod flow_meta; -mod physical_table_metadata; pub mod table_meta; #[cfg(any(test, feature = "testing"))] pub mod test_util; @@ -59,64 +51,6 @@ pub(crate) mod tests; pub mod truncate_table; pub mod utils; -#[derive(Debug, Default)] -pub struct ExecutorContext { - pub tracing_context: Option, -} - -/// The procedure executor that accepts ddl, region migration task etc. -#[async_trait::async_trait] -pub trait ProcedureExecutor: Send + Sync { - /// Submit a ddl task - async fn submit_ddl_task( - &self, - ctx: &ExecutorContext, - request: SubmitDdlTaskRequest, - ) -> Result; - - /// Add a region follower - async fn add_region_follower( - &self, - _ctx: &ExecutorContext, - _request: AddRegionFollowerRequest, - ) -> Result<()> { - UnsupportedSnafu { - operation: "add_region_follower", - } - .fail() - } - - /// Remove a region follower - async fn remove_region_follower( - &self, - _ctx: &ExecutorContext, - _request: RemoveRegionFollowerRequest, - ) -> Result<()> { - UnsupportedSnafu { - operation: "remove_region_follower", - } - .fail() - } - - /// Submit a region migration task - async fn migrate_region( - &self, - ctx: &ExecutorContext, - request: MigrateRegionRequest, - ) -> Result; - - /// Query the procedure state by its id - async fn query_procedure_state( - &self, - ctx: &ExecutorContext, - pid: &str, - ) -> Result; - - async fn list_procedures(&self, ctx: &ExecutorContext) -> Result; -} - -pub type ProcedureExecutorRef = Arc; - /// Metadata allocated to a table. #[derive(Default)] pub struct TableMetadata { diff --git a/src/common/meta/src/ddl/alter_logical_tables.rs b/src/common/meta/src/ddl/alter_logical_tables.rs index 68bbae6c41..ca1d4bf5af 100644 --- a/src/common/meta/src/ddl/alter_logical_tables.rs +++ b/src/common/meta/src/ddl/alter_logical_tables.rs @@ -12,32 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod check; -mod metadata; -mod region_request; -mod table_cache_keys; +mod executor; mod update_metadata; +mod validator; use api::region::RegionResponse; use async_trait::async_trait; use common_catalog::format_full_table_name; use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu}; use common_procedure::{Context, LockKey, Procedure, Status}; -use common_telemetry::{error, info, warn}; -use futures_util::future; -pub use region_request::make_alter_region_request; +use common_telemetry::{debug, error, info, warn}; +pub use executor::make_alter_region_request; use serde::{Deserialize, Serialize}; -use snafu::{ensure, ResultExt}; +use snafu::ResultExt; use store_api::metadata::ColumnMetadata; use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY; use strum::AsRefStr; use table::metadata::TableId; -use crate::ddl::utils::{ - add_peer_context_if_needed, map_to_procedure_error, sync_follower_regions, +use crate::cache_invalidator::Context as CacheContext; +use crate::ddl::alter_logical_tables::executor::AlterLogicalTablesExecutor; +use crate::ddl::alter_logical_tables::validator::{ + retain_unskipped, AlterLogicalTableValidator, ValidatorResult, }; +use crate::ddl::utils::{extract_column_metadatas, map_to_procedure_error, sync_follower_regions}; use crate::ddl::DdlContext; -use crate::error::{DecodeJsonSnafu, MetadataCorruptionSnafu, Result}; +use crate::error::Result; use crate::instruction::CacheIdent; use crate::key::table_info::TableInfoValue; use crate::key::table_route::PhysicalTableRouteValue; @@ -45,13 +45,38 @@ use crate::key::DeserializedValueWithBytes; use crate::lock_key::{CatalogLock, SchemaLock, TableLock}; use crate::metrics; use crate::rpc::ddl::AlterTableTask; -use crate::rpc::router::{find_leaders, RegionRoute}; +use crate::rpc::router::RegionRoute; pub struct AlterLogicalTablesProcedure { pub context: DdlContext, pub data: AlterTablesData, } +/// Builds the validator from the [`AlterTablesData`]. +fn build_validator_from_alter_table_data<'a>( + data: &'a AlterTablesData, +) -> AlterLogicalTableValidator<'a> { + let phsycial_table_id = data.physical_table_id; + let alters = data + .tasks + .iter() + .map(|task| &task.alter_table) + .collect::>(); + AlterLogicalTableValidator::new(phsycial_table_id, alters) +} + +/// Builds the executor from the [`AlterTablesData`]. +fn build_executor_from_alter_expr<'a>(data: &'a AlterTablesData) -> AlterLogicalTablesExecutor<'a> { + debug_assert_eq!(data.tasks.len(), data.table_info_values.len()); + let alters = data + .tasks + .iter() + .zip(data.table_info_values.iter()) + .map(|(task, table_info)| (table_info.table_info.ident.table_id, &task.alter_table)) + .collect::>(); + AlterLogicalTablesExecutor::new(alters) +} + impl AlterLogicalTablesProcedure { pub const TYPE_NAME: &'static str = "metasrv-procedure::AlterLogicalTables"; @@ -81,35 +106,44 @@ impl AlterLogicalTablesProcedure { } pub(crate) async fn on_prepare(&mut self) -> Result { - // Checks all the tasks - self.check_input_tasks()?; - // Fills the table info values - self.fill_table_info_values().await?; - // Checks the physical table, must after [fill_table_info_values] - self.check_physical_table().await?; - // Fills the physical table info - self.fill_physical_table_info().await?; - // Filter the finished tasks - let finished_tasks = self.check_finished_tasks()?; - let already_finished_count = finished_tasks - .iter() - .map(|x| if *x { 1 } else { 0 }) - .sum::(); - let apply_tasks_count = self.data.tasks.len(); - if already_finished_count == apply_tasks_count { + let validator = build_validator_from_alter_table_data(&self.data); + let ValidatorResult { + num_skipped, + skip_alter, + table_info_values, + physical_table_info, + physical_table_route, + } = validator + .validate(&self.context.table_metadata_manager) + .await?; + + let num_tasks = self.data.tasks.len(); + if num_skipped == num_tasks { info!("All the alter tasks are finished, will skip the procedure."); + let cache_ident_keys = AlterLogicalTablesExecutor::build_cache_ident_keys( + &physical_table_info, + &table_info_values + .iter() + .map(|v| v.get_inner_ref()) + .collect::>(), + ); + self.data.table_cache_keys_to_invalidate = cache_ident_keys; // Re-invalidate the table cache self.data.state = AlterTablesState::InvalidateTableCache; return Ok(Status::executing(true)); - } else if already_finished_count > 0 { + } else if num_skipped > 0 { info!( "There are {} alter tasks, {} of them were already finished.", - apply_tasks_count, already_finished_count + num_tasks, num_skipped ); } - self.filter_task(&finished_tasks)?; - // Next state + // Updates the procedure state. + retain_unskipped(&mut self.data.tasks, &skip_alter); + self.data.physical_table_info = Some(physical_table_info); + self.data.physical_table_route = Some(physical_table_route); + self.data.table_info_values = table_info_values; + debug_assert_eq!(self.data.tasks.len(), self.data.table_info_values.len()); self.data.state = AlterTablesState::SubmitAlterRegionRequests; Ok(Status::executing(true)) } @@ -117,57 +151,21 @@ impl AlterLogicalTablesProcedure { pub(crate) async fn on_submit_alter_region_requests(&mut self) -> Result { // Safety: we have checked the state in on_prepare let physical_table_route = &self.data.physical_table_route.as_ref().unwrap(); - let leaders = find_leaders(&physical_table_route.region_routes); - let mut alter_region_tasks = Vec::with_capacity(leaders.len()); + let executor = build_executor_from_alter_expr(&self.data); + let mut results = executor + .on_alter_regions( + &self.context.node_manager, + &physical_table_route.region_routes, + ) + .await?; - for peer in leaders { - let requester = self.context.node_manager.datanode(&peer).await; - let request = self.make_request(&peer, &physical_table_route.region_routes)?; - - alter_region_tasks.push(async move { - requester - .handle(request) - .await - .map_err(add_peer_context_if_needed(peer)) - }); - } - - let mut results = future::join_all(alter_region_tasks) - .await - .into_iter() - .collect::>>()?; - - // Collects responses from datanodes. - let phy_raw_schemas = results - .iter_mut() - .map(|res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)) - .collect::>(); - - if phy_raw_schemas.is_empty() { - self.submit_sync_region_requests(results, &physical_table_route.region_routes) - .await; - self.data.state = AlterTablesState::UpdateMetadata; - return Ok(Status::executing(true)); - } - - // Verify all the physical schemas are the same - // Safety: previous check ensures this vec is not empty - let first = phy_raw_schemas.first().unwrap(); - ensure!( - phy_raw_schemas.iter().all(|x| x == first), - MetadataCorruptionSnafu { - err_msg: "The physical schemas from datanodes are not the same." - } - ); - - // Decodes the physical raw schemas - if let Some(phy_raw_schema) = first { - self.data.physical_columns = - ColumnMetadata::decode_list(phy_raw_schema).context(DecodeJsonSnafu)?; + if let Some(column_metadatas) = + extract_column_metadatas(&mut results, ALTER_PHYSICAL_EXTENSION_KEY)? + { + self.data.physical_columns = column_metadatas; } else { warn!("altering logical table result doesn't contains extension key `{ALTER_PHYSICAL_EXTENSION_KEY}`,leaving the physical table's schema unchanged"); } - self.submit_sync_region_requests(results, &physical_table_route.region_routes) .await; self.data.state = AlterTablesState::UpdateMetadata; @@ -183,7 +181,7 @@ impl AlterLogicalTablesProcedure { if let Err(err) = sync_follower_regions( &self.context, self.data.physical_table_id, - results, + &results, region_routes, table_info.meta.engine.as_str(), ) @@ -200,7 +198,18 @@ impl AlterLogicalTablesProcedure { self.update_physical_table_metadata().await?; self.update_logical_tables_metadata().await?; - self.data.build_cache_keys_to_invalidate(); + let logical_table_info_values = self + .data + .table_info_values + .iter() + .map(|v| v.get_inner_ref()) + .collect::>(); + + let cache_ident_keys = AlterLogicalTablesExecutor::build_cache_ident_keys( + self.data.physical_table_info.as_ref().unwrap(), + &logical_table_info_values, + ); + self.data.table_cache_keys_to_invalidate = cache_ident_keys; self.data.clear_metadata_fields(); self.data.state = AlterTablesState::InvalidateTableCache; @@ -210,9 +219,16 @@ impl AlterLogicalTablesProcedure { pub(crate) async fn on_invalidate_table_cache(&mut self) -> Result { let to_invalidate = &self.data.table_cache_keys_to_invalidate; + let ctx = CacheContext { + subject: Some(format!( + "Invalidate table cache by altering logical tables, physical_table_id: {}", + self.data.physical_table_id, + )), + }; + self.context .cache_invalidator - .invalidate(&Default::default(), to_invalidate) + .invalidate(&ctx, to_invalidate) .await?; Ok(Status::done()) } @@ -232,6 +248,10 @@ impl Procedure for AlterLogicalTablesProcedure { let _timer = metrics::METRIC_META_PROCEDURE_ALTER_TABLE .with_label_values(&[step]) .start_timer(); + debug!( + "Executing alter logical tables procedure, state: {:?}", + state + ); match state { AlterTablesState::Prepare => self.on_prepare().await, diff --git a/src/common/meta/src/ddl/alter_logical_tables/check.rs b/src/common/meta/src/ddl/alter_logical_tables/check.rs deleted file mode 100644 index a80ef3cd8c..0000000000 --- a/src/common/meta/src/ddl/alter_logical_tables/check.rs +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashSet; - -use api::v1::alter_table_expr::Kind; -use snafu::{ensure, OptionExt}; - -use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure; -use crate::error::{AlterLogicalTablesInvalidArgumentsSnafu, Result}; -use crate::key::table_info::TableInfoValue; -use crate::key::table_route::TableRouteValue; -use crate::rpc::ddl::AlterTableTask; - -impl AlterLogicalTablesProcedure { - pub(crate) fn check_input_tasks(&self) -> Result<()> { - self.check_schema()?; - self.check_alter_kind()?; - Ok(()) - } - - pub(crate) async fn check_physical_table(&self) -> Result<()> { - let table_route_manager = self.context.table_metadata_manager.table_route_manager(); - let table_ids = self - .data - .table_info_values - .iter() - .map(|v| v.table_info.ident.table_id) - .collect::>(); - let table_routes = table_route_manager - .table_route_storage() - .batch_get(&table_ids) - .await?; - let physical_table_id = self.data.physical_table_id; - let is_same_physical_table = table_routes.iter().all(|r| { - if let Some(TableRouteValue::Logical(r)) = r { - r.physical_table_id() == physical_table_id - } else { - false - } - }); - - ensure!( - is_same_physical_table, - AlterLogicalTablesInvalidArgumentsSnafu { - err_msg: "All the tasks should have the same physical table id" - } - ); - - Ok(()) - } - - pub(crate) fn check_finished_tasks(&self) -> Result> { - let task = &self.data.tasks; - let table_info_values = &self.data.table_info_values; - - Ok(task - .iter() - .zip(table_info_values.iter()) - .map(|(task, table)| Self::check_finished_task(task, table)) - .collect()) - } - - // Checks if the schemas of the tasks are the same - fn check_schema(&self) -> Result<()> { - let is_same_schema = self.data.tasks.windows(2).all(|pair| { - pair[0].alter_table.catalog_name == pair[1].alter_table.catalog_name - && pair[0].alter_table.schema_name == pair[1].alter_table.schema_name - }); - - ensure!( - is_same_schema, - AlterLogicalTablesInvalidArgumentsSnafu { - err_msg: "Schemas of the tasks are not the same" - } - ); - - Ok(()) - } - - fn check_alter_kind(&self) -> Result<()> { - for task in &self.data.tasks { - let kind = task.alter_table.kind.as_ref().context( - AlterLogicalTablesInvalidArgumentsSnafu { - err_msg: "Alter kind is missing", - }, - )?; - let Kind::AddColumns(_) = kind else { - return AlterLogicalTablesInvalidArgumentsSnafu { - err_msg: "Only support add columns operation", - } - .fail(); - }; - } - - Ok(()) - } - - fn check_finished_task(task: &AlterTableTask, table: &TableInfoValue) -> bool { - let columns = table - .table_info - .meta - .schema - .column_schemas - .iter() - .map(|c| &c.name) - .collect::>(); - - let Some(kind) = task.alter_table.kind.as_ref() else { - return true; // Never get here since we have checked it in `check_alter_kind` - }; - let Kind::AddColumns(add_columns) = kind else { - return true; // Never get here since we have checked it in `check_alter_kind` - }; - - // We only check that all columns have been finished. That is to say, - // if one part is finished but another part is not, it will be considered - // unfinished. - add_columns - .add_columns - .iter() - .map(|add_column| add_column.column_def.as_ref().map(|c| &c.name)) - .all(|column| column.map(|c| columns.contains(c)).unwrap_or(false)) - } -} diff --git a/src/common/meta/src/ddl/alter_logical_tables/executor.rs b/src/common/meta/src/ddl/alter_logical_tables/executor.rs new file mode 100644 index 0000000000..fde4a334ca --- /dev/null +++ b/src/common/meta/src/ddl/alter_logical_tables/executor.rs @@ -0,0 +1,216 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use api::region::RegionResponse; +use api::v1::alter_table_expr::Kind; +use api::v1::region::{ + alter_request, region_request, AddColumn, AddColumns, AlterRequest, AlterRequests, + RegionColumnDef, RegionRequest, RegionRequestHeader, +}; +use api::v1::{self, AlterTableExpr}; +use common_telemetry::tracing_context::TracingContext; +use common_telemetry::{debug, warn}; +use futures::future; +use store_api::metadata::ColumnMetadata; +use store_api::storage::{RegionId, RegionNumber, TableId}; + +use crate::ddl::utils::{add_peer_context_if_needed, raw_table_info}; +use crate::error::Result; +use crate::instruction::CacheIdent; +use crate::key::table_info::TableInfoValue; +use crate::key::{DeserializedValueWithBytes, RegionDistribution, TableMetadataManagerRef}; +use crate::node_manager::NodeManagerRef; +use crate::rpc::router::{find_leaders, region_distribution, RegionRoute}; + +/// [AlterLogicalTablesExecutor] performs: +/// - Alters logical regions on the datanodes. +/// - Updates table metadata for alter table operation. +pub struct AlterLogicalTablesExecutor<'a> { + /// The alter table expressions. + /// + /// The first element is the logical table id, the second element is the alter table expression. + alters: Vec<(TableId, &'a AlterTableExpr)>, +} + +impl<'a> AlterLogicalTablesExecutor<'a> { + pub fn new(alters: Vec<(TableId, &'a AlterTableExpr)>) -> Self { + Self { alters } + } + + /// Alters logical regions on the datanodes. + pub(crate) async fn on_alter_regions( + &self, + node_manager: &NodeManagerRef, + region_routes: &[RegionRoute], + ) -> Result> { + let region_distribution = region_distribution(region_routes); + let leaders = find_leaders(region_routes) + .into_iter() + .map(|p| (p.id, p)) + .collect::>(); + let mut alter_region_tasks = Vec::with_capacity(leaders.len()); + for (datanode_id, region_role_set) in region_distribution { + if region_role_set.leader_regions.is_empty() { + continue; + } + // Safety: must exists. + let peer = leaders.get(&datanode_id).unwrap(); + let requester = node_manager.datanode(peer).await; + let requests = self.make_alter_region_request(®ion_role_set.leader_regions); + let requester = requester.clone(); + let peer = peer.clone(); + + debug!("Sending alter region requests to datanode {}", peer); + alter_region_tasks.push(async move { + requester + .handle(make_request(requests)) + .await + .map_err(add_peer_context_if_needed(peer)) + }); + } + + future::join_all(alter_region_tasks) + .await + .into_iter() + .collect::>>() + } + + fn make_alter_region_request(&self, region_numbers: &[RegionNumber]) -> AlterRequests { + let mut requests = Vec::with_capacity(region_numbers.len() * self.alters.len()); + for (table_id, alter) in self.alters.iter() { + for region_number in region_numbers { + let region_id = RegionId::new(*table_id, *region_number); + let request = make_alter_region_request(region_id, alter); + requests.push(request); + } + } + + AlterRequests { requests } + } + + /// Updates table metadata for alter table operation. + /// + /// ## Panic: + /// - If the region distribution is not set when updating table metadata. + pub(crate) async fn on_alter_metadata( + physical_table_id: TableId, + table_metadata_manager: &TableMetadataManagerRef, + current_table_info_value: &DeserializedValueWithBytes, + region_distribution: RegionDistribution, + physical_columns: &[ColumnMetadata], + ) -> Result<()> { + if physical_columns.is_empty() { + warn!("No physical columns found, leaving the physical table's schema unchanged when altering logical tables"); + return Ok(()); + } + + let table_ref = current_table_info_value.table_ref(); + let table_id = physical_table_id; + + // Generates new table info + let old_raw_table_info = current_table_info_value.table_info.clone(); + let new_raw_table_info = + raw_table_info::build_new_physical_table_info(old_raw_table_info, physical_columns); + + debug!( + "Starting update table: {} metadata, table_id: {}, new table info: {:?}", + table_ref, table_id, new_raw_table_info + ); + + table_metadata_manager + .update_table_info( + current_table_info_value, + Some(region_distribution), + new_raw_table_info, + ) + .await?; + + Ok(()) + } + + /// Builds the cache ident keys for the alter logical tables. + /// + /// The cache ident keys are: + /// - The table id of the logical tables. + /// - The table name of the logical tables. + /// - The table id of the physical table. + pub(crate) fn build_cache_ident_keys( + physical_table_info: &TableInfoValue, + logical_table_info_values: &[&TableInfoValue], + ) -> Vec { + let mut cache_keys = Vec::with_capacity(logical_table_info_values.len() * 2 + 2); + cache_keys.extend(logical_table_info_values.iter().flat_map(|table| { + vec![ + CacheIdent::TableId(table.table_info.ident.table_id), + CacheIdent::TableName(table.table_name()), + ] + })); + cache_keys.push(CacheIdent::TableId( + physical_table_info.table_info.ident.table_id, + )); + cache_keys.push(CacheIdent::TableName(physical_table_info.table_name())); + + cache_keys + } +} + +fn make_request(alter_requests: AlterRequests) -> RegionRequest { + RegionRequest { + header: Some(RegionRequestHeader { + tracing_context: TracingContext::from_current_span().to_w3c(), + ..Default::default() + }), + body: Some(region_request::Body::Alters(alter_requests)), + } +} + +/// Makes an alter region request. +pub fn make_alter_region_request( + region_id: RegionId, + alter_table_expr: &AlterTableExpr, +) -> AlterRequest { + let region_id = region_id.as_u64(); + let kind = match &alter_table_expr.kind { + Some(Kind::AddColumns(add_columns)) => Some(alter_request::Kind::AddColumns( + to_region_add_columns(add_columns), + )), + _ => unreachable!(), // Safety: we have checked the kind in check_input_tasks + }; + + AlterRequest { + region_id, + schema_version: 0, + kind, + } +} + +fn to_region_add_columns(add_columns: &v1::AddColumns) -> AddColumns { + let add_columns = add_columns + .add_columns + .iter() + .map(|add_column| { + let region_column_def = RegionColumnDef { + column_def: add_column.column_def.clone(), + ..Default::default() // other fields are not used in alter logical table + }; + AddColumn { + column_def: Some(region_column_def), + ..Default::default() // other fields are not used in alter logical table + } + }) + .collect(); + AddColumns { add_columns } +} diff --git a/src/common/meta/src/ddl/alter_logical_tables/metadata.rs b/src/common/meta/src/ddl/alter_logical_tables/metadata.rs deleted file mode 100644 index 8734b4ef37..0000000000 --- a/src/common/meta/src/ddl/alter_logical_tables/metadata.rs +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_catalog::format_full_table_name; -use snafu::OptionExt; -use table::metadata::TableId; - -use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure; -use crate::error::{ - AlterLogicalTablesInvalidArgumentsSnafu, Result, TableInfoNotFoundSnafu, TableNotFoundSnafu, - TableRouteNotFoundSnafu, -}; -use crate::key::table_info::TableInfoValue; -use crate::key::table_name::TableNameKey; -use crate::key::table_route::TableRouteValue; -use crate::key::DeserializedValueWithBytes; -use crate::rpc::ddl::AlterTableTask; - -impl AlterLogicalTablesProcedure { - pub(crate) fn filter_task(&mut self, finished_tasks: &[bool]) -> Result<()> { - debug_assert_eq!(finished_tasks.len(), self.data.tasks.len()); - debug_assert_eq!(finished_tasks.len(), self.data.table_info_values.len()); - self.data.tasks = self - .data - .tasks - .drain(..) - .zip(finished_tasks.iter()) - .filter_map(|(task, finished)| if *finished { None } else { Some(task) }) - .collect(); - self.data.table_info_values = self - .data - .table_info_values - .drain(..) - .zip(finished_tasks.iter()) - .filter_map(|(table_info_value, finished)| { - if *finished { - None - } else { - Some(table_info_value) - } - }) - .collect(); - - Ok(()) - } - - pub(crate) async fn fill_physical_table_info(&mut self) -> Result<()> { - let (physical_table_info, physical_table_route) = self - .context - .table_metadata_manager - .get_full_table_info(self.data.physical_table_id) - .await?; - - let physical_table_info = physical_table_info.with_context(|| TableInfoNotFoundSnafu { - table: format!("table id - {}", self.data.physical_table_id), - })?; - let physical_table_route = physical_table_route - .context(TableRouteNotFoundSnafu { - table_id: self.data.physical_table_id, - })? - .into_inner(); - - self.data.physical_table_info = Some(physical_table_info); - let TableRouteValue::Physical(physical_table_route) = physical_table_route else { - return AlterLogicalTablesInvalidArgumentsSnafu { - err_msg: format!( - "expected a physical table but got a logical table: {:?}", - self.data.physical_table_id - ), - } - .fail(); - }; - self.data.physical_table_route = Some(physical_table_route); - - Ok(()) - } - - pub(crate) async fn fill_table_info_values(&mut self) -> Result<()> { - let table_ids = self.get_all_table_ids().await?; - let table_info_values = self.get_all_table_info_values(&table_ids).await?; - debug_assert_eq!(table_info_values.len(), self.data.tasks.len()); - self.data.table_info_values = table_info_values; - - Ok(()) - } - - async fn get_all_table_info_values( - &self, - table_ids: &[TableId], - ) -> Result>> { - let table_info_manager = self.context.table_metadata_manager.table_info_manager(); - let mut table_info_map = table_info_manager.batch_get_raw(table_ids).await?; - let mut table_info_values = Vec::with_capacity(table_ids.len()); - for (table_id, task) in table_ids.iter().zip(self.data.tasks.iter()) { - let table_info_value = - table_info_map - .remove(table_id) - .with_context(|| TableInfoNotFoundSnafu { - table: extract_table_name(task), - })?; - table_info_values.push(table_info_value); - } - - Ok(table_info_values) - } - - async fn get_all_table_ids(&self) -> Result> { - let table_name_manager = self.context.table_metadata_manager.table_name_manager(); - let table_name_keys = self - .data - .tasks - .iter() - .map(|task| extract_table_name_key(task)) - .collect(); - - let table_name_values = table_name_manager.batch_get(table_name_keys).await?; - let mut table_ids = Vec::with_capacity(table_name_values.len()); - for (value, task) in table_name_values.into_iter().zip(self.data.tasks.iter()) { - let table_id = value - .with_context(|| TableNotFoundSnafu { - table_name: extract_table_name(task), - })? - .table_id(); - table_ids.push(table_id); - } - - Ok(table_ids) - } -} - -#[inline] -fn extract_table_name(task: &AlterTableTask) -> String { - format_full_table_name( - &task.alter_table.catalog_name, - &task.alter_table.schema_name, - &task.alter_table.table_name, - ) -} - -#[inline] -fn extract_table_name_key(task: &AlterTableTask) -> TableNameKey { - TableNameKey::new( - &task.alter_table.catalog_name, - &task.alter_table.schema_name, - &task.alter_table.table_name, - ) -} diff --git a/src/common/meta/src/ddl/alter_logical_tables/region_request.rs b/src/common/meta/src/ddl/alter_logical_tables/region_request.rs deleted file mode 100644 index 6bd1a12193..0000000000 --- a/src/common/meta/src/ddl/alter_logical_tables/region_request.rs +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use api::v1::alter_table_expr::Kind; -use api::v1::region::{ - alter_request, region_request, AddColumn, AddColumns, AlterRequest, AlterRequests, - RegionColumnDef, RegionRequest, RegionRequestHeader, -}; -use api::v1::{self, AlterTableExpr}; -use common_telemetry::tracing_context::TracingContext; -use store_api::storage::RegionId; - -use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure; -use crate::error::Result; -use crate::peer::Peer; -use crate::rpc::router::{find_leader_regions, RegionRoute}; - -impl AlterLogicalTablesProcedure { - pub(crate) fn make_request( - &self, - peer: &Peer, - region_routes: &[RegionRoute], - ) -> Result { - let alter_requests = self.make_alter_region_requests(peer, region_routes)?; - let request = RegionRequest { - header: Some(RegionRequestHeader { - tracing_context: TracingContext::from_current_span().to_w3c(), - ..Default::default() - }), - body: Some(region_request::Body::Alters(alter_requests)), - }; - - Ok(request) - } - - fn make_alter_region_requests( - &self, - peer: &Peer, - region_routes: &[RegionRoute], - ) -> Result { - let tasks = &self.data.tasks; - let regions_on_this_peer = find_leader_regions(region_routes, peer); - let mut requests = Vec::with_capacity(tasks.len() * regions_on_this_peer.len()); - for (task, table) in self - .data - .tasks - .iter() - .zip(self.data.table_info_values.iter()) - { - for region_number in ®ions_on_this_peer { - let region_id = RegionId::new(table.table_info.ident.table_id, *region_number); - let request = make_alter_region_request( - region_id, - &task.alter_table, - table.table_info.ident.version, - ); - requests.push(request); - } - } - - Ok(AlterRequests { requests }) - } -} - -/// Makes an alter region request. -pub fn make_alter_region_request( - region_id: RegionId, - alter_table_expr: &AlterTableExpr, - schema_version: u64, -) -> AlterRequest { - let region_id = region_id.as_u64(); - let kind = match &alter_table_expr.kind { - Some(Kind::AddColumns(add_columns)) => Some(alter_request::Kind::AddColumns( - to_region_add_columns(add_columns), - )), - _ => unreachable!(), // Safety: we have checked the kind in check_input_tasks - }; - - AlterRequest { - region_id, - schema_version, - kind, - } -} - -fn to_region_add_columns(add_columns: &v1::AddColumns) -> AddColumns { - let add_columns = add_columns - .add_columns - .iter() - .map(|add_column| { - let region_column_def = RegionColumnDef { - column_def: add_column.column_def.clone(), - ..Default::default() // other fields are not used in alter logical table - }; - AddColumn { - column_def: Some(region_column_def), - ..Default::default() // other fields are not used in alter logical table - } - }) - .collect(); - AddColumns { add_columns } -} diff --git a/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs b/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs deleted file mode 100644 index 2c839da0fd..0000000000 --- a/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use table::metadata::RawTableInfo; -use table::table_name::TableName; - -use crate::ddl::alter_logical_tables::AlterTablesData; -use crate::instruction::CacheIdent; - -impl AlterTablesData { - pub(crate) fn build_cache_keys_to_invalidate(&mut self) { - let mut cache_keys = self - .table_info_values - .iter() - .flat_map(|table| { - vec![ - CacheIdent::TableId(table.table_info.ident.table_id), - CacheIdent::TableName(extract_table_name(&table.table_info)), - ] - }) - .collect::>(); - cache_keys.push(CacheIdent::TableId(self.physical_table_id)); - // Safety: physical_table_info already filled in previous steps - let physical_table_info = &self.physical_table_info.as_ref().unwrap().table_info; - cache_keys.push(CacheIdent::TableName(extract_table_name( - physical_table_info, - ))); - - self.table_cache_keys_to_invalidate = cache_keys; - } -} - -fn extract_table_name(table_info: &RawTableInfo) -> TableName { - TableName::new( - &table_info.catalog_name, - &table_info.schema_name, - &table_info.name, - ) -} diff --git a/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs b/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs index c05777bcc6..5379eaa457 100644 --- a/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs +++ b/src/common/meta/src/ddl/alter_logical_tables/update_metadata.rs @@ -13,66 +13,43 @@ // limitations under the License. use common_grpc_expr::alter_expr_to_request; -use common_telemetry::warn; -use itertools::Itertools; use snafu::ResultExt; use table::metadata::{RawTableInfo, TableInfo}; +use crate::ddl::alter_logical_tables::executor::AlterLogicalTablesExecutor; use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure; -use crate::ddl::physical_table_metadata; +use crate::ddl::utils::table_info::batch_update_table_info_values; use crate::error; use crate::error::{ConvertAlterTableRequestSnafu, Result}; use crate::key::table_info::TableInfoValue; use crate::key::DeserializedValueWithBytes; use crate::rpc::ddl::AlterTableTask; +use crate::rpc::router::region_distribution; impl AlterLogicalTablesProcedure { pub(crate) async fn update_physical_table_metadata(&mut self) -> Result<()> { - if self.data.physical_columns.is_empty() { - warn!("No physical columns found, leaving the physical table's schema unchanged when altering logical tables"); - return Ok(()); - } - // Safety: must exist. let physical_table_info = self.data.physical_table_info.as_ref().unwrap(); + let physical_table_route = self.data.physical_table_route.as_ref().unwrap(); + let region_distribution = region_distribution(&physical_table_route.region_routes); - // Generates new table info - let old_raw_table_info = physical_table_info.table_info.clone(); - let new_raw_table_info = physical_table_metadata::build_new_physical_table_info( - old_raw_table_info, + // Updates physical table's metadata. + AlterLogicalTablesExecutor::on_alter_metadata( + self.data.physical_table_id, + &self.context.table_metadata_manager, + physical_table_info, + region_distribution, &self.data.physical_columns, - ); - - // Updates physical table's metadata, and we don't need to touch per-region settings. - self.context - .table_metadata_manager - .update_table_info(physical_table_info, None, new_raw_table_info) - .await?; + ) + .await?; Ok(()) } pub(crate) async fn update_logical_tables_metadata(&mut self) -> Result<()> { let table_info_values = self.build_update_metadata()?; - let manager = &self.context.table_metadata_manager; - let chunk_size = manager.batch_update_table_info_value_chunk_size(); - if table_info_values.len() > chunk_size { - let chunks = table_info_values - .into_iter() - .chunks(chunk_size) - .into_iter() - .map(|check| check.collect::>()) - .collect::>(); - for chunk in chunks { - manager.batch_update_table_info_values(chunk).await?; - } - } else { - manager - .batch_update_table_info_values(table_info_values) - .await?; - } - - Ok(()) + batch_update_table_info_values(&self.context.table_metadata_manager, table_info_values) + .await } pub(crate) fn build_update_metadata( diff --git a/src/common/meta/src/ddl/alter_logical_tables/validator.rs b/src/common/meta/src/ddl/alter_logical_tables/validator.rs new file mode 100644 index 0000000000..a6407e8403 --- /dev/null +++ b/src/common/meta/src/ddl/alter_logical_tables/validator.rs @@ -0,0 +1,279 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; + +use api::v1::alter_table_expr::Kind; +use api::v1::AlterTableExpr; +use snafu::{ensure, OptionExt}; +use store_api::storage::TableId; +use table::table_reference::TableReference; + +use crate::ddl::utils::table_id::get_all_table_ids_by_names; +use crate::ddl::utils::table_info::{ + all_logical_table_routes_have_same_physical_id, get_all_table_info_values_by_table_ids, +}; +use crate::error::{ + AlterLogicalTablesInvalidArgumentsSnafu, Result, TableInfoNotFoundSnafu, + TableRouteNotFoundSnafu, +}; +use crate::key::table_info::TableInfoValue; +use crate::key::table_route::{PhysicalTableRouteValue, TableRouteManager, TableRouteValue}; +use crate::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; + +/// [AlterLogicalTableValidator] validates the alter logical expressions. +pub struct AlterLogicalTableValidator<'a> { + physical_table_id: TableId, + alters: Vec<&'a AlterTableExpr>, +} + +impl<'a> AlterLogicalTableValidator<'a> { + pub fn new(physical_table_id: TableId, alters: Vec<&'a AlterTableExpr>) -> Self { + Self { + physical_table_id, + alters, + } + } + + /// Validates all alter table expressions have the same schema and catalog. + fn validate_schema(&self) -> Result<()> { + let is_same_schema = self.alters.windows(2).all(|pair| { + pair[0].catalog_name == pair[1].catalog_name + && pair[0].schema_name == pair[1].schema_name + }); + + ensure!( + is_same_schema, + AlterLogicalTablesInvalidArgumentsSnafu { + err_msg: "Schemas of the alter table expressions are not the same" + } + ); + + Ok(()) + } + + /// Validates that all alter table expressions are of the supported kind. + /// Currently only supports `AddColumns` operations. + fn validate_alter_kind(&self) -> Result<()> { + for alter in &self.alters { + let kind = alter + .kind + .as_ref() + .context(AlterLogicalTablesInvalidArgumentsSnafu { + err_msg: "Alter kind is missing", + })?; + + let Kind::AddColumns(_) = kind else { + return AlterLogicalTablesInvalidArgumentsSnafu { + err_msg: "Only support add columns operation", + } + .fail(); + }; + } + + Ok(()) + } + + fn table_names(&self) -> Vec { + self.alters + .iter() + .map(|alter| { + TableReference::full(&alter.catalog_name, &alter.schema_name, &alter.table_name) + }) + .collect() + } + + /// Validates that the physical table info and route exist. + /// + /// This method performs the following validations: + /// 1. Retrieves the full table info and route for the given physical table id + /// 2. Ensures the table info and table route exists + /// 3. Verifies that the table route is actually a physical table route, not a logical one + /// + /// Returns a tuple containing the validated table info and physical table route. + async fn validate_physical_table( + &self, + table_metadata_manager: &TableMetadataManagerRef, + ) -> Result<( + DeserializedValueWithBytes, + PhysicalTableRouteValue, + )> { + let (table_info, table_route) = table_metadata_manager + .get_full_table_info(self.physical_table_id) + .await?; + + let table_info = table_info.with_context(|| TableInfoNotFoundSnafu { + table: format!("table id - {}", self.physical_table_id), + })?; + + let physical_table_route = table_route + .context(TableRouteNotFoundSnafu { + table_id: self.physical_table_id, + })? + .into_inner(); + + let TableRouteValue::Physical(table_route) = physical_table_route else { + return AlterLogicalTablesInvalidArgumentsSnafu { + err_msg: format!( + "expected a physical table but got a logical table: {:?}", + self.physical_table_id + ), + } + .fail(); + }; + + Ok((table_info, table_route)) + } + + /// Validates that all logical table routes have the same physical table id. + /// + /// This method performs the following validations: + /// 1. Retrieves table routes for all the given table ids. + /// 2. Ensures that all retrieved routes are logical table routes (not physical) + /// 3. Verifies that all logical table routes reference the same physical table id. + /// 4. Returns an error if any route is not logical or references a different physical table. + async fn validate_logical_table_routes( + &self, + table_route_manager: &TableRouteManager, + table_ids: &[TableId], + ) -> Result<()> { + let all_logical_table_routes_have_same_physical_id = + all_logical_table_routes_have_same_physical_id( + table_route_manager, + table_ids, + self.physical_table_id, + ) + .await?; + + ensure!( + all_logical_table_routes_have_same_physical_id, + AlterLogicalTablesInvalidArgumentsSnafu { + err_msg: "All the tasks should have the same physical table id" + } + ); + + Ok(()) + } + + /// Validates the alter logical expressions. + /// + /// This method performs the following validations: + /// 1. Validates that all alter table expressions have the same schema and catalog. + /// 2. Validates that all alter table expressions are of the supported kind. + /// 3. Validates that the physical table info and route exist. + /// 4. Validates that all logical table routes have the same physical table id. + /// + /// Returns a [ValidatorResult] containing the validation results. + pub async fn validate( + &self, + table_metadata_manager: &TableMetadataManagerRef, + ) -> Result { + self.validate_schema()?; + self.validate_alter_kind()?; + let (physical_table_info, physical_table_route) = + self.validate_physical_table(table_metadata_manager).await?; + let table_names = self.table_names(); + let table_ids = + get_all_table_ids_by_names(table_metadata_manager.table_name_manager(), &table_names) + .await?; + let mut table_info_values = get_all_table_info_values_by_table_ids( + table_metadata_manager.table_info_manager(), + &table_ids, + &table_names, + ) + .await?; + self.validate_logical_table_routes( + table_metadata_manager.table_route_manager(), + &table_ids, + ) + .await?; + let skip_alter = self + .alters + .iter() + .zip(table_info_values.iter()) + .map(|(task, table)| skip_alter_logical_region(task, table)) + .collect::>(); + retain_unskipped(&mut table_info_values, &skip_alter); + let num_skipped = skip_alter.iter().filter(|&&x| x).count(); + + Ok(ValidatorResult { + num_skipped, + skip_alter, + table_info_values, + physical_table_info, + physical_table_route, + }) + } +} + +/// The result of the validator. +pub(crate) struct ValidatorResult { + pub(crate) num_skipped: usize, + pub(crate) skip_alter: Vec, + pub(crate) table_info_values: Vec>, + pub(crate) physical_table_info: DeserializedValueWithBytes, + pub(crate) physical_table_route: PhysicalTableRouteValue, +} + +/// Retains the elements that are not skipped. +pub(crate) fn retain_unskipped(target: &mut Vec, skipped: &[bool]) { + debug_assert_eq!(target.len(), skipped.len()); + let mut iter = skipped.iter(); + target.retain(|_| !iter.next().unwrap()); +} + +/// Returns true if does not required to alter the logical region. +fn skip_alter_logical_region(alter: &AlterTableExpr, table: &TableInfoValue) -> bool { + let existing_columns = table + .table_info + .meta + .schema + .column_schemas + .iter() + .map(|c| &c.name) + .collect::>(); + + let Some(kind) = alter.kind.as_ref() else { + return true; // Never get here since we have checked it in `validate_alter_kind` + }; + let Kind::AddColumns(add_columns) = kind else { + return true; // Never get here since we have checked it in `validate_alter_kind` + }; + + // We only check that all columns have been finished. That is to say, + // if one part is finished but another part is not, it will be considered + // unfinished. + add_columns + .add_columns + .iter() + .map(|add_column| add_column.column_def.as_ref().map(|c| &c.name)) + .all(|column| { + column + .map(|c| existing_columns.contains(c)) + .unwrap_or(false) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_retain_unskipped() { + let mut target = vec![1, 2, 3, 4, 5]; + let skipped = vec![false, true, false, true, false]; + retain_unskipped(&mut target, &skipped); + assert_eq!(target, vec![1, 3, 5]); + } +} diff --git a/src/common/meta/src/ddl/alter_table.rs b/src/common/meta/src/ddl/alter_table.rs index bfa0a679ea..ac882cf9a9 100644 --- a/src/common/meta/src/ddl/alter_table.rs +++ b/src/common/meta/src/ddl/alter_table.rs @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod check; +mod executor; mod metadata; mod region_request; -mod update_metadata; use std::vec; @@ -29,30 +28,29 @@ use common_procedure::{ Context as ProcedureContext, ContextProvider, Error as ProcedureError, LockKey, PoisonKey, PoisonKeys, Procedure, ProcedureId, Status, StringKey, }; -use common_telemetry::{debug, error, info}; -use futures::future::{self}; +use common_telemetry::{error, info, warn}; use serde::{Deserialize, Serialize}; use snafu::{ensure, ResultExt}; -use store_api::storage::RegionId; +use store_api::metadata::ColumnMetadata; +use store_api::metric_engine_consts::TABLE_COLUMN_METADATA_EXTENSION_KEY; use strum::AsRefStr; use table::metadata::{RawTableInfo, TableId, TableInfo}; use table::table_reference::TableReference; -use crate::cache_invalidator::Context; +use crate::ddl::alter_table::executor::AlterTableExecutor; use crate::ddl::utils::{ - add_peer_context_if_needed, handle_multiple_results, map_to_procedure_error, + extract_column_metadatas, handle_multiple_results, map_to_procedure_error, sync_follower_regions, MultipleResults, }; use crate::ddl::DdlContext; use crate::error::{AbortProcedureSnafu, NoLeaderSnafu, PutPoisonSnafu, Result, RetryLaterSnafu}; -use crate::instruction::CacheIdent; use crate::key::table_info::TableInfoValue; use crate::key::{DeserializedValueWithBytes, RegionDistribution}; use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock}; use crate::metrics; use crate::poison_key::table_poison_key; use crate::rpc::ddl::AlterTableTask; -use crate::rpc::router::{find_leader_regions, find_leaders, region_distribution, RegionRoute}; +use crate::rpc::router::{find_leaders, region_distribution, RegionRoute}; /// The alter table procedure pub struct AlterTableProcedure { @@ -64,6 +62,24 @@ pub struct AlterTableProcedure { /// If we recover the procedure from json, then the table info value is not cached. /// But we already validated it in the prepare step. new_table_info: Option, + /// The alter table executor. + executor: AlterTableExecutor, +} + +/// Builds the executor from the [`AlterTableData`]. +/// +/// # Panics +/// - If the alter kind is not set. +fn build_executor_from_alter_expr(alter_data: &AlterTableData) -> AlterTableExecutor { + let table_name = alter_data.table_ref().into(); + let table_id = alter_data.table_id; + let alter_kind = alter_data.task.alter_table.kind.as_ref().unwrap(); + let new_table_name = if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind { + Some(new_table_name.to_string()) + } else { + None + }; + AlterTableExecutor::new(table_name, table_id, new_table_name) } impl AlterTableProcedure { @@ -71,33 +87,42 @@ impl AlterTableProcedure { pub fn new(table_id: TableId, task: AlterTableTask, context: DdlContext) -> Result { task.validate()?; + let data = AlterTableData::new(task, table_id); + let executor = build_executor_from_alter_expr(&data); Ok(Self { context, - data: AlterTableData::new(task, table_id), + data, new_table_info: None, + executor, }) } pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult { let data: AlterTableData = serde_json::from_str(json).context(FromJsonSnafu)?; + let executor = build_executor_from_alter_expr(&data); + Ok(AlterTableProcedure { context, data, new_table_info: None, + executor, }) } // Checks whether the table exists. pub(crate) async fn on_prepare(&mut self) -> Result { - self.check_alter().await?; + self.executor + .on_prepare(&self.context.table_metadata_manager) + .await?; self.fill_table_info().await?; - // Validates the request and builds the new table info. - // We need to build the new table info here because we should ensure the alteration - // is valid in `UpdateMeta` state as we already altered the region. - // Safety: `fill_table_info()` already set it. + // Safety: filled in `fill_table_info`. let table_info_value = self.data.table_info_value.as_ref().unwrap(); - self.new_table_info = Some(self.build_new_table_info(&table_info_value.table_info)?); + let new_table_info = AlterTableExecutor::validate_alter_table_expr( + &table_info_value.table_info, + self.data.task.alter_table.clone(), + )?; + self.new_table_info = Some(new_table_info); // Safety: Checked in `AlterTableProcedure::new`. let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap(); @@ -140,9 +165,7 @@ impl AlterTableProcedure { self.data.region_distribution = Some(region_distribution(&physical_table_route.region_routes)); - let leaders = find_leaders(&physical_table_route.region_routes); - let mut alter_region_tasks = Vec::with_capacity(leaders.len()); let alter_kind = self.make_region_alter_kind()?; info!( @@ -155,31 +178,14 @@ impl AlterTableProcedure { ensure!(!leaders.is_empty(), NoLeaderSnafu { table_id }); // Puts the poison before submitting alter region requests to datanodes. self.put_poison(ctx_provider, procedure_id).await?; - for datanode in leaders { - let requester = self.context.node_manager.datanode(&datanode).await; - let regions = find_leader_regions(&physical_table_route.region_routes, &datanode); - - for region in regions { - let region_id = RegionId::new(table_id, region); - let request = self.make_alter_region_request(region_id, alter_kind.clone())?; - debug!("Submitting {request:?} to {datanode}"); - - let datanode = datanode.clone(); - let requester = requester.clone(); - - alter_region_tasks.push(async move { - requester - .handle(request) - .await - .map_err(add_peer_context_if_needed(datanode)) - }); - } - } - - let results = future::join_all(alter_region_tasks) - .await - .into_iter() - .collect::>(); + let results = self + .executor + .on_alter_regions( + &self.context.node_manager, + &physical_table_route.region_routes, + alter_kind, + ) + .await; match handle_multiple_results(results) { MultipleResults::PartialRetryable(error) => { @@ -202,9 +208,9 @@ impl AlterTableProcedure { }) } MultipleResults::Ok(results) => { - self.submit_sync_region_requests(results, &physical_table_route.region_routes) + self.submit_sync_region_requests(&results, &physical_table_route.region_routes) .await; - self.data.state = AlterTableState::UpdateMetadata; + self.handle_alter_region_response(results)?; Ok(Status::executing_with_clean_poisons(true)) } MultipleResults::AllNonRetryable(error) => { @@ -220,9 +226,21 @@ impl AlterTableProcedure { } } + fn handle_alter_region_response(&mut self, mut results: Vec) -> Result<()> { + if let Some(column_metadatas) = + extract_column_metadatas(&mut results, TABLE_COLUMN_METADATA_EXTENSION_KEY)? + { + self.data.column_metadatas = column_metadatas; + } else { + warn!("altering table result doesn't contains extension key `{TABLE_COLUMN_METADATA_EXTENSION_KEY}`,leaving the table's column metadata unchanged"); + } + self.data.state = AlterTableState::UpdateMetadata; + Ok(()) + } + async fn submit_sync_region_requests( &mut self, - results: Vec, + results: &[RegionResponse], region_routes: &[RegionRoute], ) { // Safety: filled in `prepare` step. @@ -244,39 +262,34 @@ impl AlterTableProcedure { pub(crate) async fn on_update_metadata(&mut self) -> Result { let table_id = self.data.table_id(); let table_ref = self.data.table_ref(); - // Safety: checked before. + // Safety: filled in `fill_table_info`. let table_info_value = self.data.table_info_value.as_ref().unwrap(); + // Safety: Checked in `AlterTableProcedure::new`. + let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap(); + // Gets the table info from the cache or builds it. - let new_info = match &self.new_table_info { + let new_info = match &self.new_table_info { Some(cached) => cached.clone(), - None => self.build_new_table_info(&table_info_value.table_info) + None => AlterTableExecutor::validate_alter_table_expr( + &table_info_value.table_info, + self.data.task.alter_table.clone(), + ) .inspect_err(|e| { // We already check the table info in the prepare step so this should not happen. error!(e; "Unable to build info for table {} in update metadata step, table_id: {}", table_ref, table_id); })?, }; - debug!( - "Starting update table: {} metadata, new table info {:?}", - table_ref.to_string(), - new_info - ); - - // Safety: Checked in `AlterTableProcedure::new`. - let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap(); - if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind { - self.on_update_metadata_for_rename(new_table_name.to_string(), table_info_value) - .await?; - } else { - // region distribution is set in submit_alter_region_requests - let region_distribution = self.data.region_distribution.as_ref().unwrap().clone(); - self.on_update_metadata_for_alter( - new_info.into(), - region_distribution, + // Safety: region distribution is set in `submit_alter_region_requests`. + self.executor + .on_alter_metadata( + &self.context.table_metadata_manager, table_info_value, + self.data.region_distribution.as_ref(), + new_info.into(), + &self.data.column_metadatas, ) .await?; - } info!("Updated table metadata for table {table_ref}, table_id: {table_id}, kind: {alter_kind:?}"); self.data.state = AlterTableState::InvalidateTableCache; @@ -285,18 +298,9 @@ impl AlterTableProcedure { /// Broadcasts the invalidating table cache instructions. async fn on_broadcast(&mut self) -> Result { - let cache_invalidator = &self.context.cache_invalidator; - - cache_invalidator - .invalidate( - &Context::default(), - &[ - CacheIdent::TableId(self.data.table_id()), - CacheIdent::TableName(self.data.table_ref().into()), - ], - ) + self.executor + .invalidate_table_cache(&self.context.cache_invalidator) .await?; - Ok(Status::done()) } @@ -318,6 +322,16 @@ impl AlterTableProcedure { lock_key } + + #[cfg(test)] + pub(crate) fn data(&self) -> &AlterTableData { + &self.data + } + + #[cfg(test)] + pub(crate) fn mut_data(&mut self) -> &mut AlterTableData { + &mut self.data + } } #[async_trait] @@ -380,6 +394,8 @@ pub struct AlterTableData { state: AlterTableState, task: AlterTableTask, table_id: TableId, + #[serde(default)] + column_metadatas: Vec, /// Table info value before alteration. table_info_value: Option>, /// Region distribution for table in case we need to update region options. @@ -392,6 +408,7 @@ impl AlterTableData { state: AlterTableState::Prepare, task, table_id, + column_metadatas: vec![], table_info_value: None, region_distribution: None, } @@ -410,4 +427,14 @@ impl AlterTableData { .as_ref() .map(|value| &value.table_info) } + + #[cfg(test)] + pub(crate) fn column_metadatas(&self) -> &[ColumnMetadata] { + &self.column_metadatas + } + + #[cfg(test)] + pub(crate) fn set_column_metadatas(&mut self, column_metadatas: Vec) { + self.column_metadatas = column_metadatas; + } } diff --git a/src/common/meta/src/ddl/alter_table/check.rs b/src/common/meta/src/ddl/alter_table/check.rs deleted file mode 100644 index 5be40ac3e2..0000000000 --- a/src/common/meta/src/ddl/alter_table/check.rs +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use api::v1::alter_table_expr::Kind; -use api::v1::RenameTable; -use common_catalog::format_full_table_name; -use snafu::ensure; - -use crate::ddl::alter_table::AlterTableProcedure; -use crate::error::{self, Result}; -use crate::key::table_name::TableNameKey; - -impl AlterTableProcedure { - /// Checks: - /// - The new table name doesn't exist (rename). - /// - Table exists. - pub(crate) async fn check_alter(&self) -> Result<()> { - let alter_expr = &self.data.task.alter_table; - let catalog = &alter_expr.catalog_name; - let schema = &alter_expr.schema_name; - let table_name = &alter_expr.table_name; - // Safety: Checked in `AlterTableProcedure::new`. - let alter_kind = self.data.task.alter_table.kind.as_ref().unwrap(); - - let manager = &self.context.table_metadata_manager; - if let Kind::RenameTable(RenameTable { new_table_name }) = alter_kind { - let new_table_name_key = TableNameKey::new(catalog, schema, new_table_name); - let exists = manager - .table_name_manager() - .exists(new_table_name_key) - .await?; - ensure!( - !exists, - error::TableAlreadyExistsSnafu { - table_name: format_full_table_name(catalog, schema, new_table_name), - } - ) - } - - let table_name_key = TableNameKey::new(catalog, schema, table_name); - let exists = manager.table_name_manager().exists(table_name_key).await?; - ensure!( - exists, - error::TableNotFoundSnafu { - table_name: format_full_table_name(catalog, schema, &alter_expr.table_name), - } - ); - - Ok(()) - } -} diff --git a/src/common/meta/src/ddl/alter_table/executor.rs b/src/common/meta/src/ddl/alter_table/executor.rs new file mode 100644 index 0000000000..933d04337e --- /dev/null +++ b/src/common/meta/src/ddl/alter_table/executor.rs @@ -0,0 +1,308 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use api::region::RegionResponse; +use api::v1::region::region_request::Body; +use api::v1::region::{alter_request, AlterRequest, RegionRequest, RegionRequestHeader}; +use api::v1::AlterTableExpr; +use common_catalog::format_full_table_name; +use common_grpc_expr::alter_expr_to_request; +use common_telemetry::debug; +use common_telemetry::tracing_context::TracingContext; +use futures::future; +use snafu::{ensure, ResultExt}; +use store_api::metadata::ColumnMetadata; +use store_api::storage::{RegionId, TableId}; +use table::metadata::{RawTableInfo, TableInfo}; +use table::requests::AlterKind; +use table::table_name::TableName; + +use crate::cache_invalidator::{CacheInvalidatorRef, Context}; +use crate::ddl::utils::{add_peer_context_if_needed, raw_table_info}; +use crate::error::{self, Result, UnexpectedSnafu}; +use crate::instruction::CacheIdent; +use crate::key::table_info::TableInfoValue; +use crate::key::table_name::TableNameKey; +use crate::key::{DeserializedValueWithBytes, RegionDistribution, TableMetadataManagerRef}; +use crate::node_manager::NodeManagerRef; +use crate::rpc::router::{find_leaders, region_distribution, RegionRoute}; + +/// [AlterTableExecutor] performs: +/// - Alters the metadata of the table. +/// - Alters regions on the datanode nodes. +pub struct AlterTableExecutor { + table: TableName, + table_id: TableId, + /// The new table name if the alter kind is rename table. + new_table_name: Option, +} + +impl AlterTableExecutor { + /// Creates a new [`AlterTableExecutor`]. + pub fn new(table: TableName, table_id: TableId, new_table_name: Option) -> Self { + Self { + table, + table_id, + new_table_name, + } + } + + /// Prepares to alter the table. + /// + /// ## Checks: + /// - The new table name doesn't exist (rename). + /// - Table exists. + pub(crate) async fn on_prepare( + &self, + table_metadata_manager: &TableMetadataManagerRef, + ) -> Result<()> { + let catalog = &self.table.catalog_name; + let schema = &self.table.schema_name; + let table_name = &self.table.table_name; + + let manager = table_metadata_manager; + if let Some(new_table_name) = &self.new_table_name { + let new_table_name_key = TableNameKey::new(catalog, schema, new_table_name); + let exists = manager + .table_name_manager() + .exists(new_table_name_key) + .await?; + ensure!( + !exists, + error::TableAlreadyExistsSnafu { + table_name: format_full_table_name(catalog, schema, new_table_name), + } + ) + } + + let table_name_key = TableNameKey::new(catalog, schema, table_name); + let exists = manager.table_name_manager().exists(table_name_key).await?; + ensure!( + exists, + error::TableNotFoundSnafu { + table_name: format_full_table_name(catalog, schema, table_name), + } + ); + + Ok(()) + } + + /// Validates the alter table expression and builds the new table info. + /// + /// This validation is performed early to ensure the alteration is valid before + /// proceeding to the `on_alter_metadata` state, where regions have already been altered. + /// Building the new table info here allows us to catch any issues with the + /// alteration before committing metadata changes. + pub(crate) fn validate_alter_table_expr( + table_info: &RawTableInfo, + alter_table_expr: AlterTableExpr, + ) -> Result { + build_new_table_info(table_info, alter_table_expr) + } + + /// Updates table metadata for alter table operation. + pub(crate) async fn on_alter_metadata( + &self, + table_metadata_manager: &TableMetadataManagerRef, + current_table_info_value: &DeserializedValueWithBytes, + region_distribution: Option<&RegionDistribution>, + mut raw_table_info: RawTableInfo, + column_metadatas: &[ColumnMetadata], + ) -> Result<()> { + let table_ref = self.table.table_ref(); + let table_id = self.table_id; + + if let Some(new_table_name) = &self.new_table_name { + debug!( + "Starting update table: {} metadata, table_id: {}, new table info: {:?}, new table name: {}", + table_ref, table_id, raw_table_info, new_table_name + ); + + table_metadata_manager + .rename_table(current_table_info_value, new_table_name.to_string()) + .await?; + } else { + debug!( + "Starting update table: {} metadata, table_id: {}, new table info: {:?}", + table_ref, table_id, raw_table_info + ); + + ensure!( + region_distribution.is_some(), + UnexpectedSnafu { + err_msg: "region distribution is not set when updating table metadata", + } + ); + + if !column_metadatas.is_empty() { + raw_table_info::update_table_info_column_ids(&mut raw_table_info, column_metadatas); + } + table_metadata_manager + .update_table_info( + current_table_info_value, + region_distribution.cloned(), + raw_table_info, + ) + .await?; + } + + Ok(()) + } + + /// Alters regions on the datanode nodes. + pub(crate) async fn on_alter_regions( + &self, + node_manager: &NodeManagerRef, + region_routes: &[RegionRoute], + kind: Option, + ) -> Vec> { + let region_distribution = region_distribution(region_routes); + let leaders = find_leaders(region_routes) + .into_iter() + .map(|p| (p.id, p)) + .collect::>(); + let total_num_region = region_distribution + .values() + .map(|r| r.leader_regions.len()) + .sum::(); + let mut alter_region_tasks = Vec::with_capacity(total_num_region); + for (datanode_id, region_role_set) in region_distribution { + if region_role_set.leader_regions.is_empty() { + continue; + } + // Safety: must exists. + let peer = leaders.get(&datanode_id).unwrap(); + let requester = node_manager.datanode(peer).await; + + for region_id in region_role_set.leader_regions { + let region_id = RegionId::new(self.table_id, region_id); + let request = make_alter_region_request(region_id, kind.clone()); + + let requester = requester.clone(); + let peer = peer.clone(); + + alter_region_tasks.push(async move { + requester + .handle(request) + .await + .map_err(add_peer_context_if_needed(peer)) + }); + } + } + + future::join_all(alter_region_tasks) + .await + .into_iter() + .collect::>() + } + + /// Invalidates cache for the table. + pub(crate) async fn invalidate_table_cache( + &self, + cache_invalidator: &CacheInvalidatorRef, + ) -> Result<()> { + let ctx = Context { + subject: Some(format!( + "Invalidate table cache by altering table {}, table_id: {}", + self.table.table_ref(), + self.table_id, + )), + }; + + cache_invalidator + .invalidate( + &ctx, + &[ + CacheIdent::TableName(self.table.clone()), + CacheIdent::TableId(self.table_id), + ], + ) + .await?; + + Ok(()) + } +} + +/// Makes alter region request. +pub(crate) fn make_alter_region_request( + region_id: RegionId, + kind: Option, +) -> RegionRequest { + RegionRequest { + header: Some(RegionRequestHeader { + tracing_context: TracingContext::from_current_span().to_w3c(), + ..Default::default() + }), + body: Some(Body::Alter(AlterRequest { + region_id: region_id.as_u64(), + kind, + ..Default::default() + })), + } +} + +/// Builds new table info after alteration. +/// +/// This function creates a new table info by applying the alter table expression +/// to the existing table info. For add column operations, it increments the +/// `next_column_id` by the number of columns being added, which may result in gaps +/// in the column id sequence. +fn build_new_table_info( + table_info: &RawTableInfo, + alter_table_expr: AlterTableExpr, +) -> Result { + let table_info = + TableInfo::try_from(table_info.clone()).context(error::ConvertRawTableInfoSnafu)?; + let schema_name = &table_info.schema_name; + let catalog_name = &table_info.catalog_name; + let table_name = &table_info.name; + let table_id = table_info.ident.table_id; + let request = alter_expr_to_request(table_id, alter_table_expr) + .context(error::ConvertAlterTableRequestSnafu)?; + + let new_meta = table_info + .meta + .builder_with_alter_kind(table_name, &request.alter_kind) + .context(error::TableSnafu)? + .build() + .with_context(|_| error::BuildTableMetaSnafu { + table_name: format_full_table_name(catalog_name, schema_name, table_name), + })?; + + let mut new_info = table_info.clone(); + new_info.meta = new_meta; + new_info.ident.version = table_info.ident.version + 1; + match request.alter_kind { + AlterKind::AddColumns { columns } => { + // Bumps the column id for the new columns. + // It may bump more than the actual number of columns added if there are + // existing columns, but it's fine. + new_info.meta.next_column_id += columns.len() as u32; + } + AlterKind::RenameTable { new_table_name } => { + new_info.name = new_table_name.to_string(); + } + AlterKind::DropColumns { .. } + | AlterKind::ModifyColumnTypes { .. } + | AlterKind::SetTableOptions { .. } + | AlterKind::UnsetTableOptions { .. } + | AlterKind::SetIndexes { .. } + | AlterKind::UnsetIndexes { .. } + | AlterKind::DropDefaults { .. } => {} + } + + Ok(new_info) +} diff --git a/src/common/meta/src/ddl/alter_table/region_request.rs b/src/common/meta/src/ddl/alter_table/region_request.rs index cf8e0d57aa..dab63cc85a 100644 --- a/src/common/meta/src/ddl/alter_table/region_request.rs +++ b/src/common/meta/src/ddl/alter_table/region_request.rs @@ -15,43 +15,16 @@ use std::collections::HashSet; use api::v1::alter_table_expr::Kind; -use api::v1::region::region_request::Body; use api::v1::region::{ - alter_request, AddColumn, AddColumns, AlterRequest, DropColumn, DropColumns, RegionColumnDef, - RegionRequest, RegionRequestHeader, + alter_request, AddColumn, AddColumns, DropColumn, DropColumns, RegionColumnDef, }; -use common_telemetry::tracing_context::TracingContext; use snafu::OptionExt; -use store_api::storage::RegionId; use table::metadata::RawTableInfo; use crate::ddl::alter_table::AlterTableProcedure; use crate::error::{InvalidProtoMsgSnafu, Result}; impl AlterTableProcedure { - /// Makes alter region request from existing an alter kind. - /// Region alter request always add columns if not exist. - pub(crate) fn make_alter_region_request( - &self, - region_id: RegionId, - kind: Option, - ) -> Result { - // Safety: checked - let table_info = self.data.table_info().unwrap(); - - Ok(RegionRequest { - header: Some(RegionRequestHeader { - tracing_context: TracingContext::from_current_span().to_w3c(), - ..Default::default() - }), - body: Some(Body::Alter(AlterRequest { - region_id: region_id.as_u64(), - schema_version: table_info.ident.version, - kind, - })), - }) - } - /// Makes alter kind proto that all regions can reuse. /// Region alter request always add columns if not exist. pub(crate) fn make_region_alter_kind(&self) -> Result> { @@ -135,6 +108,8 @@ fn create_proto_alter_kind( Kind::UnsetTableOptions(v) => Ok(Some(alter_request::Kind::UnsetTableOptions(v.clone()))), Kind::SetIndex(v) => Ok(Some(alter_request::Kind::SetIndex(v.clone()))), Kind::UnsetIndex(v) => Ok(Some(alter_request::Kind::UnsetIndex(v.clone()))), + Kind::SetIndexes(v) => Ok(Some(alter_request::Kind::SetIndexes(v.clone()))), + Kind::UnsetIndexes(v) => Ok(Some(alter_request::Kind::UnsetIndexes(v.clone()))), Kind::DropDefaults(v) => Ok(Some(alter_request::Kind::DropDefaults(v.clone()))), } } @@ -155,6 +130,7 @@ mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use store_api::storage::{RegionId, TableId}; + use crate::ddl::alter_table::executor::make_alter_region_request; use crate::ddl::alter_table::AlterTableProcedure; use crate::ddl::test_util::columns::TestColumnDefBuilder; use crate::ddl::test_util::create_table::{ @@ -261,15 +237,13 @@ mod tests { let mut procedure = AlterTableProcedure::new(table_id, task, ddl_context).unwrap(); procedure.on_prepare().await.unwrap(); let alter_kind = procedure.make_region_alter_kind().unwrap(); - let Some(Body::Alter(alter_region_request)) = procedure - .make_alter_region_request(region_id, alter_kind) - .unwrap() - .body + let Some(Body::Alter(alter_region_request)) = + make_alter_region_request(region_id, alter_kind).body else { unreachable!() }; assert_eq!(alter_region_request.region_id, region_id.as_u64()); - assert_eq!(alter_region_request.schema_version, 1); + assert_eq!(alter_region_request.schema_version, 0); assert_eq!( alter_region_request.kind, Some(region::alter_request::Kind::AddColumns( @@ -319,15 +293,13 @@ mod tests { let mut procedure = AlterTableProcedure::new(table_id, task, ddl_context).unwrap(); procedure.on_prepare().await.unwrap(); let alter_kind = procedure.make_region_alter_kind().unwrap(); - let Some(Body::Alter(alter_region_request)) = procedure - .make_alter_region_request(region_id, alter_kind) - .unwrap() - .body + let Some(Body::Alter(alter_region_request)) = + make_alter_region_request(region_id, alter_kind).body else { unreachable!() }; assert_eq!(alter_region_request.region_id, region_id.as_u64()); - assert_eq!(alter_region_request.schema_version, 1); + assert_eq!(alter_region_request.schema_version, 0); assert_eq!( alter_region_request.kind, Some(region::alter_request::Kind::ModifyColumnTypes( diff --git a/src/common/meta/src/ddl/alter_table/update_metadata.rs b/src/common/meta/src/ddl/alter_table/update_metadata.rs deleted file mode 100644 index 08ea63689e..0000000000 --- a/src/common/meta/src/ddl/alter_table/update_metadata.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_grpc_expr::alter_expr_to_request; -use snafu::ResultExt; -use table::metadata::{RawTableInfo, TableInfo}; -use table::requests::AlterKind; - -use crate::ddl::alter_table::AlterTableProcedure; -use crate::error::{self, Result}; -use crate::key::table_info::TableInfoValue; -use crate::key::{DeserializedValueWithBytes, RegionDistribution}; - -impl AlterTableProcedure { - /// Builds new table info after alteration. - /// It bumps the column id of the table by the number of the add column requests. - /// So there may be holes in the column id sequence. - pub(crate) fn build_new_table_info(&self, table_info: &RawTableInfo) -> Result { - let table_info = - TableInfo::try_from(table_info.clone()).context(error::ConvertRawTableInfoSnafu)?; - let table_ref = self.data.table_ref(); - let alter_expr = self.data.task.alter_table.clone(); - let request = alter_expr_to_request(self.data.table_id(), alter_expr) - .context(error::ConvertAlterTableRequestSnafu)?; - - let new_meta = table_info - .meta - .builder_with_alter_kind(table_ref.table, &request.alter_kind) - .context(error::TableSnafu)? - .build() - .with_context(|_| error::BuildTableMetaSnafu { - table_name: table_ref.table, - })?; - - let mut new_info = table_info.clone(); - new_info.meta = new_meta; - new_info.ident.version = table_info.ident.version + 1; - match request.alter_kind { - AlterKind::AddColumns { columns } => { - // Bumps the column id for the new columns. - // It may bump more than the actual number of columns added if there are - // existing columns, but it's fine. - new_info.meta.next_column_id += columns.len() as u32; - } - AlterKind::RenameTable { new_table_name } => { - new_info.name = new_table_name.to_string(); - } - AlterKind::DropColumns { .. } - | AlterKind::ModifyColumnTypes { .. } - | AlterKind::SetTableOptions { .. } - | AlterKind::UnsetTableOptions { .. } - | AlterKind::SetIndex { .. } - | AlterKind::UnsetIndex { .. } - | AlterKind::DropDefaults { .. } => {} - } - - Ok(new_info) - } - - /// Updates table metadata for rename table operation. - pub(crate) async fn on_update_metadata_for_rename( - &self, - new_table_name: String, - current_table_info_value: &DeserializedValueWithBytes, - ) -> Result<()> { - let table_metadata_manager = &self.context.table_metadata_manager; - table_metadata_manager - .rename_table(current_table_info_value, new_table_name) - .await?; - - Ok(()) - } - - /// Updates table metadata for alter table operation. - pub(crate) async fn on_update_metadata_for_alter( - &self, - new_table_info: RawTableInfo, - region_distribution: RegionDistribution, - current_table_info_value: &DeserializedValueWithBytes, - ) -> Result<()> { - let table_metadata_manager = &self.context.table_metadata_manager; - table_metadata_manager - .update_table_info( - current_table_info_value, - Some(region_distribution), - new_table_info, - ) - .await?; - - Ok(()) - } -} diff --git a/src/common/meta/src/ddl/create_logical_tables.rs b/src/common/meta/src/ddl/create_logical_tables.rs index d1f1ac37b6..3b0a091a14 100644 --- a/src/common/meta/src/ddl/create_logical_tables.rs +++ b/src/common/meta/src/ddl/create_logical_tables.rs @@ -27,7 +27,7 @@ use common_telemetry::{debug, error, warn}; use futures::future; pub use region_request::create_region_request_builder; use serde::{Deserialize, Serialize}; -use snafu::{ensure, ResultExt}; +use snafu::ResultExt; use store_api::metadata::ColumnMetadata; use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY; use store_api::storage::{RegionId, RegionNumber}; @@ -35,10 +35,11 @@ use strum::AsRefStr; use table::metadata::{RawTableInfo, TableId}; use crate::ddl::utils::{ - add_peer_context_if_needed, map_to_procedure_error, sync_follower_regions, + add_peer_context_if_needed, extract_column_metadatas, map_to_procedure_error, + sync_follower_regions, }; use crate::ddl::DdlContext; -use crate::error::{DecodeJsonSnafu, MetadataCorruptionSnafu, Result}; +use crate::error::Result; use crate::key::table_route::TableRouteValue; use crate::lock_key::{CatalogLock, SchemaLock, TableLock, TableNameLock}; use crate::metrics; @@ -166,47 +167,23 @@ impl CreateLogicalTablesProcedure { .into_iter() .collect::>>()?; - // Collects response from datanodes. - let phy_raw_schemas = results - .iter_mut() - .map(|res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)) - .collect::>(); - - if phy_raw_schemas.is_empty() { - self.submit_sync_region_requests(results, region_routes) - .await; - self.data.state = CreateTablesState::CreateMetadata; - return Ok(Status::executing(false)); - } - - // Verify all the physical schemas are the same - // Safety: previous check ensures this vec is not empty - let first = phy_raw_schemas.first().unwrap(); - ensure!( - phy_raw_schemas.iter().all(|x| x == first), - MetadataCorruptionSnafu { - err_msg: "The physical schemas from datanodes are not the same." - } - ); - - // Decodes the physical raw schemas - if let Some(phy_raw_schemas) = first { - self.data.physical_columns = - ColumnMetadata::decode_list(phy_raw_schemas).context(DecodeJsonSnafu)?; + if let Some(column_metadatas) = + extract_column_metadatas(&mut results, ALTER_PHYSICAL_EXTENSION_KEY)? + { + self.data.physical_columns = column_metadatas; } else { warn!("creating logical table result doesn't contains extension key `{ALTER_PHYSICAL_EXTENSION_KEY}`,leaving the physical table's schema unchanged"); } - self.submit_sync_region_requests(results, region_routes) + self.submit_sync_region_requests(&results, region_routes) .await; self.data.state = CreateTablesState::CreateMetadata; - Ok(Status::executing(true)) } async fn submit_sync_region_requests( &self, - results: Vec, + results: &[RegionResponse], region_routes: &[RegionRoute], ) { if let Err(err) = sync_follower_regions( diff --git a/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs b/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs index 75d235812f..c6e50b01d8 100644 --- a/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs +++ b/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs @@ -22,7 +22,7 @@ use table::table_name::TableName; use crate::cache_invalidator::Context; use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure; -use crate::ddl::physical_table_metadata; +use crate::ddl::utils::raw_table_info; use crate::error::{Result, TableInfoNotFoundSnafu}; use crate::instruction::CacheIdent; @@ -47,7 +47,7 @@ impl CreateLogicalTablesProcedure { // Generates new table info let raw_table_info = physical_table_info.deref().table_info.clone(); - let new_table_info = physical_table_metadata::build_new_physical_table_info( + let new_table_info = raw_table_info::build_new_physical_table_info( raw_table_info, &self.data.physical_columns, ); diff --git a/src/common/meta/src/ddl/create_table.rs b/src/common/meta/src/ddl/create_table.rs index 1a62c8e716..45b7336229 100644 --- a/src/common/meta/src/ddl/create_table.rs +++ b/src/common/meta/src/ddl/create_table.rs @@ -21,21 +21,24 @@ use common_error::ext::BoxedError; use common_procedure::error::{ ExternalSnafu, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu, }; -use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status}; -use common_telemetry::info; +use common_procedure::{Context as ProcedureContext, LockKey, Procedure, ProcedureId, Status}; use common_telemetry::tracing_context::TracingContext; +use common_telemetry::{info, warn}; use futures::future::join_all; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt}; +use store_api::metadata::ColumnMetadata; +use store_api::metric_engine_consts::TABLE_COLUMN_METADATA_EXTENSION_KEY; use store_api::storage::{RegionId, RegionNumber}; use strum::AsRefStr; use table::metadata::{RawTableInfo, TableId}; use table::table_reference::TableReference; use crate::ddl::create_table_template::{build_template, CreateRequestBuilder}; +use crate::ddl::utils::raw_table_info::update_table_info_column_ids; use crate::ddl::utils::{ - add_peer_context_if_needed, convert_region_routes_to_detecting_regions, map_to_procedure_error, - region_storage_path, + add_peer_context_if_needed, convert_region_routes_to_detecting_regions, + extract_column_metadatas, map_to_procedure_error, region_storage_path, }; use crate::ddl::{DdlContext, TableMetadata}; use crate::error::{self, Result}; @@ -243,14 +246,20 @@ impl CreateTableProcedure { } } - join_all(create_region_tasks) + let mut results = join_all(create_region_tasks) .await .into_iter() .collect::>>()?; - self.creator.data.state = CreateTableState::CreateMetadata; + if let Some(column_metadatas) = + extract_column_metadatas(&mut results, TABLE_COLUMN_METADATA_EXTENSION_KEY)? + { + self.creator.data.column_metadatas = column_metadatas; + } else { + warn!("creating table result doesn't contains extension key `{TABLE_COLUMN_METADATA_EXTENSION_KEY}`,leaving the table's column metadata unchanged"); + } - // TODO(weny): Add more tests. + self.creator.data.state = CreateTableState::CreateMetadata; Ok(Status::executing(true)) } @@ -258,11 +267,15 @@ impl CreateTableProcedure { /// /// Abort(not-retry): /// - Failed to create table metadata. - async fn on_create_metadata(&mut self) -> Result { + async fn on_create_metadata(&mut self, pid: ProcedureId) -> Result { let table_id = self.table_id(); + let table_ref = self.creator.data.table_ref(); let manager = &self.context.table_metadata_manager; - let raw_table_info = self.table_info().clone(); + let mut raw_table_info = self.table_info().clone(); + if !self.creator.data.column_metadatas.is_empty() { + update_table_info_column_ids(&mut raw_table_info, &self.creator.data.column_metadatas); + } // Safety: the region_wal_options must be allocated. let region_wal_options = self.region_wal_options()?.clone(); // Safety: the table_route must be allocated. @@ -276,7 +289,10 @@ impl CreateTableProcedure { self.context .register_failure_detectors(detecting_regions) .await; - info!("Created table metadata for table {table_id}"); + info!( + "Successfully created table: {}, table_id: {}, procedure_id: {}", + table_ref, table_id, pid + ); self.creator.opening_regions.clear(); Ok(Status::done_with_output(table_id)) @@ -304,7 +320,7 @@ impl Procedure for CreateTableProcedure { Ok(()) } - async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + async fn execute(&mut self, ctx: &ProcedureContext) -> ProcedureResult { let state = &self.creator.data.state; let _timer = metrics::METRIC_META_PROCEDURE_CREATE_TABLE @@ -314,7 +330,7 @@ impl Procedure for CreateTableProcedure { match state { CreateTableState::Prepare => self.on_prepare().await, CreateTableState::DatanodeCreateRegions => self.on_datanode_create_regions().await, - CreateTableState::CreateMetadata => self.on_create_metadata().await, + CreateTableState::CreateMetadata => self.on_create_metadata(ctx.procedure_id).await, } .map_err(map_to_procedure_error) } @@ -346,6 +362,7 @@ impl TableCreator { Self { data: CreateTableData { state: CreateTableState::Prepare, + column_metadatas: vec![], task, table_route: None, region_wal_options: None, @@ -407,6 +424,8 @@ pub enum CreateTableState { pub struct CreateTableData { pub state: CreateTableState, pub task: CreateTableTask, + #[serde(default)] + pub column_metadatas: Vec, /// None stands for not allocated yet. table_route: Option, /// None stands for not allocated yet. diff --git a/src/common/meta/src/ddl/create_table_template.rs b/src/common/meta/src/ddl/create_table_template.rs index 290fc33308..55adc2cded 100644 --- a/src/common/meta/src/ddl/create_table_template.rs +++ b/src/common/meta/src/ddl/create_table_template.rs @@ -14,17 +14,57 @@ use std::collections::HashMap; +use api::v1::column_def::try_as_column_def; use api::v1::region::{CreateRequest, RegionColumnDef}; use api::v1::{ColumnDef, CreateTableExpr, SemanticType}; -use snafu::OptionExt; -use store_api::metric_engine_consts::LOGICAL_TABLE_METADATA_KEY; +use snafu::{OptionExt, ResultExt}; +use store_api::metric_engine_consts::{LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME}; use store_api::storage::{RegionId, RegionNumber}; -use table::metadata::TableId; +use table::metadata::{RawTableInfo, TableId}; -use crate::error; -use crate::error::Result; +use crate::error::{self, Result}; use crate::wal_options_allocator::prepare_wal_options; +/// Builds a [CreateRequest] from a [RawTableInfo]. +/// +/// Note: **This method is only used for creating logical tables.** +pub(crate) fn build_template_from_raw_table_info( + raw_table_info: &RawTableInfo, +) -> Result { + let primary_key_indices = &raw_table_info.meta.primary_key_indices; + let column_defs = raw_table_info + .meta + .schema + .column_schemas + .iter() + .enumerate() + .map(|(i, c)| { + let is_primary_key = primary_key_indices.contains(&i); + let column_def = try_as_column_def(c, is_primary_key) + .context(error::ConvertColumnDefSnafu { column: &c.name })?; + + Ok(RegionColumnDef { + column_def: Some(column_def), + // The column id will be overridden by the metric engine. + // So we just use the index as the column id. + column_id: i as u32, + }) + }) + .collect::>>()?; + + let options = HashMap::from(&raw_table_info.meta.options); + let template = CreateRequest { + region_id: 0, + engine: METRIC_ENGINE_NAME.to_string(), + column_defs, + primary_key: primary_key_indices.iter().map(|i| *i as u32).collect(), + path: String::new(), + options, + }; + + Ok(template) +} + pub(crate) fn build_template(create_table_expr: &CreateTableExpr) -> Result { let column_defs = create_table_expr .column_defs diff --git a/src/common/meta/src/ddl/drop_table/executor.rs b/src/common/meta/src/ddl/drop_table/executor.rs index 7aae31b13a..7cc6589f6a 100644 --- a/src/common/meta/src/ddl/drop_table/executor.rs +++ b/src/common/meta/src/ddl/drop_table/executor.rs @@ -185,11 +185,15 @@ impl DropTableExecutor { .await } - /// Invalidates frontend caches + /// Invalidates caches for the table. pub async fn invalidate_table_cache(&self, ctx: &DdlContext) -> Result<()> { let cache_invalidator = &ctx.cache_invalidator; let ctx = Context { - subject: Some("Invalidate table cache by dropping table".to_string()), + subject: Some(format!( + "Invalidate table cache by dropping table {}, table_id: {}", + self.table.table_ref(), + self.table_id, + )), }; cache_invalidator diff --git a/src/common/meta/src/ddl/physical_table_metadata.rs b/src/common/meta/src/ddl/physical_table_metadata.rs deleted file mode 100644 index 376a143133..0000000000 --- a/src/common/meta/src/ddl/physical_table_metadata.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashSet; - -use api::v1::SemanticType; -use store_api::metadata::ColumnMetadata; -use table::metadata::RawTableInfo; - -/// Generate the new physical table info. -pub(crate) fn build_new_physical_table_info( - mut raw_table_info: RawTableInfo, - physical_columns: &[ColumnMetadata], -) -> RawTableInfo { - let existing_columns = raw_table_info - .meta - .schema - .column_schemas - .iter() - .map(|col| col.name.clone()) - .collect::>(); - let primary_key_indices = &mut raw_table_info.meta.primary_key_indices; - let value_indices = &mut raw_table_info.meta.value_indices; - value_indices.clear(); - let time_index = &mut raw_table_info.meta.schema.timestamp_index; - let columns = &mut raw_table_info.meta.schema.column_schemas; - columns.clear(); - - for (idx, col) in physical_columns.iter().enumerate() { - match col.semantic_type { - SemanticType::Tag => { - // push new primary key to the end. - if !existing_columns.contains(&col.column_schema.name) { - primary_key_indices.push(idx); - } - } - SemanticType::Field => value_indices.push(idx), - SemanticType::Timestamp => *time_index = Some(idx), - } - - columns.push(col.column_schema.clone()); - } - - if let Some(time_index) = *time_index { - raw_table_info.meta.schema.column_schemas[time_index].set_time_index(); - } - - raw_table_info -} diff --git a/src/common/meta/src/ddl/table_meta.rs b/src/common/meta/src/ddl/table_meta.rs index b2755df2c1..e2e2ce36c4 100644 --- a/src/common/meta/src/ddl/table_meta.rs +++ b/src/common/meta/src/ddl/table_meta.rs @@ -122,6 +122,7 @@ impl TableMetadataAllocator { ); let peers = self.peer_allocator.alloc(regions).await?; + debug!("Allocated peers {:?} for table {}", peers, table_id); let region_routes = task .partitions .iter() @@ -174,6 +175,10 @@ impl TableMetadataAllocator { region_wal_options, }) } + + pub fn table_id_sequence(&self) -> SequenceRef { + self.table_id_sequence.clone() + } } pub type PeerAllocatorRef = Arc; diff --git a/src/common/meta/src/ddl/test_util.rs b/src/common/meta/src/ddl/test_util.rs index c440e517d5..505abf870b 100644 --- a/src/common/meta/src/ddl/test_util.rs +++ b/src/common/meta/src/ddl/test_util.rs @@ -17,6 +17,7 @@ pub mod columns; pub mod create_table; pub mod datanode_handler; pub mod flownode_handler; +pub mod region_metadata; use std::assert_matches::assert_matches; use std::collections::HashMap; @@ -24,7 +25,14 @@ use std::collections::HashMap; use api::v1::meta::Partition; use api::v1::{ColumnDataType, SemanticType}; use common_procedure::Status; -use store_api::metric_engine_consts::{LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME}; +use datatypes::prelude::ConcreteDataType; +use datatypes::schema::ColumnSchema; +use store_api::metadata::ColumnMetadata; +use store_api::metric_engine_consts::{ + DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME, LOGICAL_TABLE_METADATA_KEY, + METRIC_ENGINE_NAME, +}; +use store_api::storage::consts::ReservedColumnId; use table::metadata::{RawTableInfo, TableId}; use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure; @@ -146,6 +154,7 @@ pub fn test_create_logical_table_task(name: &str) -> CreateTableTask { } } +/// Creates a physical table task with a single region. pub fn test_create_physical_table_task(name: &str) -> CreateTableTask { let create_table = TestCreateTableExprBuilder::default() .column_defs([ @@ -182,3 +191,95 @@ pub fn test_create_physical_table_task(name: &str) -> CreateTableTask { table_info, } } + +/// Creates a column metadata list with tag fields. +pub fn test_column_metadatas(tag_fields: &[&str]) -> Vec { + let mut output = Vec::with_capacity(tag_fields.len() + 4); + output.extend([ + ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 0, + }, + ColumnMetadata { + column_schema: ColumnSchema::new("value", ConcreteDataType::float64_datatype(), false), + semantic_type: SemanticType::Field, + column_id: 1, + }, + ColumnMetadata { + column_schema: ColumnSchema::new( + DATA_SCHEMA_TABLE_ID_COLUMN_NAME, + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Tag, + column_id: ReservedColumnId::table_id(), + }, + ColumnMetadata { + column_schema: ColumnSchema::new( + DATA_SCHEMA_TSID_COLUMN_NAME, + ConcreteDataType::float64_datatype(), + false, + ), + semantic_type: SemanticType::Tag, + column_id: ReservedColumnId::tsid(), + }, + ]); + + for (i, name) in tag_fields.iter().enumerate() { + output.push(ColumnMetadata { + column_schema: ColumnSchema::new( + name.to_string(), + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: (i + 2) as u32, + }); + } + + output +} + +/// Asserts the column names. +pub fn assert_column_name(table_info: &RawTableInfo, expected_column_names: &[&str]) { + assert_eq!( + table_info + .meta + .schema + .column_schemas + .iter() + .map(|c| c.name.to_string()) + .collect::>(), + expected_column_names + ); +} + +/// Asserts the column metadatas +pub fn assert_column_name_and_id(column_metadatas: &[ColumnMetadata], expected: &[(&str, u32)]) { + assert_eq!(expected.len(), column_metadatas.len()); + for (name, id) in expected { + let column_metadata = column_metadatas + .iter() + .find(|c| c.column_id == *id) + .unwrap(); + assert_eq!(column_metadata.column_schema.name, *name); + } +} + +/// Gets the raw table info. +pub async fn get_raw_table_info(ddl_context: &DdlContext, table_id: TableId) -> RawTableInfo { + ddl_context + .table_metadata_manager + .table_info_manager() + .get(table_id) + .await + .unwrap() + .unwrap() + .into_inner() + .table_info +} diff --git a/src/common/meta/src/ddl/test_util/create_table.rs b/src/common/meta/src/ddl/test_util/create_table.rs index 9d99bbf5c6..b0b7b3a5c9 100644 --- a/src/common/meta/src/ddl/test_util/create_table.rs +++ b/src/common/meta/src/ddl/test_util/create_table.rs @@ -132,6 +132,7 @@ pub fn build_raw_table_info_from_expr(expr: &CreateTableExpr) -> RawTableInfo { options: TableOptions::try_from_iter(&expr.table_options).unwrap(), created_on: DateTime::default(), partition_key_indices: vec![], + column_ids: vec![], }, table_type: TableType::Base, } diff --git a/src/common/meta/src/ddl/test_util/datanode_handler.rs b/src/common/meta/src/ddl/test_util/datanode_handler.rs index 775fc644f7..cd8005510b 100644 --- a/src/common/meta/src/ddl/test_util/datanode_handler.rs +++ b/src/common/meta/src/ddl/test_util/datanode_handler.rs @@ -12,7 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; +use std::sync::Arc; + use api::region::RegionResponse; +use api::v1::region::region_request::Body; use api::v1::region::RegionRequest; use common_error::ext::{BoxedError, ErrorExt, StackError}; use common_error::status_code::StatusCode; @@ -20,6 +24,8 @@ use common_query::request::QueryRequest; use common_recordbatch::SendableRecordBatchStream; use common_telemetry::debug; use snafu::{ResultExt, Snafu}; +use store_api::metadata::RegionMetadata; +use store_api::storage::RegionId; use tokio::sync::mpsc; use crate::error::{self, Error, Result}; @@ -32,6 +38,7 @@ impl MockDatanodeHandler for () { Ok(RegionResponse { affected_rows: 0, extensions: Default::default(), + metadata: Vec::new(), }) } @@ -44,10 +51,13 @@ impl MockDatanodeHandler for () { } } +type RegionRequestHandler = + Arc Result + Send + Sync>; + #[derive(Clone)] pub struct DatanodeWatcher { sender: mpsc::Sender<(Peer, RegionRequest)>, - handler: Option Result>, + handler: Option, } impl DatanodeWatcher { @@ -60,9 +70,9 @@ impl DatanodeWatcher { pub fn with_handler( mut self, - user_handler: fn(Peer, RegionRequest) -> Result, + user_handler: impl Fn(Peer, RegionRequest) -> Result + Send + Sync + 'static, ) -> Self { - self.handler = Some(user_handler); + self.handler = Some(Arc::new(user_handler)); self } } @@ -75,7 +85,7 @@ impl MockDatanodeHandler for DatanodeWatcher { .send((peer.clone(), request.clone())) .await .unwrap(); - if let Some(handler) = self.handler { + if let Some(handler) = self.handler.as_ref() { handler(peer.clone(), request) } else { Ok(RegionResponse::new(0)) @@ -272,3 +282,47 @@ impl MockDatanodeHandler for AllFailureDatanodeHandler { unreachable!() } } + +#[derive(Clone)] +pub struct ListMetadataDatanodeHandler { + pub region_metadatas: HashMap>, +} + +impl ListMetadataDatanodeHandler { + pub fn new(region_metadatas: HashMap>) -> Self { + Self { region_metadatas } + } +} + +#[async_trait::async_trait] +impl MockDatanodeHandler for ListMetadataDatanodeHandler { + async fn handle(&self, _peer: &Peer, request: RegionRequest) -> Result { + let Some(Body::ListMetadata(req)) = request.body else { + unreachable!() + }; + let mut response = RegionResponse::new(0); + + let mut output = Vec::with_capacity(req.region_ids.len()); + for region_id in req.region_ids { + match self.region_metadatas.get(&RegionId::from_u64(region_id)) { + Some(metadata) => { + output.push(metadata.clone()); + } + None => { + output.push(None); + } + } + } + + response.metadata = serde_json::to_vec(&output).unwrap(); + Ok(response) + } + + async fn handle_query( + &self, + _peer: &Peer, + _request: QueryRequest, + ) -> Result { + unreachable!() + } +} diff --git a/src/common/meta/src/ddl/test_util/region_metadata.rs b/src/common/meta/src/ddl/test_util/region_metadata.rs new file mode 100644 index 0000000000..1d84f3549c --- /dev/null +++ b/src/common/meta/src/ddl/test_util/region_metadata.rs @@ -0,0 +1,34 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::SemanticType; +use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; +use store_api::storage::RegionId; + +/// Builds a region metadata with the given column metadatas. +pub fn build_region_metadata( + region_id: RegionId, + column_metadatas: &[ColumnMetadata], +) -> RegionMetadata { + let mut builder = RegionMetadataBuilder::new(region_id); + let mut primary_key = vec![]; + for column_metadata in column_metadatas { + builder.push_column_metadata(column_metadata.clone()); + if column_metadata.semantic_type == SemanticType::Tag { + primary_key.push(column_metadata.column_id); + } + } + builder.primary_key(primary_key); + builder.build().unwrap() +} diff --git a/src/common/meta/src/ddl/tests/alter_logical_tables.rs b/src/common/meta/src/ddl/tests/alter_logical_tables.rs index 01ab8e513c..20733bec03 100644 --- a/src/common/meta/src/ddl/tests/alter_logical_tables.rs +++ b/src/common/meta/src/ddl/tests/alter_logical_tables.rs @@ -23,17 +23,20 @@ use api::v1::{ColumnDataType, SemanticType}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_procedure::{Procedure, ProcedureId, Status}; use common_procedure_test::MockContextProvider; -use store_api::metric_engine_consts::MANIFEST_INFO_EXTENSION_KEY; +use store_api::metadata::ColumnMetadata; +use store_api::metric_engine_consts::{ALTER_PHYSICAL_EXTENSION_KEY, MANIFEST_INFO_EXTENSION_KEY}; use store_api::region_engine::RegionManifestInfo; +use store_api::storage::consts::ReservedColumnId; use store_api::storage::RegionId; use tokio::sync::mpsc; use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure; use crate::ddl::test_util::alter_table::TestAlterTableExprBuilder; use crate::ddl::test_util::columns::TestColumnDefBuilder; -use crate::ddl::test_util::datanode_handler::{DatanodeWatcher, NaiveDatanodeHandler}; +use crate::ddl::test_util::datanode_handler::DatanodeWatcher; use crate::ddl::test_util::{ - create_logical_table, create_physical_table, create_physical_table_metadata, + assert_column_name, create_logical_table, create_physical_table, + create_physical_table_metadata, get_raw_table_info, test_column_metadatas, test_create_physical_table_task, }; use crate::error::Error::{AlterLogicalTablesInvalidArguments, TableNotFound}; @@ -96,6 +99,52 @@ fn make_alter_logical_table_rename_task( } } +fn make_alters_request_handler( + column_metadatas: Vec, +) -> impl Fn(Peer, RegionRequest) -> Result { + move |_peer: Peer, request: RegionRequest| { + if let region_request::Body::Alters(_) = request.body.unwrap() { + let mut response = RegionResponse::new(0); + // Default region id for physical table. + let region_id = RegionId::new(1000, 1); + response.extensions.insert( + MANIFEST_INFO_EXTENSION_KEY.to_string(), + RegionManifestInfo::encode_list(&[( + region_id, + RegionManifestInfo::metric(1, 0, 2, 0), + )]) + .unwrap(), + ); + response.extensions.insert( + ALTER_PHYSICAL_EXTENSION_KEY.to_string(), + ColumnMetadata::encode_list(&column_metadatas).unwrap(), + ); + return Ok(response); + } + Ok(RegionResponse::new(0)) + } +} + +fn assert_alters_request( + peer: Peer, + request: RegionRequest, + expected_peer_id: u64, + expected_region_ids: &[RegionId], +) { + assert_eq!(peer.id, expected_peer_id,); + let Some(region_request::Body::Alters(req)) = request.body else { + unreachable!(); + }; + for (i, region_id) in expected_region_ids.iter().enumerate() { + assert_eq!( + req.requests[i].region_id, + *region_id, + "actual region id: {}", + RegionId::from_u64(req.requests[i].region_id) + ); + } +} + #[tokio::test] async fn test_on_prepare_check_schema() { let node_manager = Arc::new(MockDatanodeManager::new(())); @@ -205,15 +254,20 @@ async fn test_on_prepare() { #[tokio::test] async fn test_on_update_metadata() { - let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler)); + common_telemetry::init_default_ut_logging(); + let (tx, mut rx) = mpsc::channel(8); + let test_column_metadatas = test_column_metadatas(&["new_col", "mew_col"]); + let datanode_handler = + DatanodeWatcher::new(tx).with_handler(make_alters_request_handler(test_column_metadatas)); + let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler)); let ddl_context = new_ddl_context(node_manager); // Creates physical table let phy_id = create_physical_table(&ddl_context, "phy").await; // Creates 3 logical tables - create_logical_table(ddl_context.clone(), phy_id, "table1").await; - create_logical_table(ddl_context.clone(), phy_id, "table2").await; - create_logical_table(ddl_context.clone(), phy_id, "table3").await; + let logical_table1_id = create_logical_table(ddl_context.clone(), phy_id, "table1").await; + let logical_table2_id = create_logical_table(ddl_context.clone(), phy_id, "table2").await; + let logical_table3_id = create_logical_table(ddl_context.clone(), phy_id, "table3").await; create_logical_table(ddl_context.clone(), phy_id, "table4").await; create_logical_table(ddl_context.clone(), phy_id, "table5").await; @@ -223,7 +277,7 @@ async fn test_on_update_metadata() { make_alter_logical_table_add_column_task(None, "table3", vec!["new_col".to_string()]), ]; - let mut procedure = AlterLogicalTablesProcedure::new(tasks, phy_id, ddl_context); + let mut procedure = AlterLogicalTablesProcedure::new(tasks, phy_id, ddl_context.clone()); let mut status = procedure.on_prepare().await.unwrap(); assert_matches!( status, @@ -255,18 +309,52 @@ async fn test_on_update_metadata() { clean_poisons: false } ); + let (peer, request) = rx.try_recv().unwrap(); + rx.try_recv().unwrap_err(); + assert_alters_request( + peer, + request, + 0, + &[ + RegionId::new(logical_table1_id, 0), + RegionId::new(logical_table2_id, 0), + RegionId::new(logical_table3_id, 0), + ], + ); + + let table_info = get_raw_table_info(&ddl_context, phy_id).await; + assert_column_name( + &table_info, + &["ts", "value", "__table_id", "__tsid", "new_col", "mew_col"], + ); + assert_eq!( + table_info.meta.column_ids, + vec![ + 0, + 1, + ReservedColumnId::table_id(), + ReservedColumnId::tsid(), + 2, + 3 + ] + ); } #[tokio::test] async fn test_on_part_duplicate_alter_request() { - let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler)); - let ddl_context = new_ddl_context(node_manager); + common_telemetry::init_default_ut_logging(); + let (tx, mut rx) = mpsc::channel(8); + let column_metadatas = test_column_metadatas(&["col_0"]); + let handler = + DatanodeWatcher::new(tx).with_handler(make_alters_request_handler(column_metadatas)); + let node_manager = Arc::new(MockDatanodeManager::new(handler)); + let mut ddl_context = new_ddl_context(node_manager); // Creates physical table let phy_id = create_physical_table(&ddl_context, "phy").await; // Creates 3 logical tables - create_logical_table(ddl_context.clone(), phy_id, "table1").await; - create_logical_table(ddl_context.clone(), phy_id, "table2").await; + let logical_table1_id = create_logical_table(ddl_context.clone(), phy_id, "table1").await; + let logical_table2_id = create_logical_table(ddl_context.clone(), phy_id, "table2").await; let tasks = vec![ make_alter_logical_table_add_column_task(None, "table1", vec!["col_0".to_string()]), @@ -305,6 +393,40 @@ async fn test_on_part_duplicate_alter_request() { clean_poisons: false } ); + let (peer, request) = rx.try_recv().unwrap(); + rx.try_recv().unwrap_err(); + assert_alters_request( + peer, + request, + 0, + &[ + RegionId::new(logical_table1_id, 0), + RegionId::new(logical_table2_id, 0), + ], + ); + + let table_info = get_raw_table_info(&ddl_context, phy_id).await; + assert_column_name( + &table_info, + &["ts", "value", "__table_id", "__tsid", "col_0"], + ); + assert_eq!( + table_info.meta.column_ids, + vec![ + 0, + 1, + ReservedColumnId::table_id(), + ReservedColumnId::tsid(), + 2 + ] + ); + + let (tx, mut rx) = mpsc::channel(8); + let column_metadatas = test_column_metadatas(&["col_0", "new_col_1", "new_col_2"]); + let handler = + DatanodeWatcher::new(tx).with_handler(make_alters_request_handler(column_metadatas)); + let node_manager = Arc::new(MockDatanodeManager::new(handler)); + ddl_context.node_manager = node_manager; // re-alter let tasks = vec![ @@ -357,6 +479,44 @@ async fn test_on_part_duplicate_alter_request() { } ); + let (peer, request) = rx.try_recv().unwrap(); + rx.try_recv().unwrap_err(); + assert_alters_request( + peer, + request, + 0, + &[ + RegionId::new(logical_table1_id, 0), + RegionId::new(logical_table2_id, 0), + ], + ); + + let table_info = get_raw_table_info(&ddl_context, phy_id).await; + assert_column_name( + &table_info, + &[ + "ts", + "value", + "__table_id", + "__tsid", + "col_0", + "new_col_1", + "new_col_2", + ], + ); + assert_eq!( + table_info.meta.column_ids, + vec![ + 0, + 1, + ReservedColumnId::table_id(), + ReservedColumnId::tsid(), + 2, + 3, + 4, + ] + ); + let table_name_keys = vec![ TableNameKey::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "table1"), TableNameKey::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "table2"), @@ -422,27 +582,13 @@ async fn test_on_part_duplicate_alter_request() { ); } -fn alters_request_handler(_peer: Peer, request: RegionRequest) -> Result { - if let region_request::Body::Alters(_) = request.body.unwrap() { - let mut response = RegionResponse::new(0); - // Default region id for physical table. - let region_id = RegionId::new(1000, 1); - response.extensions.insert( - MANIFEST_INFO_EXTENSION_KEY.to_string(), - RegionManifestInfo::encode_list(&[(region_id, RegionManifestInfo::metric(1, 0, 2, 0))]) - .unwrap(), - ); - return Ok(response); - } - - Ok(RegionResponse::new(0)) -} - #[tokio::test] async fn test_on_submit_alter_region_request() { common_telemetry::init_default_ut_logging(); let (tx, mut rx) = mpsc::channel(8); - let handler = DatanodeWatcher::new(tx).with_handler(alters_request_handler); + let column_metadatas = test_column_metadatas(&["new_col", "mew_col"]); + let handler = + DatanodeWatcher::new(tx).with_handler(make_alters_request_handler(column_metadatas)); let node_manager = Arc::new(MockDatanodeManager::new(handler)); let ddl_context = new_ddl_context(node_manager); diff --git a/src/common/meta/src/ddl/tests/alter_table.rs b/src/common/meta/src/ddl/tests/alter_table.rs index 26e07117c8..08e39ced45 100644 --- a/src/common/meta/src/ddl/tests/alter_table.rs +++ b/src/common/meta/src/ddl/tests/alter_table.rs @@ -30,7 +30,12 @@ use common_error::status_code::StatusCode; use common_procedure::store::poison_store::PoisonStore; use common_procedure::{ProcedureId, Status}; use common_procedure_test::MockContextProvider; -use store_api::metric_engine_consts::MANIFEST_INFO_EXTENSION_KEY; +use datatypes::prelude::ConcreteDataType; +use datatypes::schema::ColumnSchema; +use store_api::metadata::ColumnMetadata; +use store_api::metric_engine_consts::{ + MANIFEST_INFO_EXTENSION_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY, +}; use store_api::region_engine::RegionManifestInfo; use store_api::storage::RegionId; use table::requests::TTL_KEY; @@ -43,6 +48,7 @@ use crate::ddl::test_util::datanode_handler::{ AllFailureDatanodeHandler, DatanodeWatcher, PartialSuccessDatanodeHandler, RequestOutdatedErrorDatanodeHandler, }; +use crate::ddl::test_util::{assert_column_name, assert_column_name_and_id}; use crate::error::{Error, Result}; use crate::key::datanode_table::DatanodeTableKey; use crate::key::table_name::TableNameKey; @@ -179,6 +185,30 @@ fn alter_request_handler(_peer: Peer, request: RegionRequest) -> Result Result, +) -> impl Fn(Peer, RegionRequest) -> Result { + move |_peer, request| { + let _ = _peer; + if let region_request::Body::Creates(_) = request.body.unwrap() { + let mut response = RegionResponse::new(0); + // Default region id for physical table. + let region_id = RegionId::new(1024, 1); + response.extensions.insert( + MANIFEST_INFO_EXTENSION_KEY.to_string(), + RegionManifestInfo::encode_list(&[( + region_id, + RegionManifestInfo::metric(1, 0, 2, 0), + )]) + .unwrap(), + ); + response.extensions.insert( + ALTER_PHYSICAL_EXTENSION_KEY.to_string(), + ColumnMetadata::encode_list(&column_metadatas).unwrap(), + ); + return Ok(response); + } + + Ok(RegionResponse::new(0)) + } +} + +fn assert_creates_request( + peer: Peer, + request: RegionRequest, + expected_peer_id: u64, + expected_region_ids: &[RegionId], +) { + assert_eq!(peer.id, expected_peer_id,); + let Some(region_request::Body::Creates(req)) = request.body else { + unreachable!(); + }; + for (i, region_id) in expected_region_ids.iter().enumerate() { + assert_eq!( + req.requests[i].region_id, + *region_id, + "actual region id: {}", + RegionId::from_u64(req.requests[i].region_id) + ); + } +} + #[tokio::test] async fn test_on_prepare_physical_table_not_found() { let node_manager = Arc::new(MockDatanodeManager::new(())); @@ -227,7 +278,12 @@ async fn test_on_prepare_part_logical_tables_exist() { #[tokio::test] async fn test_on_create_metadata() { - let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler)); + common_telemetry::init_default_ut_logging(); + let (tx, mut rx) = mpsc::channel(8); + let column_metadatas = test_column_metadatas(&["host", "cpu"]); + let datanode_handler = + DatanodeWatcher::new(tx).with_handler(make_creates_request_handler(column_metadatas)); + let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler)); let ddl_context = new_ddl_context(node_manager); // Prepares physical table metadata. let mut create_physical_table_task = test_create_physical_table_task("phy_table"); @@ -255,7 +311,7 @@ async fn test_on_create_metadata() { let mut procedure = CreateLogicalTablesProcedure::new( vec![task, yet_another_task], physical_table_id, - ddl_context, + ddl_context.clone(), ); let status = procedure.on_prepare().await.unwrap(); assert_matches!( @@ -274,11 +330,42 @@ async fn test_on_create_metadata() { let status = procedure.execute(&ctx).await.unwrap(); let table_ids = status.downcast_output_ref::>().unwrap(); assert_eq!(*table_ids, vec![1025, 1026]); + + let (peer, request) = rx.try_recv().unwrap(); + rx.try_recv().unwrap_err(); + assert_creates_request( + peer, + request, + 0, + &[RegionId::new(1025, 0), RegionId::new(1026, 0)], + ); + + let table_info = get_raw_table_info(&ddl_context, table_id).await; + assert_column_name( + &table_info, + &["ts", "value", "__table_id", "__tsid", "host", "cpu"], + ); + assert_eq!( + table_info.meta.column_ids, + vec![ + 0, + 1, + ReservedColumnId::table_id(), + ReservedColumnId::tsid(), + 2, + 3 + ] + ); } #[tokio::test] async fn test_on_create_metadata_part_logical_tables_exist() { - let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler)); + common_telemetry::init_default_ut_logging(); + let (tx, mut rx) = mpsc::channel(8); + let column_metadatas = test_column_metadatas(&["host", "cpu"]); + let datanode_handler = + DatanodeWatcher::new(tx).with_handler(make_creates_request_handler(column_metadatas)); + let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler)); let ddl_context = new_ddl_context(node_manager); // Prepares physical table metadata. let mut create_physical_table_task = test_create_physical_table_task("phy_table"); @@ -317,7 +404,7 @@ async fn test_on_create_metadata_part_logical_tables_exist() { let mut procedure = CreateLogicalTablesProcedure::new( vec![task, non_exist_task], physical_table_id, - ddl_context, + ddl_context.clone(), ); let status = procedure.on_prepare().await.unwrap(); assert_matches!( @@ -336,6 +423,27 @@ async fn test_on_create_metadata_part_logical_tables_exist() { let status = procedure.execute(&ctx).await.unwrap(); let table_ids = status.downcast_output_ref::>().unwrap(); assert_eq!(*table_ids, vec![8192, 1025]); + + let (peer, request) = rx.try_recv().unwrap(); + rx.try_recv().unwrap_err(); + assert_creates_request(peer, request, 0, &[RegionId::new(1025, 0)]); + + let table_info = get_raw_table_info(&ddl_context, table_id).await; + assert_column_name( + &table_info, + &["ts", "value", "__table_id", "__tsid", "host", "cpu"], + ); + assert_eq!( + table_info.meta.column_ids, + vec![ + 0, + 1, + ReservedColumnId::table_id(), + ReservedColumnId::tsid(), + 2, + 3 + ] + ); } #[tokio::test] @@ -399,27 +507,13 @@ async fn test_on_create_metadata_err() { assert!(!error.is_retry_later()); } -fn creates_request_handler(_peer: Peer, request: RegionRequest) -> Result { - if let region_request::Body::Creates(_) = request.body.unwrap() { - let mut response = RegionResponse::new(0); - // Default region id for physical table. - let region_id = RegionId::new(1024, 1); - response.extensions.insert( - MANIFEST_INFO_EXTENSION_KEY.to_string(), - RegionManifestInfo::encode_list(&[(region_id, RegionManifestInfo::metric(1, 0, 2, 0))]) - .unwrap(), - ); - return Ok(response); - } - - Ok(RegionResponse::new(0)) -} - #[tokio::test] async fn test_on_submit_create_request() { common_telemetry::init_default_ut_logging(); let (tx, mut rx) = mpsc::channel(8); - let handler = DatanodeWatcher::new(tx).with_handler(creates_request_handler); + let column_metadatas = test_column_metadatas(&["host", "cpu"]); + let handler = + DatanodeWatcher::new(tx).with_handler(make_creates_request_handler(column_metadatas)); let node_manager = Arc::new(MockDatanodeManager::new(handler)); let ddl_context = new_ddl_context(node_manager); let mut create_physical_table_task = test_create_physical_table_task("phy_table"); diff --git a/src/common/meta/src/ddl/tests/create_table.rs b/src/common/meta/src/ddl/tests/create_table.rs index c4cae2233d..8e8d70957d 100644 --- a/src/common/meta/src/ddl/tests/create_table.rs +++ b/src/common/meta/src/ddl/tests/create_table.rs @@ -16,7 +16,9 @@ use std::assert_matches::assert_matches; use std::collections::HashMap; use std::sync::Arc; -use api::v1::meta::Partition; +use api::region::RegionResponse; +use api::v1::meta::{Partition, Peer}; +use api::v1::region::{region_request, RegionRequest}; use api::v1::{ColumnDataType, SemanticType}; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; @@ -24,7 +26,12 @@ use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId, Stat use common_procedure_test::{ execute_procedure_until, execute_procedure_until_done, MockContextProvider, }; +use datatypes::prelude::ConcreteDataType; +use datatypes::schema::ColumnSchema; +use store_api::metadata::ColumnMetadata; +use store_api::metric_engine_consts::TABLE_COLUMN_METADATA_EXTENSION_KEY; use store_api::storage::RegionId; +use tokio::sync::mpsc; use crate::ddl::create_table::{CreateTableProcedure, CreateTableState}; use crate::ddl::test_util::columns::TestColumnDefBuilder; @@ -32,14 +39,73 @@ use crate::ddl::test_util::create_table::{ build_raw_table_info_from_expr, TestCreateTableExprBuilder, }; use crate::ddl::test_util::datanode_handler::{ - NaiveDatanodeHandler, RetryErrorDatanodeHandler, UnexpectedErrorDatanodeHandler, + DatanodeWatcher, NaiveDatanodeHandler, RetryErrorDatanodeHandler, + UnexpectedErrorDatanodeHandler, }; -use crate::error::Error; +use crate::ddl::test_util::{assert_column_name, get_raw_table_info}; +use crate::error::{Error, Result}; use crate::key::table_route::TableRouteValue; use crate::kv_backend::memory::MemoryKvBackend; use crate::rpc::ddl::CreateTableTask; use crate::test_util::{new_ddl_context, new_ddl_context_with_kv_backend, MockDatanodeManager}; +fn create_request_handler(_peer: Peer, request: RegionRequest) -> Result { + let _ = _peer; + if let region_request::Body::Create(_) = request.body.unwrap() { + let mut response = RegionResponse::new(0); + + response.extensions.insert( + TABLE_COLUMN_METADATA_EXTENSION_KEY.to_string(), + ColumnMetadata::encode_list(&[ + ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 0, + }, + ColumnMetadata { + column_schema: ColumnSchema::new( + "host", + ConcreteDataType::float64_datatype(), + false, + ), + semantic_type: SemanticType::Tag, + column_id: 1, + }, + ColumnMetadata { + column_schema: ColumnSchema::new( + "cpu", + ConcreteDataType::float64_datatype(), + false, + ), + semantic_type: SemanticType::Tag, + column_id: 2, + }, + ]) + .unwrap(), + ); + return Ok(response); + } + + Ok(RegionResponse::new(0)) +} + +fn assert_create_request( + peer: Peer, + request: RegionRequest, + expected_peer_id: u64, + expected_region_id: RegionId, +) { + assert_eq!(peer.id, expected_peer_id); + let Some(region_request::Body::Create(req)) = request.body else { + unreachable!(); + }; + assert_eq!(req.region_id, expected_region_id); +} + pub(crate) fn test_create_table_task(name: &str) -> CreateTableTask { let create_table = TestCreateTableExprBuilder::default() .column_defs([ @@ -230,11 +296,13 @@ async fn test_on_create_metadata_error() { #[tokio::test] async fn test_on_create_metadata() { common_telemetry::init_default_ut_logging(); - let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler)); + let (tx, mut rx) = mpsc::channel(8); + let datanode_handler = DatanodeWatcher::new(tx).with_handler(create_request_handler); + let node_manager = Arc::new(MockDatanodeManager::new(datanode_handler)); let ddl_context = new_ddl_context(node_manager); let task = test_create_table_task("foo"); assert!(!task.create_table.create_if_not_exists); - let mut procedure = CreateTableProcedure::new(task, ddl_context); + let mut procedure = CreateTableProcedure::new(task, ddl_context.clone()); procedure.on_prepare().await.unwrap(); let ctx = ProcedureContext { procedure_id: ProcedureId::random(), @@ -243,8 +311,16 @@ async fn test_on_create_metadata() { procedure.execute(&ctx).await.unwrap(); // Triggers procedure to create table metadata let status = procedure.execute(&ctx).await.unwrap(); - let table_id = status.downcast_output_ref::().unwrap(); - assert_eq!(*table_id, 1024); + let table_id = *status.downcast_output_ref::().unwrap(); + assert_eq!(table_id, 1024); + + let (peer, request) = rx.try_recv().unwrap(); + rx.try_recv().unwrap_err(); + assert_create_request(peer, request, 0, RegionId::new(table_id, 0)); + + let table_info = get_raw_table_info(&ddl_context, table_id).await; + assert_column_name(&table_info, &["ts", "host", "cpu"]); + assert_eq!(table_info.meta.column_ids, vec![0, 1, 2]); } #[tokio::test] diff --git a/src/common/meta/src/ddl/utils.rs b/src/common/meta/src/ddl/utils.rs index eb1299334f..ea6d8512c9 100644 --- a/src/common/meta/src/ddl/utils.rs +++ b/src/common/meta/src/ddl/utils.rs @@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub(crate) mod raw_table_info; +#[allow(dead_code)] +pub(crate) mod region_metadata_lister; +pub(crate) mod table_id; +pub(crate) mod table_info; + use std::collections::HashMap; use std::fmt::Debug; @@ -29,6 +35,7 @@ use common_telemetry::{error, info, warn}; use common_wal::options::WalOptions; use futures::future::join_all; use snafu::{ensure, OptionExt, ResultExt}; +use store_api::metadata::ColumnMetadata; use store_api::metric_engine_consts::{LOGICAL_TABLE_METADATA_KEY, MANIFEST_INFO_EXTENSION_KEY}; use store_api::region_engine::RegionManifestInfo; use store_api::storage::{RegionId, RegionNumber}; @@ -37,8 +44,8 @@ use table::table_reference::TableReference; use crate::ddl::{DdlContext, DetectingRegion}; use crate::error::{ - self, Error, OperateDatanodeSnafu, ParseWalOptionsSnafu, Result, TableNotFoundSnafu, - UnsupportedSnafu, + self, DecodeJsonSnafu, Error, MetadataCorruptionSnafu, OperateDatanodeSnafu, + ParseWalOptionsSnafu, Result, TableNotFoundSnafu, UnsupportedSnafu, }; use crate::key::datanode_table::DatanodeTableValue; use crate::key::table_name::TableNameKey; @@ -314,11 +321,23 @@ pub fn parse_manifest_infos_from_extensions( Ok(data_manifest_version) } +/// Parses column metadatas from extensions. +pub fn parse_column_metadatas( + extensions: &HashMap>, + key: &str, +) -> Result> { + let value = extensions.get(key).context(error::UnexpectedSnafu { + err_msg: format!("column metadata extension not found: {}", key), + })?; + let column_metadatas = ColumnMetadata::decode_list(value).context(error::SerdeJsonSnafu {})?; + Ok(column_metadatas) +} + /// Sync follower regions on datanodes. pub async fn sync_follower_regions( context: &DdlContext, table_id: TableId, - results: Vec, + results: &[RegionResponse], region_routes: &[RegionRoute], engine: &str, ) -> Result<()> { @@ -331,7 +350,7 @@ pub async fn sync_follower_regions( } let results = results - .into_iter() + .iter() .map(|response| parse_manifest_infos_from_extensions(&response.extensions)) .collect::>>()? .into_iter() @@ -418,6 +437,39 @@ pub async fn sync_follower_regions( Ok(()) } +/// Extracts column metadatas from extensions. +pub fn extract_column_metadatas( + results: &mut [RegionResponse], + key: &str, +) -> Result>> { + let schemas = results + .iter_mut() + .map(|r| r.extensions.remove(key)) + .collect::>(); + + if schemas.is_empty() { + warn!("extract_column_metadatas: no extension key `{key}` found in results"); + return Ok(None); + } + + // Verify all the physical schemas are the same + // Safety: previous check ensures this vec is not empty + let first = schemas.first().unwrap(); + ensure!( + schemas.iter().all(|x| x == first), + MetadataCorruptionSnafu { + err_msg: "The table column metadata schemas from datanodes are not the same." + } + ); + + if let Some(first) = first { + let column_metadatas = ColumnMetadata::decode_list(first).context(DecodeJsonSnafu)?; + Ok(Some(column_metadatas)) + } else { + Ok(None) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/common/meta/src/ddl/utils/raw_table_info.rs b/src/common/meta/src/ddl/utils/raw_table_info.rs new file mode 100644 index 0000000000..812f9185c2 --- /dev/null +++ b/src/common/meta/src/ddl/utils/raw_table_info.rs @@ -0,0 +1,123 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; + +use api::v1::SemanticType; +use common_telemetry::debug; +use common_telemetry::tracing::warn; +use store_api::metadata::ColumnMetadata; +use table::metadata::RawTableInfo; + +/// Generate the new physical table info. +pub(crate) fn build_new_physical_table_info( + mut raw_table_info: RawTableInfo, + physical_columns: &[ColumnMetadata], +) -> RawTableInfo { + debug!( + "building new physical table info for table: {}, table_id: {}", + raw_table_info.name, raw_table_info.ident.table_id + ); + let existing_columns = raw_table_info + .meta + .schema + .column_schemas + .iter() + .map(|col| col.name.clone()) + .collect::>(); + let primary_key_indices = &mut raw_table_info.meta.primary_key_indices; + let value_indices = &mut raw_table_info.meta.value_indices; + value_indices.clear(); + let time_index = &mut raw_table_info.meta.schema.timestamp_index; + let columns = &mut raw_table_info.meta.schema.column_schemas; + columns.clear(); + let column_ids = &mut raw_table_info.meta.column_ids; + column_ids.clear(); + + for (idx, col) in physical_columns.iter().enumerate() { + match col.semantic_type { + SemanticType::Tag => { + // push new primary key to the end. + if !existing_columns.contains(&col.column_schema.name) { + primary_key_indices.push(idx); + } + } + SemanticType::Field => value_indices.push(idx), + SemanticType::Timestamp => { + value_indices.push(idx); + *time_index = Some(idx); + } + } + + columns.push(col.column_schema.clone()); + column_ids.push(col.column_id); + } + + if let Some(time_index) = *time_index { + raw_table_info.meta.schema.column_schemas[time_index].set_time_index(); + } + + raw_table_info +} + +/// Updates the column IDs in the table info based on the provided column metadata. +/// +/// This function validates that the column metadata matches the existing table schema +/// before updating the column ids. If the column metadata doesn't match the table schema, +/// the table info remains unchanged. +pub(crate) fn update_table_info_column_ids( + raw_table_info: &mut RawTableInfo, + column_metadatas: &[ColumnMetadata], +) { + let mut table_column_names = raw_table_info + .meta + .schema + .column_schemas + .iter() + .map(|c| c.name.as_str()) + .collect::>(); + table_column_names.sort_unstable(); + + let mut column_names = column_metadatas + .iter() + .map(|c| c.column_schema.name.as_str()) + .collect::>(); + column_names.sort_unstable(); + + if table_column_names != column_names { + warn!( + "Column metadata doesn't match the table schema for table {}, table_id: {}, column in table: {:?}, column in metadata: {:?}", + raw_table_info.name, + raw_table_info.ident.table_id, + table_column_names, + column_names, + ); + return; + } + + let name_to_id = column_metadatas + .iter() + .map(|c| (c.column_schema.name.clone(), c.column_id)) + .collect::>(); + + let schema = &raw_table_info.meta.schema.column_schemas; + let mut column_ids = Vec::with_capacity(schema.len()); + for column_schema in schema { + if let Some(id) = name_to_id.get(&column_schema.name) { + column_ids.push(*id); + } + } + + raw_table_info.meta.column_ids = column_ids; +} diff --git a/src/common/meta/src/ddl/utils/region_metadata_lister.rs b/src/common/meta/src/ddl/utils/region_metadata_lister.rs new file mode 100644 index 0000000000..30bacd04e7 --- /dev/null +++ b/src/common/meta/src/ddl/utils/region_metadata_lister.rs @@ -0,0 +1,240 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use api::v1::region::region_request::Body as PbRegionRequest; +use api::v1::region::{ListMetadataRequest, RegionRequest, RegionRequestHeader}; +use common_telemetry::tracing_context::TracingContext; +use futures::future::join_all; +use snafu::ResultExt; +use store_api::metadata::RegionMetadata; +use store_api::storage::{RegionId, TableId}; + +use crate::ddl::utils::add_peer_context_if_needed; +use crate::error::{DecodeJsonSnafu, Result}; +use crate::node_manager::NodeManagerRef; +use crate::rpc::router::{find_leaders, region_distribution, RegionRoute}; + +/// Collects the region metadata from the datanodes. +pub struct RegionMetadataLister { + node_manager: NodeManagerRef, +} + +impl RegionMetadataLister { + /// Creates a new [`RegionMetadataLister`] with the given [`NodeManagerRef`]. + pub fn new(node_manager: NodeManagerRef) -> Self { + Self { node_manager } + } + + /// Collects the region metadata from the datanodes. + pub async fn list( + &self, + table_id: TableId, + region_routes: &[RegionRoute], + ) -> Result>> { + let region_distribution = region_distribution(region_routes); + let leaders = find_leaders(region_routes) + .into_iter() + .map(|p| (p.id, p)) + .collect::>(); + + let total_num_region = region_distribution + .values() + .map(|r| r.leader_regions.len()) + .sum::(); + + let mut list_metadata_tasks = Vec::with_capacity(leaders.len()); + + // Build requests. + for (datanode_id, region_role_set) in region_distribution { + if region_role_set.leader_regions.is_empty() { + continue; + } + // Safety: must exists. + let peer = leaders.get(&datanode_id).unwrap(); + let requester = self.node_manager.datanode(peer).await; + let region_ids = region_role_set + .leader_regions + .iter() + .map(|r| RegionId::new(table_id, *r).as_u64()) + .collect(); + let request = Self::build_list_metadata_request(region_ids); + + let peer = peer.clone(); + list_metadata_tasks.push(async move { + requester + .handle(request) + .await + .map_err(add_peer_context_if_needed(peer)) + }); + } + + let results = join_all(list_metadata_tasks) + .await + .into_iter() + .collect::>>()? + .into_iter() + .map(|r| r.metadata); + + let mut output = Vec::with_capacity(total_num_region); + for result in results { + let region_metadatas: Vec> = + serde_json::from_slice(&result).context(DecodeJsonSnafu)?; + output.extend(region_metadatas); + } + + Ok(output) + } + + fn build_list_metadata_request(region_ids: Vec) -> RegionRequest { + RegionRequest { + header: Some(RegionRequestHeader { + tracing_context: TracingContext::from_current_span().to_w3c(), + ..Default::default() + }), + body: Some(PbRegionRequest::ListMetadata(ListMetadataRequest { + region_ids, + })), + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use api::region::RegionResponse; + use api::v1::meta::Peer; + use api::v1::region::region_request::Body; + use api::v1::region::RegionRequest; + use store_api::metadata::RegionMetadata; + use store_api::storage::RegionId; + use tokio::sync::mpsc; + + use crate::ddl::test_util::datanode_handler::{DatanodeWatcher, ListMetadataDatanodeHandler}; + use crate::ddl::test_util::region_metadata::build_region_metadata; + use crate::ddl::test_util::test_column_metadatas; + use crate::ddl::utils::region_metadata_lister::RegionMetadataLister; + use crate::error::Result; + use crate::rpc::router::{Region, RegionRoute}; + use crate::test_util::MockDatanodeManager; + + fn assert_list_metadata_request(req: RegionRequest, expected_region_ids: &[RegionId]) { + let Some(Body::ListMetadata(req)) = req.body else { + unreachable!() + }; + + assert_eq!(req.region_ids.len(), expected_region_ids.len()); + for region_id in expected_region_ids { + assert!(req.region_ids.contains(®ion_id.as_u64())); + } + } + + fn empty_list_metadata_handler(_peer: Peer, request: RegionRequest) -> Result { + let Some(Body::ListMetadata(req)) = request.body else { + unreachable!() + }; + + let mut output: Vec> = Vec::with_capacity(req.region_ids.len()); + for _region_id in req.region_ids { + output.push(None); + } + + Ok(RegionResponse::from_metadata( + serde_json::to_vec(&output).unwrap(), + )) + } + + #[tokio::test] + async fn test_list_request() { + let (tx, mut rx) = mpsc::channel(8); + let handler = DatanodeWatcher::new(tx).with_handler(empty_list_metadata_handler); + let node_manager = Arc::new(MockDatanodeManager::new(handler)); + let lister = RegionMetadataLister::new(node_manager); + let region_routes = vec![ + RegionRoute { + region: Region::new_test(RegionId::new(1024, 1)), + leader_peer: Some(Peer::empty(1)), + follower_peers: vec![Peer::empty(5)], + leader_state: None, + leader_down_since: None, + }, + RegionRoute { + region: Region::new_test(RegionId::new(1024, 2)), + leader_peer: Some(Peer::empty(3)), + follower_peers: vec![Peer::empty(4)], + leader_state: None, + leader_down_since: None, + }, + RegionRoute { + region: Region::new_test(RegionId::new(1024, 3)), + leader_peer: Some(Peer::empty(3)), + follower_peers: vec![Peer::empty(4)], + leader_state: None, + leader_down_since: None, + }, + ]; + let region_metadatas = lister.list(1024, ®ion_routes).await.unwrap(); + assert_eq!(region_metadatas.len(), 3); + + let mut requests = vec![]; + for _ in 0..2 { + let (peer, request) = rx.try_recv().unwrap(); + requests.push((peer, request)); + } + rx.try_recv().unwrap_err(); + + let (peer, request) = requests.remove(0); + assert_eq!(peer.id, 1); + assert_list_metadata_request(request, &[RegionId::new(1024, 1)]); + let (peer, request) = requests.remove(0); + assert_eq!(peer.id, 3); + assert_list_metadata_request(request, &[RegionId::new(1024, 2), RegionId::new(1024, 3)]); + } + + #[tokio::test] + async fn test_list_region_metadata() { + let region_metadata = + build_region_metadata(RegionId::new(1024, 1), &test_column_metadatas(&["tag_0"])); + let region_metadatas = HashMap::from([ + (RegionId::new(1024, 0), None), + (RegionId::new(1024, 1), Some(region_metadata.clone())), + ]); + let handler = ListMetadataDatanodeHandler::new(region_metadatas); + let node_manager = Arc::new(MockDatanodeManager::new(handler)); + let lister = RegionMetadataLister::new(node_manager); + let region_routes = vec![ + RegionRoute { + region: Region::new_test(RegionId::new(1024, 0)), + leader_peer: Some(Peer::empty(1)), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }, + RegionRoute { + region: Region::new_test(RegionId::new(1024, 1)), + leader_peer: Some(Peer::empty(3)), + follower_peers: vec![], + leader_state: None, + leader_down_since: None, + }, + ]; + let region_metadatas = lister.list(1024, ®ion_routes).await.unwrap(); + assert_eq!(region_metadatas.len(), 2); + assert_eq!(region_metadatas[0], None); + assert_eq!(region_metadatas[1], Some(region_metadata)); + } +} diff --git a/src/common/meta/src/ddl/utils/table_id.rs b/src/common/meta/src/ddl/utils/table_id.rs new file mode 100644 index 0000000000..e0f62de818 --- /dev/null +++ b/src/common/meta/src/ddl/utils/table_id.rs @@ -0,0 +1,46 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use snafu::OptionExt; +use store_api::storage::TableId; +use table::table_reference::TableReference; + +use crate::error::{Result, TableNotFoundSnafu}; +use crate::key::table_name::{TableNameKey, TableNameManager}; + +/// Get all the table ids from the table names. +/// +/// Returns an error if any table does not exist. +pub(crate) async fn get_all_table_ids_by_names<'a>( + table_name_manager: &TableNameManager, + table_names: &[TableReference<'a>], +) -> Result> { + let table_name_keys = table_names + .iter() + .map(TableNameKey::from) + .collect::>(); + let table_name_values = table_name_manager.batch_get(table_name_keys).await?; + let mut table_ids = Vec::with_capacity(table_name_values.len()); + for (value, table_name) in table_name_values.into_iter().zip(table_names) { + let value = value + .with_context(|| TableNotFoundSnafu { + table_name: table_name.to_string(), + })? + .table_id(); + + table_ids.push(value); + } + + Ok(table_ids) +} diff --git a/src/common/meta/src/ddl/utils/table_info.rs b/src/common/meta/src/ddl/utils/table_info.rs new file mode 100644 index 0000000000..7f42a383fe --- /dev/null +++ b/src/common/meta/src/ddl/utils/table_info.rs @@ -0,0 +1,100 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use itertools::Itertools; +use snafu::OptionExt; +use store_api::storage::TableId; +use table::metadata::RawTableInfo; +use table::table_reference::TableReference; + +use crate::error::{Result, TableInfoNotFoundSnafu}; +use crate::key::table_info::{TableInfoManager, TableInfoValue}; +use crate::key::table_route::{TableRouteManager, TableRouteValue}; +use crate::key::{DeserializedValueWithBytes, TableMetadataManager}; + +/// Get all table info values by table ids. +/// +/// Returns an error if any table does not exist. +pub(crate) async fn get_all_table_info_values_by_table_ids<'a>( + table_info_manager: &TableInfoManager, + table_ids: &[TableId], + table_names: &[TableReference<'a>], +) -> Result>> { + let mut table_info_map = table_info_manager.batch_get_raw(table_ids).await?; + let mut table_info_values = Vec::with_capacity(table_ids.len()); + for (table_id, table_name) in table_ids.iter().zip(table_names) { + let table_info_value = + table_info_map + .remove(table_id) + .with_context(|| TableInfoNotFoundSnafu { + table: table_name.to_string(), + })?; + table_info_values.push(table_info_value); + } + + Ok(table_info_values) +} + +/// Checks if all the logical table routes have the same physical table id. +pub(crate) async fn all_logical_table_routes_have_same_physical_id( + table_route_manager: &TableRouteManager, + table_ids: &[TableId], + physical_table_id: TableId, +) -> Result { + let table_routes = table_route_manager + .table_route_storage() + .batch_get(table_ids) + .await?; + + let is_same_physical_table = table_routes.iter().all(|r| { + if let Some(TableRouteValue::Logical(r)) = r { + r.physical_table_id() == physical_table_id + } else { + false + } + }); + + Ok(is_same_physical_table) +} + +/// Batch updates the table info values. +/// +/// The table info values are grouped into chunks, and each chunk is updated in a single transaction. +/// +/// Returns an error if any table info value fails to update. +pub(crate) async fn batch_update_table_info_values( + table_metadata_manager: &TableMetadataManager, + table_info_values: Vec<(DeserializedValueWithBytes, RawTableInfo)>, +) -> Result<()> { + let chunk_size = table_metadata_manager.batch_update_table_info_value_chunk_size(); + if table_info_values.len() > chunk_size { + let chunks = table_info_values + .into_iter() + .chunks(chunk_size) + .into_iter() + .map(|check| check.collect::>()) + .collect::>(); + for chunk in chunks { + table_metadata_manager + .batch_update_table_info_values(chunk) + .await?; + } + } else { + table_metadata_manager + .batch_update_table_info_values(table_info_values) + .await?; + } + + Ok(()) +} diff --git a/src/common/meta/src/ddl_manager.rs b/src/common/meta/src/ddl_manager.rs index f665796713..8648338080 100644 --- a/src/common/meta/src/ddl_manager.rs +++ b/src/common/meta/src/ddl_manager.rs @@ -14,7 +14,6 @@ use std::sync::Arc; -use api::v1::meta::ProcedureDetailResponse; use common_procedure::{ watcher, BoxedProcedureLoader, Output, ProcedureId, ProcedureManagerRef, ProcedureWithId, }; @@ -37,16 +36,16 @@ use crate::ddl::drop_flow::DropFlowProcedure; use crate::ddl::drop_table::DropTableProcedure; use crate::ddl::drop_view::DropViewProcedure; use crate::ddl::truncate_table::TruncateTableProcedure; -use crate::ddl::{utils, DdlContext, ExecutorContext, ProcedureExecutor}; +use crate::ddl::{utils, DdlContext}; use crate::error::{ - EmptyDdlTasksSnafu, ParseProcedureIdSnafu, ProcedureNotFoundSnafu, ProcedureOutputSnafu, - QueryProcedureSnafu, RegisterProcedureLoaderSnafu, Result, SubmitProcedureSnafu, - TableInfoNotFoundSnafu, TableNotFoundSnafu, TableRouteNotFoundSnafu, - UnexpectedLogicalRouteTableSnafu, UnsupportedSnafu, WaitProcedureSnafu, + EmptyDdlTasksSnafu, ProcedureOutputSnafu, RegisterProcedureLoaderSnafu, Result, + SubmitProcedureSnafu, TableInfoNotFoundSnafu, TableNotFoundSnafu, TableRouteNotFoundSnafu, + UnexpectedLogicalRouteTableSnafu, WaitProcedureSnafu, }; use crate::key::table_info::TableInfoValue; use crate::key::table_name::TableNameKey; use crate::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; +use crate::procedure_executor::ExecutorContext; #[cfg(feature = "enterprise")] use crate::rpc::ddl::trigger::CreateTriggerTask; #[cfg(feature = "enterprise")] @@ -61,8 +60,6 @@ use crate::rpc::ddl::{ CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask, QueryContext, SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask, }; -use crate::rpc::procedure; -use crate::rpc::procedure::{MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse}; use crate::rpc::router::RegionRoute; pub type DdlManagerRef = Arc; @@ -406,6 +403,70 @@ impl DdlManager { Ok((procedure_id, output)) } + + pub async fn submit_ddl_task( + &self, + ctx: &ExecutorContext, + request: SubmitDdlTaskRequest, + ) -> Result { + let span = ctx + .tracing_context + .as_ref() + .map(TracingContext::from_w3c) + .unwrap_or_else(TracingContext::from_current_span) + .attach(tracing::info_span!("DdlManager::submit_ddl_task")); + async move { + debug!("Submitting Ddl task: {:?}", request.task); + match request.task { + CreateTable(create_table_task) => { + handle_create_table_task(self, create_table_task).await + } + DropTable(drop_table_task) => handle_drop_table_task(self, drop_table_task).await, + AlterTable(alter_table_task) => { + handle_alter_table_task(self, alter_table_task).await + } + TruncateTable(truncate_table_task) => { + handle_truncate_table_task(self, truncate_table_task).await + } + CreateLogicalTables(create_table_tasks) => { + handle_create_logical_table_tasks(self, create_table_tasks).await + } + AlterLogicalTables(alter_table_tasks) => { + handle_alter_logical_table_tasks(self, alter_table_tasks).await + } + DropLogicalTables(_) => todo!(), + CreateDatabase(create_database_task) => { + handle_create_database_task(self, create_database_task).await + } + DropDatabase(drop_database_task) => { + handle_drop_database_task(self, drop_database_task).await + } + AlterDatabase(alter_database_task) => { + handle_alter_database_task(self, alter_database_task).await + } + CreateFlow(create_flow_task) => { + handle_create_flow_task(self, create_flow_task, request.query_context.into()) + .await + } + DropFlow(drop_flow_task) => handle_drop_flow_task(self, drop_flow_task).await, + CreateView(create_view_task) => { + handle_create_view_task(self, create_view_task).await + } + DropView(drop_view_task) => handle_drop_view_task(self, drop_view_task).await, + #[cfg(feature = "enterprise")] + CreateTrigger(create_trigger_task) => { + handle_create_trigger_task( + self, + create_trigger_task, + request.query_context.into(), + ) + .await + } + } + } + .trace(span) + .await + } } async fn handle_truncate_table_task( @@ -712,6 +773,8 @@ async fn handle_create_trigger_task( query_context: QueryContext, ) -> Result { let Some(m) = ddl_manager.trigger_ddl_manager.as_ref() else { + use crate::error::UnsupportedSnafu; + return UnsupportedSnafu { operation: "create trigger", } @@ -788,114 +851,6 @@ async fn handle_create_view_task( }) } -/// TODO(dennis): let [`DdlManager`] implement [`ProcedureExecutor`] looks weird, find some way to refactor it. -#[async_trait::async_trait] -impl ProcedureExecutor for DdlManager { - async fn submit_ddl_task( - &self, - ctx: &ExecutorContext, - request: SubmitDdlTaskRequest, - ) -> Result { - let span = ctx - .tracing_context - .as_ref() - .map(TracingContext::from_w3c) - .unwrap_or(TracingContext::from_current_span()) - .attach(tracing::info_span!("DdlManager::submit_ddl_task")); - async move { - debug!("Submitting Ddl task: {:?}", request.task); - match request.task { - CreateTable(create_table_task) => { - handle_create_table_task(self, create_table_task).await - } - DropTable(drop_table_task) => handle_drop_table_task(self, drop_table_task).await, - AlterTable(alter_table_task) => { - handle_alter_table_task(self, alter_table_task).await - } - TruncateTable(truncate_table_task) => { - handle_truncate_table_task(self, truncate_table_task).await - } - CreateLogicalTables(create_table_tasks) => { - handle_create_logical_table_tasks(self, create_table_tasks).await - } - AlterLogicalTables(alter_table_tasks) => { - handle_alter_logical_table_tasks(self, alter_table_tasks).await - } - DropLogicalTables(_) => todo!(), - CreateDatabase(create_database_task) => { - handle_create_database_task(self, create_database_task).await - } - DropDatabase(drop_database_task) => { - handle_drop_database_task(self, drop_database_task).await - } - AlterDatabase(alter_database_task) => { - handle_alter_database_task(self, alter_database_task).await - } - CreateFlow(create_flow_task) => { - handle_create_flow_task(self, create_flow_task, request.query_context.into()) - .await - } - #[cfg(feature = "enterprise")] - CreateTrigger(create_trigger_task) => { - handle_create_trigger_task( - self, - create_trigger_task, - request.query_context.into(), - ) - .await - } - DropFlow(drop_flow_task) => handle_drop_flow_task(self, drop_flow_task).await, - CreateView(create_view_task) => { - handle_create_view_task(self, create_view_task).await - } - DropView(drop_view_task) => handle_drop_view_task(self, drop_view_task).await, - } - } - .trace(span) - .await - } - - async fn migrate_region( - &self, - _ctx: &ExecutorContext, - _request: MigrateRegionRequest, - ) -> Result { - UnsupportedSnafu { - operation: "migrate_region", - } - .fail() - } - - async fn query_procedure_state( - &self, - _ctx: &ExecutorContext, - pid: &str, - ) -> Result { - let pid = - ProcedureId::parse_str(pid).with_context(|_| ParseProcedureIdSnafu { key: pid })?; - - let state = self - .procedure_manager - .procedure_state(pid) - .await - .context(QueryProcedureSnafu)? - .context(ProcedureNotFoundSnafu { - pid: pid.to_string(), - })?; - - Ok(procedure::procedure_state_to_pb_response(&state)) - } - - async fn list_procedures(&self, _ctx: &ExecutorContext) -> Result { - let metas = self - .procedure_manager - .list_procedures() - .await - .context(QueryProcedureSnafu)?; - Ok(procedure::procedure_details_to_pb_response(metas)) - } -} - #[cfg(test)] mod tests { use std::sync::Arc; @@ -956,6 +911,7 @@ mod tests { Default::default(), state_store, poison_manager, + None, )); let _ = DdlManager::try_new( diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs index 8abd4982af..add64e877f 100644 --- a/src/common/meta/src/error.rs +++ b/src/common/meta/src/error.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; +use common_procedure::ProcedureId; use common_wal::options::WalOptions; use serde_json::error::Error as JsonError; use snafu::{Location, Snafu}; @@ -140,6 +141,21 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to get procedure state receiver, procedure id: {procedure_id}"))] + ProcedureStateReceiver { + procedure_id: ProcedureId, + #[snafu(implicit)] + location: Location, + source: common_procedure::Error, + }, + + #[snafu(display("Procedure state receiver not found: {procedure_id}"))] + ProcedureStateReceiverNotFound { + procedure_id: ProcedureId, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Failed to wait procedure done"))] WaitProcedure { #[snafu(implicit)] @@ -387,6 +403,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Catalog not found, catalog: {}", catalog))] + CatalogNotFound { + catalog: String, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Invalid metadata, err: {}", err_msg))] InvalidMetadata { err_msg: String, @@ -877,6 +900,93 @@ pub enum Error { #[snafu(source)] error: object_store::Error, }, + + #[snafu(display("Missing column ids"))] + MissingColumnIds { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Missing column in column metadata: {}, table: {}, table_id: {}", + column_name, + table_name, + table_id, + ))] + MissingColumnInColumnMetadata { + column_name: String, + #[snafu(implicit)] + location: Location, + table_name: String, + table_id: TableId, + }, + + #[snafu(display( + "Mismatch column id: column_name: {}, column_id: {}, table: {}, table_id: {}", + column_name, + column_id, + table_name, + table_id, + ))] + MismatchColumnId { + column_name: String, + column_id: u32, + #[snafu(implicit)] + location: Location, + table_name: String, + table_id: TableId, + }, + + #[snafu(display("Failed to convert column def, column: {}", column))] + ConvertColumnDef { + column: String, + #[snafu(implicit)] + location: Location, + source: api::error::Error, + }, + + #[snafu(display( + "Column metadata inconsistencies found in table: {}, table_id: {}", + table_name, + table_id + ))] + ColumnMetadataConflicts { + table_name: String, + table_id: TableId, + }, + + #[snafu(display( + "Column not found in column metadata, column_name: {}, column_id: {}", + column_name, + column_id + ))] + ColumnNotFound { column_name: String, column_id: u32 }, + + #[snafu(display( + "Column id mismatch, column_name: {}, expected column_id: {}, actual column_id: {}", + column_name, + expected_column_id, + actual_column_id + ))] + ColumnIdMismatch { + column_name: String, + expected_column_id: u32, + actual_column_id: u32, + }, + + #[snafu(display( + "Timestamp column mismatch, expected column_name: {}, expected column_id: {}, actual column_name: {}, actual column_id: {}", + expected_column_name, + expected_column_id, + actual_column_name, + actual_column_id, + ))] + TimestampMismatch { + expected_column_name: String, + expected_column_id: u32, + actual_column_name: String, + actual_column_id: u32, + }, } pub type Result = std::result::Result; @@ -896,7 +1006,16 @@ impl ErrorExt for Error { | DeserializeFromJson { .. } => StatusCode::Internal, NoLeader { .. } => StatusCode::TableUnavailable, - ValueNotExist { .. } | ProcedurePoisonConflict { .. } => StatusCode::Unexpected, + ValueNotExist { .. } + | ProcedurePoisonConflict { .. } + | ProcedureStateReceiverNotFound { .. } + | MissingColumnIds { .. } + | MissingColumnInColumnMetadata { .. } + | MismatchColumnId { .. } + | ColumnMetadataConflicts { .. } + | ColumnNotFound { .. } + | ColumnIdMismatch { .. } + | TimestampMismatch { .. } => StatusCode::Unexpected, Unsupported { .. } => StatusCode::Unsupported, WriteObject { .. } | ReadObject { .. } => StatusCode::StorageUnavailable, @@ -980,10 +1099,13 @@ impl ErrorExt for Error { AbortProcedure { source, .. } => source.status_code(), ConvertAlterTableRequest { source, .. } => source.status_code(), PutPoison { source, .. } => source.status_code(), + ConvertColumnDef { source, .. } => source.status_code(), + ProcedureStateReceiver { source, .. } => source.status_code(), ParseProcedureId { .. } | InvalidNumTopics { .. } | SchemaNotFound { .. } + | CatalogNotFound { .. } | InvalidNodeInfoKey { .. } | InvalidStatKey { .. } | ParseNum { .. } diff --git a/src/common/meta/src/key.rs b/src/common/meta/src/key.rs index 6a50baf381..8621d772d5 100644 --- a/src/common/meta/src/key.rs +++ b/src/common/meta/src/key.rs @@ -100,8 +100,8 @@ pub mod catalog_name; pub mod datanode_table; pub mod flow; -pub mod maintenance; pub mod node_address; +pub mod runtime_switch; mod schema_metadata_manager; pub mod schema_name; pub mod table_info; @@ -164,7 +164,10 @@ use crate::state_store::PoisonValue; use crate::DatanodeId; pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.@#]*"; -pub const MAINTENANCE_KEY: &str = "__maintenance"; +pub const LEGACY_MAINTENANCE_KEY: &str = "__maintenance"; +pub const MAINTENANCE_KEY: &str = "__switches/maintenance"; +pub const PAUSE_PROCEDURE_KEY: &str = "__switches/pause_procedure"; +pub const RECOVERY_MODE_KEY: &str = "__switches/recovery"; pub const DATANODE_TABLE_KEY_PREFIX: &str = "__dn_table"; pub const TABLE_INFO_KEY_PREFIX: &str = "__table_info"; @@ -179,6 +182,11 @@ pub const KAFKA_TOPIC_KEY_PREFIX: &str = "__topic_name/kafka"; pub const LEGACY_TOPIC_KEY_PREFIX: &str = "__created_wal_topics/kafka"; pub const TOPIC_REGION_PREFIX: &str = "__topic_region"; +/// The election key. +pub const ELECTION_KEY: &str = "__metasrv_election"; +/// The root key of metasrv election candidates. +pub const CANDIDATES_ROOT: &str = "__metasrv_election_candidates/"; + /// The keys with these prefixes will be loaded into the cache when the leader starts. pub const CACHE_KEY_PREFIXES: [&str; 5] = [ TABLE_NAME_KEY_PREFIX, diff --git a/src/common/meta/src/key/maintenance.rs b/src/common/meta/src/key/maintenance.rs deleted file mode 100644 index c1cb93d76e..0000000000 --- a/src/common/meta/src/key/maintenance.rs +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use crate::error::Result; -use crate::key::MAINTENANCE_KEY; -use crate::kv_backend::KvBackendRef; -use crate::rpc::store::PutRequest; - -pub type MaintenanceModeManagerRef = Arc; - -/// The maintenance mode manager. -/// -/// Used to enable or disable maintenance mode. -#[derive(Clone)] -pub struct MaintenanceModeManager { - kv_backend: KvBackendRef, -} - -impl MaintenanceModeManager { - pub fn new(kv_backend: KvBackendRef) -> Self { - Self { kv_backend } - } - - /// Enables maintenance mode. - pub async fn set_maintenance_mode(&self) -> Result<()> { - let req = PutRequest { - key: Vec::from(MAINTENANCE_KEY), - value: vec![], - prev_kv: false, - }; - self.kv_backend.put(req).await?; - Ok(()) - } - - /// Unsets maintenance mode. - pub async fn unset_maintenance_mode(&self) -> Result<()> { - self.kv_backend - .delete(MAINTENANCE_KEY.as_bytes(), false) - .await?; - Ok(()) - } - - /// Returns true if maintenance mode is enabled. - pub async fn maintenance_mode(&self) -> Result { - self.kv_backend.exists(MAINTENANCE_KEY.as_bytes()).await - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use crate::key::maintenance::MaintenanceModeManager; - use crate::kv_backend::memory::MemoryKvBackend; - - #[tokio::test] - async fn test_maintenance_mode_manager() { - let maintenance_mode_manager = Arc::new(MaintenanceModeManager::new(Arc::new( - MemoryKvBackend::new(), - ))); - assert!(!maintenance_mode_manager.maintenance_mode().await.unwrap()); - maintenance_mode_manager - .set_maintenance_mode() - .await - .unwrap(); - assert!(maintenance_mode_manager.maintenance_mode().await.unwrap()); - maintenance_mode_manager - .unset_maintenance_mode() - .await - .unwrap(); - assert!(!maintenance_mode_manager.maintenance_mode().await.unwrap()); - } -} diff --git a/src/common/meta/src/key/runtime_switch.rs b/src/common/meta/src/key/runtime_switch.rs new file mode 100644 index 0000000000..f5eb9b058c --- /dev/null +++ b/src/common/meta/src/key/runtime_switch.rs @@ -0,0 +1,250 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Duration; + +use common_error::ext::BoxedError; +use common_procedure::local::PauseAware; +use moka::future::Cache; +use snafu::ResultExt; + +use crate::error::{GetCacheSnafu, Result}; +use crate::key::{LEGACY_MAINTENANCE_KEY, MAINTENANCE_KEY, PAUSE_PROCEDURE_KEY, RECOVERY_MODE_KEY}; +use crate::kv_backend::KvBackendRef; +use crate::rpc::store::{BatchDeleteRequest, PutRequest}; + +pub type RuntimeSwitchManagerRef = Arc; + +/// The runtime switch manager. +/// +/// Used to enable or disable runtime switches. +#[derive(Clone)] +pub struct RuntimeSwitchManager { + kv_backend: KvBackendRef, + cache: Cache, Option>>, +} + +#[async_trait::async_trait] +impl PauseAware for RuntimeSwitchManager { + async fn is_paused(&self) -> std::result::Result { + self.is_procedure_paused().await.map_err(BoxedError::new) + } +} + +const CACHE_TTL: Duration = Duration::from_secs(10); +const MAX_CAPACITY: u64 = 32; + +impl RuntimeSwitchManager { + pub fn new(kv_backend: KvBackendRef) -> Self { + let cache = Cache::builder() + .time_to_live(CACHE_TTL) + .max_capacity(MAX_CAPACITY) + .build(); + Self { kv_backend, cache } + } + + async fn put_key(&self, key: &str) -> Result<()> { + let req = PutRequest { + key: Vec::from(key), + value: vec![], + prev_kv: false, + }; + self.kv_backend.put(req).await?; + self.cache.invalidate(key.as_bytes()).await; + Ok(()) + } + + async fn delete_keys(&self, keys: &[&str]) -> Result<()> { + let req = BatchDeleteRequest::new() + .with_keys(keys.iter().map(|x| x.as_bytes().to_vec()).collect()); + self.kv_backend.batch_delete(req).await?; + for key in keys { + self.cache.invalidate(key.as_bytes()).await; + } + Ok(()) + } + + /// Returns true if the key exists. + async fn exists(&self, key: &str) -> Result { + let key = key.as_bytes().to_vec(); + let kv_backend = self.kv_backend.clone(); + let value = self + .cache + .try_get_with(key.clone(), async move { + kv_backend.get(&key).await.map(|v| v.map(|v| v.value)) + }) + .await + .context(GetCacheSnafu)?; + + Ok(value.is_some()) + } + + /// Enables maintenance mode. + pub async fn set_maintenance_mode(&self) -> Result<()> { + self.put_key(MAINTENANCE_KEY).await + } + + /// Unsets maintenance mode. + pub async fn unset_maintenance_mode(&self) -> Result<()> { + self.delete_keys(&[MAINTENANCE_KEY, LEGACY_MAINTENANCE_KEY]) + .await + } + + /// Returns true if maintenance mode is enabled. + pub async fn maintenance_mode(&self) -> Result { + let exists = self.exists(MAINTENANCE_KEY).await?; + if exists { + return Ok(true); + } + + let exists = self.exists(LEGACY_MAINTENANCE_KEY).await?; + if exists { + return Ok(true); + } + + Ok(false) + } + + // Pauses handling of incoming procedure requests. + pub async fn pasue_procedure(&self) -> Result<()> { + self.put_key(PAUSE_PROCEDURE_KEY).await + } + + /// Resumes processing of incoming procedure requests. + pub async fn resume_procedure(&self) -> Result<()> { + self.delete_keys(&[PAUSE_PROCEDURE_KEY]).await + } + + /// Returns true if the system is currently pausing incoming procedure requests. + pub async fn is_procedure_paused(&self) -> Result { + self.exists(PAUSE_PROCEDURE_KEY).await + } + + /// Enables recovery mode. + pub async fn set_recovery_mode(&self) -> Result<()> { + self.put_key(RECOVERY_MODE_KEY).await + } + + /// Unsets recovery mode. + pub async fn unset_recovery_mode(&self) -> Result<()> { + self.delete_keys(&[RECOVERY_MODE_KEY]).await + } + + /// Returns true if the system is currently in recovery mode. + pub async fn recovery_mode(&self) -> Result { + self.exists(RECOVERY_MODE_KEY).await + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use crate::key::runtime_switch::RuntimeSwitchManager; + use crate::key::{LEGACY_MAINTENANCE_KEY, MAINTENANCE_KEY}; + use crate::kv_backend::memory::MemoryKvBackend; + use crate::kv_backend::KvBackend; + use crate::rpc::store::PutRequest; + + #[tokio::test] + async fn test_runtime_switch_manager_basic() { + let runtime_switch_manager = + Arc::new(RuntimeSwitchManager::new(Arc::new(MemoryKvBackend::new()))); + runtime_switch_manager + .put_key(MAINTENANCE_KEY) + .await + .unwrap(); + let v = runtime_switch_manager + .cache + .get(MAINTENANCE_KEY.as_bytes()) + .await; + assert!(v.is_none()); + runtime_switch_manager + .exists(MAINTENANCE_KEY) + .await + .unwrap(); + let v = runtime_switch_manager + .cache + .get(MAINTENANCE_KEY.as_bytes()) + .await; + assert!(v.is_some()); + runtime_switch_manager + .delete_keys(&[MAINTENANCE_KEY]) + .await + .unwrap(); + let v = runtime_switch_manager + .cache + .get(MAINTENANCE_KEY.as_bytes()) + .await; + assert!(v.is_none()); + } + + #[tokio::test] + async fn test_runtime_switch_manager() { + let runtime_switch_manager = + Arc::new(RuntimeSwitchManager::new(Arc::new(MemoryKvBackend::new()))); + assert!(!runtime_switch_manager.maintenance_mode().await.unwrap()); + runtime_switch_manager.set_maintenance_mode().await.unwrap(); + assert!(runtime_switch_manager.maintenance_mode().await.unwrap()); + runtime_switch_manager + .unset_maintenance_mode() + .await + .unwrap(); + assert!(!runtime_switch_manager.maintenance_mode().await.unwrap()); + } + + #[tokio::test] + async fn test_runtime_switch_manager_with_legacy_key() { + let kv_backend = Arc::new(MemoryKvBackend::new()); + kv_backend + .put(PutRequest { + key: Vec::from(LEGACY_MAINTENANCE_KEY), + value: vec![], + prev_kv: false, + }) + .await + .unwrap(); + let runtime_switch_manager = Arc::new(RuntimeSwitchManager::new(kv_backend)); + assert!(runtime_switch_manager.maintenance_mode().await.unwrap()); + runtime_switch_manager + .unset_maintenance_mode() + .await + .unwrap(); + assert!(!runtime_switch_manager.maintenance_mode().await.unwrap()); + runtime_switch_manager.set_maintenance_mode().await.unwrap(); + assert!(runtime_switch_manager.maintenance_mode().await.unwrap()); + } + + #[tokio::test] + async fn test_pasue_procedure() { + let runtime_switch_manager = + Arc::new(RuntimeSwitchManager::new(Arc::new(MemoryKvBackend::new()))); + runtime_switch_manager.pasue_procedure().await.unwrap(); + assert!(runtime_switch_manager.is_procedure_paused().await.unwrap()); + runtime_switch_manager.resume_procedure().await.unwrap(); + assert!(!runtime_switch_manager.is_procedure_paused().await.unwrap()); + } + + #[tokio::test] + async fn test_recovery_mode() { + let runtime_switch_manager = + Arc::new(RuntimeSwitchManager::new(Arc::new(MemoryKvBackend::new()))); + assert!(!runtime_switch_manager.recovery_mode().await.unwrap()); + runtime_switch_manager.set_recovery_mode().await.unwrap(); + assert!(runtime_switch_manager.recovery_mode().await.unwrap()); + runtime_switch_manager.unset_recovery_mode().await.unwrap(); + assert!(!runtime_switch_manager.recovery_mode().await.unwrap()); + } +} diff --git a/src/common/meta/src/key/table_info.rs b/src/common/meta/src/key/table_info.rs index 3249bf686b..2de5abd764 100644 --- a/src/common/meta/src/key/table_info.rs +++ b/src/common/meta/src/key/table_info.rs @@ -334,6 +334,7 @@ mod tests { options: Default::default(), region_numbers: vec![1], partition_key_indices: vec![], + column_ids: vec![], }; RawTableInfo { diff --git a/src/common/meta/src/key/table_name.rs b/src/common/meta/src/key/table_name.rs index 9dbe405aec..2cb8586293 100644 --- a/src/common/meta/src/key/table_name.rs +++ b/src/common/meta/src/key/table_name.rs @@ -103,6 +103,26 @@ pub fn table_decoder(kv: KeyValue) -> Result<(String, TableNameValue)> { Ok((table_name_key.table.to_string(), table_name_value)) } +impl<'a> From<&TableReference<'a>> for TableNameKey<'a> { + fn from(value: &TableReference<'a>) -> Self { + Self { + catalog: value.catalog, + schema: value.schema, + table: value.table, + } + } +} + +impl<'a> From> for TableNameKey<'a> { + fn from(value: TableReference<'a>) -> Self { + Self { + catalog: value.catalog, + schema: value.schema, + table: value.table, + } + } +} + impl<'a> From<&'a TableName> for TableNameKey<'a> { fn from(value: &'a TableName) -> Self { Self { diff --git a/src/common/meta/src/key/table_route.rs b/src/common/meta/src/key/table_route.rs index 94d2a0bf07..27097d52c2 100644 --- a/src/common/meta/src/key/table_route.rs +++ b/src/common/meta/src/key/table_route.rs @@ -184,6 +184,17 @@ impl TableRouteValue { } } + /// Converts to [`LogicalTableRouteValue`]. + /// + /// # Panic + /// If it is not the [`LogicalTableRouteValue`]. + pub fn into_logical_table_route(self) -> LogicalTableRouteValue { + match self { + TableRouteValue::Logical(x) => x, + _ => unreachable!("Mistakenly been treated as a Logical TableRoute: {self:?}"), + } + } + pub fn region_numbers(&self) -> Vec { match self { TableRouteValue::Physical(x) => x diff --git a/src/common/meta/src/lib.rs b/src/common/meta/src/lib.rs index 50d3cdc8d3..fa53ab0a6c 100644 --- a/src/common/meta/src/lib.rs +++ b/src/common/meta/src/lib.rs @@ -37,7 +37,9 @@ pub mod node_expiry_listener; pub mod node_manager; pub mod peer; pub mod poison_key; +pub mod procedure_executor; pub mod range_stream; +pub mod reconciliation; pub mod region_keeper; pub mod region_registry; pub mod rpc; diff --git a/src/common/meta/src/metrics.rs b/src/common/meta/src/metrics.rs index 2df82c8aba..a977c3ed1c 100644 --- a/src/common/meta/src/metrics.rs +++ b/src/common/meta/src/metrics.rs @@ -15,6 +15,13 @@ use lazy_static::lazy_static; use prometheus::*; +pub const TABLE_TYPE_PHYSICAL: &str = "physical"; +pub const TABLE_TYPE_LOGICAL: &str = "logical"; +pub const ERROR_TYPE_RETRYABLE: &str = "retryable"; +pub const ERROR_TYPE_EXTERNAL: &str = "external"; +pub const STATS_TYPE_NO_REGION_METADATA: &str = "no_region_metadata"; +pub const STATS_TYPE_REGION_NOT_OPEN: &str = "region_not_open"; + lazy_static! { pub static ref METRIC_META_TXN_REQUEST: HistogramVec = register_histogram_vec!( "greptime_meta_txn_request", @@ -114,4 +121,39 @@ lazy_static! { &["backend", "result", "op", "type"] ) .unwrap(); + pub static ref METRIC_META_RECONCILIATION_LIST_REGION_METADATA_DURATION: HistogramVec = + register_histogram_vec!( + "greptime_meta_reconciliation_list_region_metadata_duration", + "reconciliation list region metadata duration", + &["table_type"] + ) + .unwrap(); + pub static ref METRIC_META_RECONCILIATION_RESOLVED_COLUMN_METADATA: IntCounterVec = + register_int_counter_vec!( + "greptime_meta_reconciliation_resolved_column_metadata", + "reconciliation resolved column metadata", + &["strategy"] + ) + .unwrap(); + pub static ref METRIC_META_RECONCILIATION_STATS: IntCounterVec = + register_int_counter_vec!( + "greptime_meta_reconciliation_stats", + "reconciliation stats", + &["procedure_name", "table_type", "type"] + ) + .unwrap(); + pub static ref METRIC_META_RECONCILIATION_PROCEDURE: HistogramVec = + register_histogram_vec!( + "greptime_meta_reconciliation_procedure", + "reconcile table procedure", + &["procedure_name", "step"] + ) + .unwrap(); + pub static ref METRIC_META_RECONCILIATION_PROCEDURE_ERROR: IntCounterVec = + register_int_counter_vec!( + "greptime_meta_reconciliation_procedure_error", + "reconciliation procedure error", + &["procedure_name", "step", "error_type"] + ) + .unwrap(); } diff --git a/src/common/meta/src/procedure_executor.rs b/src/common/meta/src/procedure_executor.rs new file mode 100644 index 0000000000..41567a02ca --- /dev/null +++ b/src/common/meta/src/procedure_executor.rs @@ -0,0 +1,173 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse}; +use common_procedure::{ProcedureId, ProcedureManagerRef}; +use common_telemetry::tracing_context::W3cTrace; +use snafu::{OptionExt, ResultExt}; + +use crate::ddl_manager::DdlManagerRef; +use crate::error::{ + ParseProcedureIdSnafu, ProcedureNotFoundSnafu, QueryProcedureSnafu, Result, UnsupportedSnafu, +}; +use crate::rpc::ddl::{SubmitDdlTaskRequest, SubmitDdlTaskResponse}; +use crate::rpc::procedure::{ + self, AddRegionFollowerRequest, MigrateRegionRequest, MigrateRegionResponse, + ProcedureStateResponse, RemoveRegionFollowerRequest, +}; + +/// The context of procedure executor. +#[derive(Debug, Default)] +pub struct ExecutorContext { + pub tracing_context: Option, +} + +/// The procedure executor that accepts ddl, region migration task etc. +#[async_trait::async_trait] +pub trait ProcedureExecutor: Send + Sync { + /// Submit a ddl task + async fn submit_ddl_task( + &self, + ctx: &ExecutorContext, + request: SubmitDdlTaskRequest, + ) -> Result; + + /// Add a region follower + async fn add_region_follower( + &self, + _ctx: &ExecutorContext, + _request: AddRegionFollowerRequest, + ) -> Result<()> { + UnsupportedSnafu { + operation: "add_region_follower", + } + .fail() + } + + /// Remove a region follower + async fn remove_region_follower( + &self, + _ctx: &ExecutorContext, + _request: RemoveRegionFollowerRequest, + ) -> Result<()> { + UnsupportedSnafu { + operation: "remove_region_follower", + } + .fail() + } + + /// Submit a region migration task + async fn migrate_region( + &self, + ctx: &ExecutorContext, + request: MigrateRegionRequest, + ) -> Result; + + /// Submit a reconcile task. + async fn reconcile( + &self, + _ctx: &ExecutorContext, + request: ReconcileRequest, + ) -> Result; + + /// Query the procedure state by its id + async fn query_procedure_state( + &self, + ctx: &ExecutorContext, + pid: &str, + ) -> Result; + + async fn list_procedures(&self, ctx: &ExecutorContext) -> Result; +} + +pub type ProcedureExecutorRef = Arc; + +/// The local procedure executor that accepts ddl, region migration task etc. +pub struct LocalProcedureExecutor { + pub ddl_manager: DdlManagerRef, + pub procedure_manager: ProcedureManagerRef, +} + +impl LocalProcedureExecutor { + pub fn new(ddl_manager: DdlManagerRef, procedure_manager: ProcedureManagerRef) -> Self { + Self { + ddl_manager, + procedure_manager, + } + } +} + +#[async_trait::async_trait] +impl ProcedureExecutor for LocalProcedureExecutor { + async fn submit_ddl_task( + &self, + ctx: &ExecutorContext, + request: SubmitDdlTaskRequest, + ) -> Result { + self.ddl_manager.submit_ddl_task(ctx, request).await + } + + async fn migrate_region( + &self, + _ctx: &ExecutorContext, + _request: MigrateRegionRequest, + ) -> Result { + UnsupportedSnafu { + operation: "migrate_region", + } + .fail() + } + + async fn reconcile( + &self, + _ctx: &ExecutorContext, + _request: ReconcileRequest, + ) -> Result { + UnsupportedSnafu { + operation: "reconcile", + } + .fail() + } + + async fn query_procedure_state( + &self, + _ctx: &ExecutorContext, + pid: &str, + ) -> Result { + let pid = + ProcedureId::parse_str(pid).with_context(|_| ParseProcedureIdSnafu { key: pid })?; + + let state = self + .procedure_manager + .procedure_state(pid) + .await + .context(QueryProcedureSnafu)? + .with_context(|| ProcedureNotFoundSnafu { + pid: pid.to_string(), + })?; + + Ok(procedure::procedure_state_to_pb_response(&state)) + } + + async fn list_procedures(&self, _ctx: &ExecutorContext) -> Result { + let metas = self + .procedure_manager + .list_procedures() + .await + .context(QueryProcedureSnafu)?; + Ok(procedure::procedure_details_to_pb_response(metas)) + } +} diff --git a/src/common/meta/src/reconciliation.rs b/src/common/meta/src/reconciliation.rs new file mode 100644 index 0000000000..3f851d0163 --- /dev/null +++ b/src/common/meta/src/reconciliation.rs @@ -0,0 +1,20 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod manager; +pub(crate) mod reconcile_catalog; +pub(crate) mod reconcile_database; +pub(crate) mod reconcile_logical_tables; +pub(crate) mod reconcile_table; +pub(crate) mod utils; diff --git a/src/common/meta/src/reconciliation/manager.rs b/src/common/meta/src/reconciliation/manager.rs new file mode 100644 index 0000000000..29e15b4692 --- /dev/null +++ b/src/common/meta/src/reconciliation/manager.rs @@ -0,0 +1,246 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use common_procedure::{ + watcher, BoxedProcedure, ProcedureId, ProcedureManagerRef, ProcedureWithId, +}; +use common_telemetry::{error, info, warn}; +use snafu::{OptionExt, ResultExt}; +use store_api::storage::TableId; +use table::table_name::TableName; +use table::table_reference::TableReference; + +use crate::cache_invalidator::CacheInvalidatorRef; +use crate::error::{self, Result, TableNotFoundSnafu}; +use crate::key::table_name::TableNameKey; +use crate::key::TableMetadataManagerRef; +use crate::node_manager::NodeManagerRef; +use crate::reconciliation::reconcile_catalog::ReconcileCatalogProcedure; +use crate::reconciliation::reconcile_database::{ReconcileDatabaseProcedure, DEFAULT_PARALLELISM}; +use crate::reconciliation::reconcile_logical_tables::ReconcileLogicalTablesProcedure; +use crate::reconciliation::reconcile_table::resolve_column_metadata::ResolveStrategy; +use crate::reconciliation::reconcile_table::ReconcileTableProcedure; +use crate::reconciliation::utils::Context; + +pub type ReconciliationManagerRef = Arc; + +/// The manager for reconciliation procedures. +pub struct ReconciliationManager { + procedure_manager: ProcedureManagerRef, + context: Context, +} + +macro_rules! register_reconcile_loader { + ($self:ident, $procedure:ty) => {{ + let context = $self.context.clone(); + $self + .procedure_manager + .register_loader( + <$procedure>::TYPE_NAME, + Box::new(move |json| { + let context = context.clone(); + let procedure = <$procedure>::from_json(context, json)?; + Ok(Box::new(procedure)) + }), + ) + .context(error::RegisterProcedureLoaderSnafu { + type_name: <$procedure>::TYPE_NAME, + })?; + }}; +} + +impl ReconciliationManager { + pub fn new( + node_manager: NodeManagerRef, + table_metadata_manager: TableMetadataManagerRef, + cache_invalidator: CacheInvalidatorRef, + procedure_manager: ProcedureManagerRef, + ) -> Self { + Self { + procedure_manager, + context: Context { + node_manager, + table_metadata_manager, + cache_invalidator, + }, + } + } + + /// Try to start the reconciliation manager. + /// + /// This function will register the procedure loaders for the reconciliation procedures. + /// Returns an error if the procedure loaders are already registered. + pub fn try_start(&self) -> Result<()> { + register_reconcile_loader!(self, ReconcileLogicalTablesProcedure); + register_reconcile_loader!(self, ReconcileTableProcedure); + register_reconcile_loader!(self, ReconcileDatabaseProcedure); + register_reconcile_loader!(self, ReconcileCatalogProcedure); + + Ok(()) + } + + /// Reconcile a table. + /// + /// Returns the procedure id of the reconciliation procedure. + pub async fn reconcile_table( + &self, + table_ref: TableReference<'_>, + resolve_strategy: ResolveStrategy, + ) -> Result { + let table_name_key = + TableNameKey::new(table_ref.catalog, table_ref.schema, table_ref.table); + let table_metadata_manager = &self.context.table_metadata_manager; + let table_id = table_metadata_manager + .table_name_manager() + .get(table_name_key) + .await? + .with_context(|| TableNotFoundSnafu { + table_name: table_ref.to_string(), + })? + .table_id(); + let (physical_table_id, _) = table_metadata_manager + .table_route_manager() + .get_physical_table_route(table_id) + .await?; + + if physical_table_id == table_id { + Ok(self.reconcile_physical_table(table_id, table_ref.into(), resolve_strategy)) + } else { + let physical_table_info = table_metadata_manager + .table_info_manager() + .get(physical_table_id) + .await? + .with_context(|| TableNotFoundSnafu { + table_name: format!("table_id: {}", physical_table_id), + })?; + + Ok(self.reconcile_logical_tables( + physical_table_id, + physical_table_info.table_name(), + vec![(table_id, table_ref.into())], + )) + } + } + + /// Reconcile a database. + /// + /// Returns the procedure id of the reconciliation procedure. + pub fn reconcile_database( + &self, + catalog: String, + schema: String, + resolve_strategy: ResolveStrategy, + parallelism: usize, + ) -> ProcedureId { + let parallelism = normalize_parallelism(parallelism); + let procedure = ReconcileDatabaseProcedure::new( + self.context.clone(), + catalog, + schema, + false, + parallelism, + resolve_strategy, + false, + ); + self.spawn_procedure(Box::new(procedure)) + } + + fn reconcile_physical_table( + &self, + table_id: TableId, + table_name: TableName, + resolve_strategy: ResolveStrategy, + ) -> ProcedureId { + let procedure = ReconcileTableProcedure::new( + self.context.clone(), + table_id, + table_name, + resolve_strategy, + false, + ); + self.spawn_procedure(Box::new(procedure)) + } + + fn reconcile_logical_tables( + &self, + physical_table_id: TableId, + physical_table_name: TableName, + logical_tables: Vec<(TableId, TableName)>, + ) -> ProcedureId { + let procedure = ReconcileLogicalTablesProcedure::new( + self.context.clone(), + physical_table_id, + physical_table_name, + logical_tables, + false, + ); + self.spawn_procedure(Box::new(procedure)) + } + + /// Reconcile a catalog. + /// + /// Returns the procedure id of the reconciliation procedure. + pub fn reconcile_catalog( + &self, + catalog: String, + resolve_strategy: ResolveStrategy, + parallelism: usize, + ) -> ProcedureId { + let parallelism = normalize_parallelism(parallelism); + let procedure = ReconcileCatalogProcedure::new( + self.context.clone(), + catalog, + false, + resolve_strategy, + parallelism, + ); + self.spawn_procedure(Box::new(procedure)) + } + + fn spawn_procedure(&self, procedure: BoxedProcedure) -> ProcedureId { + let procedure_manager = self.procedure_manager.clone(); + let procedure_with_id = ProcedureWithId::with_random_id(procedure); + let procedure_id = procedure_with_id.id; + common_runtime::spawn_global(async move { + let watcher = &mut match procedure_manager.submit(procedure_with_id).await { + Ok(watcher) => watcher, + Err(e) => { + error!(e; "Failed to submit reconciliation procedure {procedure_id}"); + return; + } + }; + if let Err(e) = watcher::wait(watcher).await { + error!(e; "Failed to wait reconciliation procedure {procedure_id}"); + return; + } + + info!("Reconciliation procedure {procedure_id} is finished successfully!"); + }); + procedure_id + } +} + +fn normalize_parallelism(parallelism: usize) -> usize { + if parallelism == 0 { + warn!( + "Parallelism is 0, using default parallelism: {}", + DEFAULT_PARALLELISM + ); + DEFAULT_PARALLELISM + } else { + parallelism + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_catalog.rs b/src/common/meta/src/reconciliation/reconcile_catalog.rs new file mode 100644 index 0000000000..341ffb3fdd --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_catalog.rs @@ -0,0 +1,237 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::fmt::Debug; +use std::time::Instant; + +use common_procedure::error::FromJsonSnafu; +use common_procedure::{ + Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, + Result as ProcedureResult, Status, +}; +use futures::stream::BoxStream; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt; + +use crate::cache_invalidator::CacheInvalidatorRef; +use crate::error::Result; +use crate::key::TableMetadataManagerRef; +use crate::lock_key::CatalogLock; +use crate::metrics; +use crate::node_manager::NodeManagerRef; +use crate::reconciliation::reconcile_catalog::start::ReconcileCatalogStart; +use crate::reconciliation::reconcile_table::resolve_column_metadata::ResolveStrategy; +use crate::reconciliation::utils::{ + wait_for_inflight_subprocedures, Context, ReconcileCatalogMetrics, SubprocedureMeta, +}; + +pub(crate) mod end; +pub(crate) mod reconcile_databases; +pub(crate) mod start; + +pub(crate) struct ReconcileCatalogContext { + pub node_manager: NodeManagerRef, + pub table_metadata_manager: TableMetadataManagerRef, + pub cache_invalidator: CacheInvalidatorRef, + persistent_ctx: PersistentContext, + volatile_ctx: VolatileContext, +} + +impl ReconcileCatalogContext { + pub fn new(ctx: Context, persistent_ctx: PersistentContext) -> Self { + Self { + node_manager: ctx.node_manager, + table_metadata_manager: ctx.table_metadata_manager, + cache_invalidator: ctx.cache_invalidator, + persistent_ctx, + volatile_ctx: VolatileContext::default(), + } + } + + pub(crate) async fn wait_for_inflight_subprocedure( + &mut self, + procedure_ctx: &ProcedureContext, + ) -> Result<()> { + if let Some(subprocedure) = self.volatile_ctx.inflight_subprocedure.take() { + let subprocedures = [subprocedure]; + let result = wait_for_inflight_subprocedures( + procedure_ctx, + &subprocedures, + self.persistent_ctx.fast_fail, + ) + .await?; + self.volatile_ctx.metrics += result.into(); + } + Ok(()) + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct PersistentContext { + catalog: String, + fast_fail: bool, + resolve_strategy: ResolveStrategy, + parallelism: usize, +} + +impl PersistentContext { + pub fn new( + catalog: String, + fast_fail: bool, + resolve_strategy: ResolveStrategy, + parallelism: usize, + ) -> Self { + Self { + catalog, + fast_fail, + resolve_strategy, + parallelism, + } + } +} + +pub(crate) struct VolatileContext { + /// Stores the stream of catalogs. + schemas: Option>>, + /// Stores the inflight subprocedure. + inflight_subprocedure: Option, + /// Stores the metrics of reconciling catalog. + metrics: ReconcileCatalogMetrics, + /// The start time of the reconciliation. + start_time: Instant, +} + +impl Default for VolatileContext { + fn default() -> Self { + Self { + schemas: None, + inflight_subprocedure: None, + metrics: Default::default(), + start_time: Instant::now(), + } + } +} + +pub struct ReconcileCatalogProcedure { + pub context: ReconcileCatalogContext, + state: Box, +} + +impl ReconcileCatalogProcedure { + pub const TYPE_NAME: &'static str = "metasrv-procedure::ReconcileCatalog"; + + pub fn new( + ctx: Context, + catalog: String, + fast_fail: bool, + resolve_strategy: ResolveStrategy, + parallelism: usize, + ) -> Self { + let persistent_ctx = + PersistentContext::new(catalog, fast_fail, resolve_strategy, parallelism); + let context = ReconcileCatalogContext::new(ctx, persistent_ctx); + let state = Box::new(ReconcileCatalogStart); + Self { context, state } + } + + pub(crate) fn from_json(ctx: Context, json: &str) -> ProcedureResult { + let ProcedureDataOwned { + state, + persistent_ctx, + } = serde_json::from_str(json).context(FromJsonSnafu)?; + let context = ReconcileCatalogContext::new(ctx, persistent_ctx); + Ok(Self { context, state }) + } +} + +#[derive(Debug, Serialize)] +struct ProcedureData<'a> { + state: &'a dyn State, + persistent_ctx: &'a PersistentContext, +} + +#[derive(Debug, Deserialize)] +struct ProcedureDataOwned { + state: Box, + persistent_ctx: PersistentContext, +} + +#[async_trait::async_trait] +impl Procedure for ReconcileCatalogProcedure { + fn type_name(&self) -> &str { + Self::TYPE_NAME + } + + async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + let state = &mut self.state; + + let procedure_name = Self::TYPE_NAME; + let step = state.name(); + let _timer = metrics::METRIC_META_RECONCILIATION_PROCEDURE + .with_label_values(&[procedure_name, step]) + .start_timer(); + match state.next(&mut self.context, _ctx).await { + Ok((next, status)) => { + *state = next; + Ok(status) + } + Err(e) => { + if e.is_retry_later() { + metrics::METRIC_META_RECONCILIATION_PROCEDURE_ERROR + .with_label_values(&[procedure_name, step, metrics::ERROR_TYPE_RETRYABLE]) + .inc(); + Err(ProcedureError::retry_later(e)) + } else { + metrics::METRIC_META_RECONCILIATION_PROCEDURE_ERROR + .with_label_values(&[procedure_name, step, metrics::ERROR_TYPE_EXTERNAL]) + .inc(); + Err(ProcedureError::external(e)) + } + } + } + } + + fn dump(&self) -> ProcedureResult { + let data = ProcedureData { + state: self.state.as_ref(), + persistent_ctx: &self.context.persistent_ctx, + }; + serde_json::to_string(&data).context(FromJsonSnafu) + } + + fn lock_key(&self) -> LockKey { + let catalog = &self.context.persistent_ctx.catalog; + + LockKey::new(vec![CatalogLock::Write(catalog).into()]) + } +} + +#[async_trait::async_trait] +#[typetag::serde(tag = "reconcile_catalog_state")] +pub(crate) trait State: Sync + Send + Debug { + fn name(&self) -> &'static str { + let type_name = std::any::type_name::(); + // short name + type_name.split("::").last().unwrap_or(type_name) + } + + async fn next( + &mut self, + ctx: &mut ReconcileCatalogContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)>; + + fn as_any(&self) -> &dyn Any; +} diff --git a/src/common/meta/src/reconciliation/reconcile_catalog/end.rs b/src/common/meta/src/reconciliation/reconcile_catalog/end.rs new file mode 100644 index 0000000000..964338c26e --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_catalog/end.rs @@ -0,0 +1,48 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; + +use crate::error::Result; +use crate::reconciliation::reconcile_catalog::{ReconcileCatalogContext, State}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ReconcileCatalogEnd; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconcileCatalogEnd { + async fn next( + &mut self, + ctx: &mut ReconcileCatalogContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + info!( + "Catalog reconciliation completed. catalog: {}, procedure_id: {}, metrics: {}, elapsed: {:?}", + ctx.persistent_ctx.catalog, + procedure_ctx.procedure_id, + ctx.volatile_ctx.metrics, + ctx.volatile_ctx.start_time.elapsed() + ); + Ok((Box::new(ReconcileCatalogEnd), Status::done())) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_catalog/reconcile_databases.rs b/src/common/meta/src/reconciliation/reconcile_catalog/reconcile_databases.rs new file mode 100644 index 0000000000..9e30571029 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_catalog/reconcile_databases.rs @@ -0,0 +1,104 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, ProcedureWithId, Status}; +use common_telemetry::info; +use futures::TryStreamExt; +use serde::{Deserialize, Serialize}; + +use crate::error::Result; +use crate::reconciliation::reconcile_catalog::end::ReconcileCatalogEnd; +use crate::reconciliation::reconcile_catalog::{ReconcileCatalogContext, State}; +use crate::reconciliation::reconcile_database::ReconcileDatabaseProcedure; +use crate::reconciliation::utils::{Context, SubprocedureMeta}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ReconcileDatabases; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconcileDatabases { + async fn next( + &mut self, + ctx: &mut ReconcileCatalogContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + // Waits for inflight subprocedure first. + ctx.wait_for_inflight_subprocedure(procedure_ctx).await?; + + if ctx.volatile_ctx.schemas.as_deref().is_none() { + let schemas = ctx + .table_metadata_manager + .schema_manager() + .schema_names(&ctx.persistent_ctx.catalog); + ctx.volatile_ctx.schemas = Some(schemas); + } + + if let Some(catalog) = ctx + .volatile_ctx + .schemas + .as_mut() + .unwrap() + .try_next() + .await? + { + return Self::schedule_reconcile_database(ctx, catalog); + } + + Ok((Box::new(ReconcileCatalogEnd), Status::executing(false))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +impl ReconcileDatabases { + fn schedule_reconcile_database( + ctx: &mut ReconcileCatalogContext, + schema: String, + ) -> Result<(Box, Status)> { + let context = Context { + node_manager: ctx.node_manager.clone(), + table_metadata_manager: ctx.table_metadata_manager.clone(), + cache_invalidator: ctx.cache_invalidator.clone(), + }; + info!( + "Scheduling reconcile database: {}, catalog: {}", + schema, ctx.persistent_ctx.catalog + ); + let procedure = ReconcileDatabaseProcedure::new( + context, + ctx.persistent_ctx.catalog.clone(), + schema.clone(), + ctx.persistent_ctx.fast_fail, + ctx.persistent_ctx.parallelism, + ctx.persistent_ctx.resolve_strategy, + true, + ); + let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure)); + ctx.volatile_ctx.inflight_subprocedure = Some(SubprocedureMeta::new_reconcile_database( + procedure_with_id.id, + ctx.persistent_ctx.catalog.clone(), + schema, + )); + + Ok(( + Box::new(ReconcileDatabases), + Status::suspended(vec![procedure_with_id], false), + )) + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_catalog/start.rs b/src/common/meta/src/reconciliation/reconcile_catalog/start.rs new file mode 100644 index 0000000000..03fd4cda04 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_catalog/start.rs @@ -0,0 +1,58 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use serde::{Deserialize, Serialize}; +use snafu::ensure; + +use crate::error::{self, Result}; +use crate::key::catalog_name::CatalogNameKey; +use crate::reconciliation::reconcile_catalog::reconcile_databases::ReconcileDatabases; +use crate::reconciliation::reconcile_catalog::{ReconcileCatalogContext, State}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ReconcileCatalogStart; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconcileCatalogStart { + async fn next( + &mut self, + ctx: &mut ReconcileCatalogContext, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let exists = ctx + .table_metadata_manager + .catalog_manager() + .exists(CatalogNameKey { + catalog: &ctx.persistent_ctx.catalog, + }) + .await?; + + ensure!( + exists, + error::CatalogNotFoundSnafu { + catalog: &ctx.persistent_ctx.catalog + }, + ); + + Ok((Box::new(ReconcileDatabases), Status::executing(true))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_database.rs b/src/common/meta/src/reconciliation/reconcile_database.rs new file mode 100644 index 0000000000..f4beffa973 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_database.rs @@ -0,0 +1,285 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) mod end; +pub(crate) mod reconcile_logical_tables; +pub(crate) mod reconcile_tables; +pub(crate) mod start; + +use std::any::Any; +use std::collections::HashMap; +use std::fmt::Debug; +use std::time::Instant; + +use async_trait::async_trait; +use common_procedure::error::{FromJsonSnafu, ToJsonSnafu}; +use common_procedure::{ + Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, + Result as ProcedureResult, Status, +}; +use futures::stream::BoxStream; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt; +use store_api::storage::TableId; +use table::table_name::TableName; + +use crate::cache_invalidator::CacheInvalidatorRef; +use crate::error::Result; +use crate::key::table_name::TableNameValue; +use crate::key::TableMetadataManagerRef; +use crate::lock_key::{CatalogLock, SchemaLock}; +use crate::metrics; +use crate::node_manager::NodeManagerRef; +use crate::reconciliation::reconcile_database::start::ReconcileDatabaseStart; +use crate::reconciliation::reconcile_table::resolve_column_metadata::ResolveStrategy; +use crate::reconciliation::utils::{ + wait_for_inflight_subprocedures, Context, ReconcileDatabaseMetrics, SubprocedureMeta, +}; +pub(crate) const DEFAULT_PARALLELISM: usize = 64; + +pub(crate) struct ReconcileDatabaseContext { + pub node_manager: NodeManagerRef, + pub table_metadata_manager: TableMetadataManagerRef, + pub cache_invalidator: CacheInvalidatorRef, + persistent_ctx: PersistentContext, + volatile_ctx: VolatileContext, +} + +impl ReconcileDatabaseContext { + pub fn new(ctx: Context, persistent_ctx: PersistentContext) -> Self { + Self { + node_manager: ctx.node_manager, + table_metadata_manager: ctx.table_metadata_manager, + cache_invalidator: ctx.cache_invalidator, + persistent_ctx, + volatile_ctx: VolatileContext::default(), + } + } + + /// Waits for inflight subprocedures to complete. + pub(crate) async fn wait_for_inflight_subprocedures( + &mut self, + procedure_ctx: &ProcedureContext, + ) -> Result<()> { + if !self.volatile_ctx.inflight_subprocedures.is_empty() { + let result = wait_for_inflight_subprocedures( + procedure_ctx, + &self.volatile_ctx.inflight_subprocedures, + self.persistent_ctx.fail_fast, + ) + .await?; + + // Collects result into metrics + let metrics = result.into(); + self.volatile_ctx.inflight_subprocedures.clear(); + self.volatile_ctx.metrics += metrics; + } + + Ok(()) + } + + /// Returns the immutable metrics. + pub(crate) fn metrics(&self) -> &ReconcileDatabaseMetrics { + &self.volatile_ctx.metrics + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct PersistentContext { + catalog: String, + schema: String, + fail_fast: bool, + parallelism: usize, + resolve_strategy: ResolveStrategy, + is_subprocedure: bool, +} + +impl PersistentContext { + pub fn new( + catalog: String, + schema: String, + fail_fast: bool, + parallelism: usize, + resolve_strategy: ResolveStrategy, + is_subprocedure: bool, + ) -> Self { + Self { + catalog, + schema, + fail_fast, + parallelism, + resolve_strategy, + is_subprocedure, + } + } +} + +pub(crate) struct VolatileContext { + /// Stores pending physical tables. + pending_tables: Vec<(TableId, TableName)>, + /// Stores pending logical tables associated with each physical table. + /// + /// - Key: Table ID of the physical table. + /// - Value: Vector of (TableId, TableName) tuples representing logical tables belonging to the physical table. + pending_logical_tables: HashMap>, + /// Stores inflight subprocedures. + inflight_subprocedures: Vec, + /// Stores the stream of tables. + tables: Option>>, + /// The metrics of reconciling database. + metrics: ReconcileDatabaseMetrics, + /// The start time of the reconciliation. + start_time: Instant, +} + +impl Default for VolatileContext { + fn default() -> Self { + Self { + pending_tables: vec![], + pending_logical_tables: HashMap::new(), + inflight_subprocedures: vec![], + tables: None, + metrics: ReconcileDatabaseMetrics::default(), + start_time: Instant::now(), + } + } +} + +pub struct ReconcileDatabaseProcedure { + pub context: ReconcileDatabaseContext, + state: Box, +} + +impl ReconcileDatabaseProcedure { + pub const TYPE_NAME: &'static str = "metasrv-procedure::ReconcileDatabase"; + + pub fn new( + ctx: Context, + catalog: String, + schema: String, + fail_fast: bool, + parallelism: usize, + resolve_strategy: ResolveStrategy, + is_subprocedure: bool, + ) -> Self { + let persistent_ctx = PersistentContext::new( + catalog, + schema, + fail_fast, + parallelism, + resolve_strategy, + is_subprocedure, + ); + let context = ReconcileDatabaseContext::new(ctx, persistent_ctx); + let state = Box::new(ReconcileDatabaseStart); + Self { context, state } + } + + pub(crate) fn from_json(ctx: Context, json: &str) -> ProcedureResult { + let ProcedureDataOwned { + state, + persistent_ctx, + } = serde_json::from_str(json).context(FromJsonSnafu)?; + let context = ReconcileDatabaseContext::new(ctx, persistent_ctx); + Ok(Self { context, state }) + } +} + +#[derive(Debug, Serialize)] +struct ProcedureData<'a> { + state: &'a dyn State, + persistent_ctx: &'a PersistentContext, +} + +#[derive(Debug, Deserialize)] +struct ProcedureDataOwned { + state: Box, + persistent_ctx: PersistentContext, +} + +#[async_trait] +impl Procedure for ReconcileDatabaseProcedure { + fn type_name(&self) -> &str { + Self::TYPE_NAME + } + + async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + let state = &mut self.state; + + let procedure_name = Self::TYPE_NAME; + let step = state.name(); + let _timer = metrics::METRIC_META_RECONCILIATION_PROCEDURE + .with_label_values(&[procedure_name, step]) + .start_timer(); + match state.next(&mut self.context, _ctx).await { + Ok((next, status)) => { + *state = next; + Ok(status) + } + Err(e) => { + if e.is_retry_later() { + metrics::METRIC_META_RECONCILIATION_PROCEDURE_ERROR + .with_label_values(&[procedure_name, step, metrics::ERROR_TYPE_RETRYABLE]) + .inc(); + Err(ProcedureError::retry_later(e)) + } else { + metrics::METRIC_META_RECONCILIATION_PROCEDURE_ERROR + .with_label_values(&[procedure_name, step, metrics::ERROR_TYPE_EXTERNAL]) + .inc(); + Err(ProcedureError::external(e)) + } + } + } + } + + fn dump(&self) -> ProcedureResult { + let data = ProcedureData { + state: self.state.as_ref(), + persistent_ctx: &self.context.persistent_ctx, + }; + serde_json::to_string(&data).context(ToJsonSnafu) + } + + fn lock_key(&self) -> LockKey { + let catalog = &self.context.persistent_ctx.catalog; + let schema = &self.context.persistent_ctx.schema; + // If the procedure is a subprocedure, only lock the schema. + if self.context.persistent_ctx.is_subprocedure { + return LockKey::new(vec![SchemaLock::write(catalog, schema).into()]); + } + + LockKey::new(vec![ + CatalogLock::Read(catalog).into(), + SchemaLock::write(catalog, schema).into(), + ]) + } +} + +#[async_trait::async_trait] +#[typetag::serde(tag = "reconcile_database_state")] +pub(crate) trait State: Sync + Send + Debug { + fn name(&self) -> &'static str { + let type_name = std::any::type_name::(); + // short name + type_name.split("::").last().unwrap_or(type_name) + } + + async fn next( + &mut self, + ctx: &mut ReconcileDatabaseContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)>; + + fn as_any(&self) -> &dyn Any; +} diff --git a/src/common/meta/src/reconciliation/reconcile_database/end.rs b/src/common/meta/src/reconciliation/reconcile_database/end.rs new file mode 100644 index 0000000000..cbbd7bdbac --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_database/end.rs @@ -0,0 +1,49 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; + +use crate::error::Result; +use crate::reconciliation::reconcile_database::{ReconcileDatabaseContext, State}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ReconcileDatabaseEnd; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconcileDatabaseEnd { + async fn next( + &mut self, + ctx: &mut ReconcileDatabaseContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + info!( + "Database reconciliation completed. schema: {}, catalog: {}, procedure_id: {}, metrics: {}, elapsed: {:?}", + ctx.persistent_ctx.schema, + ctx.persistent_ctx.catalog, + procedure_ctx.procedure_id, + ctx.metrics(), + ctx.volatile_ctx.start_time.elapsed(), + ); + Ok((Box::new(ReconcileDatabaseEnd), Status::done())) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_database/reconcile_logical_tables.rs b/src/common/meta/src/reconciliation/reconcile_database/reconcile_logical_tables.rs new file mode 100644 index 0000000000..2bf0457aeb --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_database/reconcile_logical_tables.rs @@ -0,0 +1,248 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::HashMap; + +use common_procedure::{Context as ProcedureContext, ProcedureWithId, Status}; +use common_telemetry::info; +use futures::TryStreamExt; +use serde::{Deserialize, Serialize}; +use snafu::OptionExt; +use table::metadata::TableId; +use table::table_name::TableName; +use table::table_reference::TableReference; + +use crate::error::{Result, TableInfoNotFoundSnafu}; +use crate::key::table_route::TableRouteValue; +use crate::reconciliation::reconcile_database::end::ReconcileDatabaseEnd; +use crate::reconciliation::reconcile_database::{ReconcileDatabaseContext, State}; +use crate::reconciliation::reconcile_logical_tables::ReconcileLogicalTablesProcedure; +use crate::reconciliation::utils::{Context, SubprocedureMeta}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ReconcileLogicalTables; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconcileLogicalTables { + async fn next( + &mut self, + ctx: &mut ReconcileDatabaseContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + info!( + "Reconcile logical tables in database: {}, catalog: {}, inflight_subprocedures: {}", + ctx.persistent_ctx.schema, + ctx.persistent_ctx.catalog, + ctx.volatile_ctx.inflight_subprocedures.len() + ); + // Waits for inflight subprocedures first. + ctx.wait_for_inflight_subprocedures(procedure_ctx).await?; + + let catalog = &ctx.persistent_ctx.catalog; + let schema = &ctx.persistent_ctx.schema; + let parallelism = ctx.persistent_ctx.parallelism; + if ctx.volatile_ctx.tables.as_deref().is_none() { + let tables = ctx + .table_metadata_manager + .table_name_manager() + .tables(catalog, schema); + ctx.volatile_ctx.tables = Some(tables); + } + + let pending_logical_tables = &mut ctx.volatile_ctx.pending_logical_tables; + let mut pending_procedures = Vec::with_capacity(parallelism); + let context = Context { + node_manager: ctx.node_manager.clone(), + table_metadata_manager: ctx.table_metadata_manager.clone(), + cache_invalidator: ctx.cache_invalidator.clone(), + }; + // Safety: initialized above. + while let Some((table_name, table_name_value)) = + ctx.volatile_ctx.tables.as_mut().unwrap().try_next().await? + { + let table_id = table_name_value.table_id(); + let Some(table_route) = ctx + .table_metadata_manager + .table_route_manager() + .table_route_storage() + .get(table_id) + .await? + else { + continue; + }; + + let table_ref = TableReference::full(catalog, schema, &table_name); + Self::enqueue_logical_table(pending_logical_tables, table_id, table_ref, table_route); + // Try to build reconcile logical tables procedure. + if let Some(procedure) = Self::try_build_reconcile_logical_tables_procedure( + &context, + pending_logical_tables, + parallelism, + ) + .await? + { + pending_procedures.push(procedure); + } + // Schedule reconcile logical tables procedures if the number of pending procedures + // is greater than or equal to parallelism. + if Self::should_schedule_reconcile_logical_tables(&pending_procedures, parallelism) { + return Self::schedule_reconcile_logical_tables(ctx, &mut pending_procedures); + } + } + + // Build remaining procedures. + Self::build_remaining_procedures( + &context, + pending_logical_tables, + &mut pending_procedures, + parallelism, + ) + .await?; + // If there are remaining procedures, schedule reconcile logical tables procedures. + if !pending_procedures.is_empty() { + return Self::schedule_reconcile_logical_tables(ctx, &mut pending_procedures); + } + + ctx.volatile_ctx.tables.take(); + Ok((Box::new(ReconcileDatabaseEnd), Status::executing(true))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +impl ReconcileLogicalTables { + fn schedule_reconcile_logical_tables( + ctx: &mut ReconcileDatabaseContext, + buffer: &mut Vec<(ProcedureWithId, SubprocedureMeta)>, + ) -> Result<(Box, Status)> { + let buffer = std::mem::take(buffer); + let (procedures, meta): (Vec<_>, Vec<_>) = buffer.into_iter().unzip(); + + ctx.volatile_ctx.inflight_subprocedures.extend(meta); + Ok(( + Box::new(ReconcileLogicalTables), + Status::suspended(procedures, false), + )) + } + + fn should_schedule_reconcile_logical_tables( + buffer: &[(ProcedureWithId, SubprocedureMeta)], + parallelism: usize, + ) -> bool { + buffer.len() >= parallelism + } + + async fn try_build_reconcile_logical_tables_procedure( + ctx: &Context, + pending_logical_tables: &mut HashMap>, + parallelism: usize, + ) -> Result> { + let mut physical_table_id = None; + for (table_id, tables) in pending_logical_tables.iter() { + if tables.len() >= parallelism { + physical_table_id = Some(*table_id); + break; + } + } + + if let Some(physical_table_id) = physical_table_id { + // Safety: Checked above. + let tables = pending_logical_tables.remove(&physical_table_id).unwrap(); + return Ok(Some( + Self::build_reconcile_logical_tables_procedure(ctx, physical_table_id, tables) + .await?, + )); + } + + Ok(None) + } + + async fn build_remaining_procedures( + ctx: &Context, + pending_logical_tables: &mut HashMap>, + pending_procedures: &mut Vec<(ProcedureWithId, SubprocedureMeta)>, + parallelism: usize, + ) -> Result<()> { + if pending_logical_tables.is_empty() { + return Ok(()); + } + + while let Some(physical_table_id) = pending_logical_tables.keys().next().cloned() { + if pending_procedures.len() >= parallelism { + return Ok(()); + } + + // Safety: Checked above. + let tables = pending_logical_tables.remove(&physical_table_id).unwrap(); + pending_procedures.push( + Self::build_reconcile_logical_tables_procedure(ctx, physical_table_id, tables) + .await?, + ); + } + + Ok(()) + } + + async fn build_reconcile_logical_tables_procedure( + ctx: &Context, + physical_table_id: TableId, + logical_tables: Vec<(TableId, TableName)>, + ) -> Result<(ProcedureWithId, SubprocedureMeta)> { + let table_info = ctx + .table_metadata_manager + .table_info_manager() + .get(physical_table_id) + .await? + .context(TableInfoNotFoundSnafu { + table: format!("table_id: {}", physical_table_id), + })?; + + let physical_table_name = table_info.table_name(); + let procedure = ReconcileLogicalTablesProcedure::new( + ctx.clone(), + physical_table_id, + physical_table_name.clone(), + logical_tables.clone(), + true, + ); + let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure)); + let subprocedure_meta = SubprocedureMeta::new_logical_table( + procedure_with_id.id, + physical_table_id, + physical_table_name, + logical_tables, + ); + Ok((procedure_with_id, subprocedure_meta)) + } + + fn enqueue_logical_table( + tables: &mut HashMap>, + table_id: TableId, + table_ref: TableReference<'_>, + table_route: TableRouteValue, + ) { + if !table_route.is_physical() { + let logical_table_route = table_route.into_logical_table_route(); + let physical_table_id = logical_table_route.physical_table_id(); + tables + .entry(physical_table_id) + .or_default() + .push((table_id, table_ref.into())); + } + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_database/reconcile_tables.rs b/src/common/meta/src/reconciliation/reconcile_database/reconcile_tables.rs new file mode 100644 index 0000000000..93f24fcf2f --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_database/reconcile_tables.rs @@ -0,0 +1,166 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, ProcedureWithId, Status}; +use common_telemetry::info; +use futures::TryStreamExt; +use serde::{Deserialize, Serialize}; +use store_api::storage::TableId; +use table::table_name::TableName; +use table::table_reference::TableReference; + +use crate::error::Result; +use crate::key::table_route::TableRouteValue; +use crate::reconciliation::reconcile_database::reconcile_logical_tables::ReconcileLogicalTables; +use crate::reconciliation::reconcile_database::{ReconcileDatabaseContext, State}; +use crate::reconciliation::reconcile_table::ReconcileTableProcedure; +use crate::reconciliation::utils::{Context, SubprocedureMeta}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ReconcileTables; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconcileTables { + async fn next( + &mut self, + ctx: &mut ReconcileDatabaseContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + info!( + "Reconcile tables in database: {}, catalog: {}, inflight_subprocedures: {}", + ctx.persistent_ctx.schema, + ctx.persistent_ctx.catalog, + ctx.volatile_ctx.inflight_subprocedures.len() + ); + // Waits for inflight subprocedures first. + ctx.wait_for_inflight_subprocedures(procedure_ctx).await?; + + let catalog = &ctx.persistent_ctx.catalog; + let schema = &ctx.persistent_ctx.schema; + let parallelism = ctx.persistent_ctx.parallelism; + if ctx.volatile_ctx.tables.as_deref().is_none() { + let tables = ctx + .table_metadata_manager + .table_name_manager() + .tables(catalog, schema); + ctx.volatile_ctx.tables = Some(tables); + } + + let pending_tables = &mut ctx.volatile_ctx.pending_tables; + // Safety: must exists. + while let Some((table_name, table_name_value)) = + ctx.volatile_ctx.tables.as_mut().unwrap().try_next().await? + { + let table_id = table_name_value.table_id(); + let Some(table_route) = ctx + .table_metadata_manager + .table_route_manager() + .table_route_storage() + .get(table_id) + .await? + else { + continue; + }; + + let table_ref = TableReference::full(catalog, schema, &table_name); + // Enqueue table. + Self::enqueue_table(pending_tables, table_id, table_ref, table_route); + // Schedule reconcile table procedures if the number of pending procedures + // is greater than or equal to parallelism. + if Self::should_schedule_reconcile_tables(pending_tables, parallelism) { + return Self::schedule_reconcile_tables(ctx); + } + } + + // If there are remaining tables, schedule reconcile table procedures. + if !pending_tables.is_empty() { + return Self::schedule_reconcile_tables(ctx); + } + ctx.volatile_ctx.tables.take(); + Ok((Box::new(ReconcileLogicalTables), Status::executing(true))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +impl ReconcileTables { + fn schedule_reconcile_tables( + ctx: &mut ReconcileDatabaseContext, + ) -> Result<(Box, Status)> { + let tables = std::mem::take(&mut ctx.volatile_ctx.pending_tables); + let (procedures, meta): (Vec<_>, Vec<_>) = + Self::build_reconcile_table_procedures(ctx, tables) + .into_iter() + .unzip(); + ctx.volatile_ctx.inflight_subprocedures.extend(meta); + Ok(( + Box::new(ReconcileTables), + Status::suspended(procedures, false), + )) + } + + fn should_schedule_reconcile_tables( + pending_tables: &[(TableId, TableName)], + parallelism: usize, + ) -> bool { + pending_tables.len() >= parallelism + } + + fn build_reconcile_table_procedures( + ctx: &ReconcileDatabaseContext, + tables: Vec<(TableId, TableName)>, + ) -> Vec<(ProcedureWithId, SubprocedureMeta)> { + let mut procedures = Vec::with_capacity(tables.len()); + for (table_id, table_name) in tables { + let context = Context { + node_manager: ctx.node_manager.clone(), + table_metadata_manager: ctx.table_metadata_manager.clone(), + cache_invalidator: ctx.cache_invalidator.clone(), + }; + let procedure = ReconcileTableProcedure::new( + context, + table_id, + table_name.clone(), + ctx.persistent_ctx.resolve_strategy, + true, + ); + let procedure = ProcedureWithId::with_random_id(Box::new(procedure)); + let meta = + SubprocedureMeta::new_physical_table(procedure.id, table_id, table_name.clone()); + info!( + "Reconcile table: {}, table_id: {}, procedure_id: {}", + table_name, table_id, procedure.id + ); + procedures.push((procedure, meta)); + } + + procedures + } + + fn enqueue_table( + tables: &mut Vec<(TableId, TableName)>, + table_id: TableId, + table_ref: TableReference<'_>, + table_route: TableRouteValue, + ) { + if table_route.is_physical() { + tables.push((table_id, table_ref.into())); + } + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_database/start.rs b/src/common/meta/src/reconciliation/reconcile_database/start.rs new file mode 100644 index 0000000000..73fed9c0bb --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_database/start.rs @@ -0,0 +1,63 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use snafu::ensure; + +use crate::error::{self, Result}; +use crate::key::schema_name::SchemaNameKey; +use crate::reconciliation::reconcile_database::reconcile_tables::ReconcileTables; +use crate::reconciliation::reconcile_database::{ReconcileDatabaseContext, State}; + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ReconcileDatabaseStart; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconcileDatabaseStart { + async fn next( + &mut self, + ctx: &mut ReconcileDatabaseContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let exists = ctx + .table_metadata_manager + .schema_manager() + .exists(SchemaNameKey { + catalog: &ctx.persistent_ctx.catalog, + schema: &ctx.persistent_ctx.schema, + }) + .await?; + + ensure!( + exists, + error::SchemaNotFoundSnafu { + table_schema: &ctx.persistent_ctx.schema, + }, + ); + info!( + "Reconcile database: {}, catalog: {}, procedure_id: {}", + ctx.persistent_ctx.schema, ctx.persistent_ctx.catalog, procedure_ctx.procedure_id, + ); + Ok((Box::new(ReconcileTables), Status::executing(true))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_logical_tables.rs b/src/common/meta/src/reconciliation/reconcile_logical_tables.rs new file mode 100644 index 0000000000..a067767c72 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_logical_tables.rs @@ -0,0 +1,272 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) mod reconcile_regions; +pub(crate) mod reconciliation_end; +pub(crate) mod reconciliation_start; +pub(crate) mod resolve_table_metadatas; +pub(crate) mod update_table_infos; + +use std::any::Any; +use std::fmt::Debug; + +use async_trait::async_trait; +use common_procedure::error::{FromJsonSnafu, ToJsonSnafu}; +use common_procedure::{ + Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, + Result as ProcedureResult, Status, +}; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt; +use store_api::metadata::ColumnMetadata; +use store_api::storage::TableId; +use table::metadata::RawTableInfo; +use table::table_name::TableName; + +use crate::cache_invalidator::CacheInvalidatorRef; +use crate::error::Result; +use crate::key::table_info::TableInfoValue; +use crate::key::table_route::PhysicalTableRouteValue; +use crate::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; +use crate::lock_key::{CatalogLock, SchemaLock, TableLock}; +use crate::metrics; +use crate::node_manager::NodeManagerRef; +use crate::reconciliation::reconcile_logical_tables::reconciliation_start::ReconciliationStart; +use crate::reconciliation::utils::{Context, ReconcileLogicalTableMetrics}; + +pub struct ReconcileLogicalTablesContext { + pub node_manager: NodeManagerRef, + pub table_metadata_manager: TableMetadataManagerRef, + pub cache_invalidator: CacheInvalidatorRef, + pub persistent_ctx: PersistentContext, + pub volatile_ctx: VolatileContext, +} + +impl ReconcileLogicalTablesContext { + /// Creates a new [`ReconcileLogicalTablesContext`] with the given [`Context`] and [`PersistentContext`]. + pub fn new(ctx: Context, persistent_ctx: PersistentContext) -> Self { + Self { + node_manager: ctx.node_manager, + table_metadata_manager: ctx.table_metadata_manager, + cache_invalidator: ctx.cache_invalidator, + persistent_ctx, + volatile_ctx: VolatileContext::default(), + } + } + + /// Returns the physical table name. + pub(crate) fn table_name(&self) -> &TableName { + &self.persistent_ctx.table_name + } + + /// Returns the physical table id. + pub(crate) fn table_id(&self) -> TableId { + self.persistent_ctx.table_id + } + + /// Returns a mutable reference to the metrics. + pub(crate) fn mut_metrics(&mut self) -> &mut ReconcileLogicalTableMetrics { + &mut self.volatile_ctx.metrics + } + + /// Returns a reference to the metrics. + pub(crate) fn metrics(&self) -> &ReconcileLogicalTableMetrics { + &self.volatile_ctx.metrics + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct PersistentContext { + pub(crate) table_id: TableId, + pub(crate) table_name: TableName, + // The logical tables need to be reconciled. + // The logical tables belongs to the physical table. + pub(crate) logical_tables: Vec, + // The logical table ids. + // The value will be set in `ReconciliationStart` state. + pub(crate) logical_table_ids: Vec, + /// The table info value. + /// The value will be set in `ReconciliationStart` state. + pub(crate) table_info_value: Option>, + // The physical table route. + // The value will be set in `ReconciliationStart` state. + pub(crate) physical_table_route: Option, + // The table infos to be updated. + // The value will be set in `ResolveTableMetadatas` state. + pub(crate) update_table_infos: Vec<(TableId, Vec)>, + // The table infos to be created. + // The value will be set in `ResolveTableMetadatas` state. + pub(crate) create_tables: Vec<(TableId, RawTableInfo)>, + // Whether the procedure is a subprocedure. + pub(crate) is_subprocedure: bool, +} + +impl PersistentContext { + pub(crate) fn new( + table_id: TableId, + table_name: TableName, + logical_tables: Vec<(TableId, TableName)>, + is_subprocedure: bool, + ) -> Self { + let (logical_table_ids, logical_tables) = logical_tables.into_iter().unzip(); + + Self { + table_id, + table_name, + logical_tables, + logical_table_ids, + table_info_value: None, + physical_table_route: None, + update_table_infos: vec![], + create_tables: vec![], + is_subprocedure, + } + } +} + +#[derive(Default)] +pub(crate) struct VolatileContext { + pub(crate) metrics: ReconcileLogicalTableMetrics, +} + +pub struct ReconcileLogicalTablesProcedure { + pub context: ReconcileLogicalTablesContext, + state: Box, +} + +#[derive(Debug, Serialize)] +struct ProcedureData<'a> { + state: &'a dyn State, + persistent_ctx: &'a PersistentContext, +} + +#[derive(Debug, Deserialize)] +struct ProcedureDataOwned { + state: Box, + persistent_ctx: PersistentContext, +} + +impl ReconcileLogicalTablesProcedure { + pub const TYPE_NAME: &'static str = "metasrv-procedure::ReconcileLogicalTables"; + + pub fn new( + ctx: Context, + table_id: TableId, + table_name: TableName, + logical_tables: Vec<(TableId, TableName)>, + is_subprocedure: bool, + ) -> Self { + let persistent_ctx = + PersistentContext::new(table_id, table_name, logical_tables, is_subprocedure); + let context = ReconcileLogicalTablesContext::new(ctx, persistent_ctx); + let state = Box::new(ReconciliationStart); + Self { context, state } + } + + pub(crate) fn from_json(ctx: Context, json: &str) -> ProcedureResult { + let ProcedureDataOwned { + state, + persistent_ctx, + } = serde_json::from_str(json).context(FromJsonSnafu)?; + let context = ReconcileLogicalTablesContext::new(ctx, persistent_ctx); + Ok(Self { context, state }) + } +} + +#[async_trait] +impl Procedure for ReconcileLogicalTablesProcedure { + fn type_name(&self) -> &str { + Self::TYPE_NAME + } + + async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + let state = &mut self.state; + + let procedure_name = Self::TYPE_NAME; + let step = state.name(); + let _timer = metrics::METRIC_META_RECONCILIATION_PROCEDURE + .with_label_values(&[procedure_name, step]) + .start_timer(); + match state.next(&mut self.context, _ctx).await { + Ok((next, status)) => { + *state = next; + Ok(status) + } + Err(e) => { + if e.is_retry_later() { + metrics::METRIC_META_RECONCILIATION_PROCEDURE_ERROR + .with_label_values(&[procedure_name, step, metrics::ERROR_TYPE_RETRYABLE]) + .inc(); + Err(ProcedureError::retry_later(e)) + } else { + metrics::METRIC_META_RECONCILIATION_PROCEDURE_ERROR + .with_label_values(&[procedure_name, step, metrics::ERROR_TYPE_EXTERNAL]) + .inc(); + Err(ProcedureError::external(e)) + } + } + } + } + + fn dump(&self) -> ProcedureResult { + let data = ProcedureData { + state: self.state.as_ref(), + persistent_ctx: &self.context.persistent_ctx, + }; + serde_json::to_string(&data).context(ToJsonSnafu) + } + + fn lock_key(&self) -> LockKey { + let table_ref = &self.context.table_name().table_ref(); + + let mut table_ids = self + .context + .persistent_ctx + .logical_table_ids + .iter() + .map(|t| TableLock::Write(*t).into()) + .collect::>(); + table_ids.sort_unstable(); + table_ids.push(TableLock::Read(self.context.table_id()).into()); + if self.context.persistent_ctx.is_subprocedure { + // The catalog and schema are already locked by the parent procedure. + // Only lock the table name. + return LockKey::new(table_ids); + } + let mut keys = vec![ + CatalogLock::Read(table_ref.catalog).into(), + SchemaLock::read(table_ref.catalog, table_ref.schema).into(), + ]; + keys.extend(table_ids); + LockKey::new(keys) + } +} + +#[async_trait::async_trait] +#[typetag::serde(tag = "reconcile_logical_tables_state")] +pub(crate) trait State: Sync + Send + Debug { + fn name(&self) -> &'static str { + let type_name = std::any::type_name::(); + // short name + type_name.split("::").last().unwrap_or(type_name) + } + + async fn next( + &mut self, + ctx: &mut ReconcileLogicalTablesContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)>; + + fn as_any(&self) -> &dyn Any; +} diff --git a/src/common/meta/src/reconciliation/reconcile_logical_tables/reconcile_regions.rs b/src/common/meta/src/reconciliation/reconcile_logical_tables/reconcile_regions.rs new file mode 100644 index 0000000000..98e8290425 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_logical_tables/reconcile_regions.rs @@ -0,0 +1,146 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::HashMap; + +use api::v1::region::{region_request, CreateRequests, RegionRequest, RegionRequestHeader}; +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use common_telemetry::tracing_context::TracingContext; +use futures::future; +use serde::{Deserialize, Serialize}; +use store_api::storage::{RegionId, TableId}; +use table::metadata::RawTableInfo; + +use crate::ddl::utils::{add_peer_context_if_needed, region_storage_path}; +use crate::ddl::{build_template_from_raw_table_info, CreateRequestBuilder}; +use crate::error::Result; +use crate::reconciliation::reconcile_logical_tables::update_table_infos::UpdateTableInfos; +use crate::reconciliation::reconcile_logical_tables::{ReconcileLogicalTablesContext, State}; +use crate::rpc::router::{find_leaders, region_distribution}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct ReconcileRegions; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconcileRegions { + async fn next( + &mut self, + ctx: &mut ReconcileLogicalTablesContext, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + if ctx.persistent_ctx.create_tables.is_empty() { + return Ok((Box::new(UpdateTableInfos), Status::executing(false))); + } + + // Safety: previous steps ensure the physical table route is set. + let region_routes = &ctx + .persistent_ctx + .physical_table_route + .as_ref() + .unwrap() + .region_routes; + + let region_distribution = region_distribution(region_routes); + let leaders = find_leaders(region_routes) + .into_iter() + .map(|p| (p.id, p)) + .collect::>(); + let mut create_table_tasks = Vec::with_capacity(leaders.len()); + for (datanode_id, region_role_set) in region_distribution { + if region_role_set.leader_regions.is_empty() { + continue; + } + // Safety: It contains all leaders in the region routes. + let peer = leaders.get(&datanode_id).unwrap().clone(); + let request = self.make_request(®ion_role_set.leader_regions, ctx)?; + let requester = ctx.node_manager.datanode(&peer).await; + create_table_tasks.push(async move { + requester + .handle(request) + .await + .map_err(add_peer_context_if_needed(peer)) + }); + } + + future::join_all(create_table_tasks) + .await + .into_iter() + .collect::>>()?; + let table_id = ctx.table_id(); + let table_name = ctx.table_name(); + info!( + "Reconciled regions for logical tables: {:?}, physical table: {}, table_id: {}", + ctx.persistent_ctx + .create_tables + .iter() + .map(|(table_id, _)| table_id) + .collect::>(), + table_id, + table_name + ); + ctx.persistent_ctx.create_tables.clear(); + return Ok((Box::new(UpdateTableInfos), Status::executing(true))); + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +impl ReconcileRegions { + fn make_request( + &self, + region_numbers: &[u32], + ctx: &ReconcileLogicalTablesContext, + ) -> Result { + let physical_table_id = ctx.table_id(); + let table_name = ctx.table_name(); + let create_tables = &ctx.persistent_ctx.create_tables; + + let mut requests = Vec::with_capacity(region_numbers.len() * create_tables.len()); + for (table_id, table_info) in create_tables { + let request_builder = + create_region_request_from_raw_table_info(table_info, physical_table_id)?; + let storage_path = + region_storage_path(&table_name.catalog_name, &table_name.schema_name); + + for region_number in region_numbers { + let region_id = RegionId::new(*table_id, *region_number); + let one_region_request = + request_builder.build_one(region_id, storage_path.clone(), &HashMap::new()); + requests.push(one_region_request); + } + } + + Ok(RegionRequest { + header: Some(RegionRequestHeader { + tracing_context: TracingContext::from_current_span().to_w3c(), + ..Default::default() + }), + body: Some(region_request::Body::Creates(CreateRequests { requests })), + }) + } +} + +/// Creates a region request builder from a raw table info. +fn create_region_request_from_raw_table_info( + raw_table_info: &RawTableInfo, + physical_table_id: TableId, +) -> Result { + let template = build_template_from_raw_table_info(raw_table_info)?; + Ok(CreateRequestBuilder::new(template, Some(physical_table_id))) +} diff --git a/src/common/meta/src/reconciliation/reconcile_logical_tables/reconciliation_end.rs b/src/common/meta/src/reconciliation/reconcile_logical_tables/reconciliation_end.rs new file mode 100644 index 0000000000..a3c074cbe6 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_logical_tables/reconciliation_end.rs @@ -0,0 +1,53 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; + +use crate::error::Result; +use crate::reconciliation::reconcile_logical_tables::{ReconcileLogicalTablesContext, State}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct ReconciliationEnd; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconciliationEnd { + async fn next( + &mut self, + ctx: &mut ReconcileLogicalTablesContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let table_id = ctx.table_id(); + let table_name = ctx.table_name(); + let metrics = ctx.metrics(); + + info!( + "Logical tables reconciliation completed. logical tables: {:?}, physical_table_id: {}, table_name: {}, procedure_id: {}, metrics: {}", + ctx.persistent_ctx.logical_table_ids, + table_id, + table_name, + procedure_ctx.procedure_id, + metrics + ); + Ok((Box::new(ReconciliationEnd), Status::done())) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_logical_tables/reconciliation_start.rs b/src/common/meta/src/reconciliation/reconcile_logical_tables/reconciliation_start.rs new file mode 100644 index 0000000000..1649abdc07 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_logical_tables/reconciliation_start.rs @@ -0,0 +1,192 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use snafu::{ensure, OptionExt}; +use store_api::storage::TableId; +use table::table_name::TableName; + +use crate::ddl::utils::region_metadata_lister::RegionMetadataLister; +use crate::ddl::utils::table_id::get_all_table_ids_by_names; +use crate::ddl::utils::table_info::all_logical_table_routes_have_same_physical_id; +use crate::error::{self, Result}; +use crate::metrics; +use crate::reconciliation::reconcile_logical_tables::resolve_table_metadatas::ResolveTableMetadatas; +use crate::reconciliation::reconcile_logical_tables::{ + ReconcileLogicalTablesContext, ReconcileLogicalTablesProcedure, State, +}; +use crate::reconciliation::utils::check_column_metadatas_consistent; + +/// The start state of the reconciliation procedure. +#[derive(Debug, Serialize, Deserialize)] +pub struct ReconciliationStart; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconciliationStart { + async fn next( + &mut self, + ctx: &mut ReconcileLogicalTablesContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let table_id = ctx.table_id(); + let table_name = ctx.table_name(); + let (physical_table_id, physical_table_route) = ctx + .table_metadata_manager + .table_route_manager() + .get_physical_table_route(table_id) + .await?; + ensure!( + physical_table_id == table_id, + error::UnexpectedSnafu { + err_msg: format!( + "Expected physical table: {}, but it's a logical table of table: {}", + table_name, physical_table_id + ), + } + ); + + let region_metadata_lister = RegionMetadataLister::new(ctx.node_manager.clone()); + let region_metadatas = { + let _timer = metrics::METRIC_META_RECONCILIATION_LIST_REGION_METADATA_DURATION + .with_label_values(&[metrics::TABLE_TYPE_PHYSICAL]) + .start_timer(); + region_metadata_lister + .list(physical_table_id, &physical_table_route.region_routes) + .await? + }; + + ensure!(!region_metadatas.is_empty(), { + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + ReconcileLogicalTablesProcedure::TYPE_NAME, + metrics::TABLE_TYPE_PHYSICAL, + metrics::STATS_TYPE_NO_REGION_METADATA, + ]) + .inc(); + + error::UnexpectedSnafu { + err_msg: format!( + "No region metadata found for physical table: {}, table_id: {}", + table_name, table_id + ), + } + }); + + ensure!(region_metadatas.iter().all(|r| r.is_some()), { + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + ReconcileLogicalTablesProcedure::TYPE_NAME, + metrics::TABLE_TYPE_PHYSICAL, + metrics::STATS_TYPE_REGION_NOT_OPEN, + ]) + .inc(); + error::UnexpectedSnafu { + err_msg: format!( + "Some regions of the physical table are not open. physical table: {}, table_id: {}", + table_name, table_id + ), + } + }); + + // Safety: checked above + let region_metadatas = region_metadatas + .into_iter() + .map(|r| r.unwrap()) + .collect::>(); + let _region_metadata = check_column_metadatas_consistent(®ion_metadatas).context( + error::UnexpectedSnafu { + err_msg: format!( + "Column metadatas are not consistent for physical table: {}, table_id: {}", + table_name, table_id + ), + }, + )?; + + // TODO(weny): ensure all columns in region metadata can be found in table info. + // Validates the logical tables. + Self::validate_schema(&ctx.persistent_ctx.logical_tables)?; + let table_refs = ctx + .persistent_ctx + .logical_tables + .iter() + .map(|t| t.table_ref()) + .collect::>(); + let table_ids = get_all_table_ids_by_names( + ctx.table_metadata_manager.table_name_manager(), + &table_refs, + ) + .await?; + Self::validate_logical_table_routes(ctx, &table_ids).await?; + + let table_name = ctx.table_name(); + info!( + "Starting reconciliation for logical tables: {:?}, physical_table_id: {}, table_name: {}, procedure_id: {}", + table_ids, table_id, table_name, procedure_ctx.procedure_id + ); + + ctx.persistent_ctx.physical_table_route = Some(physical_table_route); + ctx.persistent_ctx.logical_table_ids = table_ids; + Ok((Box::new(ResolveTableMetadatas), Status::executing(true))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +impl ReconciliationStart { + /// Validates all the logical tables have the same catalog and schema. + fn validate_schema(logical_tables: &[TableName]) -> Result<()> { + let is_same_schema = logical_tables.windows(2).all(|pair| { + pair[0].catalog_name == pair[1].catalog_name + && pair[0].schema_name == pair[1].schema_name + }); + + ensure!( + is_same_schema, + error::UnexpectedSnafu { + err_msg: "The logical tables have different schemas", + } + ); + + Ok(()) + } + + async fn validate_logical_table_routes( + ctx: &mut ReconcileLogicalTablesContext, + table_ids: &[TableId], + ) -> Result<()> { + let all_logical_table_routes_have_same_physical_id = + all_logical_table_routes_have_same_physical_id( + ctx.table_metadata_manager.table_route_manager(), + table_ids, + ctx.table_id(), + ) + .await?; + + ensure!( + all_logical_table_routes_have_same_physical_id, + error::UnexpectedSnafu { + err_msg: "All the logical tables should have the same physical table id", + } + ); + + Ok(()) + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_logical_tables/resolve_table_metadatas.rs b/src/common/meta/src/reconciliation/reconcile_logical_tables/resolve_table_metadatas.rs new file mode 100644 index 0000000000..6b08ff81a6 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_logical_tables/resolve_table_metadatas.rs @@ -0,0 +1,156 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::{info, warn}; +use serde::{Deserialize, Serialize}; +use snafu::ensure; + +use crate::ddl::utils::region_metadata_lister::RegionMetadataLister; +use crate::ddl::utils::table_info::get_all_table_info_values_by_table_ids; +use crate::error::{self, Result}; +use crate::metrics; +use crate::reconciliation::reconcile_logical_tables::reconcile_regions::ReconcileRegions; +use crate::reconciliation::reconcile_logical_tables::{ + ReconcileLogicalTablesContext, ReconcileLogicalTablesProcedure, State, +}; +use crate::reconciliation::utils::{ + check_column_metadatas_consistent, need_update_logical_table_info, +}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct ResolveTableMetadatas; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ResolveTableMetadatas { + async fn next( + &mut self, + ctx: &mut ReconcileLogicalTablesContext, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let table_names = ctx + .persistent_ctx + .logical_tables + .iter() + .map(|t| t.table_ref()) + .collect::>(); + let table_ids = &ctx.persistent_ctx.logical_table_ids; + + let mut create_tables = vec![]; + let mut update_table_infos = vec![]; + + let table_info_values = get_all_table_info_values_by_table_ids( + ctx.table_metadata_manager.table_info_manager(), + table_ids, + &table_names, + ) + .await?; + + // Safety: The physical table route is set in `ReconciliationStart` state. + let region_routes = &ctx + .persistent_ctx + .physical_table_route + .as_ref() + .unwrap() + .region_routes; + let region_metadata_lister = RegionMetadataLister::new(ctx.node_manager.clone()); + let mut metadata_consistent_count = 0; + let mut metadata_inconsistent_count = 0; + let mut create_tables_count = 0; + for (table_id, table_info_value) in table_ids.iter().zip(table_info_values.iter()) { + let region_metadatas = { + let _timer = metrics::METRIC_META_RECONCILIATION_LIST_REGION_METADATA_DURATION + .with_label_values(&[metrics::TABLE_TYPE_LOGICAL]) + .start_timer(); + region_metadata_lister + .list(*table_id, region_routes) + .await? + }; + + ensure!(!region_metadatas.is_empty(), { + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + ReconcileLogicalTablesProcedure::TYPE_NAME, + metrics::TABLE_TYPE_LOGICAL, + metrics::STATS_TYPE_NO_REGION_METADATA, + ]) + .inc(); + + error::UnexpectedSnafu { + err_msg: format!( + "No region metadata found for table: {}, table_id: {}", + table_info_value.table_info.name, table_id + ), + } + }); + + if region_metadatas.iter().any(|r| r.is_none()) { + create_tables_count += 1; + create_tables.push((*table_id, table_info_value.table_info.clone())); + continue; + } + + // Safety: The physical table route is set in `ReconciliationStart` state. + let region_metadatas = region_metadatas + .into_iter() + .map(|r| r.unwrap()) + .collect::>(); + if let Some(column_metadatas) = check_column_metadatas_consistent(®ion_metadatas) { + metadata_consistent_count += 1; + if need_update_logical_table_info(&table_info_value.table_info, &column_metadatas) { + update_table_infos.push((*table_id, column_metadatas)); + } + } else { + metadata_inconsistent_count += 1; + // If the logical regions have inconsistent column metadatas, it won't affect read and write. + // It's safe to continue if the column metadatas of the logical table are inconsistent. + warn!( + "Found inconsistent column metadatas for table: {}, table_id: {}. Remaining the inconsistent column metadatas", + table_info_value.table_info.name, table_id + ); + } + } + + let table_id = ctx.table_id(); + let table_name = ctx.table_name(); + info!( + "Resolving table metadatas for physical table: {}, table_id: {}, updating table infos: {:?}, creating tables: {:?}", + table_name, + table_id, + update_table_infos + .iter() + .map(|(table_id, _)| *table_id) + .collect::>(), + create_tables + .iter() + .map(|(table_id, _)| *table_id) + .collect::>(), + ); + ctx.persistent_ctx.update_table_infos = update_table_infos; + ctx.persistent_ctx.create_tables = create_tables; + // Update metrics. + let metrics = ctx.mut_metrics(); + metrics.column_metadata_consistent_count = metadata_consistent_count; + metrics.column_metadata_inconsistent_count = metadata_inconsistent_count; + metrics.create_tables_count = create_tables_count; + Ok((Box::new(ReconcileRegions), Status::executing(true))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_logical_tables/update_table_infos.rs b/src/common/meta/src/reconciliation/reconcile_logical_tables/update_table_infos.rs new file mode 100644 index 0000000000..e82d210573 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_logical_tables/update_table_infos.rs @@ -0,0 +1,182 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::HashMap; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use store_api::metadata::ColumnMetadata; +use store_api::storage::TableId; +use table::metadata::RawTableInfo; +use table::table_name::TableName; +use table::table_reference::TableReference; + +use crate::cache_invalidator::Context as CacheContext; +use crate::ddl::utils::table_info::{ + batch_update_table_info_values, get_all_table_info_values_by_table_ids, +}; +use crate::error::Result; +use crate::instruction::CacheIdent; +use crate::reconciliation::reconcile_logical_tables::reconciliation_end::ReconciliationEnd; +use crate::reconciliation::reconcile_logical_tables::{ReconcileLogicalTablesContext, State}; +use crate::reconciliation::utils::build_table_meta_from_column_metadatas; + +#[derive(Debug, Serialize, Deserialize)] +pub struct UpdateTableInfos; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for UpdateTableInfos { + async fn next( + &mut self, + ctx: &mut ReconcileLogicalTablesContext, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + if ctx.persistent_ctx.update_table_infos.is_empty() { + return Ok((Box::new(ReconciliationEnd), Status::executing(false))); + } + + let all_table_names = ctx + .persistent_ctx + .logical_table_ids + .iter() + .cloned() + .zip( + ctx.persistent_ctx + .logical_tables + .iter() + .map(|t| t.table_ref()), + ) + .collect::>(); + let table_ids = ctx + .persistent_ctx + .update_table_infos + .iter() + .map(|(table_id, _)| *table_id) + .collect::>(); + let table_names = table_ids + .iter() + .map(|table_id| *all_table_names.get(table_id).unwrap()) + .collect::>(); + let table_info_values = get_all_table_info_values_by_table_ids( + ctx.table_metadata_manager.table_info_manager(), + &table_ids, + &table_names, + ) + .await?; + + let mut table_info_values_to_update = + Vec::with_capacity(ctx.persistent_ctx.update_table_infos.len()); + for ((table_id, column_metadatas), table_info_value) in ctx + .persistent_ctx + .update_table_infos + .iter() + .zip(table_info_values.into_iter()) + { + let new_table_info = Self::build_new_table_info( + *table_id, + column_metadatas, + &table_info_value.table_info, + )?; + table_info_values_to_update.push((table_info_value, new_table_info)); + } + let table_id = ctx.table_id(); + let table_name = ctx.table_name(); + + let updated_table_info_num = table_info_values_to_update.len(); + batch_update_table_info_values(&ctx.table_metadata_manager, table_info_values_to_update) + .await?; + + info!( + "Updated table infos for logical tables: {:?}, physical table: {}, table_id: {}", + ctx.persistent_ctx + .update_table_infos + .iter() + .map(|(table_id, _)| table_id) + .collect::>(), + table_id, + table_name, + ); + + let cache_ctx = CacheContext { + subject: Some(format!( + "Invalidate table by reconcile logical tables, physical_table_id: {}", + table_id + )), + }; + let idents = Self::build_cache_ident_keys(table_id, table_name, &table_ids, &table_names); + ctx.cache_invalidator + .invalidate(&cache_ctx, &idents) + .await?; + + ctx.persistent_ctx.update_table_infos.clear(); + // Update metrics. + let metrics = ctx.mut_metrics(); + metrics.update_table_info_count = updated_table_info_num; + Ok((Box::new(ReconciliationEnd), Status::executing(false))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +impl UpdateTableInfos { + fn build_new_table_info( + table_id: TableId, + column_metadatas: &[ColumnMetadata], + table_info: &RawTableInfo, + ) -> Result { + let table_ref = table_info.table_ref(); + let table_meta = build_table_meta_from_column_metadatas( + table_id, + table_ref, + &table_info.meta, + None, + column_metadatas, + )?; + + let mut new_table_info = table_info.clone(); + new_table_info.meta = table_meta; + new_table_info.ident.version = table_info.ident.version + 1; + new_table_info.sort_columns(); + + Ok(new_table_info) + } + + fn build_cache_ident_keys( + physical_table_id: TableId, + physical_table_name: &TableName, + table_ids: &[TableId], + table_names: &[TableReference], + ) -> Vec { + let mut cache_keys = Vec::with_capacity(table_ids.len() * 2 + 2); + cache_keys.push(CacheIdent::TableId(physical_table_id)); + cache_keys.push(CacheIdent::TableName(physical_table_name.clone())); + cache_keys.extend( + table_ids + .iter() + .map(|table_id| CacheIdent::TableId(*table_id)), + ); + cache_keys.extend( + table_names + .iter() + .map(|table_ref| CacheIdent::TableName((*table_ref).into())), + ); + + cache_keys + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_table.rs b/src/common/meta/src/reconciliation/reconcile_table.rs new file mode 100644 index 0000000000..c1ca21b971 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_table.rs @@ -0,0 +1,280 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) mod reconcile_regions; +pub(crate) mod reconciliation_end; +pub(crate) mod reconciliation_start; +pub(crate) mod resolve_column_metadata; +pub(crate) mod update_table_info; + +use std::any::Any; +use std::fmt::Debug; + +use common_procedure::error::{FromJsonSnafu, ToJsonSnafu}; +use common_procedure::{ + Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure, + Result as ProcedureResult, Status, +}; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt; +use store_api::metadata::ColumnMetadata; +use store_api::storage::TableId; +use table::metadata::RawTableMeta; +use table::table_name::TableName; +use tonic::async_trait; + +use crate::cache_invalidator::CacheInvalidatorRef; +use crate::error::Result; +use crate::key::table_info::TableInfoValue; +use crate::key::table_route::PhysicalTableRouteValue; +use crate::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; +use crate::lock_key::{CatalogLock, SchemaLock, TableNameLock}; +use crate::metrics; +use crate::node_manager::NodeManagerRef; +use crate::reconciliation::reconcile_table::reconciliation_start::ReconciliationStart; +use crate::reconciliation::reconcile_table::resolve_column_metadata::ResolveStrategy; +use crate::reconciliation::utils::{ + build_table_meta_from_column_metadatas, Context, ReconcileTableMetrics, +}; + +pub struct ReconcileTableContext { + pub node_manager: NodeManagerRef, + pub table_metadata_manager: TableMetadataManagerRef, + pub cache_invalidator: CacheInvalidatorRef, + pub persistent_ctx: PersistentContext, + pub volatile_ctx: VolatileContext, +} + +impl ReconcileTableContext { + /// Creates a new [`ReconcileTableContext`] with the given [`Context`] and [`PersistentContext`]. + pub fn new(ctx: Context, persistent_ctx: PersistentContext) -> Self { + Self { + node_manager: ctx.node_manager, + table_metadata_manager: ctx.table_metadata_manager, + cache_invalidator: ctx.cache_invalidator, + persistent_ctx, + volatile_ctx: VolatileContext::default(), + } + } + + /// Returns the physical table name. + pub(crate) fn table_name(&self) -> &TableName { + &self.persistent_ctx.table_name + } + + /// Returns the physical table id. + pub(crate) fn table_id(&self) -> TableId { + self.persistent_ctx.table_id + } + + /// Builds a [`RawTableMeta`] from the provided [`ColumnMetadata`]s. + pub(crate) fn build_table_meta( + &self, + column_metadatas: &[ColumnMetadata], + ) -> Result { + // Safety: The table info value is set in `ReconciliationStart` state. + let table_info_value = self.persistent_ctx.table_info_value.as_ref().unwrap(); + let table_id = self.table_id(); + let table_ref = self.table_name().table_ref(); + let name_to_ids = table_info_value.table_info.name_to_ids(); + let table_meta = build_table_meta_from_column_metadatas( + table_id, + table_ref, + &table_info_value.table_info.meta, + name_to_ids, + column_metadatas, + )?; + + Ok(table_meta) + } + + /// Returns a mutable reference to the metrics. + pub(crate) fn mut_metrics(&mut self) -> &mut ReconcileTableMetrics { + &mut self.volatile_ctx.metrics + } + + /// Returns a reference to the metrics. + pub(crate) fn metrics(&self) -> &ReconcileTableMetrics { + &self.volatile_ctx.metrics + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct PersistentContext { + pub(crate) table_id: TableId, + pub(crate) table_name: TableName, + pub(crate) resolve_strategy: ResolveStrategy, + /// The table info value. + /// The value will be set in `ReconciliationStart` state. + pub(crate) table_info_value: Option>, + // The physical table route. + // The value will be set in `ReconciliationStart` state. + pub(crate) physical_table_route: Option, + // Whether the procedure is a subprocedure. + pub(crate) is_subprocedure: bool, +} + +impl PersistentContext { + pub(crate) fn new( + table_id: TableId, + table_name: TableName, + resolve_strategy: ResolveStrategy, + is_subprocedure: bool, + ) -> Self { + Self { + table_id, + table_name, + resolve_strategy, + table_info_value: None, + physical_table_route: None, + is_subprocedure, + } + } +} + +#[derive(Default)] +pub(crate) struct VolatileContext { + pub(crate) table_meta: Option, + pub(crate) metrics: ReconcileTableMetrics, +} + +pub struct ReconcileTableProcedure { + pub context: ReconcileTableContext, + state: Box, +} + +impl ReconcileTableProcedure { + /// Creates a new [`ReconcileTableProcedure`] with the given [`Context`] and [`PersistentContext`]. + pub fn new( + ctx: Context, + table_id: TableId, + table_name: TableName, + resolve_strategy: ResolveStrategy, + is_subprocedure: bool, + ) -> Self { + let persistent_ctx = + PersistentContext::new(table_id, table_name, resolve_strategy, is_subprocedure); + let context = ReconcileTableContext::new(ctx, persistent_ctx); + let state = Box::new(ReconciliationStart); + Self { context, state } + } +} + +impl ReconcileTableProcedure { + pub const TYPE_NAME: &'static str = "metasrv-procedure::ReconcileTable"; + + pub(crate) fn from_json(ctx: Context, json: &str) -> ProcedureResult { + let ProcedureDataOwned { + state, + persistent_ctx, + } = serde_json::from_str(json).context(FromJsonSnafu)?; + let context = ReconcileTableContext::new(ctx, persistent_ctx); + Ok(Self { context, state }) + } +} + +#[derive(Debug, Serialize)] +struct ProcedureData<'a> { + state: &'a dyn State, + persistent_ctx: &'a PersistentContext, +} + +#[derive(Debug, Deserialize)] +struct ProcedureDataOwned { + state: Box, + persistent_ctx: PersistentContext, +} + +#[async_trait] +impl Procedure for ReconcileTableProcedure { + fn type_name(&self) -> &str { + Self::TYPE_NAME + } + + async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + let state = &mut self.state; + + let procedure_name = Self::TYPE_NAME; + let step = state.name(); + let _timer = metrics::METRIC_META_RECONCILIATION_PROCEDURE + .with_label_values(&[procedure_name, step]) + .start_timer(); + match state.next(&mut self.context, _ctx).await { + Ok((next, status)) => { + *state = next; + Ok(status) + } + Err(e) => { + if e.is_retry_later() { + metrics::METRIC_META_RECONCILIATION_PROCEDURE_ERROR + .with_label_values(&[procedure_name, step, metrics::ERROR_TYPE_RETRYABLE]) + .inc(); + Err(ProcedureError::retry_later(e)) + } else { + metrics::METRIC_META_RECONCILIATION_PROCEDURE_ERROR + .with_label_values(&[procedure_name, step, metrics::ERROR_TYPE_EXTERNAL]) + .inc(); + Err(ProcedureError::external(e)) + } + } + } + } + + fn dump(&self) -> ProcedureResult { + let data = ProcedureData { + state: self.state.as_ref(), + persistent_ctx: &self.context.persistent_ctx, + }; + serde_json::to_string(&data).context(ToJsonSnafu) + } + + fn lock_key(&self) -> LockKey { + let table_ref = &self.context.table_name().table_ref(); + + if self.context.persistent_ctx.is_subprocedure { + // The catalog and schema are already locked by the parent procedure. + // Only lock the table name. + return LockKey::new(vec![TableNameLock::new( + table_ref.catalog, + table_ref.schema, + table_ref.table, + ) + .into()]); + } + + LockKey::new(vec![ + CatalogLock::Read(table_ref.catalog).into(), + SchemaLock::read(table_ref.catalog, table_ref.schema).into(), + TableNameLock::new(table_ref.catalog, table_ref.schema, table_ref.table).into(), + ]) + } +} + +#[async_trait::async_trait] +#[typetag::serde(tag = "reconcile_table_state")] +pub(crate) trait State: Sync + Send + Debug { + fn name(&self) -> &'static str { + let type_name = std::any::type_name::(); + // short name + type_name.split("::").last().unwrap_or(type_name) + } + + async fn next( + &mut self, + ctx: &mut ReconcileTableContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)>; + + fn as_any(&self) -> &dyn Any; +} diff --git a/src/common/meta/src/reconciliation/reconcile_table/reconcile_regions.rs b/src/common/meta/src/reconciliation/reconcile_table/reconcile_regions.rs new file mode 100644 index 0000000000..52c90d8a02 --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_table/reconcile_regions.rs @@ -0,0 +1,199 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::{HashMap, HashSet}; + +use api::v1::column_def::try_as_column_def; +use api::v1::region::region_request::Body; +use api::v1::region::{ + alter_request, AlterRequest, RegionColumnDef, RegionRequest, RegionRequestHeader, SyncColumns, +}; +use api::v1::{ColumnDef, SemanticType}; +use async_trait::async_trait; +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use common_telemetry::tracing_context::TracingContext; +use futures::future; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt}; +use store_api::metadata::ColumnMetadata; +use store_api::metric_engine_consts::TABLE_COLUMN_METADATA_EXTENSION_KEY; +use store_api::storage::{ColumnId, RegionId}; + +use crate::ddl::utils::{add_peer_context_if_needed, extract_column_metadatas}; +use crate::error::{ConvertColumnDefSnafu, Result, UnexpectedSnafu}; +use crate::reconciliation::reconcile_table::reconciliation_end::ReconciliationEnd; +use crate::reconciliation::reconcile_table::update_table_info::UpdateTableInfo; +use crate::reconciliation::reconcile_table::{ReconcileTableContext, State}; +use crate::rpc::router::{find_leaders, region_distribution}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct ReconcileRegions { + column_metadatas: Vec, + region_ids: HashSet, +} + +impl ReconcileRegions { + pub fn new(column_metadatas: Vec, region_ids: Vec) -> Self { + Self { + column_metadatas, + region_ids: region_ids.into_iter().collect(), + } + } +} + +#[async_trait] +#[typetag::serde] +impl State for ReconcileRegions { + async fn next( + &mut self, + ctx: &mut ReconcileTableContext, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let table_meta = ctx.build_table_meta(&self.column_metadatas)?; + ctx.volatile_ctx.table_meta = Some(table_meta); + let table_id = ctx.table_id(); + let table_name = ctx.table_name(); + + let primary_keys = self + .column_metadatas + .iter() + .filter(|c| c.semantic_type == SemanticType::Tag) + .map(|c| c.column_schema.name.to_string()) + .collect::>(); + let column_defs = self + .column_metadatas + .iter() + .map(|c| { + let column_def = try_as_column_def( + &c.column_schema, + primary_keys.contains(&c.column_schema.name), + ) + .context(ConvertColumnDefSnafu { + column: &c.column_schema.name, + })?; + + Ok((c.column_id, column_def)) + }) + .collect::>>()?; + + // Sends sync column metadatas to datanode. + // Safety: The physical table route is set in `ReconciliationStart` state. + let region_routes = &ctx + .persistent_ctx + .physical_table_route + .as_ref() + .unwrap() + .region_routes; + let region_distribution = region_distribution(region_routes); + let leaders = find_leaders(region_routes) + .into_iter() + .map(|p| (p.id, p)) + .collect::>(); + let mut sync_column_tsks = Vec::with_capacity(self.region_ids.len()); + for (datanode_id, region_role_set) in region_distribution { + if region_role_set.leader_regions.is_empty() { + continue; + } + // Safety: It contains all leaders in the region routes. + let peer = leaders.get(&datanode_id).unwrap(); + for region_id in region_role_set.leader_regions { + let region_id = RegionId::new(ctx.persistent_ctx.table_id, region_id); + if self.region_ids.contains(®ion_id) { + let requester = ctx.node_manager.datanode(peer).await; + let request = make_alter_region_request(region_id, &column_defs); + let peer = peer.clone(); + + sync_column_tsks.push(async move { + requester + .handle(request) + .await + .map_err(add_peer_context_if_needed(peer)) + }); + } + } + } + + let mut results = future::join_all(sync_column_tsks) + .await + .into_iter() + .collect::>>()?; + + // Ensures all the column metadatas are the same. + let column_metadatas = + extract_column_metadatas(&mut results, TABLE_COLUMN_METADATA_EXTENSION_KEY)?.context( + UnexpectedSnafu { + err_msg: format!( + "The table column metadata schemas from datanodes are not the same, table: {}, table_id: {}", + table_name, + table_id + ), + }, + )?; + + // Checks all column metadatas are consistent, and updates the table info if needed. + if column_metadatas != self.column_metadatas { + info!("Datanode column metadatas are not consistent with metasrv, updating metasrv's column metadatas, table: {}, table_id: {}", table_name, table_id); + // Safety: fetched in the above. + let table_info_value = ctx.persistent_ctx.table_info_value.clone().unwrap(); + return Ok(( + Box::new(UpdateTableInfo::new(table_info_value, column_metadatas)), + Status::executing(true), + )); + } + + Ok((Box::new(ReconciliationEnd), Status::executing(false))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +/// Makes an alter region request to sync columns. +fn make_alter_region_request( + region_id: RegionId, + column_defs: &[(ColumnId, ColumnDef)], +) -> RegionRequest { + let kind = alter_request::Kind::SyncColumns(to_region_sync_columns(column_defs)); + + let alter_request = AlterRequest { + region_id: region_id.as_u64(), + schema_version: 0, + kind: Some(kind), + }; + + RegionRequest { + header: Some(RegionRequestHeader { + tracing_context: TracingContext::from_current_span().to_w3c(), + ..Default::default() + }), + body: Some(Body::Alter(alter_request)), + } +} + +fn to_region_sync_columns(column_defs: &[(ColumnId, ColumnDef)]) -> SyncColumns { + let region_column_defs = column_defs + .iter() + .map(|(column_id, column_def)| RegionColumnDef { + column_id: *column_id, + column_def: Some(column_def.clone()), + }) + .collect::>(); + + SyncColumns { + column_defs: region_column_defs, + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_table/reconciliation_end.rs b/src/common/meta/src/reconciliation/reconcile_table/reconciliation_end.rs new file mode 100644 index 0000000000..9f28d7f4be --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_table/reconciliation_end.rs @@ -0,0 +1,53 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use tonic::async_trait; + +use crate::error::Result; +use crate::reconciliation::reconcile_table::{ReconcileTableContext, State}; + +/// The state of the reconciliation end. +/// This state is used to indicate that the reconciliation is done. +#[derive(Debug, Serialize, Deserialize)] +pub struct ReconciliationEnd; + +#[async_trait] +#[typetag::serde] +impl State for ReconciliationEnd { + async fn next( + &mut self, + ctx: &mut ReconcileTableContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let table_id = ctx.table_id(); + let table_name = ctx.table_name(); + let metrics = ctx.metrics(); + + info!( + "Physical table reconciliation completed. table_name: {}, table_id: {}, procedure_id: {}, metrics: {}", + table_name, table_id, procedure_ctx.procedure_id, metrics + ); + + Ok((Box::new(ReconciliationEnd), Status::done())) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_table/reconciliation_start.rs b/src/common/meta/src/reconciliation/reconcile_table/reconciliation_start.rs new file mode 100644 index 0000000000..2fcc8ded0c --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_table/reconciliation_start.rs @@ -0,0 +1,134 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use snafu::ensure; + +use crate::ddl::utils::region_metadata_lister::RegionMetadataLister; +use crate::error::{self, Result}; +use crate::metrics::{self}; +use crate::reconciliation::reconcile_table::resolve_column_metadata::ResolveColumnMetadata; +use crate::reconciliation::reconcile_table::{ + ReconcileTableContext, ReconcileTableProcedure, State, +}; + +/// The start state of the reconciliation procedure. +/// +/// This state is used to prepare the table for reconciliation. +/// It will: +/// 1. Check the table id and table name consistency. +/// 2. Ensures the table is a physical table. +/// 3. List the region metadatas for the physical table. +#[derive(Debug, Serialize, Deserialize)] +pub struct ReconciliationStart; + +#[async_trait::async_trait] +#[typetag::serde] +impl State for ReconciliationStart { + async fn next( + &mut self, + ctx: &mut ReconcileTableContext, + procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let table_id = ctx.table_id(); + let table_name = ctx.table_name(); + + let (physical_table_id, physical_table_route) = ctx + .table_metadata_manager + .table_route_manager() + .get_physical_table_route(table_id) + .await?; + ensure!( + physical_table_id == table_id, + error::UnexpectedSnafu { + err_msg: format!( + "Reconcile table only works for physical table, but got logical table: {}, table_id: {}", + table_name, table_id + ), + } + ); + + info!( + "Reconciling table: {}, table_id: {}, procedure_id: {}", + table_name, table_id, procedure_ctx.procedure_id + ); + // TODO(weny): Repairs the table route if needed. + let region_metadata_lister = RegionMetadataLister::new(ctx.node_manager.clone()); + + let region_metadatas = { + let _timer = metrics::METRIC_META_RECONCILIATION_LIST_REGION_METADATA_DURATION + .with_label_values(&[metrics::TABLE_TYPE_PHYSICAL]) + .start_timer(); + // Always list region metadatas for the physical table. + region_metadata_lister + .list(physical_table_id, &physical_table_route.region_routes) + .await? + }; + + ensure!(!region_metadatas.is_empty(), { + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + ReconcileTableProcedure::TYPE_NAME, + metrics::TABLE_TYPE_PHYSICAL, + metrics::STATS_TYPE_NO_REGION_METADATA, + ]) + .inc(); + + error::UnexpectedSnafu { + err_msg: format!( + "No region metadata found for table: {}, table_id: {}", + table_name, table_id + ), + } + }); + + ensure!(region_metadatas.iter().all(|r| r.is_some()), { + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + ReconcileTableProcedure::TYPE_NAME, + metrics::TABLE_TYPE_PHYSICAL, + metrics::STATS_TYPE_REGION_NOT_OPEN, + ]) + .inc(); + + error::UnexpectedSnafu { + err_msg: format!( + "Some regions are not opened, table: {}, table_id: {}", + table_name, table_id + ), + } + }); + + // Persist the physical table route. + // TODO(weny): refetch the physical table route if repair is needed. + ctx.persistent_ctx.physical_table_route = Some(physical_table_route); + let region_metadatas = region_metadatas.into_iter().map(|r| r.unwrap()).collect(); + Ok(( + Box::new(ResolveColumnMetadata::new( + ctx.persistent_ctx.resolve_strategy, + region_metadatas, + )), + // We don't persist the state of this step. + Status::executing(false), + )) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_table/resolve_column_metadata.rs b/src/common/meta/src/reconciliation/reconcile_table/resolve_column_metadata.rs new file mode 100644 index 0000000000..97fd7d473b --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_table/resolve_column_metadata.rs @@ -0,0 +1,170 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use async_trait::async_trait; +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use snafu::OptionExt; +use store_api::metadata::RegionMetadata; +use strum::AsRefStr; + +use crate::error::{self, MissingColumnIdsSnafu, Result}; +use crate::reconciliation::reconcile_table::reconcile_regions::ReconcileRegions; +use crate::reconciliation::reconcile_table::update_table_info::UpdateTableInfo; +use crate::reconciliation::reconcile_table::{ReconcileTableContext, State}; +use crate::reconciliation::utils::{ + build_column_metadata_from_table_info, check_column_metadatas_consistent, + resolve_column_metadatas_with_latest, resolve_column_metadatas_with_metasrv, + ResolveColumnMetadataResult, +}; + +/// Strategy for resolving column metadata inconsistencies. +#[derive(Debug, Serialize, Deserialize, Clone, Copy, Default, AsRefStr)] +pub enum ResolveStrategy { + #[default] + /// Trusts the latest column metadata from datanode. + UseLatest, + + /// Always uses the column metadata from metasrv. + UseMetasrv, + + /// Aborts the resolution process if inconsistencies are detected. + AbortOnConflict, +} + +impl From for ResolveStrategy { + fn from(strategy: api::v1::meta::ResolveStrategy) -> Self { + match strategy { + api::v1::meta::ResolveStrategy::UseMetasrv => Self::UseMetasrv, + api::v1::meta::ResolveStrategy::UseLatest => Self::UseLatest, + api::v1::meta::ResolveStrategy::AbortOnConflict => Self::AbortOnConflict, + } + } +} + +/// State responsible for resolving inconsistencies in column metadata across physical regions. +#[derive(Debug, Serialize, Deserialize)] +pub struct ResolveColumnMetadata { + strategy: ResolveStrategy, + region_metadata: Vec, +} + +impl ResolveColumnMetadata { + pub fn new(strategy: ResolveStrategy, region_metadata: Vec) -> Self { + Self { + strategy, + region_metadata, + } + } +} + +#[async_trait] +#[typetag::serde] +impl State for ResolveColumnMetadata { + async fn next( + &mut self, + ctx: &mut ReconcileTableContext, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let table_id = ctx.persistent_ctx.table_id; + let table_name = &ctx.persistent_ctx.table_name; + + let table_info_value = ctx + .table_metadata_manager + .table_info_manager() + .get(table_id) + .await? + .with_context(|| error::TableNotFoundSnafu { + table_name: table_name.to_string(), + })?; + ctx.persistent_ctx.table_info_value = Some(table_info_value); + + if let Some(column_metadatas) = check_column_metadatas_consistent(&self.region_metadata) { + // Safety: fetched in the above. + let table_info_value = ctx.persistent_ctx.table_info_value.clone().unwrap(); + info!( + "Column metadatas are consistent for table: {}, table_id: {}.", + table_name, table_id + ); + + // Update metrics. + ctx.mut_metrics().resolve_column_metadata_result = + Some(ResolveColumnMetadataResult::Consistent); + return Ok(( + Box::new(UpdateTableInfo::new(table_info_value, column_metadatas)), + Status::executing(false), + )); + }; + + match self.strategy { + ResolveStrategy::UseMetasrv => { + let table_info_value = ctx.persistent_ctx.table_info_value.as_ref().unwrap(); + let name_to_ids = table_info_value + .table_info + .name_to_ids() + .context(MissingColumnIdsSnafu)?; + let column_metadata = build_column_metadata_from_table_info( + &table_info_value.table_info.meta.schema.column_schemas, + &table_info_value.table_info.meta.primary_key_indices, + &name_to_ids, + )?; + + let region_ids = + resolve_column_metadatas_with_metasrv(&column_metadata, &self.region_metadata)?; + + // Update metrics. + let metrics = ctx.mut_metrics(); + metrics.resolve_column_metadata_result = + Some(ResolveColumnMetadataResult::Inconsistent(self.strategy)); + Ok(( + Box::new(ReconcileRegions::new(column_metadata, region_ids)), + Status::executing(true), + )) + } + ResolveStrategy::UseLatest => { + let (column_metadatas, region_ids) = + resolve_column_metadatas_with_latest(&self.region_metadata)?; + + // Update metrics. + let metrics = ctx.mut_metrics(); + metrics.resolve_column_metadata_result = + Some(ResolveColumnMetadataResult::Inconsistent(self.strategy)); + Ok(( + Box::new(ReconcileRegions::new(column_metadatas, region_ids)), + Status::executing(true), + )) + } + ResolveStrategy::AbortOnConflict => { + let table_name = table_name.to_string(); + + // Update metrics. + let metrics = ctx.mut_metrics(); + metrics.resolve_column_metadata_result = + Some(ResolveColumnMetadataResult::Inconsistent(self.strategy)); + error::ColumnMetadataConflictsSnafu { + table_name, + table_id, + } + .fail() + } + } + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/reconcile_table/update_table_info.rs b/src/common/meta/src/reconciliation/reconcile_table/update_table_info.rs new file mode 100644 index 0000000000..16284a22ef --- /dev/null +++ b/src/common/meta/src/reconciliation/reconcile_table/update_table_info.rs @@ -0,0 +1,129 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_procedure::{Context as ProcedureContext, Status}; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use store_api::metadata::ColumnMetadata; +use tonic::async_trait; + +use crate::cache_invalidator::Context as CacheContext; +use crate::error::Result; +use crate::instruction::CacheIdent; +use crate::key::table_info::TableInfoValue; +use crate::key::DeserializedValueWithBytes; +use crate::reconciliation::reconcile_table::reconciliation_end::ReconciliationEnd; +use crate::reconciliation::reconcile_table::{ReconcileTableContext, State}; +use crate::rpc::router::region_distribution; + +/// Updates the table info with the new column metadatas. +#[derive(Debug, Serialize, Deserialize)] +pub struct UpdateTableInfo { + table_info_value: DeserializedValueWithBytes, + column_metadatas: Vec, +} + +impl UpdateTableInfo { + pub fn new( + table_info_value: DeserializedValueWithBytes, + column_metadatas: Vec, + ) -> Self { + Self { + table_info_value, + column_metadatas, + } + } +} + +#[async_trait] +#[typetag::serde] +impl State for UpdateTableInfo { + async fn next( + &mut self, + ctx: &mut ReconcileTableContext, + _procedure_ctx: &ProcedureContext, + ) -> Result<(Box, Status)> { + let new_table_meta = match &ctx.volatile_ctx.table_meta { + Some(table_meta) => table_meta.clone(), + None => ctx.build_table_meta(&self.column_metadatas)?, + }; + + let region_routes = &ctx + .persistent_ctx + .physical_table_route + .as_ref() + .unwrap() + .region_routes; + let region_distribution = region_distribution(region_routes); + let current_table_info_value = ctx.persistent_ctx.table_info_value.as_ref().unwrap(); + let new_table_info = { + let mut new_table_info = current_table_info_value.table_info.clone(); + new_table_info.meta = new_table_meta; + new_table_info + }; + + if new_table_info.meta == current_table_info_value.table_info.meta { + info!( + "Table info is already up to date for table: {}, table_id: {}", + ctx.table_name(), + ctx.table_id() + ); + return Ok((Box::new(ReconciliationEnd), Status::executing(true))); + } + + info!( + "Updating table info for table: {}, table_id: {}. new table meta: {:?}, current table meta: {:?}", + ctx.table_name(), + ctx.table_id(), + new_table_info.meta, + current_table_info_value.table_info.meta, + ); + ctx.table_metadata_manager + .update_table_info( + current_table_info_value, + Some(region_distribution), + new_table_info, + ) + .await?; + + let table_ref = ctx.table_name().table_ref(); + let table_id = ctx.table_id(); + let cache_ctx = CacheContext { + subject: Some(format!( + "Invalidate table cache by reconciling table {}, table_id: {}", + table_ref, table_id, + )), + }; + ctx.cache_invalidator + .invalidate( + &cache_ctx, + &[ + CacheIdent::TableName(table_ref.into()), + CacheIdent::TableId(table_id), + ], + ) + .await?; + // Update metrics. + let metrics = ctx.mut_metrics(); + metrics.update_table_info = true; + + Ok((Box::new(ReconciliationEnd), Status::executing(true))) + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/src/common/meta/src/reconciliation/utils.rs b/src/common/meta/src/reconciliation/utils.rs new file mode 100644 index 0000000000..42cbc6703c --- /dev/null +++ b/src/common/meta/src/reconciliation/utils.rs @@ -0,0 +1,1267 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::fmt::{self, Display}; +use std::ops::AddAssign; +use std::time::Instant; + +use api::v1::SemanticType; +use common_procedure::{watcher, Context as ProcedureContext, ProcedureId}; +use common_telemetry::{error, warn}; +use datatypes::schema::ColumnSchema; +use futures::future::{join_all, try_join_all}; +use snafu::{ensure, OptionExt, ResultExt}; +use store_api::metadata::{ColumnMetadata, RegionMetadata}; +use store_api::storage::{RegionId, TableId}; +use table::metadata::{RawTableInfo, RawTableMeta}; +use table::table_name::TableName; +use table::table_reference::TableReference; + +use crate::cache_invalidator::CacheInvalidatorRef; +use crate::error::{ + ColumnIdMismatchSnafu, ColumnNotFoundSnafu, MismatchColumnIdSnafu, + MissingColumnInColumnMetadataSnafu, ProcedureStateReceiverNotFoundSnafu, + ProcedureStateReceiverSnafu, Result, TimestampMismatchSnafu, UnexpectedSnafu, + WaitProcedureSnafu, +}; +use crate::key::TableMetadataManagerRef; +use crate::metrics; +use crate::node_manager::NodeManagerRef; +use crate::reconciliation::reconcile_logical_tables::ReconcileLogicalTablesProcedure; +use crate::reconciliation::reconcile_table::resolve_column_metadata::ResolveStrategy; +use crate::reconciliation::reconcile_table::ReconcileTableProcedure; + +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct PartialRegionMetadata<'a> { + pub(crate) column_metadatas: &'a [ColumnMetadata], + pub(crate) primary_key: &'a [u32], + pub(crate) table_id: TableId, +} + +impl<'a> From<&'a RegionMetadata> for PartialRegionMetadata<'a> { + fn from(region_metadata: &'a RegionMetadata) -> Self { + Self { + column_metadatas: ®ion_metadata.column_metadatas, + primary_key: ®ion_metadata.primary_key, + table_id: region_metadata.region_id.table_id(), + } + } +} + +/// Checks if the column metadatas are consistent. +/// +/// The column metadatas are consistent if: +/// - The column metadatas are the same. +/// - The primary key are the same. +/// - The table id of the region metadatas are the same. +/// +/// ## Panic +/// Panic if region_metadatas is empty. +pub(crate) fn check_column_metadatas_consistent( + region_metadatas: &[RegionMetadata], +) -> Option> { + let is_column_metadata_consistent = region_metadatas + .windows(2) + .all(|w| PartialRegionMetadata::from(&w[0]) == PartialRegionMetadata::from(&w[1])); + + if !is_column_metadata_consistent { + return None; + } + + Some(region_metadatas[0].column_metadatas.clone()) +} + +/// Resolves column metadata inconsistencies among the given region metadatas +/// by using the column metadata from the metasrv as the source of truth. +/// +/// All region metadatas whose column metadata differs from the given `column_metadatas` +/// will be marked for reconciliation. +/// +/// Returns the region ids that need to be reconciled. +pub(crate) fn resolve_column_metadatas_with_metasrv( + column_metadatas: &[ColumnMetadata], + region_metadatas: &[RegionMetadata], +) -> Result> { + let is_same_table = region_metadatas + .windows(2) + .all(|w| w[0].region_id.table_id() == w[1].region_id.table_id()); + + ensure!( + is_same_table, + UnexpectedSnafu { + err_msg: "Region metadatas are not from the same table" + } + ); + + let mut regions_ids = vec![]; + for region_metadata in region_metadatas { + if region_metadata.column_metadatas != column_metadatas { + check_column_metadata_invariants(column_metadatas, ®ion_metadata.column_metadatas)?; + regions_ids.push(region_metadata.region_id); + } + } + Ok(regions_ids) +} + +/// Resolves column metadata inconsistencies among the given region metadatas +/// by selecting the column metadata with the highest schema version. +/// +/// This strategy assumes that at most two versions of column metadata may exist, +/// due to the poison mechanism, making the highest schema version a safe choice. +/// +/// Returns the resolved column metadata and the region ids that need to be reconciled. +pub(crate) fn resolve_column_metadatas_with_latest( + region_metadatas: &[RegionMetadata], +) -> Result<(Vec, Vec)> { + let is_same_table = region_metadatas + .windows(2) + .all(|w| w[0].region_id.table_id() == w[1].region_id.table_id()); + + ensure!( + is_same_table, + UnexpectedSnafu { + err_msg: "Region metadatas are not from the same table" + } + ); + + let latest_region_metadata = region_metadatas + .iter() + .max_by_key(|c| c.schema_version) + .context(UnexpectedSnafu { + err_msg: "All Region metadatas have the same schema version", + })?; + let latest_column_metadatas = PartialRegionMetadata::from(latest_region_metadata); + + let mut region_ids = vec![]; + for region_metadata in region_metadatas { + if PartialRegionMetadata::from(region_metadata) != latest_column_metadatas { + check_column_metadata_invariants( + &latest_region_metadata.column_metadatas, + ®ion_metadata.column_metadatas, + )?; + region_ids.push(region_metadata.region_id); + } + } + + // TODO(weny): verify the new column metadatas are acceptable for regions. + Ok((latest_region_metadata.column_metadatas.clone(), region_ids)) +} + +/// Constructs a vector of [`ColumnMetadata`] from the provided table information. +/// +/// This function maps each [`ColumnSchema`] to its corresponding [`ColumnMetadata`] by +/// determining the semantic type (Tag, Timestamp, or Field) and retrieving the column ID +/// from the `name_to_ids` mapping. +/// +/// Returns an error if any column name is missing in the mapping. +pub(crate) fn build_column_metadata_from_table_info( + column_schemas: &[ColumnSchema], + primary_key_indexes: &[usize], + name_to_ids: &HashMap, +) -> Result> { + let primary_names = primary_key_indexes + .iter() + .map(|i| column_schemas[*i].name.as_str()) + .collect::>(); + + column_schemas + .iter() + .map(|column_schema| { + let column_id = *name_to_ids + .get(column_schema.name.as_str()) + .with_context(|| UnexpectedSnafu { + err_msg: format!( + "Column name {} not found in name_to_ids", + column_schema.name + ), + })?; + + let semantic_type = if primary_names.contains(&column_schema.name.as_str()) { + SemanticType::Tag + } else if column_schema.is_time_index() { + SemanticType::Timestamp + } else { + SemanticType::Field + }; + Ok(ColumnMetadata { + column_schema: column_schema.clone(), + semantic_type, + column_id, + }) + }) + .collect::>>() +} + +/// Checks whether the schema invariants hold between the existing and new column metadata. +/// +/// Invariants: +/// - Primary key (Tag) columns must exist in the new metadata, with identical name and ID. +/// - Timestamp column must remain exactly the same in name and ID. +pub(crate) fn check_column_metadata_invariants( + new_column_metadatas: &[ColumnMetadata], + column_metadatas: &[ColumnMetadata], +) -> Result<()> { + let new_primary_keys = new_column_metadatas + .iter() + .filter(|c| c.semantic_type == SemanticType::Tag) + .map(|c| (c.column_schema.name.as_str(), c.column_id)) + .collect::>(); + + let old_primary_keys = column_metadatas + .iter() + .filter(|c| c.semantic_type == SemanticType::Tag) + .map(|c| (c.column_schema.name.as_str(), c.column_id)); + + for (name, id) in old_primary_keys { + let column_id = new_primary_keys + .get(name) + .cloned() + .context(ColumnNotFoundSnafu { + column_name: name, + column_id: id, + })?; + + ensure!( + column_id == id, + ColumnIdMismatchSnafu { + column_name: name, + expected_column_id: id, + actual_column_id: column_id, + } + ); + } + + let new_ts_column = new_column_metadatas + .iter() + .find(|c| c.semantic_type == SemanticType::Timestamp) + .map(|c| (c.column_schema.name.as_str(), c.column_id)) + .context(UnexpectedSnafu { + err_msg: "Timestamp column not found in new column metadata", + })?; + + let old_ts_column = column_metadatas + .iter() + .find(|c| c.semantic_type == SemanticType::Timestamp) + .map(|c| (c.column_schema.name.as_str(), c.column_id)) + .context(UnexpectedSnafu { + err_msg: "Timestamp column not found in column metadata", + })?; + ensure!( + new_ts_column == old_ts_column, + TimestampMismatchSnafu { + expected_column_name: old_ts_column.0, + expected_column_id: old_ts_column.1, + actual_column_name: new_ts_column.0, + actual_column_id: new_ts_column.1, + } + ); + + Ok(()) +} + +/// Builds a [`RawTableMeta`] from the provided [`ColumnMetadata`]s. +/// +/// Returns an error if: +/// - Any column is missing in the `name_to_ids`(if `name_to_ids` is provided). +/// - The column id in table metadata is not the same as the column id in the column metadata.(if `name_to_ids` is provided) +/// - The table index is missing in the column metadata. +/// - The primary key or partition key columns are missing in the column metadata. +/// +/// TODO(weny): add tests +pub(crate) fn build_table_meta_from_column_metadatas( + table_id: TableId, + table_ref: TableReference, + table_meta: &RawTableMeta, + name_to_ids: Option>, + column_metadata: &[ColumnMetadata], +) -> Result { + let column_in_column_metadata = column_metadata + .iter() + .map(|c| (c.column_schema.name.as_str(), c)) + .collect::>(); + let primary_key_names = table_meta + .primary_key_indices + .iter() + .map(|i| table_meta.schema.column_schemas[*i].name.as_str()) + .collect::>(); + let partition_key_names = table_meta + .partition_key_indices + .iter() + .map(|i| table_meta.schema.column_schemas[*i].name.as_str()) + .collect::>(); + ensure!( + column_metadata + .iter() + .any(|c| c.semantic_type == SemanticType::Timestamp), + UnexpectedSnafu { + err_msg: format!( + "Missing table index in column metadata, table: {}, table_id: {}", + table_ref, table_id + ), + } + ); + + if let Some(name_to_ids) = &name_to_ids { + // Ensures all primary key and partition key exists in the column metadata. + for column_name in primary_key_names.iter().chain(partition_key_names.iter()) { + let column_in_column_metadata = column_in_column_metadata + .get(column_name) + .with_context(|| MissingColumnInColumnMetadataSnafu { + column_name: column_name.to_string(), + table_name: table_ref.to_string(), + table_id, + })?; + + let column_id = *name_to_ids + .get(*column_name) + .with_context(|| UnexpectedSnafu { + err_msg: format!("column id not found in name_to_ids: {}", column_name), + })?; + ensure!( + column_id == column_in_column_metadata.column_id, + MismatchColumnIdSnafu { + column_name: column_name.to_string(), + column_id, + table_name: table_ref.to_string(), + table_id, + } + ); + } + } else { + warn!( + "`name_to_ids` is not provided, table: {}, table_id: {}", + table_ref, table_id + ); + } + + let mut new_raw_table_meta = table_meta.clone(); + let primary_key_indices = &mut new_raw_table_meta.primary_key_indices; + let partition_key_indices = &mut new_raw_table_meta.partition_key_indices; + let value_indices = &mut new_raw_table_meta.value_indices; + let time_index = &mut new_raw_table_meta.schema.timestamp_index; + let columns = &mut new_raw_table_meta.schema.column_schemas; + let column_ids = &mut new_raw_table_meta.column_ids; + let next_column_id = &mut new_raw_table_meta.next_column_id; + + column_ids.clear(); + value_indices.clear(); + columns.clear(); + primary_key_indices.clear(); + partition_key_indices.clear(); + + for (idx, col) in column_metadata.iter().enumerate() { + if partition_key_names.contains(&col.column_schema.name.as_str()) { + partition_key_indices.push(idx); + } + match col.semantic_type { + SemanticType::Tag => { + primary_key_indices.push(idx); + } + SemanticType::Field => { + value_indices.push(idx); + } + SemanticType::Timestamp => { + value_indices.push(idx); + *time_index = Some(idx); + } + } + + columns.push(col.column_schema.clone()); + column_ids.push(col.column_id); + } + + *next_column_id = column_ids + .iter() + .max() + .map(|max| max + 1) + .unwrap_or(*next_column_id) + .max(*next_column_id); + + if let Some(time_index) = *time_index { + new_raw_table_meta.schema.column_schemas[time_index].set_time_index(); + } + + Ok(new_raw_table_meta) +} + +/// Returns true if the logical table info needs to be updated. +/// +/// The logical table only support to add columns, so we can check the length of column metadatas +/// to determine whether the logical table info needs to be updated. +pub(crate) fn need_update_logical_table_info( + table_info: &RawTableInfo, + column_metadatas: &[ColumnMetadata], +) -> bool { + table_info.meta.schema.column_schemas.len() != column_metadatas.len() +} + +/// The result of waiting for inflight subprocedures. +pub struct PartialSuccessResult<'a> { + pub failed_procedures: Vec<&'a SubprocedureMeta>, + pub success_procedures: Vec<&'a SubprocedureMeta>, +} + +/// The result of waiting for inflight subprocedures. +pub enum WaitForInflightSubproceduresResult<'a> { + Success(Vec<&'a SubprocedureMeta>), + PartialSuccess(PartialSuccessResult<'a>), +} + +/// Wait for inflight subprocedures. +/// +/// If `fail_fast` is true, the function will return an error if any subprocedure fails. +/// Otherwise, the function will continue waiting for all subprocedures to complete. +pub(crate) async fn wait_for_inflight_subprocedures<'a>( + procedure_ctx: &ProcedureContext, + subprocedures: &'a [SubprocedureMeta], + fail_fast: bool, +) -> Result> { + let mut receivers = Vec::with_capacity(subprocedures.len()); + for subprocedure in subprocedures { + let procedure_id = subprocedure.procedure_id(); + let receiver = procedure_ctx + .provider + .procedure_state_receiver(procedure_id) + .await + .context(ProcedureStateReceiverSnafu { procedure_id })? + .context(ProcedureStateReceiverNotFoundSnafu { procedure_id })?; + receivers.push((receiver, subprocedure)); + } + + let mut tasks = Vec::with_capacity(receivers.len()); + for (receiver, subprocedure) in receivers.iter_mut() { + tasks.push(async move { + watcher::wait(receiver).await.inspect_err(|e| { + error!(e; "inflight subprocedure failed, parent procedure_id: {}, procedure: {}", procedure_ctx.procedure_id, subprocedure); + }) + }); + } + + if fail_fast { + try_join_all(tasks).await.context(WaitProcedureSnafu)?; + return Ok(WaitForInflightSubproceduresResult::Success( + subprocedures.iter().collect(), + )); + } + + // If fail_fast is false, we need to wait for all subprocedures to complete. + let results = join_all(tasks).await; + let failed_procedures_num = results.iter().filter(|r| r.is_err()).count(); + if failed_procedures_num == 0 { + return Ok(WaitForInflightSubproceduresResult::Success( + subprocedures.iter().collect(), + )); + } + warn!( + "{} inflight subprocedures failed, total: {}, parent procedure_id: {}", + failed_procedures_num, + subprocedures.len(), + procedure_ctx.procedure_id + ); + + let mut failed_procedures = Vec::with_capacity(failed_procedures_num); + let mut success_procedures = Vec::with_capacity(subprocedures.len() - failed_procedures_num); + for (result, subprocedure) in results.into_iter().zip(subprocedures) { + if result.is_err() { + failed_procedures.push(subprocedure); + } else { + success_procedures.push(subprocedure); + } + } + + Ok(WaitForInflightSubproceduresResult::PartialSuccess( + PartialSuccessResult { + failed_procedures, + success_procedures, + }, + )) +} + +#[derive(Clone)] +pub struct Context { + pub node_manager: NodeManagerRef, + pub table_metadata_manager: TableMetadataManagerRef, + pub cache_invalidator: CacheInvalidatorRef, +} + +/// Metadata for an inflight physical table subprocedure. +pub struct PhysicalTableMeta { + pub procedure_id: ProcedureId, + pub table_id: TableId, + pub table_name: TableName, +} + +/// Metadata for an inflight logical table subprocedure. +pub struct LogicalTableMeta { + pub procedure_id: ProcedureId, + pub physical_table_id: TableId, + pub physical_table_name: TableName, + pub logical_tables: Vec<(TableId, TableName)>, +} + +/// Metadata for an inflight database subprocedure. +pub struct ReconcileDatabaseMeta { + pub procedure_id: ProcedureId, + pub catalog: String, + pub schema: String, +} + +/// The inflight subprocedure metadata. +pub enum SubprocedureMeta { + PhysicalTable(PhysicalTableMeta), + LogicalTable(LogicalTableMeta), + Database(ReconcileDatabaseMeta), +} + +impl Display for SubprocedureMeta { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + SubprocedureMeta::PhysicalTable(meta) => { + write!( + f, + "ReconcilePhysicalTable(procedure_id: {}, table_id: {}, table_name: {})", + meta.procedure_id, meta.table_id, meta.table_name + ) + } + SubprocedureMeta::LogicalTable(meta) => { + write!( + f, + "ReconcileLogicalTable(procedure_id: {}, physical_table_id: {}, physical_table_name: {}, logical_tables: {:?})", + meta.procedure_id, meta.physical_table_id, meta.physical_table_name, meta.logical_tables + ) + } + SubprocedureMeta::Database(meta) => { + write!( + f, + "ReconcileDatabase(procedure_id: {}, catalog: {}, schema: {})", + meta.procedure_id, meta.catalog, meta.schema + ) + } + } + } +} + +impl SubprocedureMeta { + /// Creates a new logical table subprocedure metadata. + pub fn new_logical_table( + procedure_id: ProcedureId, + physical_table_id: TableId, + physical_table_name: TableName, + logical_tables: Vec<(TableId, TableName)>, + ) -> Self { + Self::LogicalTable(LogicalTableMeta { + procedure_id, + physical_table_id, + physical_table_name, + logical_tables, + }) + } + + /// Creates a new physical table subprocedure metadata. + pub fn new_physical_table( + procedure_id: ProcedureId, + table_id: TableId, + table_name: TableName, + ) -> Self { + Self::PhysicalTable(PhysicalTableMeta { + procedure_id, + table_id, + table_name, + }) + } + + /// Creates a new reconcile database subprocedure metadata. + pub fn new_reconcile_database( + procedure_id: ProcedureId, + catalog: String, + schema: String, + ) -> Self { + Self::Database(ReconcileDatabaseMeta { + procedure_id, + catalog, + schema, + }) + } + + /// Returns the procedure id of the subprocedure. + pub fn procedure_id(&self) -> ProcedureId { + match self { + SubprocedureMeta::PhysicalTable(meta) => meta.procedure_id, + SubprocedureMeta::LogicalTable(meta) => meta.procedure_id, + SubprocedureMeta::Database(meta) => meta.procedure_id, + } + } + + /// Returns the number of tables will be reconciled. + pub fn table_num(&self) -> usize { + match self { + SubprocedureMeta::PhysicalTable(_) => 1, + SubprocedureMeta::LogicalTable(meta) => meta.logical_tables.len(), + SubprocedureMeta::Database(_) => 0, + } + } + + /// Returns the number of databases will be reconciled. + pub fn database_num(&self) -> usize { + match self { + SubprocedureMeta::Database(_) => 1, + _ => 0, + } + } +} + +/// The metrics of reconciling catalog. +#[derive(Clone, Default)] +pub struct ReconcileCatalogMetrics { + pub succeeded_databases: usize, + pub failed_databases: usize, +} + +impl AddAssign for ReconcileCatalogMetrics { + fn add_assign(&mut self, other: Self) { + self.succeeded_databases += other.succeeded_databases; + self.failed_databases += other.failed_databases; + } +} + +impl Display for ReconcileCatalogMetrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "succeeded_databases: {}, failed_databases: {}", + self.succeeded_databases, self.failed_databases + ) + } +} + +impl From> for ReconcileCatalogMetrics { + fn from(result: WaitForInflightSubproceduresResult<'_>) -> Self { + match result { + WaitForInflightSubproceduresResult::Success(subprocedures) => ReconcileCatalogMetrics { + succeeded_databases: subprocedures.len(), + failed_databases: 0, + }, + WaitForInflightSubproceduresResult::PartialSuccess(PartialSuccessResult { + failed_procedures, + success_procedures, + }) => { + let succeeded_databases = success_procedures + .iter() + .map(|subprocedure| subprocedure.database_num()) + .sum(); + let failed_databases = failed_procedures + .iter() + .map(|subprocedure| subprocedure.database_num()) + .sum(); + ReconcileCatalogMetrics { + succeeded_databases, + failed_databases, + } + } + } + } +} + +/// The metrics of reconciling database. +#[derive(Clone, Default)] +pub struct ReconcileDatabaseMetrics { + pub succeeded_tables: usize, + pub failed_tables: usize, + pub succeeded_procedures: usize, + pub failed_procedures: usize, +} + +impl Display for ReconcileDatabaseMetrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "succeeded_tables: {}, failed_tables: {}, succeeded_procedures: {}, failed_procedures: {}", self.succeeded_tables, self.failed_tables, self.succeeded_procedures, self.failed_procedures) + } +} + +impl AddAssign for ReconcileDatabaseMetrics { + fn add_assign(&mut self, other: Self) { + self.succeeded_tables += other.succeeded_tables; + self.failed_tables += other.failed_tables; + self.succeeded_procedures += other.succeeded_procedures; + self.failed_procedures += other.failed_procedures; + } +} + +impl From> for ReconcileDatabaseMetrics { + fn from(result: WaitForInflightSubproceduresResult<'_>) -> Self { + match result { + WaitForInflightSubproceduresResult::Success(subprocedures) => { + let table_num = subprocedures + .iter() + .map(|subprocedure| subprocedure.table_num()) + .sum(); + ReconcileDatabaseMetrics { + succeeded_procedures: subprocedures.len(), + failed_procedures: 0, + succeeded_tables: table_num, + failed_tables: 0, + } + } + WaitForInflightSubproceduresResult::PartialSuccess(PartialSuccessResult { + failed_procedures, + success_procedures, + }) => { + let succeeded_tables = success_procedures + .iter() + .map(|subprocedure| subprocedure.table_num()) + .sum(); + let failed_tables = failed_procedures + .iter() + .map(|subprocedure| subprocedure.table_num()) + .sum(); + ReconcileDatabaseMetrics { + succeeded_procedures: success_procedures.len(), + failed_procedures: failed_procedures.len(), + succeeded_tables, + failed_tables, + } + } + } + } +} + +/// The metrics of reconciling logical tables. +#[derive(Clone)] +pub struct ReconcileLogicalTableMetrics { + pub start_time: Instant, + pub update_table_info_count: usize, + pub create_tables_count: usize, + pub column_metadata_consistent_count: usize, + pub column_metadata_inconsistent_count: usize, +} + +impl Default for ReconcileLogicalTableMetrics { + fn default() -> Self { + Self { + start_time: Instant::now(), + update_table_info_count: 0, + create_tables_count: 0, + column_metadata_consistent_count: 0, + column_metadata_inconsistent_count: 0, + } + } +} + +const CREATE_TABLES: &str = "create_tables"; +const UPDATE_TABLE_INFO: &str = "update_table_info"; +const COLUMN_METADATA_CONSISTENT: &str = "column_metadata_consistent"; +const COLUMN_METADATA_INCONSISTENT: &str = "column_metadata_inconsistent"; + +impl ReconcileLogicalTableMetrics { + /// The total number of tables that have been reconciled. + pub fn total_table_count(&self) -> usize { + self.create_tables_count + + self.column_metadata_consistent_count + + self.column_metadata_inconsistent_count + } +} + +impl Drop for ReconcileLogicalTableMetrics { + fn drop(&mut self) { + let procedure_name = ReconcileLogicalTablesProcedure::TYPE_NAME; + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[procedure_name, metrics::TABLE_TYPE_LOGICAL, CREATE_TABLES]) + .inc_by(self.create_tables_count as u64); + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + procedure_name, + metrics::TABLE_TYPE_LOGICAL, + UPDATE_TABLE_INFO, + ]) + .inc_by(self.update_table_info_count as u64); + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + procedure_name, + metrics::TABLE_TYPE_LOGICAL, + COLUMN_METADATA_CONSISTENT, + ]) + .inc_by(self.column_metadata_consistent_count as u64); + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + procedure_name, + metrics::TABLE_TYPE_LOGICAL, + COLUMN_METADATA_INCONSISTENT, + ]) + .inc_by(self.column_metadata_inconsistent_count as u64); + } +} + +impl Display for ReconcileLogicalTableMetrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let elapsed = self.start_time.elapsed(); + if self.create_tables_count > 0 { + write!(f, "create_tables_count: {}, ", self.create_tables_count)?; + } + if self.update_table_info_count > 0 { + write!( + f, + "update_table_info_count: {}, ", + self.update_table_info_count + )?; + } + if self.column_metadata_consistent_count > 0 { + write!( + f, + "column_metadata_consistent_count: {}, ", + self.column_metadata_consistent_count + )?; + } + if self.column_metadata_inconsistent_count > 0 { + write!( + f, + "column_metadata_inconsistent_count: {}, ", + self.column_metadata_inconsistent_count + )?; + } + + write!( + f, + "total_table_count: {}, elapsed: {:?}", + self.total_table_count(), + elapsed + ) + } +} + +/// The result of resolving column metadata. +#[derive(Clone, Copy)] +pub enum ResolveColumnMetadataResult { + Consistent, + Inconsistent(ResolveStrategy), +} + +impl Display for ResolveColumnMetadataResult { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ResolveColumnMetadataResult::Consistent => write!(f, "Consistent"), + ResolveColumnMetadataResult::Inconsistent(strategy) => { + let strategy_str = strategy.as_ref(); + write!(f, "Inconsistent({})", strategy_str) + } + } + } +} + +/// The metrics of reconciling physical tables. +#[derive(Clone)] +pub struct ReconcileTableMetrics { + /// The start time of the reconciliation. + pub start_time: Instant, + /// The result of resolving column metadata. + pub resolve_column_metadata_result: Option, + /// Whether the table info has been updated. + pub update_table_info: bool, +} + +impl Drop for ReconcileTableMetrics { + fn drop(&mut self) { + if let Some(resolve_column_metadata_result) = self.resolve_column_metadata_result { + match resolve_column_metadata_result { + ResolveColumnMetadataResult::Consistent => { + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + ReconcileTableProcedure::TYPE_NAME, + metrics::TABLE_TYPE_PHYSICAL, + COLUMN_METADATA_CONSISTENT, + ]) + .inc(); + } + ResolveColumnMetadataResult::Inconsistent(strategy) => { + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + ReconcileTableProcedure::TYPE_NAME, + metrics::TABLE_TYPE_PHYSICAL, + COLUMN_METADATA_INCONSISTENT, + ]) + .inc(); + metrics::METRIC_META_RECONCILIATION_RESOLVED_COLUMN_METADATA + .with_label_values(&[strategy.as_ref()]) + .inc(); + } + } + } + if self.update_table_info { + metrics::METRIC_META_RECONCILIATION_STATS + .with_label_values(&[ + ReconcileTableProcedure::TYPE_NAME, + metrics::TABLE_TYPE_PHYSICAL, + UPDATE_TABLE_INFO, + ]) + .inc(); + } + } +} + +impl Default for ReconcileTableMetrics { + fn default() -> Self { + Self { + start_time: Instant::now(), + resolve_column_metadata_result: None, + update_table_info: false, + } + } +} + +impl Display for ReconcileTableMetrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let elapsed = self.start_time.elapsed(); + if let Some(resolve_column_metadata_result) = self.resolve_column_metadata_result { + write!( + f, + "resolve_column_metadata_result: {}, ", + resolve_column_metadata_result + )?; + } + write!( + f, + "update_table_info: {}, elapsed: {:?}", + self.update_table_info, elapsed + ) + } +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + use std::collections::HashMap; + use std::sync::Arc; + + use api::v1::SemanticType; + use datatypes::prelude::ConcreteDataType; + use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder}; + use store_api::metadata::ColumnMetadata; + use store_api::storage::RegionId; + use table::metadata::{RawTableMeta, TableMetaBuilder}; + use table::table_reference::TableReference; + + use super::*; + use crate::ddl::test_util::region_metadata::build_region_metadata; + use crate::error::Error; + use crate::reconciliation::utils::check_column_metadatas_consistent; + + fn new_test_schema() -> Schema { + let column_schemas = vec![ + ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), + ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), + ColumnSchema::new("col2", ConcreteDataType::int32_datatype(), true), + ]; + SchemaBuilder::try_from(column_schemas) + .unwrap() + .version(123) + .build() + .unwrap() + } + + fn new_test_column_metadatas() -> Vec { + vec![ + ColumnMetadata { + column_schema: ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true), + semantic_type: SemanticType::Tag, + column_id: 0, + }, + ColumnMetadata { + column_schema: ColumnSchema::new( + "ts", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ) + .with_time_index(true), + semantic_type: SemanticType::Timestamp, + column_id: 1, + }, + ColumnMetadata { + column_schema: ColumnSchema::new("col2", ConcreteDataType::int32_datatype(), true), + semantic_type: SemanticType::Field, + column_id: 2, + }, + ] + } + + fn new_test_raw_table_info() -> RawTableMeta { + let mut table_meta_builder = TableMetaBuilder::empty(); + let table_meta = table_meta_builder + .schema(Arc::new(new_test_schema())) + .primary_key_indices(vec![0]) + .partition_key_indices(vec![2]) + .next_column_id(4) + .build() + .unwrap(); + + table_meta.into() + } + + #[test] + fn test_build_table_info_from_column_metadatas_identical() { + let column_metadatas = new_test_column_metadatas(); + let table_id = 1; + let table_ref = TableReference::full("test_catalog", "test_schema", "test_table"); + let mut table_meta = new_test_raw_table_info(); + table_meta.column_ids = vec![0, 1, 2]; + let name_to_ids = HashMap::from([ + ("col1".to_string(), 0), + ("ts".to_string(), 1), + ("col2".to_string(), 2), + ]); + + let new_table_meta = build_table_meta_from_column_metadatas( + table_id, + table_ref, + &table_meta, + Some(name_to_ids), + &column_metadatas, + ) + .unwrap(); + assert_eq!(new_table_meta, table_meta); + } + + #[test] + fn test_build_table_info_from_column_metadatas() { + let mut column_metadatas = new_test_column_metadatas(); + column_metadatas.push(ColumnMetadata { + column_schema: ColumnSchema::new("col3", ConcreteDataType::string_datatype(), true), + semantic_type: SemanticType::Tag, + column_id: 3, + }); + + let table_id = 1; + let table_ref = TableReference::full("test_catalog", "test_schema", "test_table"); + let table_meta = new_test_raw_table_info(); + let name_to_ids = HashMap::from([ + ("col1".to_string(), 0), + ("ts".to_string(), 1), + ("col2".to_string(), 2), + ]); + + let new_table_meta = build_table_meta_from_column_metadatas( + table_id, + table_ref, + &table_meta, + Some(name_to_ids), + &column_metadatas, + ) + .unwrap(); + + assert_eq!(new_table_meta.primary_key_indices, vec![0, 3]); + assert_eq!(new_table_meta.partition_key_indices, vec![2]); + assert_eq!(new_table_meta.value_indices, vec![1, 2]); + assert_eq!(new_table_meta.schema.timestamp_index, Some(1)); + assert_eq!(new_table_meta.column_ids, vec![0, 1, 2, 3]); + assert_eq!(new_table_meta.next_column_id, 4); + } + + #[test] + fn test_build_table_info_from_column_metadatas_with_incorrect_name_to_ids() { + let column_metadatas = new_test_column_metadatas(); + let table_id = 1; + let table_ref = TableReference::full("test_catalog", "test_schema", "test_table"); + let table_meta = new_test_raw_table_info(); + let name_to_ids = HashMap::from([ + ("col1".to_string(), 0), + ("ts".to_string(), 1), + // Change column id of col2 to 3. + ("col2".to_string(), 3), + ]); + + let err = build_table_meta_from_column_metadatas( + table_id, + table_ref, + &table_meta, + Some(name_to_ids), + &column_metadatas, + ) + .unwrap_err(); + + assert_matches!(err, Error::MismatchColumnId { .. }); + } + + #[test] + fn test_build_table_info_from_column_metadatas_with_missing_time_index() { + let mut column_metadatas = new_test_column_metadatas(); + column_metadatas.retain(|c| c.semantic_type != SemanticType::Timestamp); + let table_id = 1; + let table_ref = TableReference::full("test_catalog", "test_schema", "test_table"); + let table_meta = new_test_raw_table_info(); + let name_to_ids = HashMap::from([ + ("col1".to_string(), 0), + ("ts".to_string(), 1), + ("col2".to_string(), 2), + ]); + + let err = build_table_meta_from_column_metadatas( + table_id, + table_ref, + &table_meta, + Some(name_to_ids), + &column_metadatas, + ) + .unwrap_err(); + + assert!( + err.to_string() + .contains("Missing table index in column metadata"), + "err: {}", + err + ); + } + + #[test] + fn test_build_table_info_from_column_metadatas_with_missing_column() { + let mut column_metadatas = new_test_column_metadatas(); + // Remove primary key column. + column_metadatas.retain(|c| c.column_id != 0); + let table_id = 1; + let table_ref = TableReference::full("test_catalog", "test_schema", "test_table"); + let table_meta = new_test_raw_table_info(); + let name_to_ids = HashMap::from([ + ("col1".to_string(), 0), + ("ts".to_string(), 1), + ("col2".to_string(), 2), + ]); + + let err = build_table_meta_from_column_metadatas( + table_id, + table_ref, + &table_meta, + Some(name_to_ids.clone()), + &column_metadatas, + ) + .unwrap_err(); + assert_matches!(err, Error::MissingColumnInColumnMetadata { .. }); + + let mut column_metadatas = new_test_column_metadatas(); + // Remove partition key column. + column_metadatas.retain(|c| c.column_id != 2); + + let err = build_table_meta_from_column_metadatas( + table_id, + table_ref, + &table_meta, + Some(name_to_ids), + &column_metadatas, + ) + .unwrap_err(); + assert_matches!(err, Error::MissingColumnInColumnMetadata { .. }); + } + + #[test] + fn test_check_column_metadatas_consistent() { + let column_metadatas = new_test_column_metadatas(); + let region_metadata1 = build_region_metadata(RegionId::new(1024, 0), &column_metadatas); + let region_metadata2 = build_region_metadata(RegionId::new(1024, 1), &column_metadatas); + let result = + check_column_metadatas_consistent(&[region_metadata1, region_metadata2]).unwrap(); + assert_eq!(result, column_metadatas); + + let region_metadata1 = build_region_metadata(RegionId::new(1025, 0), &column_metadatas); + let region_metadata2 = build_region_metadata(RegionId::new(1024, 1), &column_metadatas); + let result = check_column_metadatas_consistent(&[region_metadata1, region_metadata2]); + assert!(result.is_none()); + } + + #[test] + fn test_check_column_metadata_invariants() { + let column_metadatas = new_test_column_metadatas(); + let mut new_column_metadatas = column_metadatas.clone(); + new_column_metadatas.push(ColumnMetadata { + column_schema: ColumnSchema::new("col3", ConcreteDataType::int32_datatype(), true), + semantic_type: SemanticType::Field, + column_id: 3, + }); + check_column_metadata_invariants(&new_column_metadatas, &column_metadatas).unwrap(); + } + + #[test] + fn test_check_column_metadata_invariants_missing_primary_key_column_or_ts_column() { + let column_metadatas = new_test_column_metadatas(); + let mut new_column_metadatas = column_metadatas.clone(); + new_column_metadatas.retain(|c| c.semantic_type != SemanticType::Timestamp); + check_column_metadata_invariants(&new_column_metadatas, &column_metadatas).unwrap_err(); + + let column_metadatas = new_test_column_metadatas(); + let mut new_column_metadatas = column_metadatas.clone(); + new_column_metadatas.retain(|c| c.semantic_type != SemanticType::Tag); + check_column_metadata_invariants(&new_column_metadatas, &column_metadatas).unwrap_err(); + } + + #[test] + fn test_check_column_metadata_invariants_mismatch_column_id() { + let column_metadatas = new_test_column_metadatas(); + let mut new_column_metadatas = column_metadatas.clone(); + if let Some(col) = new_column_metadatas + .iter_mut() + .find(|c| c.semantic_type == SemanticType::Timestamp) + { + col.column_id = 100; + } + check_column_metadata_invariants(&new_column_metadatas, &column_metadatas).unwrap_err(); + + let column_metadatas = new_test_column_metadatas(); + let mut new_column_metadatas = column_metadatas.clone(); + if let Some(col) = new_column_metadatas + .iter_mut() + .find(|c| c.semantic_type == SemanticType::Tag) + { + col.column_id = 100; + } + check_column_metadata_invariants(&new_column_metadatas, &column_metadatas).unwrap_err(); + } + + #[test] + fn test_resolve_column_metadatas_with_use_metasrv_strategy() { + let column_metadatas = new_test_column_metadatas(); + let region_metadata1 = build_region_metadata(RegionId::new(1024, 0), &column_metadatas); + let mut metasrv_column_metadatas = region_metadata1.column_metadatas.clone(); + metasrv_column_metadatas.push(ColumnMetadata { + column_schema: ColumnSchema::new("col3", ConcreteDataType::int32_datatype(), true), + semantic_type: SemanticType::Field, + column_id: 3, + }); + let result = + resolve_column_metadatas_with_metasrv(&metasrv_column_metadatas, &[region_metadata1]) + .unwrap(); + + assert_eq!(result, vec![RegionId::new(1024, 0)]); + } + + #[test] + fn test_resolve_column_metadatas_with_use_latest_strategy() { + let column_metadatas = new_test_column_metadatas(); + let region_metadata1 = build_region_metadata(RegionId::new(1024, 0), &column_metadatas); + let mut new_column_metadatas = column_metadatas.clone(); + new_column_metadatas.push(ColumnMetadata { + column_schema: ColumnSchema::new("col3", ConcreteDataType::int32_datatype(), true), + semantic_type: SemanticType::Field, + column_id: 3, + }); + + let mut region_metadata2 = + build_region_metadata(RegionId::new(1024, 1), &new_column_metadatas); + region_metadata2.schema_version = 2; + + let (resolved_column_metadatas, region_ids) = + resolve_column_metadatas_with_latest(&[region_metadata1, region_metadata2]).unwrap(); + assert_eq!(region_ids, vec![RegionId::new(1024, 0)]); + assert_eq!(resolved_column_metadatas, new_column_metadatas); + } +} diff --git a/src/common/meta/src/rpc/ddl.rs b/src/common/meta/src/rpc/ddl.rs index 99eb24cebb..fc46ba9c9c 100644 --- a/src/common/meta/src/rpc/ddl.rs +++ b/src/common/meta/src/rpc/ddl.rs @@ -1358,6 +1358,7 @@ mod tests { options: Default::default(), created_on: Default::default(), partition_key_indices: Default::default(), + column_ids: Default::default(), }; // construct RawTableInfo @@ -1423,6 +1424,6 @@ mod tests { create_table_task.table_info.meta.primary_key_indices, vec![2] ); - assert_eq!(create_table_task.table_info.meta.value_indices, vec![1]); + assert_eq!(create_table_task.table_info.meta.value_indices, vec![0, 1]); } } diff --git a/src/common/meta/src/sequence.rs b/src/common/meta/src/sequence.rs index b1acc961d4..0e26b4c594 100644 --- a/src/common/meta/src/sequence.rs +++ b/src/common/meta/src/sequence.rs @@ -15,6 +15,7 @@ use std::ops::Range; use std::sync::Arc; +use common_telemetry::{debug, warn}; use snafu::ensure; use tokio::sync::Mutex; @@ -82,15 +83,43 @@ pub struct Sequence { } impl Sequence { + /// Returns the next value and increments the sequence. pub async fn next(&self) -> Result { let mut inner = self.inner.lock().await; inner.next().await } + /// Returns the range of available sequences. pub async fn min_max(&self) -> Range { let inner = self.inner.lock().await; inner.initial..inner.max } + + /// Returns the current value stored in the remote storage without incrementing the sequence. + /// + /// This function always fetches the true current state from the remote storage (KV backend), + /// ignoring any local cache to provide the most accurate view of the sequence's remote state. + /// It does not consume or advance the sequence value. + /// + /// Note: Since this always queries the remote storage, it may be slower than `next()` but + /// provides the most accurate and up-to-date information about the sequence state. + pub async fn peek(&self) -> Result { + let inner = self.inner.lock().await; + inner.peek().await + } + + /// Jumps to the given value. + /// + /// The next value must be greater than both: + /// 1. The current local next value + /// 2. The current value stored in the remote storage (KV backend) + /// + /// This ensures the sequence can only move forward and maintains consistency + /// across different instances accessing the same sequence. + pub async fn jump_to(&self, next: u64) -> Result<()> { + let mut inner = self.inner.lock().await; + inner.jump_to(next).await + } } struct Inner { @@ -121,6 +150,7 @@ impl Inner { if range.contains(&self.next) { let res = Ok(self.next); self.next += 1; + debug!("sequence {} next: {}", self.name, self.next); return res; } self.range = None; @@ -129,6 +159,10 @@ impl Inner { let range = self.next_range().await?; self.next = range.start; self.range = Some(range); + debug!( + "sequence {} next: {}, range: {:?}", + self.name, self.next, self.range + ); } } } @@ -139,6 +173,26 @@ impl Inner { .fail() } + /// Returns the current value from remote storage without advancing the sequence. + /// If no value exists in remote storage, returns the initial value. + pub async fn peek(&self) -> Result { + let key = self.name.as_bytes(); + let value = self.generator.get(key).await?.map(|kv| kv.value); + let next = if let Some(value) = value { + let next = self.initial.max(self.parse_sequence_value(value)?); + debug!("The next value of sequence {} is {}", self.name, next); + next + } else { + debug!( + "The next value of sequence {} is not set, use initial value {}", + self.name, self.initial + ); + self.initial + }; + + Ok(next) + } + pub async fn next_range(&self) -> Result> { let key = self.name.as_bytes(); let mut start = self.next; @@ -172,16 +226,7 @@ impl Inner { if !res.success { if let Some(kv) = res.prev_kv { - let v: [u8; 8] = match kv.value.clone().try_into() { - Ok(a) => a, - Err(v) => { - return error::UnexpectedSequenceValueSnafu { - err_msg: format!("Not a valid u64 for '{}': {v:?}", self.name), - } - .fail() - } - }; - let v = u64::from_le_bytes(v); + let v = self.parse_sequence_value(kv.value.clone())?; // If the existed value is smaller than the initial, we should start from the initial. start = v.max(self.initial); expect = kv.value; @@ -203,11 +248,76 @@ impl Inner { } .fail() } + + /// Jumps to the given value. + /// + /// The next value must be greater than both: + /// 1. The current local next value (self.next) + /// 2. The current value stored in the remote storage (KV backend) + /// + /// This ensures the sequence can only move forward and maintains consistency + /// across different instances accessing the same sequence. + pub async fn jump_to(&mut self, next: u64) -> Result<()> { + let key = self.name.as_bytes(); + let current = self.generator.get(key).await?.map(|kv| kv.value); + + let curr_val = match ¤t { + Some(val) => self.initial.max(self.parse_sequence_value(val.clone())?), + None => self.initial, + }; + + ensure!( + next > curr_val, + error::UnexpectedSnafu { + err_msg: format!( + "The next value {} is not greater than the current next value {}", + next, curr_val + ), + } + ); + + let expect = current.unwrap_or_default(); + + let req = CompareAndPutRequest { + key: key.to_vec(), + expect, + value: u64::to_le_bytes(next).to_vec(), + }; + let res = self.generator.compare_and_put(req).await?; + ensure!( + res.success, + error::UnexpectedSnafu { + err_msg: format!("Failed to reset sequence {} to {}", self.name, next), + } + ); + warn!("Sequence {} jumped to {}", self.name, next); + // Reset the sequence to the initial value. + self.initial = next; + self.next = next; + self.range = None; + + Ok(()) + } + + /// Converts a Vec to u64 with proper error handling for sequence values + fn parse_sequence_value(&self, value: Vec) -> Result { + let v: [u8; 8] = match value.try_into() { + Ok(a) => a, + Err(v) => { + return error::UnexpectedSequenceValueSnafu { + err_msg: format!("Not a valid u64 for '{}': {v:?}", self.name), + } + .fail() + } + }; + Ok(u64::from_le_bytes(v)) + } } #[cfg(test)] mod tests { use std::any::Any; + use std::assert_matches::assert_matches; use std::collections::HashSet; use std::sync::Arc; @@ -308,7 +418,29 @@ mod tests { } #[tokio::test] - async fn test_sequence_out_of_rage() { + async fn test_sequence_set() { + let kv_backend = Arc::new(MemoryKvBackend::default()); + let seq = SequenceBuilder::new("test_seq", kv_backend.clone()) + .initial(1024) + .step(10) + .build(); + seq.jump_to(1025).await.unwrap(); + assert_eq!(seq.next().await.unwrap(), 1025); + let err = seq.jump_to(1025).await.unwrap_err(); + assert_matches!(err, Error::Unexpected { .. }); + assert_eq!(seq.next().await.unwrap(), 1026); + + seq.jump_to(1048).await.unwrap(); + // Recreate the sequence to test the sequence is reset correctly. + let seq = SequenceBuilder::new("test_seq", kv_backend) + .initial(1024) + .step(10) + .build(); + assert_eq!(seq.next().await.unwrap(), 1048); + } + + #[tokio::test] + async fn test_sequence_out_of_range() { let seq = SequenceBuilder::new("test_seq", Arc::new(MemoryKvBackend::default())) .initial(u64::MAX - 10) .step(10) @@ -378,4 +510,139 @@ mod tests { let next = seq.next().await; assert!(next.is_err()); } + + #[tokio::test] + async fn test_sequence_peek() { + common_telemetry::init_default_ut_logging(); + let kv_backend = Arc::new(MemoryKvBackend::default()); + let seq = SequenceBuilder::new("test_seq", kv_backend.clone()) + .step(10) + .initial(1024) + .build(); + // The sequence value in the kv backend is not set, so the peek value should be the initial value. + assert_eq!(seq.peek().await.unwrap(), 1024); + + for i in 0..11 { + let v = seq.next().await.unwrap(); + assert_eq!(v, 1024 + i); + } + let seq = SequenceBuilder::new("test_seq", kv_backend) + .initial(1024) + .build(); + // The sequence is not initialized, it will fetch the value from the kv backend. + assert_eq!(seq.peek().await.unwrap(), 1044); + } + + #[tokio::test] + async fn test_sequence_peek_shared_storage() { + let kv_backend = Arc::new(MemoryKvBackend::default()); + let shared_seq = "shared_seq"; + + // Create two sequence instances with the SAME name but DIFFERENT configs + let seq1 = SequenceBuilder::new(shared_seq, kv_backend.clone()) + .initial(100) + .step(5) + .build(); + let seq2 = SequenceBuilder::new(shared_seq, kv_backend.clone()) + .initial(200) // different initial + .step(3) // different step + .build(); + + // Initially both return their own initial values when no remote value exists + assert_eq!(seq1.peek().await.unwrap(), 100); + assert_eq!(seq2.peek().await.unwrap(), 200); + + // seq1 calls next() to allocate range and update remote storage + assert_eq!(seq1.next().await.unwrap(), 100); + // After seq1.next(), remote storage has 100 + seq1.step(5) = 105 + + // seq2 should now see the updated remote value through peek(), not its own initial(200) + assert_eq!(seq1.peek().await.unwrap(), 105); + assert_eq!(seq2.peek().await.unwrap(), 200); // sees seq1's update, but use its own initial(200) + + // seq2 calls next(), should start from its initial(200) + assert_eq!(seq2.next().await.unwrap(), 200); + // After seq2.next(), remote storage updated to 200 + seq2.step(3) = 203 + + // Both should see the new remote value (seq2's step was used) + assert_eq!(seq1.peek().await.unwrap(), 203); + assert_eq!(seq2.peek().await.unwrap(), 203); + + // seq1 calls next(), should start from its next(105) + assert_eq!(seq1.next().await.unwrap(), 101); + assert_eq!(seq1.next().await.unwrap(), 102); + assert_eq!(seq1.next().await.unwrap(), 103); + assert_eq!(seq1.next().await.unwrap(), 104); + assert_eq!(seq1.next().await.unwrap(), 203); + // After seq1.next(), remote storage updated to 203 + seq1.step(5) = 208 + assert_eq!(seq1.peek().await.unwrap(), 208); + assert_eq!(seq2.peek().await.unwrap(), 208); + } + + #[tokio::test] + async fn test_sequence_peek_initial_max_logic() { + let kv_backend = Arc::new(MemoryKvBackend::default()); + + // Manually set a small value in storage + let key = seq_name("test_max").into_bytes(); + kv_backend + .put( + PutRequest::new() + .with_key(key) + .with_value(u64::to_le_bytes(50)), + ) + .await + .unwrap(); + + // Create sequence with larger initial value + let seq = SequenceBuilder::new("test_max", kv_backend) + .initial(100) // larger than remote value (50) + .build(); + + // peek() should return max(initial, remote) = max(100, 50) = 100 + assert_eq!(seq.peek().await.unwrap(), 100); + + // next() should start from the larger initial value + assert_eq!(seq.next().await.unwrap(), 100); + } + + #[tokio::test] + async fn test_sequence_initial_greater_than_storage() { + let kv_backend = Arc::new(MemoryKvBackend::default()); + + // Test sequence behavior when initial > storage value + // This verifies the max(storage, initial) logic works correctly + + // Step 1: Establish a low value in storage + let seq1 = SequenceBuilder::new("max_test", kv_backend.clone()) + .initial(10) + .step(5) + .build(); + assert_eq!(seq1.next().await.unwrap(), 10); // storage: 15 + + // Step 2: Create sequence with much larger initial + let seq2 = SequenceBuilder::new("max_test", kv_backend.clone()) + .initial(100) // much larger than storage (15) + .step(5) + .build(); + + // seq2 should start from max(15, 100) = 100 (its initial value) + assert_eq!(seq2.next().await.unwrap(), 100); // storage updated to: 105 + assert_eq!(seq2.peek().await.unwrap(), 105); + + // Step 3: Verify subsequent sequences continue from updated storage + let seq3 = SequenceBuilder::new("max_test", kv_backend) + .initial(50) // smaller than current storage (105) + .step(1) + .build(); + + // seq3 should use max(105, 50) = 105 (storage value) + assert_eq!(seq3.peek().await.unwrap(), 105); + assert_eq!(seq3.next().await.unwrap(), 105); // storage: 106 + + // This demonstrates the correct max(storage, initial) behavior: + // - Sequences never generate values below their initial requirement + // - Storage always reflects the highest allocated value + // - Value gaps (15-99) are acceptable to maintain minimum constraints + } } diff --git a/src/common/meta/src/snapshot.rs b/src/common/meta/src/snapshot.rs index 6cd94768a9..08aebbc99c 100644 --- a/src/common/meta/src/snapshot.rs +++ b/src/common/meta/src/snapshot.rs @@ -21,7 +21,7 @@ use std::time::Instant; use common_telemetry::info; use file::{Metadata, MetadataContent}; -use futures::TryStreamExt; +use futures::{future, TryStreamExt}; use object_store::ObjectStore; use snafu::{OptionExt, ResultExt}; use strum::Display; @@ -30,6 +30,7 @@ use crate::error::{ Error, InvalidFileExtensionSnafu, InvalidFileNameSnafu, InvalidFilePathSnafu, ReadObjectSnafu, Result, WriteObjectSnafu, }; +use crate::key::{CANDIDATES_ROOT, ELECTION_KEY}; use crate::kv_backend::KvBackendRef; use crate::range_stream::{PaginationStream, DEFAULT_PAGE_SIZE}; use crate::rpc::store::{BatchPutRequest, RangeRequest}; @@ -162,6 +163,11 @@ pub struct MetadataSnapshotManager { /// The maximum size of the request to put metadata, use 1MiB by default. const MAX_REQUEST_SIZE: usize = 1024 * 1024; +/// Returns true if the key is an internal key. +fn is_internal_key(kv: &FileKeyValue) -> bool { + kv.key.starts_with(ELECTION_KEY.as_bytes()) || kv.key.starts_with(CANDIDATES_ROOT.as_bytes()) +} + impl MetadataSnapshotManager { pub fn new(kv_backend: KvBackendRef, object_store: ObjectStore) -> Self { Self { @@ -250,7 +256,10 @@ impl MetadataSnapshotManager { }) }) .into_stream(); - let keyvalues = stream.try_collect::>().await?; + let keyvalues = stream + .try_filter(|f| future::ready(!is_internal_key(f))) + .try_collect::>() + .await?; let num_keyvalues = keyvalues.len(); let document = Document::new( Metadata::new(), diff --git a/src/common/procedure-test/Cargo.toml b/src/common/procedure-test/Cargo.toml index 07c8436646..e445a235bc 100644 --- a/src/common/procedure-test/Cargo.toml +++ b/src/common/procedure-test/Cargo.toml @@ -11,3 +11,4 @@ workspace = true async-trait.workspace = true common-procedure = { workspace = true, features = ["testing"] } snafu.workspace = true +tokio.workspace = true diff --git a/src/common/procedure-test/src/lib.rs b/src/common/procedure-test/src/lib.rs index 9e98a4972e..f7a3ecb710 100644 --- a/src/common/procedure-test/src/lib.rs +++ b/src/common/procedure-test/src/lib.rs @@ -26,6 +26,7 @@ use common_procedure::{ Context, ContextProvider, Output, PoisonKey, Procedure, ProcedureId, ProcedureState, ProcedureWithId, Result, Status, StringKey, }; +use tokio::sync::watch::Receiver; /// A Mock [ContextProvider]. #[derive(Default)] @@ -57,6 +58,13 @@ impl ContextProvider for MockContextProvider { Ok(self.states.get(&procedure_id).cloned()) } + async fn procedure_state_receiver( + &self, + _procedure_id: ProcedureId, + ) -> Result>> { + Ok(None) + } + async fn try_put_poison(&self, key: &PoisonKey, procedure_id: ProcedureId) -> Result<()> { self.poison_manager .try_put_poison(key.to_string(), procedure_id.to_string()) diff --git a/src/common/procedure/src/error.rs b/src/common/procedure/src/error.rs index 90bf9dc5f6..128418d670 100644 --- a/src/common/procedure/src/error.rs +++ b/src/common/procedure/src/error.rs @@ -28,6 +28,19 @@ use crate::PoisonKey; #[snafu(visibility(pub))] #[stack_trace_debug] pub enum Error { + #[snafu(display("Failed to check procedure manager status"))] + CheckStatus { + source: BoxedError, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Manager is pasued"))] + ManagerPasued { + #[snafu(implicit)] + location: Location, + }, + #[snafu(display( "Failed to execute procedure due to external error, clean poisons: {}", clean_poisons @@ -246,7 +259,8 @@ impl ErrorExt for Error { | Error::ListState { source, .. } | Error::PutPoison { source, .. } | Error::DeletePoison { source, .. } - | Error::GetPoison { source, .. } => source.status_code(), + | Error::GetPoison { source, .. } + | Error::CheckStatus { source, .. } => source.status_code(), Error::ToJson { .. } | Error::DeleteState { .. } @@ -259,7 +273,8 @@ impl ErrorExt for Error { Error::RetryTimesExceeded { .. } | Error::RollbackTimesExceeded { .. } - | Error::ManagerNotStart { .. } => StatusCode::IllegalState, + | Error::ManagerNotStart { .. } + | Error::ManagerPasued { .. } => StatusCode::IllegalState, Error::RollbackNotSupported { .. } => StatusCode::Unsupported, Error::LoaderConflict { .. } | Error::DuplicateProcedure { .. } => { diff --git a/src/common/procedure/src/local.rs b/src/common/procedure/src/local.rs index 811f858d92..9b80884436 100644 --- a/src/common/procedure/src/local.rs +++ b/src/common/procedure/src/local.rs @@ -22,6 +22,7 @@ use std::time::{Duration, Instant}; use async_trait::async_trait; use backon::ExponentialBuilder; +use common_error::ext::BoxedError; use common_runtime::{RepeatedTask, TaskFunction}; use common_telemetry::tracing_context::{FutureExt, TracingContext}; use common_telemetry::{error, info, tracing}; @@ -30,9 +31,10 @@ use tokio::sync::watch::{self, Receiver, Sender}; use tokio::sync::{Mutex as TokioMutex, Notify}; use crate::error::{ - self, DuplicateProcedureSnafu, Error, LoaderConflictSnafu, ManagerNotStartSnafu, - PoisonKeyNotDefinedSnafu, ProcedureNotFoundSnafu, Result, StartRemoveOutdatedMetaTaskSnafu, - StopRemoveOutdatedMetaTaskSnafu, TooManyRunningProceduresSnafu, + self, CheckStatusSnafu, DuplicateProcedureSnafu, Error, LoaderConflictSnafu, + ManagerNotStartSnafu, ManagerPasuedSnafu, PoisonKeyNotDefinedSnafu, ProcedureNotFoundSnafu, + Result, StartRemoveOutdatedMetaTaskSnafu, StopRemoveOutdatedMetaTaskSnafu, + TooManyRunningProceduresSnafu, }; use crate::local::runner::Runner; use crate::procedure::{BoxedProcedureLoader, InitProcedureState, PoisonKeys, ProcedureInfo}; @@ -245,6 +247,13 @@ impl ContextProvider for ManagerContext { Ok(self.state(procedure_id)) } + async fn procedure_state_receiver( + &self, + procedure_id: ProcedureId, + ) -> Result>> { + Ok(self.state_receiver(procedure_id)) + } + async fn try_put_poison(&self, key: &PoisonKey, procedure_id: ProcedureId) -> Result<()> { { // validate the procedure exists @@ -343,6 +352,14 @@ impl ManagerContext { procedures.get(&procedure_id).map(|meta| meta.state()) } + /// Returns the [Receiver] of specific `procedure_id`. + fn state_receiver(&self, procedure_id: ProcedureId) -> Option> { + let procedures = self.procedures.read().unwrap(); + procedures + .get(&procedure_id) + .map(|meta| meta.state_receiver.clone()) + } + /// Returns the [ProcedureMeta] of all procedures. fn list_procedure(&self) -> Vec { let procedures = self.procedures.read().unwrap(); @@ -522,6 +539,14 @@ impl Default for ManagerConfig { } } +type PauseAwareRef = Arc; + +#[async_trait] +pub trait PauseAware: Send + Sync { + /// Returns true if the procedure manager is paused. + async fn is_paused(&self) -> std::result::Result; +} + /// A [ProcedureManager] that maintains procedure states locally. pub struct LocalManager { manager_ctx: Arc, @@ -531,6 +556,7 @@ pub struct LocalManager { /// GC task. remove_outdated_meta_task: TokioMutex>>, config: ManagerConfig, + pause_aware: Option, } impl LocalManager { @@ -539,6 +565,7 @@ impl LocalManager { config: ManagerConfig, state_store: StateStoreRef, poison_store: PoisonStoreRef, + pause_aware: Option, ) -> LocalManager { let manager_ctx = Arc::new(ManagerContext::new(poison_store)); @@ -549,6 +576,7 @@ impl LocalManager { retry_delay: config.retry_delay, remove_outdated_meta_task: TokioMutex::new(None), config, + pause_aware, } } @@ -719,6 +747,17 @@ impl LocalManager { let loaders = self.manager_ctx.loaders.lock().unwrap(); loaders.contains_key(name) } + + async fn check_status(&self) -> Result<()> { + if let Some(pause_aware) = self.pause_aware.as_ref() { + ensure!( + !pause_aware.is_paused().await.context(CheckStatusSnafu)?, + ManagerPasuedSnafu + ); + } + + Ok(()) + } } #[async_trait] @@ -774,6 +813,7 @@ impl ProcedureManager for LocalManager { !self.manager_ctx.contains_procedure(procedure_id), DuplicateProcedureSnafu { procedure_id } ); + self.check_status().await?; self.submit_root( procedure.id, @@ -979,7 +1019,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir))); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.manager_ctx.start(); manager @@ -1004,7 +1044,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(object_store.clone())); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.manager_ctx.start(); manager @@ -1058,7 +1098,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir))); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.manager_ctx.start(); let procedure_id = ProcedureId::random(); @@ -1110,7 +1150,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir))); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.manager_ctx.start(); #[derive(Debug)] @@ -1191,7 +1231,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir))); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); let mut procedure = ProcedureToLoad::new("submit"); procedure.lock_key = LockKey::single_exclusive("test.submit"); @@ -1219,7 +1259,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir))); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.start().await.unwrap(); manager.stop().await.unwrap(); @@ -1256,7 +1296,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(object_store.clone())); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.manager_ctx.set_running(); let mut procedure = ProcedureToLoad::new("submit"); @@ -1338,7 +1378,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir))); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.manager_ctx.set_running(); manager @@ -1463,7 +1503,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(object_store.clone())); let poison_manager = Arc::new(InMemoryPoisonStore::new()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.manager_ctx.start(); let notify = Arc::new(Notify::new()); diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs index 677fb33745..aacb61f6e1 100644 --- a/src/common/procedure/src/local/runner.rs +++ b/src/common/procedure/src/local/runner.rs @@ -601,6 +601,7 @@ mod tests { use futures_util::FutureExt; use object_store::{EntryMode, ObjectStore}; use tokio::sync::mpsc; + use tokio::sync::watch::Receiver; use super::*; use crate::local::{test_util, DynamicKeyLockGuard}; @@ -668,6 +669,13 @@ mod tests { unimplemented!() } + async fn procedure_state_receiver( + &self, + _procedure_id: ProcedureId, + ) -> Result>> { + unimplemented!() + } + async fn try_put_poison( &self, _key: &PoisonKey, diff --git a/src/common/procedure/src/procedure.rs b/src/common/procedure/src/procedure.rs index e208f754b0..0b5746d268 100644 --- a/src/common/procedure/src/procedure.rs +++ b/src/common/procedure/src/procedure.rs @@ -22,6 +22,7 @@ use async_trait::async_trait; use serde::{Deserialize, Serialize}; use smallvec::{smallvec, SmallVec}; use snafu::{ResultExt, Snafu}; +use tokio::sync::watch::Receiver; use uuid::Uuid; use crate::error::{self, Error, Result}; @@ -58,6 +59,14 @@ pub enum Status { } impl Status { + /// Returns a [Status::Suspended] with given `subprocedures` and `persist` flag. + pub fn suspended(subprocedures: Vec, persist: bool) -> Status { + Status::Suspended { + subprocedures, + persist, + } + } + /// Returns a [Status::Poisoned] with given `keys` and `error`. pub fn poisoned(keys: impl IntoIterator, error: Error) -> Status { Status::Poisoned { @@ -140,6 +149,11 @@ pub trait ContextProvider: Send + Sync { /// Query the procedure state. async fn procedure_state(&self, procedure_id: ProcedureId) -> Result>; + async fn procedure_state_receiver( + &self, + procedure_id: ProcedureId, + ) -> Result>>; + /// Try to put a poison key for a procedure. /// /// This method is used to mark a resource as being operated on by a procedure. diff --git a/src/common/procedure/src/watcher.rs b/src/common/procedure/src/watcher.rs index 99af9a2dc7..48d4c8559d 100644 --- a/src/common/procedure/src/watcher.rs +++ b/src/common/procedure/src/watcher.rs @@ -83,7 +83,7 @@ mod tests { }; let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir))); let poison_manager = Arc::new(InMemoryPoisonStore::default()); - let manager = LocalManager::new(config, state_store, poison_manager); + let manager = LocalManager::new(config, state_store, poison_manager, None); manager.start().await.unwrap(); #[derive(Debug)] diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs index 5c87641947..66bd5c98b3 100644 --- a/src/datanode/src/datanode.rs +++ b/src/datanode/src/datanode.rs @@ -24,6 +24,7 @@ use common_error::ext::BoxedError; use common_greptimedb_telemetry::GreptimeDBTelemetryTask; use common_meta::cache::{LayeredCacheRegistry, SchemaCacheRef, TableSchemaCacheRef}; use common_meta::key::datanode_table::{DatanodeTableManager, DatanodeTableValue}; +use common_meta::key::runtime_switch::RuntimeSwitchManager; use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; use common_meta::wal_options_allocator::prepare_wal_options; @@ -230,6 +231,12 @@ impl DatanodeBuilder { .new_region_server(schema_metadata_manager, region_event_listener) .await?; + // TODO(weny): Considering introducing a readonly kv_backend trait. + let runtime_switch_manager = RuntimeSwitchManager::new(self.kv_backend.clone()); + let is_recovery_mode = runtime_switch_manager + .recovery_mode() + .await + .context(GetMetadataSnafu)?; let datanode_table_manager = DatanodeTableManager::new(self.kv_backend.clone()); let table_values = datanode_table_manager .tables(node_id) @@ -242,6 +249,8 @@ impl DatanodeBuilder { table_values, !controlled_by_metasrv, self.opts.init_regions_parallelism, + // Ignore nonexistent regions in recovery mode. + is_recovery_mode, ); if self.opts.init_regions_in_background { @@ -323,6 +332,12 @@ impl DatanodeBuilder { ) -> Result<()> { let node_id = self.opts.node_id.context(MissingNodeIdSnafu)?; + let runtime_switch_manager = RuntimeSwitchManager::new(kv_backend.clone()); + let is_recovery_mode = runtime_switch_manager + .recovery_mode() + .await + .context(GetMetadataSnafu)?; + let datanode_table_manager = DatanodeTableManager::new(kv_backend.clone()); let table_values = datanode_table_manager .tables(node_id) @@ -335,6 +350,7 @@ impl DatanodeBuilder { table_values, open_with_writable, self.opts.init_regions_parallelism, + is_recovery_mode, ) .await } @@ -558,6 +574,7 @@ async fn open_all_regions( table_values: Vec, open_with_writable: bool, init_regions_parallelism: usize, + ignore_nonexistent_region: bool, ) -> Result<()> { let mut regions = vec![]; #[cfg(feature = "enterprise")] @@ -616,18 +633,30 @@ async fn open_all_regions( } let open_regions = region_server - .handle_batch_open_requests(init_regions_parallelism, region_requests) + .handle_batch_open_requests( + init_regions_parallelism, + region_requests, + ignore_nonexistent_region, + ) .await?; - ensure!( - open_regions.len() == num_regions, - error::UnexpectedSnafu { - violated: format!( - "Expected to open {} of regions, only {} of regions has opened", - num_regions, - open_regions.len() - ) - } - ); + if !ignore_nonexistent_region { + ensure!( + open_regions.len() == num_regions, + error::UnexpectedSnafu { + violated: format!( + "Expected to open {} of regions, only {} of regions has opened", + num_regions, + open_regions.len() + ) + } + ); + } else if open_regions.len() != num_regions { + warn!( + "ignore nonexistent region, expected to open {} of regions, only {} of regions has opened", + num_regions, + open_regions.len() + ); + } for region_id in open_regions { if open_with_writable { @@ -660,19 +689,31 @@ async fn open_all_regions( } let open_regions = region_server - .handle_batch_open_requests(init_regions_parallelism, region_requests) + .handle_batch_open_requests( + init_regions_parallelism, + region_requests, + ignore_nonexistent_region, + ) .await?; - ensure!( - open_regions.len() == num_regions, - error::UnexpectedSnafu { - violated: format!( - "Expected to open {} of follower regions, only {} of regions has opened", - num_regions, - open_regions.len() - ) - } - ); + if !ignore_nonexistent_region { + ensure!( + open_regions.len() == num_regions, + error::UnexpectedSnafu { + violated: format!( + "Expected to open {} of follower regions, only {} of regions has opened", + num_regions, + open_regions.len() + ) + } + ); + } else if open_regions.len() != num_regions { + warn!( + "ignore nonexistent region, expected to open {} of follower regions, only {} of regions has opened", + num_regions, + open_regions.len() + ); + } } info!("all regions are opened"); diff --git a/src/datanode/src/error.rs b/src/datanode/src/error.rs index 5c81f6ab46..4914100b80 100644 --- a/src/datanode/src/error.rs +++ b/src/datanode/src/error.rs @@ -387,6 +387,14 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Failed to serialize json"))] + SerializeJson { + #[snafu(source)] + error: serde_json::Error, + #[snafu(implicit)] + location: Location, + }, } pub type Result = std::result::Result; @@ -457,6 +465,7 @@ impl ErrorExt for Error { StatusCode::RegionBusy } MissingCache { .. } => StatusCode::Internal, + SerializeJson { .. } => StatusCode::Internal, } } diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index d188958c96..447ed76d82 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -20,12 +20,14 @@ use std::time::Duration; use api::region::RegionResponse; use api::v1::region::sync_request::ManifestInfo; -use api::v1::region::{region_request, RegionResponse as RegionResponseV1, SyncRequest}; +use api::v1::region::{ + region_request, ListMetadataRequest, RegionResponse as RegionResponseV1, SyncRequest, +}; use api::v1::{ResponseHeader, Status}; use arrow_flight::{FlightData, Ticket}; use async_trait::async_trait; use bytes::Bytes; -use common_error::ext::BoxedError; +use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_query::request::QueryRequest; use common_query::OutputData; @@ -47,6 +49,7 @@ pub use query::dummy_catalog::{ DummyCatalogList, DummyTableProviderFactory, TableProviderFactoryRef, }; use query::QueryEngineRef; +use serde_json; use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult}; use servers::grpc::flight::{FlightCraft, FlightRecordBatchStream, TonicStream}; use servers::grpc::region_server::RegionServerHandler; @@ -71,10 +74,10 @@ use tonic::{Request, Response, Result as TonicResult}; use crate::error::{ self, BuildRegionRequestsSnafu, ConcurrentQueryLimiterClosedSnafu, ConcurrentQueryLimiterTimeoutSnafu, DataFusionSnafu, DecodeLogicalPlanSnafu, - ExecuteLogicalPlanSnafu, FindLogicalRegionsSnafu, HandleBatchDdlRequestSnafu, - HandleBatchOpenRequestSnafu, HandleRegionRequestSnafu, NewPlanDecoderSnafu, - RegionEngineNotFoundSnafu, RegionNotFoundSnafu, RegionNotReadySnafu, Result, - StopRegionEngineSnafu, UnexpectedSnafu, UnsupportedOutputSnafu, + ExecuteLogicalPlanSnafu, FindLogicalRegionsSnafu, GetRegionMetadataSnafu, + HandleBatchDdlRequestSnafu, HandleBatchOpenRequestSnafu, HandleRegionRequestSnafu, + NewPlanDecoderSnafu, RegionEngineNotFoundSnafu, RegionNotFoundSnafu, RegionNotReadySnafu, + Result, SerializeJsonSnafu, StopRegionEngineSnafu, UnexpectedSnafu, UnsupportedOutputSnafu, }; use crate::event_listener::RegionServerEventListenerRef; @@ -138,12 +141,12 @@ impl RegionServer { /// Finds the region's engine by its id. If the region is not ready, returns `None`. pub fn find_engine(&self, region_id: RegionId) -> Result> { - self.inner - .get_engine(region_id, &RegionChange::None) - .map(|x| match x { - CurrentEngine::Engine(engine) => Some(engine), - CurrentEngine::EarlyReturn(_) => None, - }) + match self.inner.get_engine(region_id, &RegionChange::None) { + Ok(CurrentEngine::Engine(engine)) => Ok(Some(engine)), + Ok(CurrentEngine::EarlyReturn(_)) => Ok(None), + Err(error::Error::RegionNotFound { .. }) => Ok(None), + Err(err) => Err(err), + } } #[tracing::instrument(skip_all)] @@ -151,9 +154,10 @@ impl RegionServer { &self, parallelism: usize, requests: Vec<(RegionId, RegionOpenRequest)>, + ignore_nonexistent_region: bool, ) -> Result> { self.inner - .handle_batch_open_requests(parallelism, requests) + .handle_batch_open_requests(parallelism, requests, ignore_nonexistent_region) .await } @@ -411,6 +415,7 @@ impl RegionServer { Ok(RegionResponse { affected_rows, extensions, + metadata: Vec::new(), }) } @@ -440,6 +445,7 @@ impl RegionServer { Ok(RegionResponse { affected_rows, extensions, + metadata: Vec::new(), }) } @@ -472,6 +478,48 @@ impl RegionServer { .map(|_| RegionResponse::new(AffectedRows::default())) } + /// Handles the ListMetadata request and retrieves metadata for specified regions. + /// + /// Returns the results as a JSON-serialized list in the [RegionResponse]. It serializes + /// non-existing regions as `null`. + #[tracing::instrument(skip_all)] + async fn handle_list_metadata_request( + &self, + request: &ListMetadataRequest, + ) -> Result { + let mut region_metadatas = Vec::new(); + // Collect metadata for each region + for region_id in &request.region_ids { + let region_id = RegionId::from_u64(*region_id); + // Get the engine. + let Some(engine) = self.find_engine(region_id)? else { + region_metadatas.push(None); + continue; + }; + + match engine.get_metadata(region_id).await { + Ok(metadata) => region_metadatas.push(Some(metadata)), + Err(err) => { + if err.status_code() == StatusCode::RegionNotFound { + region_metadatas.push(None); + } else { + Err(err).with_context(|_| GetRegionMetadataSnafu { + engine: engine.name(), + region_id, + })?; + } + } + } + } + + // Serialize metadata to JSON + let json_result = serde_json::to_vec(®ion_metadatas).context(SerializeJsonSnafu)?; + + let response = RegionResponse::from_metadata(json_result); + + Ok(response) + } + /// Sync region manifest and registers new opened logical regions. pub async fn sync_region( &self, @@ -503,6 +551,10 @@ impl RegionServerHandler for RegionServer { region_request::Body::Sync(sync_request) => { self.handle_sync_region_request(sync_request).await } + region_request::Body::ListMetadata(list_metadata_request) => { + self.handle_list_metadata_request(list_metadata_request) + .await + } _ => self.handle_requests_in_serial(request).await, } .map_err(BoxedError::new) @@ -517,6 +569,7 @@ impl RegionServerHandler for RegionServer { }), affected_rows: response.affected_rows as _, extensions: response.extensions, + metadata: response.metadata, }) } } @@ -747,6 +800,7 @@ impl RegionServerInner { engine: RegionEngineRef, parallelism: usize, requests: Vec<(RegionId, RegionOpenRequest)>, + ignore_nonexistent_region: bool, ) -> Result> { let region_changes = requests .iter() @@ -784,8 +838,14 @@ impl RegionServerInner { } Err(e) => { self.unset_region_status(region_id, &engine, *region_change); - error!(e; "Failed to open region: {}", region_id); - errors.push(e); + if e.status_code() == StatusCode::RegionNotFound + && ignore_nonexistent_region + { + warn!("Region {} not found, ignore it, source: {:?}", region_id, e); + } else { + error!(e; "Failed to open region: {}", region_id); + errors.push(e); + } } } } @@ -814,6 +874,7 @@ impl RegionServerInner { &self, parallelism: usize, requests: Vec<(RegionId, RegionOpenRequest)>, + ignore_nonexistent_region: bool, ) -> Result> { let mut engine_grouped_requests: HashMap> = HashMap::with_capacity(requests.len()); @@ -836,8 +897,13 @@ impl RegionServerInner { .with_context(|| RegionEngineNotFoundSnafu { name: &engine })? .clone(); results.push( - self.handle_batch_open_requests_inner(engine, parallelism, requests) - .await, + self.handle_batch_open_requests_inner( + engine, + parallelism, + requests, + ignore_nonexistent_region, + ) + .await, ) } @@ -902,6 +968,7 @@ impl RegionServerInner { Ok(RegionResponse { affected_rows: result.affected_rows, extensions: result.extensions, + metadata: Vec::new(), }) } Err(err) => { @@ -971,6 +1038,7 @@ impl RegionServerInner { Ok(RegionResponse { affected_rows: result.affected_rows, extensions: result.extensions, + metadata: Vec::new(), }) } Err(err) => { @@ -1243,8 +1311,11 @@ mod tests { use std::assert_matches::assert_matches; + use api::v1::SemanticType; use common_error::ext::ErrorExt; + use datatypes::prelude::ConcreteDataType; use mito2::test_util::CreateRequestBuilder; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; use store_api::region_engine::RegionEngine; use store_api::region_request::{RegionDropRequest, RegionOpenRequest, RegionTruncateRequest}; use store_api::storage::RegionId; @@ -1432,6 +1503,81 @@ mod tests { assert!(status.is_some()); } + #[tokio::test] + async fn test_batch_open_region_ignore_nonexistent_regions() { + common_telemetry::init_default_ut_logging(); + let mut mock_region_server = mock_region_server(); + let (engine, _receiver) = MockRegionEngine::with_mock_fn( + MITO_ENGINE_NAME, + Box::new(|region_id, _request| { + if region_id == RegionId::new(1, 1) { + error::RegionNotFoundSnafu { region_id }.fail() + } else { + Ok(0) + } + }), + ); + mock_region_server.register_engine(engine.clone()); + + let region_ids = mock_region_server + .handle_batch_open_requests( + 8, + vec![ + ( + RegionId::new(1, 1), + RegionOpenRequest { + engine: MITO_ENGINE_NAME.to_string(), + region_dir: String::new(), + options: Default::default(), + skip_wal_replay: false, + }, + ), + ( + RegionId::new(1, 2), + RegionOpenRequest { + engine: MITO_ENGINE_NAME.to_string(), + region_dir: String::new(), + options: Default::default(), + skip_wal_replay: false, + }, + ), + ], + true, + ) + .await + .unwrap(); + assert_eq!(region_ids, vec![RegionId::new(1, 2)]); + + let err = mock_region_server + .handle_batch_open_requests( + 8, + vec![ + ( + RegionId::new(1, 1), + RegionOpenRequest { + engine: MITO_ENGINE_NAME.to_string(), + region_dir: String::new(), + options: Default::default(), + skip_wal_replay: false, + }, + ), + ( + RegionId::new(1, 2), + RegionOpenRequest { + engine: MITO_ENGINE_NAME.to_string(), + region_dir: String::new(), + options: Default::default(), + skip_wal_replay: false, + }, + ), + ], + false, + ) + .await + .unwrap_err(); + assert_eq!(err.status_code(), StatusCode::Unexpected); + } + struct CurrentEngineTest { region_id: RegionId, current_region_status: Option, @@ -1606,4 +1752,175 @@ mod tests { let forth_query = p.acquire().await; assert!(forth_query.is_ok()); } + + fn mock_region_metadata(region_id: RegionId) -> RegionMetadata { + let mut metadata_builder = RegionMetadataBuilder::new(region_id); + metadata_builder.push_column_metadata(ColumnMetadata { + column_schema: datatypes::schema::ColumnSchema::new( + "timestamp", + ConcreteDataType::timestamp_nanosecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 0, + }); + metadata_builder.push_column_metadata(ColumnMetadata { + column_schema: datatypes::schema::ColumnSchema::new( + "file", + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: 1, + }); + metadata_builder.push_column_metadata(ColumnMetadata { + column_schema: datatypes::schema::ColumnSchema::new( + "message", + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 2, + }); + metadata_builder.primary_key(vec![1]); + metadata_builder.build().unwrap() + } + + #[tokio::test] + async fn test_handle_list_metadata_request() { + common_telemetry::init_default_ut_logging(); + + let mut mock_region_server = mock_region_server(); + let region_id_1 = RegionId::new(1, 0); + let region_id_2 = RegionId::new(2, 0); + + let metadata_1 = mock_region_metadata(region_id_1); + let metadata_2 = mock_region_metadata(region_id_2); + let metadatas = vec![Some(metadata_1.clone()), Some(metadata_2.clone())]; + + let metadata_1 = Arc::new(metadata_1); + let metadata_2 = Arc::new(metadata_2); + let (engine, _) = MockRegionEngine::with_metadata_mock_fn( + MITO_ENGINE_NAME, + Box::new(move |region_id| { + if region_id == region_id_1 { + Ok(metadata_1.clone()) + } else if region_id == region_id_2 { + Ok(metadata_2.clone()) + } else { + error::RegionNotFoundSnafu { region_id }.fail() + } + }), + ); + + mock_region_server.register_engine(engine.clone()); + mock_region_server + .inner + .region_map + .insert(region_id_1, RegionEngineWithStatus::Ready(engine.clone())); + mock_region_server + .inner + .region_map + .insert(region_id_2, RegionEngineWithStatus::Ready(engine.clone())); + + // All regions exist. + let list_metadata_request = ListMetadataRequest { + region_ids: vec![region_id_1.as_u64(), region_id_2.as_u64()], + }; + let response = mock_region_server + .handle_list_metadata_request(&list_metadata_request) + .await + .unwrap(); + let decoded_metadata: Vec> = + serde_json::from_slice(&response.metadata).unwrap(); + assert_eq!(metadatas, decoded_metadata); + } + + #[tokio::test] + async fn test_handle_list_metadata_not_found() { + common_telemetry::init_default_ut_logging(); + + let mut mock_region_server = mock_region_server(); + let region_id_1 = RegionId::new(1, 0); + let region_id_2 = RegionId::new(2, 0); + + let metadata_1 = mock_region_metadata(region_id_1); + let metadatas = vec![Some(metadata_1.clone()), None]; + + let metadata_1 = Arc::new(metadata_1); + let (engine, _) = MockRegionEngine::with_metadata_mock_fn( + MITO_ENGINE_NAME, + Box::new(move |region_id| { + if region_id == region_id_1 { + Ok(metadata_1.clone()) + } else { + error::RegionNotFoundSnafu { region_id }.fail() + } + }), + ); + + mock_region_server.register_engine(engine.clone()); + mock_region_server + .inner + .region_map + .insert(region_id_1, RegionEngineWithStatus::Ready(engine.clone())); + + // Not in region map. + let list_metadata_request = ListMetadataRequest { + region_ids: vec![region_id_1.as_u64(), region_id_2.as_u64()], + }; + let response = mock_region_server + .handle_list_metadata_request(&list_metadata_request) + .await + .unwrap(); + let decoded_metadata: Vec> = + serde_json::from_slice(&response.metadata).unwrap(); + assert_eq!(metadatas, decoded_metadata); + + // Not in region engine. + mock_region_server + .inner + .region_map + .insert(region_id_2, RegionEngineWithStatus::Ready(engine.clone())); + let response = mock_region_server + .handle_list_metadata_request(&list_metadata_request) + .await + .unwrap(); + let decoded_metadata: Vec> = + serde_json::from_slice(&response.metadata).unwrap(); + assert_eq!(metadatas, decoded_metadata); + } + + #[tokio::test] + async fn test_handle_list_metadata_failed() { + common_telemetry::init_default_ut_logging(); + + let mut mock_region_server = mock_region_server(); + let region_id_1 = RegionId::new(1, 0); + + let (engine, _) = MockRegionEngine::with_metadata_mock_fn( + MITO_ENGINE_NAME, + Box::new(move |region_id| { + error::UnexpectedSnafu { + violated: format!("Failed to get region {region_id}"), + } + .fail() + }), + ); + + mock_region_server.register_engine(engine.clone()); + mock_region_server + .inner + .region_map + .insert(region_id_1, RegionEngineWithStatus::Ready(engine.clone())); + + // Failed to get. + let list_metadata_request = ListMetadataRequest { + region_ids: vec![region_id_1.as_u64()], + }; + mock_region_server + .handle_list_metadata_request(&list_metadata_request) + .await + .unwrap_err(); + } } diff --git a/src/datanode/src/tests.rs b/src/datanode/src/tests.rs index 4c0b95c2ef..b5c59a35a5 100644 --- a/src/datanode/src/tests.rs +++ b/src/datanode/src/tests.rs @@ -108,11 +108,15 @@ pub type MockRequestHandler = pub type MockSetReadonlyGracefullyHandler = Box Result + Send + Sync>; +pub type MockGetMetadataHandler = + Box Result + Send + Sync>; + pub struct MockRegionEngine { sender: Sender<(RegionId, RegionRequest)>, pub(crate) handle_request_delay: Option, pub(crate) handle_request_mock_fn: Option, pub(crate) handle_set_readonly_gracefully_mock_fn: Option, + pub(crate) handle_get_metadata_mock_fn: Option, pub(crate) mock_role: Option>, engine: String, } @@ -127,6 +131,7 @@ impl MockRegionEngine { sender: tx, handle_request_mock_fn: None, handle_set_readonly_gracefully_mock_fn: None, + handle_get_metadata_mock_fn: None, mock_role: None, engine: engine.to_string(), }), @@ -146,6 +151,27 @@ impl MockRegionEngine { sender: tx, handle_request_mock_fn: Some(mock_fn), handle_set_readonly_gracefully_mock_fn: None, + handle_get_metadata_mock_fn: None, + mock_role: None, + engine: engine.to_string(), + }), + rx, + ) + } + + pub fn with_metadata_mock_fn( + engine: &str, + mock_fn: MockGetMetadataHandler, + ) -> (Arc, Receiver<(RegionId, RegionRequest)>) { + let (tx, rx) = tokio::sync::mpsc::channel(8); + + ( + Arc::new(Self { + handle_request_delay: None, + sender: tx, + handle_request_mock_fn: None, + handle_set_readonly_gracefully_mock_fn: None, + handle_get_metadata_mock_fn: Some(mock_fn), mock_role: None, engine: engine.to_string(), }), @@ -166,6 +192,7 @@ impl MockRegionEngine { sender: tx, handle_request_mock_fn: None, handle_set_readonly_gracefully_mock_fn: None, + handle_get_metadata_mock_fn: None, mock_role: None, engine: engine.to_string(), }; @@ -208,7 +235,11 @@ impl RegionEngine for MockRegionEngine { unimplemented!() } - async fn get_metadata(&self, _region_id: RegionId) -> Result { + async fn get_metadata(&self, region_id: RegionId) -> Result { + if let Some(mock_fn) = &self.handle_get_metadata_mock_fn { + return mock_fn(region_id).map_err(BoxedError::new); + }; + unimplemented!() } diff --git a/src/datatypes/src/schema/constraint.rs b/src/datatypes/src/schema/constraint.rs index c6551687a4..e7bd5d2063 100644 --- a/src/datatypes/src/schema/constraint.rs +++ b/src/datatypes/src/schema/constraint.rs @@ -57,6 +57,15 @@ impl TryFrom for Vec { } } +impl TryFrom<&ColumnDefaultConstraint> for Vec { + type Error = error::Error; + + fn try_from(value: &ColumnDefaultConstraint) -> std::result::Result { + let s = serde_json::to_string(value).context(error::SerializeSnafu)?; + Ok(s.into_bytes()) + } +} + impl Display for ColumnDefaultConstraint { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { diff --git a/src/flow/src/server.rs b/src/flow/src/server.rs index d065253d9b..6af119381c 100644 --- a/src/flow/src/server.rs +++ b/src/flow/src/server.rs @@ -24,11 +24,11 @@ use catalog::CatalogManagerRef; use common_base::Plugins; use common_error::ext::BoxedError; use common_meta::cache::{LayeredCacheRegistryRef, TableFlownodeSetCacheRef, TableRouteCacheRef}; -use common_meta::ddl::ProcedureExecutorRef; use common_meta::key::flow::FlowMetadataManagerRef; use common_meta::key::TableMetadataManagerRef; use common_meta::kv_backend::KvBackendRef; use common_meta::node_manager::{Flownode, NodeManagerRef}; +use common_meta::procedure_executor::ProcedureExecutorRef; use common_query::Output; use common_runtime::JoinHandle; use common_telemetry::tracing::info; diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index 038e203014..23294e75f2 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -39,6 +39,7 @@ use common_base::cancellation::CancellableFuture; use common_base::Plugins; use common_config::KvBackendConfig; use common_error::ext::{BoxedError, ErrorExt}; +use common_meta::key::runtime_switch::RuntimeSwitchManager; use common_meta::key::TableMetadataManagerRef; use common_meta::kv_backend::KvBackendRef; use common_meta::state_store::KvStateStore; @@ -132,10 +133,12 @@ impl Instance { max_running_procedures: procedure_config.max_running_procedures, ..Default::default() }; + let runtime_switch_manager = Arc::new(RuntimeSwitchManager::new(kv_backend.clone())); let procedure_manager = Arc::new(LocalManager::new( manager_config, kv_state_store.clone(), kv_state_store, + Some(runtime_switch_manager), )); Ok((kv_backend, procedure_manager)) diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs index e9c132da42..113a8ad879 100644 --- a/src/frontend/src/instance/builder.rs +++ b/src/frontend/src/instance/builder.rs @@ -20,11 +20,11 @@ use catalog::CatalogManagerRef; use common_base::Plugins; use common_meta::cache::{LayeredCacheRegistryRef, TableRouteCacheRef}; use common_meta::cache_invalidator::{CacheInvalidatorRef, DummyCacheInvalidator}; -use common_meta::ddl::ProcedureExecutorRef; use common_meta::key::flow::FlowMetadataManager; use common_meta::key::TableMetadataManager; use common_meta::kv_backend::KvBackendRef; use common_meta::node_manager::NodeManagerRef; +use common_meta::procedure_executor::ProcedureExecutorRef; use operator::delete::Deleter; use operator::flow::FlowServiceOperator; use operator::insert::Inserter; @@ -157,7 +157,8 @@ impl FrontendBuilder { self.catalog_manager.clone(), )); - let flow_metadata_manager = Arc::new(FlowMetadataManager::new(kv_backend.clone())); + let flow_metadata_manager: Arc = + Arc::new(FlowMetadataManager::new(kv_backend.clone())); let flow_service = FlowServiceOperator::new(flow_metadata_manager, node_manager.clone()); let query_engine = QueryEngineFactory::new_with_plugins( diff --git a/src/meta-client/src/client.rs b/src/meta-client/src/client.rs index 1e4f49077b..ebf2a6a167 100644 --- a/src/meta-client/src/client.rs +++ b/src/meta-client/src/client.rs @@ -24,7 +24,7 @@ mod util; use std::fmt::Debug; use std::sync::Arc; -use api::v1::meta::{ProcedureDetailResponse, Role}; +use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse, Role}; pub use ask_leader::{AskLeader, LeaderProvider, LeaderProviderRef}; use cluster::Client as ClusterClient; pub use cluster::ClusterKvBackend; @@ -34,12 +34,12 @@ use common_meta::cluster::{ ClusterInfo, MetasrvStatus, NodeInfo, NodeInfoKey, NodeStatus, Role as ClusterRole, }; use common_meta::datanode::{DatanodeStatKey, DatanodeStatValue, RegionStat}; -use common_meta::ddl::{ExecutorContext, ProcedureExecutor}; use common_meta::error::{ self as meta_error, ExternalSnafu, Result as MetaResult, UnsupportedSnafu, }; use common_meta::key::flow::flow_state::{FlowStat, FlowStateManager}; use common_meta::kv_backend::KvBackendRef; +use common_meta::procedure_executor::{ExecutorContext, ProcedureExecutor}; use common_meta::range_stream::PaginationStream; use common_meta::rpc::ddl::{SubmitDdlTaskRequest, SubmitDdlTaskResponse}; use common_meta::rpc::procedure::{ @@ -275,6 +275,17 @@ impl ProcedureExecutor for MetaClient { .context(meta_error::ExternalSnafu) } + async fn reconcile( + &self, + _ctx: &ExecutorContext, + request: ReconcileRequest, + ) -> MetaResult { + self.reconcile(request) + .await + .map_err(BoxedError::new) + .context(meta_error::ExternalSnafu) + } + async fn add_region_follower( &self, _ctx: &ExecutorContext, @@ -611,6 +622,11 @@ impl MetaClient { .await } + /// Reconcile the procedure state. + pub async fn reconcile(&self, request: ReconcileRequest) -> Result { + self.procedure_client()?.reconcile(request).await + } + /// Submit a DDL task pub async fn submit_ddl_task( &self, diff --git a/src/meta-client/src/client/procedure.rs b/src/meta-client/src/client/procedure.rs index f5408c0216..f63abe1b42 100644 --- a/src/meta-client/src/client/procedure.rs +++ b/src/meta-client/src/client/procedure.rs @@ -20,7 +20,7 @@ use api::v1::meta::procedure_service_client::ProcedureServiceClient; use api::v1::meta::{ DdlTaskRequest, DdlTaskResponse, MigrateRegionRequest, MigrateRegionResponse, ProcedureDetailRequest, ProcedureDetailResponse, ProcedureId, ProcedureStateResponse, - QueryProcedureRequest, ResponseHeader, Role, + QueryProcedureRequest, ReconcileRequest, ReconcileResponse, ResponseHeader, Role, }; use common_grpc::channel_manager::ChannelManager; use common_telemetry::tracing_context::TracingContext; @@ -98,6 +98,12 @@ impl Client { .await } + /// Reconcile the procedure state. + pub async fn reconcile(&self, request: ReconcileRequest) -> Result { + let inner = self.inner.read().await; + inner.reconcile(request).await + } + pub async fn list_procedures(&self) -> Result { let inner = self.inner.read().await; inner.list_procedures().await @@ -253,6 +259,26 @@ impl Inner { .await } + async fn reconcile(&self, request: ReconcileRequest) -> Result { + let mut req = request; + req.set_header( + self.id, + self.role, + TracingContext::from_current_span().to_w3c(), + ); + + self.with_retry( + "reconcile", + move |mut client| { + let req = req.clone(); + + async move { client.reconcile(req).await.map(|res| res.into_inner()) } + }, + |resp: &ReconcileResponse| &resp.header, + ) + .await + } + async fn query_procedure_state(&self, pid: &str) -> Result { let mut req = QueryProcedureRequest { pid: Some(ProcedureId { key: pid.into() }), diff --git a/src/meta-srv/Cargo.toml b/src/meta-srv/Cargo.toml index c2b042059e..955b47f32b 100644 --- a/src/meta-srv/Cargo.toml +++ b/src/meta-srv/Cargo.toml @@ -20,6 +20,9 @@ local-ip-address.workspace = true [dependencies] api.workspace = true async-trait.workspace = true +axum.workspace = true +axum-extra.workspace = true +axum-macros.workspace = true bytes.workspace = true chrono.workspace = true clap.workspace = true @@ -76,6 +79,7 @@ tokio-stream = { workspace = true, features = ["net"] } toml.workspace = true tonic.workspace = true tower.workspace = true +tower-http.workspace = true typetag.workspace = true url = "2.3" uuid.workspace = true @@ -86,6 +90,7 @@ client = { workspace = true, features = ["testing"] } common-meta = { workspace = true, features = ["testing"] } common-procedure-test.workspace = true common-wal = { workspace = true, features = ["testing"] } +hyper = "0.14" session.workspace = true tracing = "0.1" tracing-subscriber.workspace = true diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs index c9affc41ad..0a042cd484 100644 --- a/src/meta-srv/src/bootstrap.rs +++ b/src/meta-srv/src/bootstrap.rs @@ -71,6 +71,7 @@ use crate::selector::round_robin::RoundRobinSelector; use crate::selector::weight_compute::RegionNumsBasedWeightCompute; use crate::selector::SelectorType; use crate::service::admin; +use crate::service::admin::admin_axum_router; use crate::{error, Result}; pub struct MetasrvInstance { @@ -94,17 +95,20 @@ pub struct MetasrvInstance { } impl MetasrvInstance { - pub async fn new( - opts: MetasrvOptions, - plugins: Plugins, - metasrv: Metasrv, - ) -> Result { + pub async fn new(metasrv: Metasrv) -> Result { + let opts = metasrv.options().clone(); + let plugins = metasrv.plugins().clone(); + let metasrv = Arc::new(metasrv); + + // Wire up the admin_axum_router as an extra router + let extra_routers = admin_axum_router(metasrv.clone()); + let http_server = HttpServerBuilder::new(opts.http.clone()) .with_metrics_handler(MetricsHandler) .with_greptime_config_options(opts.to_toml().context(error::TomlFormatSnafu)?) + .with_extra_router(extra_routers) .build(); - let metasrv = Arc::new(metasrv); // put metasrv into plugins for later use plugins.insert::>(metasrv.clone()); let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins)) @@ -132,6 +136,7 @@ impl MetasrvInstance { self.signal_sender = Some(tx); + // Start gRPC server with admin services for backward compatibility let mut router = router(self.metasrv.clone()); if let Some(configurator) = self.metasrv.plugins().get::() { router = configurator.config_grpc(router); diff --git a/src/meta-srv/src/election.rs b/src/meta-srv/src/election.rs index 8163e2b9ad..f8d4a1e2bd 100644 --- a/src/meta-srv/src/election.rs +++ b/src/meta-srv/src/election.rs @@ -27,9 +27,6 @@ use tokio::sync::broadcast::{self, Receiver, Sender}; use crate::error::Result; use crate::metasrv::MetasrvNodeInfo; -pub const ELECTION_KEY: &str = "__metasrv_election"; -pub const CANDIDATES_ROOT: &str = "__metasrv_election_candidates/"; - pub(crate) const CANDIDATE_LEASE_SECS: u64 = 600; const KEEP_ALIVE_INTERVAL_SECS: u64 = CANDIDATE_LEASE_SECS / 2; diff --git a/src/meta-srv/src/election/etcd.rs b/src/meta-srv/src/election/etcd.rs index bf4bfa049d..936f9548ac 100644 --- a/src/meta-srv/src/election/etcd.rs +++ b/src/meta-srv/src/election/etcd.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use std::time::Duration; use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS}; +use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, info, warn}; use etcd_client::{ Client, GetOptions, LeaderKey as EtcdLeaderKey, LeaseKeepAliveStream, LeaseKeeper, PutOptions, @@ -28,7 +29,7 @@ use tokio::time::{timeout, MissedTickBehavior}; use crate::election::{ listen_leader_change, send_leader_change_and_set_flags, Election, LeaderChangeMessage, - LeaderKey, CANDIDATES_ROOT, CANDIDATE_LEASE_SECS, ELECTION_KEY, KEEP_ALIVE_INTERVAL_SECS, + LeaderKey, CANDIDATE_LEASE_SECS, KEEP_ALIVE_INTERVAL_SECS, }; use crate::error; use crate::error::Result; diff --git a/src/meta-srv/src/election/rds/mysql.rs b/src/meta-srv/src/election/rds/mysql.rs index 27b348a83a..e36e1bfacd 100644 --- a/src/meta-srv/src/election/rds/mysql.rs +++ b/src/meta-srv/src/election/rds/mysql.rs @@ -16,6 +16,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; +use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, warn}; use common_time::Timestamp; use snafu::{ensure, OptionExt, ResultExt}; @@ -29,7 +30,6 @@ use tokio::time::MissedTickBehavior; use crate::election::rds::{parse_value_and_expire_time, Lease, RdsLeaderKey, LEASE_SEP}; use crate::election::{ listen_leader_change, send_leader_change_and_set_flags, Election, LeaderChangeMessage, - CANDIDATES_ROOT, ELECTION_KEY, }; use crate::error::{ AcquireMySqlClientSnafu, DecodeSqlValueSnafu, DeserializeFromJsonSnafu, diff --git a/src/meta-srv/src/election/rds/postgres.rs b/src/meta-srv/src/election/rds/postgres.rs index b8c4ff718e..7caa3a249b 100644 --- a/src/meta-srv/src/election/rds/postgres.rs +++ b/src/meta-srv/src/election/rds/postgres.rs @@ -16,6 +16,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; +use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY}; use common_telemetry::{error, warn}; use common_time::Timestamp; use deadpool_postgres::{Manager, Pool}; @@ -28,7 +29,6 @@ use tokio_postgres::Row; use crate::election::rds::{parse_value_and_expire_time, Lease, RdsLeaderKey, LEASE_SEP}; use crate::election::{ listen_leader_change, send_leader_change_and_set_flags, Election, LeaderChangeMessage, - CANDIDATES_ROOT, ELECTION_KEY, }; use crate::error::{ DeserializeFromJsonSnafu, GetPostgresClientSnafu, NoLeaderSnafu, PostgresExecutionSnafu, diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs index b1ea799e5f..70d0c107c9 100644 --- a/src/meta-srv/src/error.rs +++ b/src/meta-srv/src/error.rs @@ -107,6 +107,13 @@ pub enum Error { source: common_meta::error::Error, }, + #[snafu(display("Failed to init reconciliation manager"))] + InitReconciliationManager { + #[snafu(implicit)] + location: Location, + source: common_meta::error::Error, + }, + #[snafu(display("Failed to create default catalog and schema"))] InitMetadata { #[snafu(implicit)] @@ -121,6 +128,20 @@ pub enum Error { source: common_meta::error::Error, }, + #[snafu(display("Failed to set next sequence number"))] + SetNextSequence { + #[snafu(implicit)] + location: Location, + source: common_meta::error::Error, + }, + + #[snafu(display("Failed to peek sequence number"))] + PeekSequence { + #[snafu(implicit)] + location: Location, + source: common_meta::error::Error, + }, + #[snafu(display("Failed to start telemetry task"))] StartTelemetryTask { #[snafu(implicit)] @@ -135,6 +156,13 @@ pub enum Error { source: common_meta::error::Error, }, + #[snafu(display("Failed to submit reconcile procedure"))] + SubmitReconcileProcedure { + #[snafu(implicit)] + location: Location, + source: common_meta::error::Error, + }, + #[snafu(display("Failed to invalidate table cache"))] InvalidateTableCache { #[snafu(implicit)] @@ -695,8 +723,8 @@ pub enum Error { location: Location, }, - #[snafu(display("Maintenance mode manager error"))] - MaintenanceModeManager { + #[snafu(display("Runtime switch manager error"))] + RuntimeSwitchManager { source: common_meta::error::Error, #[snafu(implicit)] location: Location, @@ -1017,20 +1045,23 @@ impl ErrorExt for Error { | Error::ListTables { source, .. } => source.status_code(), Error::StartTelemetryTask { source, .. } => source.status_code(), - Error::NextSequence { source, .. } => source.status_code(), + Error::NextSequence { source, .. } + | Error::SetNextSequence { source, .. } + | Error::PeekSequence { source, .. } => source.status_code(), Error::DowngradeLeader { source, .. } => source.status_code(), Error::RegisterProcedureLoader { source, .. } => source.status_code(), - Error::SubmitDdlTask { source, .. } => source.status_code(), + Error::SubmitDdlTask { source, .. } + | Error::SubmitReconcileProcedure { source, .. } => source.status_code(), Error::ConvertProtoData { source, .. } | Error::TableMetadataManager { source, .. } - | Error::MaintenanceModeManager { source, .. } + | Error::RuntimeSwitchManager { source, .. } | Error::KvBackend { source, .. } | Error::UnexpectedLogicalRouteTable { source, .. } | Error::UpdateTopicNameValue { source, .. } => source.status_code(), - Error::InitMetadata { source, .. } | Error::InitDdlManager { source, .. } => { - source.status_code() - } + Error::InitMetadata { source, .. } + | Error::InitDdlManager { source, .. } + | Error::InitReconciliationManager { source, .. } => source.status_code(), Error::Other { source, .. } => source.status_code(), Error::LookupPeer { source, .. } => source.status_code(), diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs index 50797d44c4..b37fdcf082 100644 --- a/src/meta-srv/src/metasrv.rs +++ b/src/meta-srv/src/metasrv.rs @@ -25,9 +25,9 @@ use common_base::Plugins; use common_config::{Configurable, DEFAULT_DATA_HOME}; use common_greptimedb_telemetry::GreptimeDBTelemetryTask; use common_meta::cache_invalidator::CacheInvalidatorRef; -use common_meta::ddl::ProcedureExecutorRef; +use common_meta::ddl_manager::DdlManagerRef; use common_meta::distributed_time_constants; -use common_meta::key::maintenance::MaintenanceModeManagerRef; +use common_meta::key::runtime_switch::RuntimeSwitchManagerRef; use common_meta::key::TableMetadataManagerRef; use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef}; use common_meta::leadership_notifier::{ @@ -35,8 +35,10 @@ use common_meta::leadership_notifier::{ }; use common_meta::node_expiry_listener::NodeExpiryListener; use common_meta::peer::Peer; +use common_meta::reconciliation::manager::ReconciliationManagerRef; use common_meta::region_keeper::MemoryRegionKeeperRef; use common_meta::region_registry::LeaderRegionRegistryRef; +use common_meta::sequence::SequenceRef; use common_meta::wal_options_allocator::WalOptionsAllocatorRef; use common_options::datanode::DatanodeClientOptions; use common_procedure::options::ProcedureConfig; @@ -425,10 +427,10 @@ pub struct Metasrv { election: Option, procedure_manager: ProcedureManagerRef, mailbox: MailboxRef, - procedure_executor: ProcedureExecutorRef, + ddl_manager: DdlManagerRef, wal_options_allocator: WalOptionsAllocatorRef, table_metadata_manager: TableMetadataManagerRef, - maintenance_mode_manager: MaintenanceModeManagerRef, + runtime_switch_manager: RuntimeSwitchManagerRef, memory_region_keeper: MemoryRegionKeeperRef, greptimedb_telemetry_task: Arc, region_migration_manager: RegionMigrationManagerRef, @@ -436,6 +438,8 @@ pub struct Metasrv { cache_invalidator: CacheInvalidatorRef, leader_region_registry: LeaderRegionRegistryRef, wal_prune_ticker: Option, + table_id_sequence: SequenceRef, + reconciliation_manager: ReconciliationManagerRef, plugins: Plugins, } @@ -675,8 +679,8 @@ impl Metasrv { &self.mailbox } - pub fn procedure_executor(&self) -> &ProcedureExecutorRef { - &self.procedure_executor + pub fn ddl_manager(&self) -> &DdlManagerRef { + &self.ddl_manager } pub fn procedure_manager(&self) -> &ProcedureManagerRef { @@ -687,8 +691,8 @@ impl Metasrv { &self.table_metadata_manager } - pub fn maintenance_mode_manager(&self) -> &MaintenanceModeManagerRef { - &self.maintenance_mode_manager + pub fn runtime_switch_manager(&self) -> &RuntimeSwitchManagerRef { + &self.runtime_switch_manager } pub fn memory_region_keeper(&self) -> &MemoryRegionKeeperRef { @@ -707,6 +711,14 @@ impl Metasrv { self.plugins.get::() } + pub fn table_id_sequence(&self) -> &SequenceRef { + &self.table_id_sequence + } + + pub fn reconciliation_manager(&self) -> &ReconciliationManagerRef { + &self.reconciliation_manager + } + pub fn plugins(&self) -> &Plugins { &self.plugins } diff --git a/src/meta-srv/src/metasrv/builder.rs b/src/meta-srv/src/metasrv/builder.rs index c9c1b8dbdf..b001de2c7e 100644 --- a/src/meta-srv/src/metasrv/builder.rs +++ b/src/meta-srv/src/metasrv/builder.rs @@ -29,11 +29,12 @@ use common_meta::ddl_manager::DdlManager; use common_meta::distributed_time_constants; use common_meta::key::flow::flow_state::FlowStateManager; use common_meta::key::flow::FlowMetadataManager; -use common_meta::key::maintenance::MaintenanceModeManager; +use common_meta::key::runtime_switch::{RuntimeSwitchManager, RuntimeSwitchManagerRef}; use common_meta::key::TableMetadataManager; use common_meta::kv_backend::memory::MemoryKvBackend; use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef}; use common_meta::node_manager::NodeManagerRef; +use common_meta::reconciliation::manager::ReconciliationManager; use common_meta::region_keeper::MemoryRegionKeeper; use common_meta::region_registry::LeaderRegionRegistry; use common_meta::sequence::SequenceBuilder; @@ -193,7 +194,9 @@ impl MetasrvBuilder { let selector = selector.unwrap_or_else(|| Arc::new(LeaseBasedSelector::default())); let pushers = Pushers::default(); let mailbox = build_mailbox(&kv_backend, &pushers); - let procedure_manager = build_procedure_manager(&options, &kv_backend); + let runtime_switch_manager = Arc::new(RuntimeSwitchManager::new(kv_backend.clone())); + let procedure_manager = + build_procedure_manager(&options, &kv_backend, &runtime_switch_manager); let table_metadata_manager = Arc::new(TableMetadataManager::new( leader_cached_kv_backend.clone() as _, @@ -201,7 +204,7 @@ impl MetasrvBuilder { let flow_metadata_manager = Arc::new(FlowMetadataManager::new( leader_cached_kv_backend.clone() as _, )); - let maintenance_mode_manager = Arc::new(MaintenanceModeManager::new(kv_backend.clone())); + let selector_ctx = SelectorContext { server_addr: options.grpc.server_addr.clone(), datanode_lease_secs: distributed_time_constants::DATANODE_LEASE_SECS, @@ -233,6 +236,7 @@ impl MetasrvBuilder { peer_allocator, )) }); + let table_id_sequence = table_metadata_allocator.table_id_sequence(); let flow_selector = Arc::new(RoundRobinSelector::new( SelectTarget::Flownode, @@ -339,7 +343,7 @@ impl MetasrvBuilder { selector_ctx.clone(), supervisor_selector, region_migration_manager.clone(), - maintenance_mode_manager.clone(), + runtime_switch_manager.clone(), peer_lookup_service.clone(), ); @@ -354,7 +358,7 @@ impl MetasrvBuilder { let leader_region_registry = Arc::new(LeaderRegionRegistry::default()); let ddl_context = DdlContext { - node_manager, + node_manager: node_manager.clone(), cache_invalidator: cache_invalidator.clone(), memory_region_keeper: memory_region_keeper.clone(), leader_region_registry: leader_region_registry.clone(), @@ -440,6 +444,16 @@ impl MetasrvBuilder { .to_string_lossy() .to_string(); + let reconciliation_manager = Arc::new(ReconciliationManager::new( + node_manager.clone(), + table_metadata_manager.clone(), + cache_invalidator.clone(), + procedure_manager.clone(), + )); + reconciliation_manager + .try_start() + .context(error::InitReconciliationManagerSnafu)?; + Ok(Metasrv { state, started: Arc::new(AtomicBool::new(false)), @@ -458,10 +472,10 @@ impl MetasrvBuilder { election, procedure_manager, mailbox, - procedure_executor: ddl_manager, + ddl_manager, wal_options_allocator, table_metadata_manager, - maintenance_mode_manager, + runtime_switch_manager, greptimedb_telemetry_task: get_greptimedb_telemetry_task( Some(metasrv_home), meta_peer_client, @@ -475,6 +489,8 @@ impl MetasrvBuilder { cache_invalidator, leader_region_registry, wal_prune_ticker, + table_id_sequence, + reconciliation_manager, }) } } @@ -504,6 +520,7 @@ fn build_mailbox(kv_backend: &KvBackendRef, pushers: &Pushers) -> MailboxRef { fn build_procedure_manager( options: &MetasrvOptions, kv_backend: &KvBackendRef, + runtime_switch_manager: &RuntimeSwitchManagerRef, ) -> ProcedureManagerRef { let manager_config = ManagerConfig { max_retry_times: options.procedure.max_retry_times, @@ -524,6 +541,7 @@ fn build_procedure_manager( manager_config, kv_state_store.clone(), kv_state_store, + Some(runtime_switch_manager.clone()), )) } diff --git a/src/meta-srv/src/procedure/region_migration/test_util.rs b/src/meta-srv/src/procedure/region_migration/test_util.rs index ae5b6736e4..a60a60743c 100644 --- a/src/meta-srv/src/procedure/region_migration/test_util.rs +++ b/src/meta-srv/src/procedure/region_migration/test_util.rs @@ -92,6 +92,7 @@ impl TestingEnv { ManagerConfig::default(), state_store, poison_manager, + None, )); Self { diff --git a/src/meta-srv/src/procedure/utils.rs b/src/meta-srv/src/procedure/utils.rs index f2420522ae..fd0eca50c2 100644 --- a/src/meta-srv/src/procedure/utils.rs +++ b/src/meta-srv/src/procedure/utils.rs @@ -103,6 +103,7 @@ pub mod mock { }), affected_rows: 0, extensions: Default::default(), + metadata: Vec::new(), }) } } @@ -195,6 +196,7 @@ pub mod test_data { options: TableOptions::default(), created_on: DateTime::default(), partition_key_indices: vec![], + column_ids: vec![], }, table_type: TableType::Base, } diff --git a/src/meta-srv/src/procedure/wal_prune/test_util.rs b/src/meta-srv/src/procedure/wal_prune/test_util.rs index baa9129d3c..53436d7bef 100644 --- a/src/meta-srv/src/procedure/wal_prune/test_util.rs +++ b/src/meta-srv/src/procedure/wal_prune/test_util.rs @@ -52,6 +52,7 @@ impl TestEnv { ManagerConfig::default(), state_store, poison_manager, + None, )); let mailbox_ctx = MailboxContext::new(mailbox_sequence); diff --git a/src/meta-srv/src/region/supervisor.rs b/src/meta-srv/src/region/supervisor.rs index edbe29d85c..be904dc97f 100644 --- a/src/meta-srv/src/region/supervisor.rs +++ b/src/meta-srv/src/region/supervisor.rs @@ -20,7 +20,7 @@ use std::time::Duration; use async_trait::async_trait; use common_meta::datanode::Stat; use common_meta::ddl::{DetectingRegion, RegionFailureDetectorController}; -use common_meta::key::maintenance::MaintenanceModeManagerRef; +use common_meta::key::runtime_switch::RuntimeSwitchManagerRef; use common_meta::leadership_notifier::LeadershipChangeListener; use common_meta::peer::{Peer, PeerLookupServiceRef}; use common_meta::DatanodeId; @@ -225,7 +225,7 @@ pub struct RegionSupervisor { /// Region migration manager. region_migration_manager: RegionMigrationManagerRef, /// The maintenance mode manager. - maintenance_mode_manager: MaintenanceModeManagerRef, + runtime_switch_manager: RuntimeSwitchManagerRef, /// Peer lookup service peer_lookup: PeerLookupServiceRef, } @@ -296,7 +296,7 @@ impl RegionSupervisor { selector_context: SelectorContext, selector: RegionSupervisorSelector, region_migration_manager: RegionMigrationManagerRef, - maintenance_mode_manager: MaintenanceModeManagerRef, + runtime_switch_manager: RuntimeSwitchManagerRef, peer_lookup: PeerLookupServiceRef, ) -> Self { Self { @@ -306,7 +306,7 @@ impl RegionSupervisor { selector_context, selector, region_migration_manager, - maintenance_mode_manager, + runtime_switch_manager, peer_lookup, } } @@ -426,10 +426,10 @@ impl RegionSupervisor { } pub(crate) async fn is_maintenance_mode_enabled(&self) -> Result { - self.maintenance_mode_manager + self.runtime_switch_manager .maintenance_mode() .await - .context(error::MaintenanceModeManagerSnafu) + .context(error::RuntimeSwitchManagerSnafu) } async fn select_peers( @@ -622,7 +622,7 @@ pub(crate) mod tests { use std::time::Duration; use common_meta::ddl::RegionFailureDetectorController; - use common_meta::key::maintenance; + use common_meta::key::runtime_switch; use common_meta::peer::Peer; use common_meta::test_util::NoopPeerLookupService; use common_time::util::current_time_millis; @@ -650,8 +650,8 @@ pub(crate) mod tests { env.procedure_manager().clone(), context_factory, )); - let maintenance_mode_manager = - Arc::new(maintenance::MaintenanceModeManager::new(env.kv_backend())); + let runtime_switch_manager = + Arc::new(runtime_switch::RuntimeSwitchManager::new(env.kv_backend())); let peer_lookup = Arc::new(NoopPeerLookupService); let (tx, rx) = RegionSupervisor::channel(); @@ -662,7 +662,7 @@ pub(crate) mod tests { selector_context, RegionSupervisorSelector::NaiveSelector(selector), region_migration_manager, - maintenance_mode_manager, + runtime_switch_manager, peer_lookup, ), tx, diff --git a/src/meta-srv/src/service.rs b/src/meta-srv/src/service.rs index e260b8b980..c2eab57ce3 100644 --- a/src/meta-srv/src/service.rs +++ b/src/meta-srv/src/service.rs @@ -23,6 +23,7 @@ mod heartbeat; pub mod mailbox; pub mod procedure; pub mod store; +pub(crate) mod utils; pub type GrpcResult = Result, Status>; pub type GrpcStream = Pin> + Send + Sync + 'static>>; diff --git a/src/meta-srv/src/service/admin.rs b/src/meta-srv/src/service/admin.rs index c2b7b59794..de588904e0 100644 --- a/src/meta-srv/src/service/admin.rs +++ b/src/meta-srv/src/service/admin.rs @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod health; -mod heartbeat; -mod leader; -mod maintenance; -mod node_lease; +pub(crate) mod health; +pub(crate) mod heartbeat; +pub(crate) mod leader; +pub(crate) mod maintenance; +pub(crate) mod node_lease; +pub(crate) mod procedure; +pub(crate) mod recovery; +pub(crate) mod sequencer; mod util; use std::collections::HashMap; @@ -24,6 +27,9 @@ use std::convert::Infallible; use std::sync::Arc; use std::task::{Context, Poll}; +use axum::http::StatusCode; +use axum::response::IntoResponse; +use axum::{routing, Router as AxumRouter}; use bytes::Bytes; use http_body_util::{BodyExt, Full}; use tonic::body::BoxBody; @@ -31,6 +37,17 @@ use tonic::codegen::{empty_body, http, BoxFuture, Service}; use tonic::server::NamedService; use crate::metasrv::Metasrv; +use crate::service::admin::health::HealthHandler; +use crate::service::admin::heartbeat::HeartBeatHandler; +use crate::service::admin::leader::LeaderHandler; +use crate::service::admin::maintenance::MaintenanceHandler; +use crate::service::admin::node_lease::NodeLeaseHandler; +use crate::service::admin::procedure::ProcedureManagerHandler; +use crate::service::admin::recovery::{ + get_recovery_mode, set_recovery_mode, unset_recovery_mode, RecoveryHandler, +}; +use crate::service::admin::sequencer::TableIdSequenceHandler; +use crate::service::admin::util::{to_axum_json_response, to_axum_not_found_response}; pub fn make_admin_service(metasrv: Arc) -> Admin { let router = Router::new().route("/health", health::HealthHandler); @@ -56,10 +73,25 @@ pub fn make_admin_service(metasrv: Arc) -> Admin { }, ); - let router = router.route( - "/maintenance", + let router = router.routes( + &[ + "/maintenance", + "/maintenance/status", + "/maintenance/enable", + "/maintenance/disable", + ], maintenance::MaintenanceHandler { - manager: metasrv.maintenance_mode_manager().clone(), + manager: metasrv.runtime_switch_manager().clone(), + }, + ); + let router = router.routes( + &[ + "/procedure-manager/pause", + "/procedure-manager/resume", + "/procedure-manager/status", + ], + procedure::ProcedureManagerHandler { + manager: metasrv.runtime_switch_manager().clone(), }, ); let router = Router::nest("/admin", router); @@ -97,10 +129,7 @@ impl NamedService for Admin { const NAME: &'static str = "admin"; } -impl Service> for Admin -where - T: Send, -{ +impl Service> for Admin { type Response = http::Response; type Error = Infallible; type Future = BoxFuture; @@ -109,7 +138,7 @@ where Poll::Ready(Ok(())) } - fn call(&mut self, req: http::Request) -> Self::Future { + fn call(&mut self, req: http::Request) -> Self::Future { let router = self.router.clone(); let query_params = req .uri() @@ -128,7 +157,7 @@ where #[derive(Default)] pub struct Router { - handlers: HashMap>, + handlers: HashMap>, } impl Router { @@ -153,7 +182,17 @@ impl Router { pub fn route(mut self, path: &str, handler: impl HttpHandler + 'static) -> Self { check_path(path); - let _ = self.handlers.insert(path.to_owned(), Box::new(handler)); + let _ = self.handlers.insert(path.to_owned(), Arc::new(handler)); + + self + } + + pub fn routes(mut self, paths: &[&str], handler: impl HttpHandler + 'static) -> Self { + let handler = Arc::new(handler); + for path in paths { + check_path(path); + let _ = self.handlers.insert(path.to_string(), handler.clone()); + } self } @@ -200,11 +239,268 @@ fn boxed(body: String) -> BoxBody { .boxed_unsync() } +/// Expose admin HTTP endpoints as an Axum router for the main HTTP server. +pub fn admin_axum_router(metasrv: Arc) -> AxumRouter { + let node_lease_handler = Arc::new(NodeLeaseHandler { + meta_peer_client: metasrv.meta_peer_client().clone(), + }); + let heartbeat_handler = Arc::new(HeartBeatHandler { + meta_peer_client: metasrv.meta_peer_client().clone(), + }); + let leader_handler = Arc::new(LeaderHandler { + election: metasrv.election().cloned(), + }); + let maintenance_handler = Arc::new(MaintenanceHandler { + manager: metasrv.runtime_switch_manager().clone(), + }); + let procedure_handler = Arc::new(ProcedureManagerHandler { + manager: metasrv.runtime_switch_manager().clone(), + }); + let recovery_handler = Arc::new(RecoveryHandler { + manager: metasrv.runtime_switch_manager().clone(), + }); + let table_id_sequence_handler = Arc::new(TableIdSequenceHandler { + table_id_sequence: metasrv.table_id_sequence().clone(), + runtime_switch_manager: metasrv.runtime_switch_manager().clone(), + }); + let sequence_router = AxumRouter::new().nest( + "/table", + AxumRouter::new() + .route("/next-id", routing::get(sequencer::get_next_table_id)) + .route("/set-next-id", routing::post(sequencer::set_next_table_id)) + .with_state(table_id_sequence_handler), + ); + + let health_router = AxumRouter::new().route( + "/", + routing::get({ + move || { + let handler = HealthHandler; + async move { + match handler + .handle("/health", http::Method::GET, &Default::default()) + .await + { + Ok(status) => status.body().clone().into_response(), + Err(e) => { + common_telemetry::error!(e; "Health handler failed"); + StatusCode::INTERNAL_SERVER_ERROR.into_response() + } + } + } + } + }), + ); + + let node_lease_router = AxumRouter::new().route( + "/", + routing::get({ + let handler = node_lease_handler.clone(); + move || async move { + match handler + .handle("/node-lease", http::Method::GET, &Default::default()) + .await + { + Ok(resp) => resp.body().clone().into_response(), + Err(e) => { + common_telemetry::error!(e; "Node lease handler failed"); + StatusCode::INTERNAL_SERVER_ERROR.into_response() + } + } + } + }), + ); + + let leader_router = AxumRouter::new().route( + "/", + routing::get({ + let handler = leader_handler.clone(); + move || async move { + match handler + .handle("/leader", http::Method::GET, &Default::default()) + .await + { + Ok(resp) => resp.body().clone().into_response(), + Err(e) => { + common_telemetry::error!(e; "Leader handler failed"); + StatusCode::INTERNAL_SERVER_ERROR.into_response() + } + } + } + }), + ); + + let heartbeat_router = AxumRouter::new() + .route( + "/", + routing::get({ + let handler = heartbeat_handler.clone(); + move || async move { + match handler + .handle("/heartbeat", http::Method::GET, &Default::default()) + .await + { + Ok(resp) => resp.body().clone().into_response(), + Err(e) => { + common_telemetry::error!(e; "Heartbeat handler failed"); + StatusCode::INTERNAL_SERVER_ERROR.into_response() + } + } + } + }), + ) + .route( + "/help", + routing::get({ + let handler = heartbeat_handler.clone(); + move || async move { + match handler + .handle("/heartbeat/help", http::Method::GET, &Default::default()) + .await + { + Ok(resp) => resp.body().clone().into_response(), + Err(e) => { + common_telemetry::error!(e; "Heartbeat help handler failed"); + StatusCode::INTERNAL_SERVER_ERROR.into_response() + } + } + } + }), + ); + + let maintenance_router = AxumRouter::new() + .route( + "/", + routing::get({ + let handler = maintenance_handler.clone(); + move || async move { + match handler.get_maintenance().await { + Ok(resp) => to_axum_json_response(resp), + Err(e) => { + common_telemetry::error!(e; "Maintenance handler failed"); + to_axum_not_found_response() + } + } + } + }), + ) + .route( + "/status", + routing::get({ + let handler = maintenance_handler.clone(); + move || async move { + match handler.get_maintenance().await { + Ok(resp) => to_axum_json_response(resp), + Err(e) => { + common_telemetry::error!(e; "Maintenance status handler failed"); + to_axum_not_found_response() + } + } + } + }), + ) + .route( + "/enable", + routing::post({ + let handler = maintenance_handler.clone(); + move || async move { + match handler.set_maintenance().await { + Ok(resp) => to_axum_json_response(resp), + Err(e) => { + common_telemetry::error!(e; "Maintenance enable handler failed"); + to_axum_not_found_response() + } + } + } + }), + ) + .route( + "/disable", + routing::post({ + let handler = maintenance_handler.clone(); + move || async move { + match handler.unset_maintenance().await { + Ok(resp) => to_axum_json_response(resp), + Err(e) => { + common_telemetry::error!(e; "Maintenance disable handler failed"); + to_axum_not_found_response() + } + } + } + }), + ); + + let procedure_router = AxumRouter::new() + .route( + "/status", + routing::get({ + let handler = procedure_handler.clone(); + move || async move { + match handler.get_procedure_manager_status().await { + Ok(resp) => to_axum_json_response(resp), + Err(e) => { + common_telemetry::error!(e; "Procedure manager status handler failed"); + to_axum_not_found_response() + } + } + } + }), + ) + .route( + "/pause", + routing::post({ + let handler = procedure_handler.clone(); + move || async move { + match handler.pause_procedure_manager().await { + Ok(resp) => to_axum_json_response(resp), + Err(e) => { + common_telemetry::error!(e; "Procedure manager pause handler failed"); + to_axum_not_found_response() + } + } + } + }), + ) + .route( + "/resume", + routing::post({ + let handler = procedure_handler.clone(); + move || async move { + match handler.resume_procedure_manager().await { + Ok(resp) => to_axum_json_response(resp), + Err(e) => { + common_telemetry::error!(e; "Procedure manager resume handler failed"); + to_axum_not_found_response() + } + } + } + }), + ); + + let recovery_router = AxumRouter::new() + .route("/enable", routing::post(set_recovery_mode)) + .route("/disable", routing::post(unset_recovery_mode)) + .route("/status", routing::get(get_recovery_mode)) + .with_state(recovery_handler); + + let admin_router = AxumRouter::new() + .nest("/health", health_router) + .nest("/node-lease", node_lease_router) + .nest("/leader", leader_router) + .nest("/heartbeat", heartbeat_router) + .nest("/maintenance", maintenance_router) + .nest("/procedure-manager", procedure_router) + .nest("/recovery", recovery_router) + .nest("/sequence", sequence_router); + + AxumRouter::new().nest("/admin", admin_router) +} + #[cfg(test)] mod tests { use common_meta::kv_backend::memory::MemoryKvBackend; use common_meta::kv_backend::KvBackendRef; - use tokio::io::{AsyncReadExt, AsyncWriteExt}; + use tokio::io::{AsyncReadExt, AsyncWriteExt, DuplexStream}; use super::*; use crate::metasrv::builder::MetasrvBuilder; @@ -325,6 +621,13 @@ mod tests { metasrv } + async fn send_request(client: &mut DuplexStream, request: &[u8]) -> String { + client.write_all(request).await.unwrap(); + let mut buf = vec![0; 1024]; + let n = client.read(&mut buf).await.unwrap(); + String::from_utf8_lossy(&buf[..n]).to_string() + } + #[tokio::test(flavor = "multi_thread")] async fn test_metasrv_maintenance_mode() { common_telemetry::init_default_ut_logging(); @@ -343,73 +646,617 @@ mod tests { }); // Get maintenance mode - let http_request = b"GET /admin/maintenance HTTP/1.1\r\nHost: localhost\r\n\r\n"; - client.write_all(http_request).await.unwrap(); - let mut buf = vec![0; 1024]; - let n = client.read(&mut buf).await.unwrap(); - let response = String::from_utf8_lossy(&buf[..n]); + let response = send_request( + &mut client, + b"GET /admin/maintenance HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; assert!(response.contains(r#"{"enabled":false}"#)); assert!(response.contains("200 OK")); // Set maintenance mode to true - let http_post = b"POST /admin/maintenance?enable=true HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n"; - client.write_all(http_post).await.unwrap(); - let mut buf = vec![0; 1024]; - let n = client.read(&mut buf).await.unwrap(); - let response = String::from_utf8_lossy(&buf[..n]); + let response = send_request( + &mut client, + b"POST /admin/maintenance?enable=true HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n", + ) + .await; assert!(response.contains(r#"{"enabled":true}"#)); assert!(response.contains("200 OK")); let enabled = metasrv - .maintenance_mode_manager() + .runtime_switch_manager() .maintenance_mode() .await .unwrap(); assert!(enabled); // Get maintenance mode again - let http_request = b"GET /admin/maintenance HTTP/1.1\r\nHost: localhost\r\n\r\n"; - client.write_all(http_request).await.unwrap(); - let mut buf = vec![0; 1024]; - let n = client.read(&mut buf).await.unwrap(); - let response = String::from_utf8_lossy(&buf[..n]); + let response = send_request( + &mut client, + b"GET /admin/maintenance HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; assert!(response.contains(r#"{"enabled":true}"#)); assert!(response.contains("200 OK")); // Set maintenance mode to false - let http_post = b"POST /admin/maintenance?enable=false HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n"; - client.write_all(http_post).await.unwrap(); - let mut buf = vec![0; 1024]; - let n = client.read(&mut buf).await.unwrap(); - let response = String::from_utf8_lossy(&buf[..n]); + let response = send_request( + &mut client, + b"POST /admin/maintenance?enable=false HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n", + ) + .await; assert!(response.contains(r#"{"enabled":false}"#)); assert!(response.contains("200 OK")); let enabled = metasrv - .maintenance_mode_manager() + .runtime_switch_manager() .maintenance_mode() .await .unwrap(); assert!(!enabled); // Set maintenance mode to true via GET request - let http_request = - b"GET /admin/maintenance?enable=true HTTP/1.1\r\nHost: localhost\r\n\r\n"; - client.write_all(http_request).await.unwrap(); - let mut buf = vec![0; 1024]; - let n = client.read(&mut buf).await.unwrap(); - let response = String::from_utf8_lossy(&buf[..n]); + let response = send_request( + &mut client, + b"GET /admin/maintenance?enable=true HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; assert!(response.contains(r#"{"enabled":true}"#)); assert!(response.contains("200 OK")); // Set maintenance mode to false via GET request - let http_request = - b"PUT /admin/maintenance?enable=false HTTP/1.1\r\nHost: localhost\r\n\r\n"; - client.write_all(http_request).await.unwrap(); - let mut buf = vec![0; 1024]; - let n = client.read(&mut buf).await.unwrap(); - let response = String::from_utf8_lossy(&buf[..n]); + let response = send_request( + &mut client, + b"PUT /admin/maintenance?enable=false HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; assert!(response.contains(r#"{"enabled":false}"#)); assert!(response.contains("200 OK")); + + // Get maintenance mode via status path + let response = send_request( + &mut client, + b"GET /admin/maintenance/status HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; + assert!(response.contains(r#"{"enabled":false}"#)); + + // Set maintenance mode via enable path + let response = send_request( + &mut client, + b"POST /admin/maintenance/enable HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n", + ) + .await; + assert!(response.contains(r#"{"enabled":true}"#)); + + // Unset maintenance mode via disable path + let response = send_request( + &mut client, + b"POST /admin/maintenance/disable HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n", + ) + .await; + assert!(response.contains(r#"{"enabled":false}"#)); + + // send POST request to status path + let response = send_request( + &mut client, + b"POST /admin/maintenance/status HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n", + ) + .await; + assert!(response.contains("404 Not Found")); + + // send GET request to enable path + let response = send_request( + &mut client, + b"GET /admin/maintenance/enable HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n", + ) + .await; + assert!(response.contains("404 Not Found")); + + // send GET request to disable path + let response = send_request( + &mut client, + b"GET /admin/maintenance/disable HTTP/1.1\r\nHost: localhost\r\nContent-Length: 0\r\n\r\n", + ) + .await; + assert!(response.contains("404 Not Found")); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_metasrv_procedure_manager_handler() { + common_telemetry::init_default_ut_logging(); + let kv_backend = Arc::new(MemoryKvBackend::new()); + let metasrv = test_metasrv(kv_backend).await; + metasrv.try_start().await.unwrap(); + + let (mut client, server) = tokio::io::duplex(1024); + let metasrv = Arc::new(metasrv); + let service = metasrv.clone(); + let _handle = tokio::spawn(async move { + let router = bootstrap::router(service); + router + .serve_with_incoming(futures::stream::iter(vec![Ok::<_, std::io::Error>(server)])) + .await + }); + + // send GET request to procedure-manager/status path + let response = send_request( + &mut client, + b"GET /admin/procedure-manager/status HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; + assert!(response.contains("200 OK")); + assert!( + response.contains(r#"{"status":"running"}"#), + "response: {}", + response + ); + + // send POST request to procedure-manager/pause path + let response = send_request( + &mut client, + b"POST /admin/procedure-manager/pause HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; + assert!(response.contains("200 OK")); + assert!(response.contains(r#"{"status":"paused"}"#)); + + // send POST request to procedure-manager/resume path + let response = send_request( + &mut client, + b"POST /admin/procedure-manager/resume HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; + assert!(response.contains("200 OK")); + assert!( + response.contains(r#"{"status":"running"}"#), + "response: {}", + response + ); + + // send GET request to procedure-manager/resume path + let response = send_request( + &mut client, + b"GET /admin/procedure-manager/resume HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; + assert!(response.contains("404 Not Found")); + + // send GET request to procedure-manager/pause path + let response = send_request( + &mut client, + b"GET /admin/procedure-manager/pause HTTP/1.1\r\nHost: localhost\r\n\r\n", + ) + .await; + assert!(response.contains("404 Not Found")); + } +} + +#[cfg(test)] +mod axum_admin_tests { + use std::sync::Arc; + + use axum::body::{to_bytes, Body}; + use axum::http::{Method, Request, StatusCode}; + use common_meta::kv_backend::memory::MemoryKvBackend; + use tower::ServiceExt; // for `oneshot` + + use super::*; + use crate::metasrv::builder::MetasrvBuilder; + use crate::metasrv::MetasrvOptions; + use crate::service::admin::sequencer::NextTableIdResponse; + + async fn setup_axum_app() -> AxumRouter { + let kv_backend = Arc::new(MemoryKvBackend::new()); + let metasrv = MetasrvBuilder::new() + .options(MetasrvOptions::default()) + .kv_backend(kv_backend) + .build() + .await + .unwrap(); + let metasrv = Arc::new(metasrv); + admin_axum_router(metasrv) + } + + async fn get_body_string(resp: axum::response::Response) -> String { + let body_bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + String::from_utf8_lossy(&body_bytes).to_string() + } + + async fn into_bytes(resp: axum::response::Response) -> Vec { + let body_bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + body_bytes.to_vec() + } + + #[tokio::test] + async fn test_admin_health() { + let app = setup_axum_app().await; + let response = app + .oneshot( + Request::builder() + .uri("/admin/health") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.to_lowercase().contains("ok")); + } + + #[tokio::test] + async fn test_admin_node_lease() { + let app = setup_axum_app().await; + let response = app + .oneshot( + Request::builder() + .uri("/admin/node-lease") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_admin_heartbeat() { + let app = setup_axum_app().await; + let response = app + .oneshot( + Request::builder() + .uri("/admin/heartbeat") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_admin_heartbeat_help() { + let app = setup_axum_app().await; + let response = app + .oneshot( + Request::builder() + .uri("/admin/heartbeat/help") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_admin_leader() { + let app = setup_axum_app().await; + let response = app + .oneshot( + Request::builder() + .uri("/admin/leader") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_admin_maintenance() { + let app = setup_axum_app().await; + let response = app + .oneshot( + Request::builder() + .uri("/admin/maintenance") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("enabled")); + } + + #[tokio::test] + async fn test_admin_maintenance_status() { + let app = setup_axum_app().await; + let response = app + .oneshot( + Request::builder() + .uri("/admin/maintenance/status") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("enabled")); + } + + #[tokio::test] + async fn test_admin_maintenance_enable_disable() { + // Enable maintenance + let response = setup_axum_app() + .await + .oneshot( + Request::builder() + .method(Method::POST) + .uri("/admin/maintenance/enable") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("enabled")); + // Disable maintenance + let response = setup_axum_app() + .await + .oneshot( + Request::builder() + .method(Method::POST) + .uri("/admin/maintenance/disable") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("enabled")); + } + + #[tokio::test] + async fn test_admin_procedure_manager_status() { + let app = setup_axum_app().await; + let response = app + .oneshot( + Request::builder() + .uri("/admin/procedure-manager/status") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("status")); + } + + #[tokio::test] + async fn test_admin_procedure_manager_pause_resume() { + // Pause + let response = setup_axum_app() + .await + .oneshot( + Request::builder() + .method(Method::POST) + .uri("/admin/procedure-manager/pause") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("paused")); + // Resume + let response = setup_axum_app() + .await + .oneshot( + Request::builder() + .method(Method::POST) + .uri("/admin/procedure-manager/resume") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("running")); + } + + #[tokio::test] + async fn test_admin_recovery() { + let app = setup_axum_app().await; + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/recovery/status") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("false")); + + // Enable recovery + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/recovery/enable") + .method(Method::POST) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("true")); + + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/recovery/status") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("true")); + + // Disable recovery + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/recovery/disable") + .method(Method::POST) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("false")); + + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/admin/recovery/status") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = get_body_string(response).await; + assert!(body.contains("false")); + } + + #[tokio::test] + async fn test_admin_sequence_table_id() { + common_telemetry::init_default_ut_logging(); + let kv_backend = Arc::new(MemoryKvBackend::new()); + let metasrv = MetasrvBuilder::new() + .options(MetasrvOptions::default()) + .kv_backend(kv_backend) + .build() + .await + .unwrap(); + let metasrv = Arc::new(metasrv); + let runtime_switch_manager = metasrv.runtime_switch_manager().clone(); + let app = admin_axum_router(metasrv); + // Set recovery mode to true + runtime_switch_manager.set_recovery_mode().await.unwrap(); + let response = app + .clone() + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/admin/sequence/table/next-id") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = into_bytes(response).await; + let resp: NextTableIdResponse = serde_json::from_slice(&body).unwrap(); + assert_eq!(resp.next_table_id, 1024); + + // Bad request + let response = app + .clone() + .oneshot( + Request::builder() + .method(Method::POST) + .header(http::header::CONTENT_TYPE, "application/json") + .uri("/admin/sequence/table/set-next-id") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::BAD_REQUEST); + + // Bad next id + let response = app + .clone() + .oneshot( + Request::builder() + .method(Method::POST) + .header(http::header::CONTENT_TYPE, "application/json") + .uri("/admin/sequence/table/set-next-id") + .body(Body::from(r#"{"next_table_id": 0}"#)) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR); + let body = get_body_string(response).await; + assert!(body.contains("is not greater than the current next value")); + + // Set next id + let response = app + .clone() + .oneshot( + Request::builder() + .method(Method::POST) + .header(http::header::CONTENT_TYPE, "application/json") + .uri("/admin/sequence/table/set-next-id") + .body(Body::from(r#"{"next_table_id": 2048}"#)) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + + // Set next id + let response = app + .clone() + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/admin/sequence/table/next-id") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + let body = into_bytes(response).await; + let resp: NextTableIdResponse = serde_json::from_slice(&body).unwrap(); + assert_eq!(resp.next_table_id, 2048); + + // Set recovery mode to false + runtime_switch_manager.unset_recovery_mode().await.unwrap(); + // Set next id with recovery mode disabled + let response = app + .clone() + .oneshot( + Request::builder() + .method(Method::POST) + .header(http::header::CONTENT_TYPE, "application/json") + .uri("/admin/sequence/table/set-next-id") + .body(Body::from(r#"{"next_table_id": 2049}"#)) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR); + let body = get_body_string(response).await; + assert!(body.contains("Setting next table id is only allowed in recovery mode")); } } diff --git a/src/meta-srv/src/service/admin/health.rs b/src/meta-srv/src/service/admin/health.rs index 76f79fc8ad..1394c389c8 100644 --- a/src/meta-srv/src/service/admin/health.rs +++ b/src/meta-srv/src/service/admin/health.rs @@ -21,6 +21,7 @@ use crate::service::admin::HttpHandler; const HTTP_OK: &str = "OK\n"; +#[derive(Clone)] pub struct HealthHandler; #[async_trait::async_trait] diff --git a/src/meta-srv/src/service/admin/leader.rs b/src/meta-srv/src/service/admin/leader.rs index 207176e6af..aa952083c6 100644 --- a/src/meta-srv/src/service/admin/leader.rs +++ b/src/meta-srv/src/service/admin/leader.rs @@ -21,6 +21,7 @@ use crate::error::{self, Result}; use crate::metasrv::ElectionRef; use crate::service::admin::HttpHandler; +#[derive(Clone)] pub struct LeaderHandler { pub election: Option, } diff --git a/src/meta-srv/src/service/admin/maintenance.rs b/src/meta-srv/src/service/admin/maintenance.rs index 0b012187b8..e324a02f2b 100644 --- a/src/meta-srv/src/service/admin/maintenance.rs +++ b/src/meta-srv/src/service/admin/maintenance.rs @@ -14,7 +14,7 @@ use std::collections::HashMap; -use common_meta::key::maintenance::MaintenanceModeManagerRef; +use common_meta::key::runtime_switch::RuntimeSwitchManagerRef; use common_telemetry::{info, warn}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt}; @@ -22,18 +22,19 @@ use tonic::codegen::http; use tonic::codegen::http::Response; use crate::error::{ - self, InvalidHttpBodySnafu, MaintenanceModeManagerSnafu, MissingRequiredParameterSnafu, - ParseBoolSnafu, Result, UnsupportedSnafu, + self, MissingRequiredParameterSnafu, ParseBoolSnafu, Result, RuntimeSwitchManagerSnafu, + UnsupportedSnafu, }; +use crate::service::admin::util::{to_json_response, to_not_found_response}; use crate::service::admin::HttpHandler; #[derive(Clone)] pub struct MaintenanceHandler { - pub manager: MaintenanceModeManagerRef, + pub manager: RuntimeSwitchManagerRef, } #[derive(Debug, Serialize, Deserialize)] -struct MaintenanceResponse { +pub(crate) struct MaintenanceResponse { enabled: bool, } @@ -48,79 +49,124 @@ impl TryFrom for String { } impl MaintenanceHandler { - async fn get_maintenance(&self) -> crate::Result> { + pub(crate) async fn get_maintenance(&self) -> crate::Result { let enabled = self .manager .maintenance_mode() .await - .context(MaintenanceModeManagerSnafu)?; - let response = MaintenanceResponse { enabled }.try_into()?; - http::Response::builder() - .status(http::StatusCode::OK) - .body(response) - .context(InvalidHttpBodySnafu) + .context(RuntimeSwitchManagerSnafu)?; + Ok(MaintenanceResponse { enabled }) } - async fn set_maintenance( + pub(crate) async fn set_maintenance(&self) -> crate::Result { + self.manager + .set_maintenance_mode() + .await + .context(RuntimeSwitchManagerSnafu)?; + // TODO(weny): Add a record to the system events. + info!("Enable the maintenance mode."); + Ok(MaintenanceResponse { enabled: true }) + } + + pub(crate) async fn unset_maintenance(&self) -> crate::Result { + self.manager + .unset_maintenance_mode() + .await + .context(RuntimeSwitchManagerSnafu)?; + // TODO(weny): Add a record to the system events. + info!("Disable the maintenance mode."); + Ok(MaintenanceResponse { enabled: false }) + } + + async fn handle_legacy_maintenance( &self, params: &HashMap, - ) -> crate::Result> { - let enable = params - .get("enable") - .map(|v| v.parse::()) - .context(MissingRequiredParameterSnafu { param: "enable" })? - .context(ParseBoolSnafu { - err_msg: "'enable' must be 'true' or 'false'", - })?; - + ) -> crate::Result { + let enable = get_enable_from_params(params)?; if enable { - self.manager - .set_maintenance_mode() - .await - .context(MaintenanceModeManagerSnafu)?; - info!("Enable the maintenance mode."); + self.set_maintenance().await } else { - self.manager - .unset_maintenance_mode() - .await - .context(MaintenanceModeManagerSnafu)?; - info!("Disable the maintenance mode."); - }; - - let response = MaintenanceResponse { enabled: enable }.try_into()?; - http::Response::builder() - .status(http::StatusCode::OK) - .body(response) - .context(InvalidHttpBodySnafu) + self.unset_maintenance().await + } } } +fn get_enable_from_params(params: &HashMap) -> crate::Result { + params + .get("enable") + .map(|v| v.parse::()) + .context(MissingRequiredParameterSnafu { param: "enable" })? + .context(ParseBoolSnafu { + err_msg: "'enable' must be 'true' or 'false'", + }) +} + +const MAINTENANCE_PATH: &str = "maintenance"; +const ENABLE_SUFFIX: &str = "enable"; +const DISABLE_SUFFIX: &str = "disable"; +const STATUS_SUFFIX: &str = "status"; + #[async_trait::async_trait] impl HttpHandler for MaintenanceHandler { + // TODO(weny): Remove the legacy version of the maintenance API. + // However, we need to keep the legacy version for a while to avoid breaking the existing operators. async fn handle( &self, - _: &str, + path: &str, method: http::Method, params: &HashMap, ) -> crate::Result> { match method { http::Method::GET => { - if params.is_empty() { - self.get_maintenance().await - } else { + if path.ends_with(STATUS_SUFFIX) { + // Handle GET request to '/admin/maintenance/status' + let response = self.get_maintenance().await?; + to_json_response(response) + } else if path.ends_with(MAINTENANCE_PATH) && params.is_empty() { + // Handle GET request to '/admin/maintenance'. (The legacy version) + let response = self.get_maintenance().await?; + to_json_response(response) + } else if path.ends_with(MAINTENANCE_PATH) { + // Handle GET request to '/admin/maintenance' with URL parameters. (The legacy version) warn!( "Found URL parameters in '/admin/maintenance' request, it's deprecated, will be removed in the future" ); // The old version operator will send GET request with URL parameters, // so we need to support it. - self.set_maintenance(params).await + let response = self.handle_legacy_maintenance(params).await?; + to_json_response(response) + } else { + to_not_found_response() } } http::Method::PUT => { - warn!("Found PUT request to '/admin/maintenance', it's deprecated, will be removed in the future"); - self.set_maintenance(params).await + // Handle PUT request to '/admin/maintenance' with URL parameters. (The legacy version) + if path.ends_with(MAINTENANCE_PATH) { + warn!("Found PUT request to '/admin/maintenance', it's deprecated, will be removed in the future"); + let response = self.handle_legacy_maintenance(params).await?; + to_json_response(response) + } else { + to_not_found_response() + } + } + http::Method::POST => { + // Handle POST request to '/admin/maintenance/enable' + if path.ends_with(ENABLE_SUFFIX) { + let response = self.set_maintenance().await?; + to_json_response(response) + } else if path.ends_with(DISABLE_SUFFIX) { + // Handle POST request to '/admin/maintenance/disable' + let response = self.unset_maintenance().await?; + to_json_response(response) + } else if path.ends_with(MAINTENANCE_PATH) { + // Handle POST request to '/admin/maintenance' with URL parameters. (The legacy version) + warn!("Found PUT request to '/admin/maintenance', it's deprecated, will be removed in the future"); + let response = self.handle_legacy_maintenance(params).await?; + to_json_response(response) + } else { + to_not_found_response() + } } - http::Method::POST => self.set_maintenance(params).await, _ => UnsupportedSnafu { operation: format!("http method {method}"), } diff --git a/src/meta-srv/src/service/admin/node_lease.rs b/src/meta-srv/src/service/admin/node_lease.rs index 26089450d2..bce3591054 100644 --- a/src/meta-srv/src/service/admin/node_lease.rs +++ b/src/meta-srv/src/service/admin/node_lease.rs @@ -24,6 +24,7 @@ use crate::key::{DatanodeLeaseKey, LeaseValue}; use crate::lease; use crate::service::admin::HttpHandler; +#[derive(Clone)] pub struct NodeLeaseHandler { pub meta_peer_client: MetaPeerClientRef, } diff --git a/src/meta-srv/src/service/admin/procedure.rs b/src/meta-srv/src/service/admin/procedure.rs new file mode 100644 index 0000000000..a0a6ee87e9 --- /dev/null +++ b/src/meta-srv/src/service/admin/procedure.rs @@ -0,0 +1,125 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use common_meta::key::runtime_switch::RuntimeSwitchManagerRef; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt; +use tonic::codegen::http; +use tonic::codegen::http::Response; + +use crate::error::RuntimeSwitchManagerSnafu; +use crate::service::admin::util::{to_json_response, to_not_found_response}; +use crate::service::admin::HttpHandler; + +#[derive(Clone)] +pub struct ProcedureManagerHandler { + pub manager: RuntimeSwitchManagerRef, +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ProcedureManagerStatusResponse { + status: ProcedureManagerStatus, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +enum ProcedureManagerStatus { + Paused, + Running, +} + +impl ProcedureManagerHandler { + pub(crate) async fn pause_procedure_manager( + &self, + ) -> crate::Result { + self.manager + .pasue_procedure() + .await + .context(RuntimeSwitchManagerSnafu)?; + // TODO(weny): Add a record to the system events. + info!("Pause the procedure manager."); + Ok(ProcedureManagerStatusResponse { + status: ProcedureManagerStatus::Paused, + }) + } + + pub(crate) async fn resume_procedure_manager( + &self, + ) -> crate::Result { + self.manager + .resume_procedure() + .await + .context(RuntimeSwitchManagerSnafu)?; + // TODO(weny): Add a record to the system events. + info!("Resume the procedure manager."); + Ok(ProcedureManagerStatusResponse { + status: ProcedureManagerStatus::Running, + }) + } + + pub(crate) async fn get_procedure_manager_status( + &self, + ) -> crate::Result { + let is_paused = self + .manager + .is_procedure_paused() + .await + .context(RuntimeSwitchManagerSnafu)?; + let response = ProcedureManagerStatusResponse { + status: if is_paused { + ProcedureManagerStatus::Paused + } else { + ProcedureManagerStatus::Running + }, + }; + + Ok(response) + } +} + +#[async_trait::async_trait] +impl HttpHandler for ProcedureManagerHandler { + async fn handle( + &self, + path: &str, + method: http::Method, + _: &HashMap, + ) -> crate::Result> { + match method { + http::Method::GET => { + if path.ends_with("status") { + let response = self.get_procedure_manager_status().await?; + to_json_response(response) + } else { + to_not_found_response() + } + } + http::Method::POST => { + if path.ends_with("pause") { + let response = self.pause_procedure_manager().await?; + to_json_response(response) + } else if path.ends_with("resume") { + let response = self.resume_procedure_manager().await?; + to_json_response(response) + } else { + to_not_found_response() + } + } + _ => to_not_found_response(), + } + } +} diff --git a/src/meta-srv/src/service/admin/recovery.rs b/src/meta-srv/src/service/admin/recovery.rs new file mode 100644 index 0000000000..d35d52450d --- /dev/null +++ b/src/meta-srv/src/service/admin/recovery.rs @@ -0,0 +1,63 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use axum::extract::State; +use axum::http::StatusCode; +use axum::response::{IntoResponse, Response}; +use axum::Json; +use common_meta::key::runtime_switch::RuntimeSwitchManagerRef; +use serde::{Deserialize, Serialize}; +use servers::http::result::error_result::ErrorResponse; + +pub(crate) type RecoveryHandlerRef = Arc; + +pub(crate) struct RecoveryHandler { + pub(crate) manager: RuntimeSwitchManagerRef, +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct RecoveryResponse { + pub enabled: bool, +} + +/// Get the recovery mode. +#[axum_macros::debug_handler] +pub(crate) async fn get_recovery_mode(State(handler): State) -> Response { + let enabled = handler.manager.recovery_mode().await; + + match enabled { + Ok(enabled) => (StatusCode::OK, Json(RecoveryResponse { enabled })).into_response(), + Err(e) => ErrorResponse::from_error(e).into_response(), + } +} + +/// Set the recovery mode. +#[axum_macros::debug_handler] +pub(crate) async fn set_recovery_mode(State(handler): State) -> Response { + match handler.manager.set_recovery_mode().await { + Ok(_) => (StatusCode::OK, Json(RecoveryResponse { enabled: true })).into_response(), + Err(e) => ErrorResponse::from_error(e).into_response(), + } +} + +/// Unset the recovery mode. +#[axum_macros::debug_handler] +pub(crate) async fn unset_recovery_mode(State(handler): State) -> Response { + match handler.manager.unset_recovery_mode().await { + Ok(_) => (StatusCode::OK, Json(RecoveryResponse { enabled: false })).into_response(), + Err(e) => ErrorResponse::from_error(e).into_response(), + } +} diff --git a/src/meta-srv/src/service/admin/sequencer.rs b/src/meta-srv/src/service/admin/sequencer.rs new file mode 100644 index 0000000000..fa0558672e --- /dev/null +++ b/src/meta-srv/src/service/admin/sequencer.rs @@ -0,0 +1,100 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use axum::extract::{self, State}; +use axum::http::StatusCode; +use axum::response::{IntoResponse, Response}; +use axum::Json; +use common_meta::key::runtime_switch::RuntimeSwitchManagerRef; +use common_meta::sequence::SequenceRef; +use serde::{Deserialize, Serialize}; +use servers::http::result::error_result::ErrorResponse; +use snafu::{ensure, ResultExt}; + +use crate::error::{ + PeekSequenceSnafu, Result, RuntimeSwitchManagerSnafu, SetNextSequenceSnafu, UnexpectedSnafu, +}; + +pub type TableIdSequenceHandlerRef = Arc; + +#[derive(Clone)] +pub(crate) struct TableIdSequenceHandler { + pub(crate) table_id_sequence: SequenceRef, + pub(crate) runtime_switch_manager: RuntimeSwitchManagerRef, +} + +impl TableIdSequenceHandler { + async fn set_next_table_id(&self, next_table_id: u32) -> Result<()> { + ensure!( + self.runtime_switch_manager + .recovery_mode() + .await + .context(RuntimeSwitchManagerSnafu)?, + UnexpectedSnafu { + violated: "Setting next table id is only allowed in recovery mode", + } + ); + + self.table_id_sequence + .jump_to(next_table_id as u64) + .await + .context(SetNextSequenceSnafu) + } + + async fn peek_table_id(&self) -> Result { + let next_table_id = self + .table_id_sequence + .peek() + .await + .context(PeekSequenceSnafu)?; + Ok(next_table_id as u32) + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct NextTableIdResponse { + pub(crate) next_table_id: u32, +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ResetTableIdRequest { + pub(crate) next_table_id: u32, +} + +/// Set the next table id. +#[axum_macros::debug_handler] +pub(crate) async fn set_next_table_id( + State(handler): State, + extract::Json(ResetTableIdRequest { next_table_id }): extract::Json, +) -> Response { + match handler.set_next_table_id(next_table_id).await { + Ok(_) => (StatusCode::OK, Json(NextTableIdResponse { next_table_id })).into_response(), + Err(e) => ErrorResponse::from_error(e).into_response(), + } +} + +/// Get the next table id without incrementing the sequence. +#[axum_macros::debug_handler] +pub(crate) async fn get_next_table_id( + State(handler): State, +) -> Response { + match handler.peek_table_id().await { + Ok(next_table_id) => { + (StatusCode::OK, Json(NextTableIdResponse { next_table_id })).into_response() + } + Err(e) => ErrorResponse::from_error(e).into_response(), + } +} diff --git a/src/meta-srv/src/service/admin/util.rs b/src/meta-srv/src/service/admin/util.rs index cdabf38a63..c47145656d 100644 --- a/src/meta-srv/src/service/admin/util.rs +++ b/src/meta-srv/src/service/admin/util.rs @@ -12,11 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fmt::Debug; + +use axum::http::StatusCode; +use axum::response::{IntoResponse, Response}; +use axum::Json; +use serde::Serialize; use snafu::ResultExt; use tonic::codegen::http; use crate::error::{self, Result}; +/// Returns a 200 response with a text body. pub fn to_text_response(text: &str) -> Result> { http::Response::builder() .header("Content-Type", "text/plain") @@ -24,3 +31,36 @@ pub fn to_text_response(text: &str) -> Result> { .body(text.to_string()) .context(error::InvalidHttpBodySnafu) } + +/// Returns a 200 response with a JSON body. +pub fn to_json_response(response: T) -> Result> +where + T: Serialize + Debug, +{ + let response = serde_json::to_string(&response).context(error::SerializeToJsonSnafu { + input: format!("{response:?}"), + })?; + http::Response::builder() + .header("Content-Type", "application/json") + .status(http::StatusCode::OK) + .body(response) + .context(error::InvalidHttpBodySnafu) +} + +/// Converts any serializable type to an Axum JSON response with status 200. +pub fn to_axum_json_response(value: T) -> Response { + (StatusCode::OK, Json(value)).into_response() +} + +/// Returns a 404 response with an empty body. +pub fn to_axum_not_found_response() -> Response { + (StatusCode::NOT_FOUND, "").into_response() +} + +/// Returns a 404 response with an empty body. +pub fn to_not_found_response() -> Result> { + http::Response::builder() + .status(http::StatusCode::NOT_FOUND) + .body("".to_string()) + .context(error::InvalidHttpBodySnafu) +} diff --git a/src/meta-srv/src/service/cluster.rs b/src/meta-srv/src/service/cluster.rs index a4a26edcac..d80481b462 100644 --- a/src/meta-srv/src/service/cluster.rs +++ b/src/meta-srv/src/service/cluster.rs @@ -14,30 +14,21 @@ use api::v1::meta::{ cluster_server, BatchGetRequest as PbBatchGetRequest, BatchGetResponse as PbBatchGetResponse, - Error, MetasrvNodeInfo, MetasrvPeersRequest, MetasrvPeersResponse, - RangeRequest as PbRangeRequest, RangeResponse as PbRangeResponse, ResponseHeader, + MetasrvNodeInfo, MetasrvPeersRequest, MetasrvPeersResponse, RangeRequest as PbRangeRequest, + RangeResponse as PbRangeResponse, }; use common_telemetry::warn; use snafu::ResultExt; -use tonic::{Request, Response}; +use tonic::Request; use crate::metasrv::Metasrv; use crate::service::GrpcResult; -use crate::{error, metasrv}; +use crate::{check_leader, error, metasrv}; #[async_trait::async_trait] impl cluster_server::Cluster for Metasrv { async fn batch_get(&self, req: Request) -> GrpcResult { - if !self.is_leader() { - let is_not_leader = ResponseHeader::failed(Error::is_not_leader()); - let resp = PbBatchGetResponse { - header: Some(is_not_leader), - ..Default::default() - }; - - warn!("The current meta is not leader, but a `batch_get` request have reached the meta. Detail: {:?}.", req); - return Ok(Response::new(resp)); - } + check_leader!(self, req, PbBatchGetResponse, "`batch_get`"); let req = req.into_inner().into(); let resp = self @@ -51,16 +42,7 @@ impl cluster_server::Cluster for Metasrv { } async fn range(&self, req: Request) -> GrpcResult { - if !self.is_leader() { - let is_not_leader = ResponseHeader::failed(Error::is_not_leader()); - let resp = PbRangeResponse { - header: Some(is_not_leader), - ..Default::default() - }; - - warn!("The current meta is not leader, but a `range` request have reached the meta. Detail: {:?}.", req); - return Ok(Response::new(resp)); - } + check_leader!(self, req, PbRangeResponse, "`range`"); let req = req.into_inner().into(); let res = self @@ -77,16 +59,7 @@ impl cluster_server::Cluster for Metasrv { &self, req: Request, ) -> GrpcResult { - if !self.is_leader() { - let is_not_leader = ResponseHeader::failed(Error::is_not_leader()); - let resp = MetasrvPeersResponse { - header: Some(is_not_leader), - ..Default::default() - }; - - warn!("The current meta is not leader, but a `metasrv_peers` request have reached the meta. Detail: {:?}.", req); - return Ok(Response::new(resp)); - } + check_leader!(self, req, MetasrvPeersResponse, "`metasrv_peers`"); let leader_addr = &self.options().grpc.server_addr; let (leader, followers) = match self.election() { diff --git a/src/meta-srv/src/service/procedure.rs b/src/meta-srv/src/service/procedure.rs index c388e74e15..260951ceb1 100644 --- a/src/meta-srv/src/service/procedure.rs +++ b/src/meta-srv/src/service/procedure.rs @@ -15,23 +15,26 @@ use std::sync::Arc; use std::time::Duration; +use api::v1::meta::reconcile_request::Target; use api::v1::meta::{ procedure_service_server, DdlTaskRequest as PbDdlTaskRequest, - DdlTaskResponse as PbDdlTaskResponse, Error, MigrateRegionRequest, MigrateRegionResponse, + DdlTaskResponse as PbDdlTaskResponse, MigrateRegionRequest, MigrateRegionResponse, ProcedureDetailRequest, ProcedureDetailResponse, ProcedureStateResponse, QueryProcedureRequest, - ResponseHeader, + ReconcileCatalog, ReconcileDatabase, ReconcileRequest, ReconcileResponse, ReconcileTable, + ResolveStrategy, }; -use common_meta::ddl::ExecutorContext; +use common_meta::procedure_executor::ExecutorContext; use common_meta::rpc::ddl::{DdlTask, SubmitDdlTaskRequest}; use common_meta::rpc::procedure; use common_telemetry::warn; use snafu::{OptionExt, ResultExt}; -use tonic::{Request, Response}; +use table::table_reference::TableReference; +use tonic::Request; -use crate::error; use crate::metasrv::Metasrv; use crate::procedure::region_migration::manager::RegionMigrationProcedureTask; use crate::service::GrpcResult; +use crate::{check_leader, error}; #[async_trait::async_trait] impl procedure_service_server::ProcedureService for Metasrv { @@ -39,15 +42,12 @@ impl procedure_service_server::ProcedureService for Metasrv { &self, request: Request, ) -> GrpcResult { - if !self.is_leader() { - let resp = ProcedureStateResponse { - header: Some(ResponseHeader::failed(Error::is_not_leader())), - ..Default::default() - }; - - warn!("The current meta is not leader, but a `query procedure state` request have reached the meta. Detail: {:?}.", request); - return Ok(Response::new(resp)); - } + check_leader!( + self, + request, + ProcedureStateResponse, + "`query procedure state`" + ); let QueryProcedureRequest { header, pid, .. } = request.into_inner(); let _header = header.context(error::MissingRequestHeaderSnafu)?; @@ -69,15 +69,7 @@ impl procedure_service_server::ProcedureService for Metasrv { } async fn ddl(&self, request: Request) -> GrpcResult { - if !self.is_leader() { - let resp = PbDdlTaskResponse { - header: Some(ResponseHeader::failed(Error::is_not_leader())), - ..Default::default() - }; - - warn!("The current meta is not leader, but a `ddl` request have reached the meta. Detail: {:?}.", request); - return Ok(Response::new(resp)); - } + check_leader!(self, request, PbDdlTaskResponse, "`ddl`"); let PbDdlTaskRequest { header, @@ -98,7 +90,7 @@ impl procedure_service_server::ProcedureService for Metasrv { .context(error::ConvertProtoDataSnafu)?; let resp = self - .procedure_executor() + .ddl_manager() .submit_ddl_task( &ExecutorContext { tracing_context: Some(header.tracing_context), @@ -119,15 +111,7 @@ impl procedure_service_server::ProcedureService for Metasrv { &self, request: Request, ) -> GrpcResult { - if !self.is_leader() { - let resp = MigrateRegionResponse { - header: Some(ResponseHeader::failed(Error::is_not_leader())), - ..Default::default() - }; - - warn!("The current meta is not leader, but a `migrate` request have reached the meta. Detail: {:?}.", request); - return Ok(Response::new(resp)); - } + check_leader!(self, request, MigrateRegionResponse, "`migrate`"); let MigrateRegionRequest { header, @@ -166,19 +150,79 @@ impl procedure_service_server::ProcedureService for Metasrv { Ok(Response::new(resp)) } + async fn reconcile(&self, request: Request) -> GrpcResult { + check_leader!(self, request, ReconcileResponse, "`reconcile`"); + + let ReconcileRequest { header, target } = request.into_inner(); + let _header = header.context(error::MissingRequestHeaderSnafu)?; + let target = target.context(error::MissingRequiredParameterSnafu { param: "target" })?; + let parse_resolve_strategy = |resolve_strategy: i32| { + ResolveStrategy::try_from(resolve_strategy) + .ok() + .context(error::UnexpectedSnafu { + violated: format!("Invalid resolve strategy: {}", resolve_strategy), + }) + }; + let procedure_id = match target { + Target::ReconcileTable(table) => { + let ReconcileTable { + catalog_name, + schema_name, + table_name, + resolve_strategy, + } = table; + let resolve_strategy = parse_resolve_strategy(resolve_strategy)?; + let table_ref = TableReference::full(&catalog_name, &schema_name, &table_name); + self.reconciliation_manager() + .reconcile_table(table_ref, resolve_strategy.into()) + .await + .context(error::SubmitReconcileProcedureSnafu)? + } + Target::ReconcileDatabase(database) => { + let ReconcileDatabase { + catalog_name, + database_name, + resolve_strategy, + parallelism, + } = database; + let resolve_strategy = parse_resolve_strategy(resolve_strategy)?; + self.reconciliation_manager().reconcile_database( + catalog_name, + database_name, + resolve_strategy.into(), + parallelism as usize, + ) + } + Target::ReconcileCatalog(catalog) => { + let ReconcileCatalog { + catalog_name, + resolve_strategy, + parallelism, + } = catalog; + let resolve_strategy = parse_resolve_strategy(resolve_strategy)?; + self.reconciliation_manager().reconcile_catalog( + catalog_name, + resolve_strategy.into(), + parallelism as usize, + ) + } + }; + Ok(Response::new(ReconcileResponse { + pid: Some(procedure::pid_to_pb_pid(procedure_id)), + ..Default::default() + })) + } + async fn details( &self, request: Request, ) -> GrpcResult { - if !self.is_leader() { - let resp = ProcedureDetailResponse { - header: Some(ResponseHeader::failed(Error::is_not_leader())), - ..Default::default() - }; - - warn!("The current meta is not leader, but a `procedure details` request have reached the meta. Detail: {:?}.", request); - return Ok(Response::new(resp)); - } + check_leader!( + self, + request, + ProcedureDetailResponse, + "`procedure details`" + ); let ProcedureDetailRequest { header } = request.into_inner(); let _header = header.context(error::MissingRequestHeaderSnafu)?; diff --git a/src/meta-srv/src/service/utils.rs b/src/meta-srv/src/service/utils.rs new file mode 100644 index 0000000000..309af50822 --- /dev/null +++ b/src/meta-srv/src/service/utils.rs @@ -0,0 +1,34 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[macro_export] +macro_rules! check_leader { + ($self:expr, $request:expr, $resp_ty:ty, $msg:expr) => { + use common_telemetry::warn; + use api::v1::meta::{ResponseHeader, Error}; + use tonic::Response; + + if !$self.is_leader() { + warn!( + "The current metasrv is not the leader, but a {} request has reached the meta. Detail: {:?}.", + $msg, $request + ); + let mut resp: $resp_ty = Default::default(); + resp.header = Some(ResponseHeader::failed( + Error::is_not_leader(), + )); + return Ok(Response::new(resp)); + } + }; +} diff --git a/src/metric-engine/Cargo.toml b/src/metric-engine/Cargo.toml index 0306a38ade..04cfcf0284 100644 --- a/src/metric-engine/Cargo.toml +++ b/src/metric-engine/Cargo.toml @@ -29,6 +29,7 @@ itertools.workspace = true lazy_static = "1.4" mito-codec.workspace = true mito2.workspace = true +moka.workspace = true mur3 = "0.1" object-store.workspace = true prometheus.workspace = true @@ -40,5 +41,6 @@ store-api.workspace = true tokio.workspace = true [dev-dependencies] +common-meta = { workspace = true, features = ["testing"] } common-test-util.workspace = true mito2 = { workspace = true, features = ["test"] } diff --git a/src/metric-engine/src/data_region.rs b/src/metric-engine/src/data_region.rs index 80aacc2848..a359e697d7 100644 --- a/src/metric-engine/src/data_region.rs +++ b/src/metric-engine/src/data_region.rs @@ -214,8 +214,11 @@ impl DataRegion { match request.kind { AlterKind::SetRegionOptions { options: _ } | AlterKind::UnsetRegionOptions { keys: _ } - | AlterKind::SetIndex { options: _ } - | AlterKind::UnsetIndex { options: _ } => { + | AlterKind::SetIndexes { options: _ } + | AlterKind::UnsetIndexes { options: _ } + | AlterKind::SyncColumns { + column_metadatas: _, + } => { let region_id = utils::to_data_region_id(region_id); self.mito .handle_request(region_id, RegionRequest::Alter(request)) diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs index dd67cb3bd2..161a851d29 100644 --- a/src/metric-engine/src/engine.rs +++ b/src/metric-engine/src/engine.rs @@ -158,6 +158,7 @@ impl RegionEngine for MetricEngine { Ok(RegionResponse { affected_rows: rows, extensions: extension_return_value, + metadata: Vec::new(), }) } BatchRegionDdlRequest::Alter(requests) => { @@ -171,6 +172,7 @@ impl RegionEngine for MetricEngine { Ok(RegionResponse { affected_rows: rows, extensions: extension_return_value, + metadata: Vec::new(), }) } BatchRegionDdlRequest::Drop(requests) => { @@ -243,6 +245,7 @@ impl RegionEngine for MetricEngine { result.map_err(BoxedError::new).map(|rows| RegionResponse { affected_rows: rows, extensions: extension_return_value, + metadata: Vec::new(), }) } @@ -439,6 +442,7 @@ impl MetricEngine { Ok(RegionResponse { affected_rows, extensions, + metadata: Vec::new(), }) } } diff --git a/src/metric-engine/src/engine/alter.rs b/src/metric-engine/src/engine/alter.rs index 22ef54cab8..e7bedaf374 100644 --- a/src/metric-engine/src/engine/alter.rs +++ b/src/metric-engine/src/engine/alter.rs @@ -66,7 +66,7 @@ impl MetricEngineInner { let mut manifest_infos = Vec::with_capacity(1); self.alter_logical_regions(physical_region_id, requests, extension_return_value) .await?; - append_manifest_info(&self.mito, region_id, &mut manifest_infos); + append_manifest_info(&self.mito, physical_region_id, &mut manifest_infos); encode_manifest_info_to_extensions(&manifest_infos, extension_return_value)?; } else { let grouped_requests = @@ -222,13 +222,17 @@ mod test { use std::time::Duration; use api::v1::SemanticType; - use datatypes::data_type::ConcreteDataType; - use datatypes::schema::ColumnSchema; - use store_api::metadata::ColumnMetadata; - use store_api::region_request::{AddColumn, SetRegionOption}; + use common_meta::ddl::test_util::assert_column_name_and_id; + use common_meta::ddl::utils::{parse_column_metadatas, parse_manifest_infos_from_extensions}; + use store_api::metric_engine_consts::ALTER_PHYSICAL_EXTENSION_KEY; + use store_api::region_engine::RegionEngine; + use store_api::region_request::{ + AlterKind, BatchRegionDdlRequest, RegionAlterRequest, SetRegionOption, + }; + use store_api::storage::consts::ReservedColumnId; + use store_api::storage::RegionId; - use super::*; - use crate::test_util::TestEnv; + use crate::test_util::{alter_logical_region_request, create_logical_region_request, TestEnv}; #[tokio::test] async fn test_alter_region() { @@ -239,22 +243,7 @@ mod test { // alter physical region let physical_region_id = env.default_physical_region_id(); - let request = RegionAlterRequest { - kind: AlterKind::AddColumns { - columns: vec![AddColumn { - column_metadata: ColumnMetadata { - column_id: 0, - semantic_type: SemanticType::Tag, - column_schema: ColumnSchema::new( - "tag1", - ConcreteDataType::string_datatype(), - false, - ), - }, - location: None, - }], - }, - }; + let request = alter_logical_region_request(&["tag1"]); let result = engine_inner .alter_physical_region(physical_region_id, request.clone()) @@ -287,14 +276,18 @@ mod test { assert!(!is_column_exist); let region_id = env.default_logical_region_id(); - engine_inner - .alter_logical_regions( - physical_region_id, - vec![(region_id, request)], - &mut HashMap::new(), - ) + let response = env + .metric() + .handle_batch_ddl_requests(BatchRegionDdlRequest::Alter(vec![( + region_id, + request.clone(), + )])) .await .unwrap(); + let manifest_infos = parse_manifest_infos_from_extensions(&response.extensions).unwrap(); + assert_eq!(manifest_infos[0].0, physical_region_id); + assert!(manifest_infos[0].1.is_metric()); + let semantic_type = metadata_region .column_semantic_type(physical_region_id, logical_region_id, "tag1") .await @@ -307,5 +300,77 @@ mod test { .unwrap() .unwrap(); assert_eq!(timestamp_index, SemanticType::Timestamp); + let column_metadatas = + parse_column_metadatas(&response.extensions, ALTER_PHYSICAL_EXTENSION_KEY).unwrap(); + assert_column_name_and_id( + &column_metadatas, + &[ + ("greptime_timestamp", 0), + ("greptime_value", 1), + ("__table_id", ReservedColumnId::table_id()), + ("__tsid", ReservedColumnId::tsid()), + ("job", 2), + ("tag1", 3), + ], + ); + } + + #[tokio::test] + async fn test_alter_logical_regions() { + let env = TestEnv::new().await; + let engine = env.metric(); + let physical_region_id1 = RegionId::new(1024, 0); + let physical_region_id2 = RegionId::new(1024, 1); + let logical_region_id1 = RegionId::new(1025, 0); + let logical_region_id2 = RegionId::new(1025, 1); + env.create_physical_region(physical_region_id1, "/test_dir1") + .await; + env.create_physical_region(physical_region_id2, "/test_dir2") + .await; + + let region_create_request1 = crate::test_util::create_logical_region_request( + &["job"], + physical_region_id1, + "logical1", + ); + let region_create_request2 = + create_logical_region_request(&["job"], physical_region_id2, "logical2"); + engine + .handle_batch_ddl_requests(BatchRegionDdlRequest::Create(vec![ + (logical_region_id1, region_create_request1), + (logical_region_id2, region_create_request2), + ])) + .await + .unwrap(); + + let region_alter_request1 = alter_logical_region_request(&["tag1"]); + let region_alter_request2 = alter_logical_region_request(&["tag1"]); + let response = engine + .handle_batch_ddl_requests(BatchRegionDdlRequest::Alter(vec![ + (logical_region_id1, region_alter_request1), + (logical_region_id2, region_alter_request2), + ])) + .await + .unwrap(); + + let manifest_infos = parse_manifest_infos_from_extensions(&response.extensions).unwrap(); + assert_eq!(manifest_infos.len(), 2); + let region_ids = manifest_infos.into_iter().map(|i| i.0).collect::>(); + assert!(region_ids.contains(&physical_region_id1)); + assert!(region_ids.contains(&physical_region_id2)); + + let column_metadatas = + parse_column_metadatas(&response.extensions, ALTER_PHYSICAL_EXTENSION_KEY).unwrap(); + assert_column_name_and_id( + &column_metadatas, + &[ + ("greptime_timestamp", 0), + ("greptime_value", 1), + ("__table_id", ReservedColumnId::table_id()), + ("__tsid", ReservedColumnId::tsid()), + ("job", 2), + ("tag1", 3), + ], + ); } } diff --git a/src/metric-engine/src/engine/create.rs b/src/metric-engine/src/engine/create.rs index 64c7cd7e14..96d064f454 100644 --- a/src/metric-engine/src/engine/create.rs +++ b/src/metric-engine/src/engine/create.rs @@ -80,7 +80,8 @@ impl MetricEngineInner { } ); let (region_id, request) = requests.pop().unwrap(); - self.create_physical_region(region_id, request).await?; + self.create_physical_region(region_id, request, extension_return_value) + .await?; return Ok(0); } else if first_request @@ -122,6 +123,7 @@ impl MetricEngineInner { &self, region_id: RegionId, request: RegionCreateRequest, + extension_return_value: &mut HashMap>, ) -> Result<()> { let physical_region_options = PhysicalRegionOptions::try_from(&request.options)?; let (data_region_id, metadata_region_id) = Self::transform_region_id(region_id); @@ -162,7 +164,8 @@ impl MetricEngineInner { .context(UnexpectedRequestSnafu { reason: "No time index column found", })?; - self.mito + let response = self + .mito .handle_request( data_region_id, RegionRequest::Create(create_data_region_request), @@ -176,6 +179,7 @@ impl MetricEngineInner { region_id: data_region_id, }, )?; + extension_return_value.extend(response.extensions); info!("Created physical metric region {region_id}, primary key encoding={primary_key_encoding}, physical_region_options={physical_region_options:?}"); PHYSICAL_REGION_COUNT.inc(); @@ -613,12 +617,15 @@ pub(crate) fn region_options_for_metadata_region( #[cfg(test)] mod test { + use common_meta::ddl::test_util::assert_column_name_and_id; + use common_meta::ddl::utils::{parse_column_metadatas, parse_manifest_infos_from_extensions}; use store_api::metric_engine_consts::{METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY}; + use store_api::region_request::BatchRegionDdlRequest; use super::*; use crate::config::EngineConfig; use crate::engine::MetricEngine; - use crate::test_util::TestEnv; + use crate::test_util::{create_logical_region_request, TestEnv}; #[test] fn test_verify_region_create_request() { @@ -807,4 +814,50 @@ mod test { ); assert!(!metadata_region_request.options.contains_key("skip_wal")); } + + #[tokio::test] + async fn test_create_logical_regions() { + let env = TestEnv::new().await; + let engine = env.metric(); + let physical_region_id1 = RegionId::new(1024, 0); + let physical_region_id2 = RegionId::new(1024, 1); + let logical_region_id1 = RegionId::new(1025, 0); + let logical_region_id2 = RegionId::new(1025, 1); + env.create_physical_region(physical_region_id1, "/test_dir1") + .await; + env.create_physical_region(physical_region_id2, "/test_dir2") + .await; + + let region_create_request1 = + create_logical_region_request(&["job"], physical_region_id1, "logical1"); + let region_create_request2 = + create_logical_region_request(&["job"], physical_region_id2, "logical2"); + + let response = engine + .handle_batch_ddl_requests(BatchRegionDdlRequest::Create(vec![ + (logical_region_id1, region_create_request1), + (logical_region_id2, region_create_request2), + ])) + .await + .unwrap(); + + let manifest_infos = parse_manifest_infos_from_extensions(&response.extensions).unwrap(); + assert_eq!(manifest_infos.len(), 2); + let region_ids = manifest_infos.into_iter().map(|i| i.0).collect::>(); + assert!(region_ids.contains(&physical_region_id1)); + assert!(region_ids.contains(&physical_region_id2)); + + let column_metadatas = + parse_column_metadatas(&response.extensions, ALTER_PHYSICAL_EXTENSION_KEY).unwrap(); + assert_column_name_and_id( + &column_metadatas, + &[ + ("greptime_timestamp", 0), + ("greptime_value", 1), + ("__table_id", ReservedColumnId::table_id()), + ("__tsid", ReservedColumnId::tsid()), + ("job", 2), + ], + ); + } } diff --git a/src/metric-engine/src/error.rs b/src/metric-engine/src/error.rs index d4fcb4e5b2..91881b5624 100644 --- a/src/metric-engine/src/error.rs +++ b/src/metric-engine/src/error.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::any::Any; +use std::sync::Arc; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; @@ -304,6 +305,13 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Get value from cache"))] + CacheGet { + source: Arc, + #[snafu(implicit)] + location: Location, + }, } pub type Result = std::result::Result; @@ -362,6 +370,8 @@ impl ErrorExt for Error { StartRepeatedTask { source, .. } => source.status_code(), MetricManifestInfo { .. } => StatusCode::Internal, + + CacheGet { source, .. } => source.status_code(), } } diff --git a/src/metric-engine/src/metadata_region.rs b/src/metric-engine/src/metadata_region.rs index 5e6532517a..c4e4878d0f 100644 --- a/src/metric-engine/src/metadata_region.rs +++ b/src/metric-engine/src/metadata_region.rs @@ -13,19 +13,23 @@ // limitations under the License. use std::collections::hash_map::Entry; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; +use std::time::Duration; use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value}; use async_stream::try_stream; use base64::engine::general_purpose::STANDARD_NO_PAD; use base64::Engine; +use common_base::readable_size::ReadableSize; use common_recordbatch::{RecordBatch, SendableRecordBatchStream}; use datafusion::prelude::{col, lit}; use futures_util::stream::BoxStream; use futures_util::TryStreamExt; use mito2::engine::MitoEngine; +use moka::future::Cache; +use moka::policy::EvictionPolicy; use snafu::{OptionExt, ResultExt}; use store_api::metadata::ColumnMetadata; use store_api::metric_engine_consts::{ @@ -39,9 +43,9 @@ use store_api::storage::{RegionId, ScanRequest}; use tokio::sync::{OwnedRwLockReadGuard, OwnedRwLockWriteGuard, RwLock}; use crate::error::{ - CollectRecordBatchStreamSnafu, DecodeColumnValueSnafu, DeserializeColumnMetadataSnafu, - LogicalRegionNotFoundSnafu, MitoReadOperationSnafu, MitoWriteOperationSnafu, - ParseRegionIdSnafu, Result, + CacheGetSnafu, CollectRecordBatchStreamSnafu, DecodeColumnValueSnafu, + DeserializeColumnMetadataSnafu, LogicalRegionNotFoundSnafu, MitoReadOperationSnafu, + MitoWriteOperationSnafu, ParseRegionIdSnafu, Result, }; use crate::utils; @@ -62,6 +66,11 @@ const COLUMN_PREFIX: &str = "__column_"; /// itself. pub struct MetadataRegion { pub(crate) mito: MitoEngine, + /// The cache for contents(key-value pairs) of region metadata. + /// + /// The cache should be invalidated when any new values are put into the metadata region or any + /// values are deleted from the metadata region. + cache: Cache, /// Logical lock for operations that need to be serialized. Like update & read region columns. /// /// Region entry will be registered on creating and opening logical region, and deregistered on @@ -69,10 +78,30 @@ pub struct MetadataRegion { logical_region_lock: RwLock>>>, } +#[derive(Clone)] +struct RegionMetadataCacheEntry { + key_values: Arc>, + size: usize, +} + +/// The max size of the region metadata cache. +const MAX_CACHE_SIZE: u64 = ReadableSize::mb(128).as_bytes(); +/// The TTL of the region metadata cache. +const CACHE_TTL: Duration = Duration::from_secs(5 * 60); + impl MetadataRegion { pub fn new(mito: MitoEngine) -> Self { + let cache = Cache::builder() + .max_capacity(MAX_CACHE_SIZE) + // Use the LRU eviction policy to minimize frequent mito scans. + // Recently accessed items are retained longer in the cache. + .eviction_policy(EvictionPolicy::lru()) + .time_to_live(CACHE_TTL) + .weigher(|_, v: &RegionMetadataCacheEntry| v.size as u32) + .build(); Self { mito, + cache, logical_region_lock: RwLock::new(HashMap::new()), } } @@ -351,21 +380,60 @@ impl MetadataRegion { } } - pub async fn get_all_with_prefix( - &self, - region_id: RegionId, - prefix: &str, - ) -> Result> { - let scan_req = MetadataRegion::build_prefix_read_request(prefix, false); + fn build_read_request() -> ScanRequest { + let projection = vec![ + METADATA_SCHEMA_KEY_COLUMN_INDEX, + METADATA_SCHEMA_VALUE_COLUMN_INDEX, + ]; + ScanRequest { + projection: Some(projection), + ..Default::default() + } + } + + async fn load_all(&self, metadata_region_id: RegionId) -> Result { + let scan_req = MetadataRegion::build_read_request(); let record_batch_stream = self .mito - .scan_to_stream(region_id, scan_req) + .scan_to_stream(metadata_region_id, scan_req) .await .context(MitoReadOperationSnafu)?; - decode_batch_stream(record_batch_stream, decode_record_batch_to_key_and_value) - .try_collect::>() + let kv = decode_batch_stream(record_batch_stream, decode_record_batch_to_key_and_value) + .try_collect::>() + .await?; + let mut size = 0; + for (k, v) in kv.iter() { + size += k.len(); + size += v.len(); + } + let kv = Arc::new(kv); + Ok(RegionMetadataCacheEntry { + key_values: kv, + size, + }) + } + + async fn get_all_with_prefix( + &self, + metadata_region_id: RegionId, + prefix: &str, + ) -> Result> { + let region_metadata = self + .cache + .try_get_with(metadata_region_id, self.load_all(metadata_region_id)) .await + .context(CacheGetSnafu)?; + + let range = region_metadata.key_values.range(prefix.to_string()..); + let mut result = HashMap::new(); + for (k, v) in range { + if !k.starts_with(prefix) { + break; + } + result.insert(k.to_string(), v.to_string()); + } + Ok(result) } pub async fn get_all_key_with_prefix( @@ -387,15 +455,18 @@ impl MetadataRegion { /// Delete the given keys. For performance consideration, this method /// doesn't check if those keys exist or not. - async fn delete(&self, region_id: RegionId, keys: &[String]) -> Result<()> { + async fn delete(&self, metadata_region_id: RegionId, keys: &[String]) -> Result<()> { let delete_request = Self::build_delete_request(keys); self.mito .handle_request( - region_id, + metadata_region_id, store_api::region_request::RegionRequest::Delete(delete_request), ) .await .context(MitoWriteOperationSnafu)?; + // Invalidates the region metadata cache if any values are deleted from the metadata region. + self.cache.invalidate(&metadata_region_id).await; + Ok(()) } @@ -485,7 +556,7 @@ impl MetadataRegion { write_region_id: bool, logical_regions: impl Iterator)>, ) -> Result<()> { - let region_id = utils::to_metadata_region_id(physical_region_id); + let metadata_region_id = utils::to_metadata_region_id(physical_region_id); let iter = logical_regions .into_iter() .flat_map(|(logical_region_id, column_metadatas)| { @@ -512,11 +583,13 @@ impl MetadataRegion { let put_request = MetadataRegion::build_put_request_from_iter(iter.into_iter()); self.mito .handle_request( - region_id, + metadata_region_id, store_api::region_request::RegionRequest::Put(put_request), ) .await .context(MitoWriteOperationSnafu)?; + // Invalidates the region metadata cache if any new values are put into the metadata region. + self.cache.invalidate(&metadata_region_id).await; Ok(()) } diff --git a/src/metric-engine/src/test_util.rs b/src/metric-engine/src/test_util.rs index b8a7130e5a..0f681c2372 100644 --- a/src/metric-engine/src/test_util.rs +++ b/src/metric-engine/src/test_util.rs @@ -16,6 +16,7 @@ use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnSchema as PbColumnSchema, Row, SemanticType, Value}; +use common_meta::ddl::utils::parse_column_metadatas; use common_telemetry::debug; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; @@ -26,12 +27,14 @@ use object_store::util::join_dir; use object_store::ObjectStore; use store_api::metadata::ColumnMetadata; use store_api::metric_engine_consts::{ - LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY, + ALTER_PHYSICAL_EXTENSION_KEY, LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME, + PHYSICAL_TABLE_METADATA_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY, }; use store_api::region_engine::RegionEngine; use store_api::region_request::{ AddColumn, AlterKind, RegionAlterRequest, RegionCreateRequest, RegionOpenRequest, RegionRequest, }; +use store_api::storage::consts::ReservedColumnId; use store_api::storage::{ColumnId, RegionId}; use crate::config::EngineConfig; @@ -116,13 +119,8 @@ impl TestEnv { (mito, metric) } - /// Create regions in [MetricEngine] under [`default_region_id`] - /// and region dir `"test_metric_region"`. - /// - /// This method will create one logical region with three columns `(ts, val, job)` - /// under [`default_logical_region_id`]. - pub async fn init_metric_region(&self) { - let region_id = self.default_physical_region_id(); + /// Create regions in [MetricEngine] with specific `physical_region_id`. + pub async fn create_physical_region(&self, physical_region_id: RegionId, region_dir: &str) { let region_create_request = RegionCreateRequest { engine: METRIC_ENGINE_NAME.to_string(), column_metadatas: vec![ @@ -149,26 +147,88 @@ impl TestEnv { options: [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())] .into_iter() .collect(), - region_dir: self.default_region_dir(), + region_dir: region_dir.to_string(), }; // create physical region - self.metric() - .handle_request(region_id, RegionRequest::Create(region_create_request)) + let response = self + .metric() + .handle_request( + physical_region_id, + RegionRequest::Create(region_create_request), + ) .await .unwrap(); + let column_metadatas = + parse_column_metadatas(&response.extensions, TABLE_COLUMN_METADATA_EXTENSION_KEY) + .unwrap(); + assert_eq!(column_metadatas.len(), 4); + } - // create logical region - let region_id = self.default_logical_region_id(); + /// Create logical region in [MetricEngine] with specific `physical_region_id` and `logical_region_id`. + pub async fn create_logical_region( + &self, + physical_region_id: RegionId, + logical_region_id: RegionId, + ) { let region_create_request = create_logical_region_request( &["job"], - self.default_physical_region_id(), + physical_region_id, "test_metric_logical_region", ); - self.metric() - .handle_request(region_id, RegionRequest::Create(region_create_request)) + let response = self + .metric() + .handle_request( + logical_region_id, + RegionRequest::Create(region_create_request), + ) .await .unwrap(); + let column_metadatas = + parse_column_metadatas(&response.extensions, ALTER_PHYSICAL_EXTENSION_KEY).unwrap(); + assert_eq!(column_metadatas.len(), 5); + let column_names = column_metadatas + .iter() + .map(|c| c.column_schema.name.as_str()) + .collect::>(); + let column_ids = column_metadatas + .iter() + .map(|c| c.column_id) + .collect::>(); + assert_eq!( + column_names, + vec![ + "greptime_timestamp", + "greptime_value", + "__table_id", + "__tsid", + "job", + ] + ); + assert_eq!( + column_ids, + vec![ + 0, + 1, + ReservedColumnId::table_id(), + ReservedColumnId::tsid(), + 2, + ] + ); + } + + /// Create regions in [MetricEngine] under [`default_region_id`] + /// and region dir `"test_metric_region"`. + /// + /// This method will create one logical region with three columns `(ts, val, job)` + /// under [`default_logical_region_id`]. + pub async fn init_metric_region(&self) { + let physical_region_id = self.default_physical_region_id(); + self.create_physical_region(physical_region_id, &self.default_region_dir()) + .await; + let logical_region_id = self.default_logical_region_id(); + self.create_logical_region(physical_region_id, logical_region_id) + .await; } pub fn metadata_region(&self) -> MetadataRegion { @@ -274,6 +334,30 @@ pub fn create_logical_region_request( } } +/// Generate a [RegionAlterRequest] for logical region. +/// Only need to specify tag column's name +pub fn alter_logical_region_request(tags: &[&str]) -> RegionAlterRequest { + RegionAlterRequest { + kind: AlterKind::AddColumns { + columns: tags + .iter() + .map(|tag| AddColumn { + column_metadata: ColumnMetadata { + column_id: 0, + semantic_type: SemanticType::Tag, + column_schema: ColumnSchema::new( + tag.to_string(), + ConcreteDataType::string_datatype(), + false, + ), + }, + location: None, + }) + .collect::>(), + }, + } +} + /// Generate a row schema with given tag columns. /// /// The result will also contains default timestamp and value column at beginning. diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index 0c245944af..d86ab4018a 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -81,8 +81,10 @@ use store_api::codec::PrimaryKeyEncoding; use store_api::logstore::provider::Provider; use store_api::logstore::LogStore; use store_api::manifest::ManifestVersion; -use store_api::metadata::RegionMetadataRef; -use store_api::metric_engine_consts::MANIFEST_INFO_EXTENSION_KEY; +use store_api::metadata::{ColumnMetadata, RegionMetadataRef}; +use store_api::metric_engine_consts::{ + MANIFEST_INFO_EXTENSION_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY, +}; use store_api::region_engine::{ BatchResponses, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse, @@ -95,7 +97,7 @@ use crate::cache::CacheStrategy; use crate::config::MitoConfig; use crate::error::{ InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu, Result, - SerdeJsonSnafu, + SerdeJsonSnafu, SerializeColumnMetadataSnafu, }; use crate::manifest::action::RegionEdit; use crate::memtable::MemtableStats; @@ -250,6 +252,22 @@ impl MitoEngine { Ok(()) } + fn encode_column_metadatas_to_extensions( + region_id: &RegionId, + column_metadatas: Vec, + extensions: &mut HashMap>, + ) -> Result<()> { + extensions.insert( + TABLE_COLUMN_METADATA_EXTENSION_KEY.to_string(), + ColumnMetadata::encode_list(&column_metadatas).context(SerializeColumnMetadataSnafu)?, + ); + info!( + "Added column metadatas: {:?} to extensions, region_id: {:?}", + column_metadatas, region_id + ); + Ok(()) + } + /// Find the current version's memtables and SSTs stats by region_id. /// The stats must be collected in one place one time to ensure data consistency. pub fn find_memtable_and_sst_stats( @@ -603,6 +621,7 @@ impl RegionEngine for MitoEngine { .start_timer(); let is_alter = matches!(request, RegionRequest::Alter(_)); + let is_create = matches!(request, RegionRequest::Create(_)); let mut response = self .inner .handle_request(region_id, request) @@ -611,14 +630,11 @@ impl RegionEngine for MitoEngine { .map_err(BoxedError::new)?; if is_alter { - if let Some(statistic) = self.region_statistic(region_id) { - Self::encode_manifest_info_to_extensions( - ®ion_id, - statistic.manifest, - &mut response.extensions, - ) + self.handle_alter_response(region_id, &mut response) + .map_err(BoxedError::new)?; + } else if is_create { + self.handle_create_response(region_id, &mut response) .map_err(BoxedError::new)?; - } } Ok(response) @@ -711,6 +727,55 @@ impl RegionEngine for MitoEngine { } } +impl MitoEngine { + fn handle_alter_response( + &self, + region_id: RegionId, + response: &mut RegionResponse, + ) -> Result<()> { + if let Some(statistic) = self.region_statistic(region_id) { + Self::encode_manifest_info_to_extensions( + ®ion_id, + statistic.manifest, + &mut response.extensions, + )?; + } + let column_metadatas = self + .inner + .find_region(region_id) + .ok() + .map(|r| r.metadata().column_metadatas.clone()); + if let Some(column_metadatas) = column_metadatas { + Self::encode_column_metadatas_to_extensions( + ®ion_id, + column_metadatas, + &mut response.extensions, + )?; + } + Ok(()) + } + + fn handle_create_response( + &self, + region_id: RegionId, + response: &mut RegionResponse, + ) -> Result<()> { + let column_metadatas = self + .inner + .find_region(region_id) + .ok() + .map(|r| r.metadata().column_metadatas.clone()); + if let Some(column_metadatas) = column_metadatas { + Self::encode_column_metadatas_to_extensions( + ®ion_id, + column_metadatas, + &mut response.extensions, + )?; + } + Ok(()) + } +} + // Tests methods. #[cfg(any(test, feature = "test"))] #[allow(clippy::too_many_arguments)] diff --git a/src/mito2/src/engine/alter_test.rs b/src/mito2/src/engine/alter_test.rs index 365306c77d..1345f4210f 100644 --- a/src/mito2/src/engine/alter_test.rs +++ b/src/mito2/src/engine/alter_test.rs @@ -20,16 +20,18 @@ use std::time::Duration; use api::v1::value::ValueData; use api::v1::{ColumnDataType, Row, Rows, SemanticType}; use common_error::ext::ErrorExt; +use common_meta::ddl::utils::{parse_column_metadatas, parse_manifest_infos_from_extensions}; use common_recordbatch::RecordBatches; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, FulltextAnalyzer, FulltextBackend, FulltextOptions}; use store_api::metadata::ColumnMetadata; -use store_api::region_engine::{RegionEngine, RegionRole}; +use store_api::metric_engine_consts::TABLE_COLUMN_METADATA_EXTENSION_KEY; +use store_api::region_engine::{RegionEngine, RegionManifestInfo, RegionRole}; use store_api::region_request::{ - AddColumn, AddColumnLocation, AlterKind, ApiSetIndexOptions, RegionAlterRequest, - RegionOpenRequest, RegionRequest, SetRegionOption, + AddColumn, AddColumnLocation, AlterKind, RegionAlterRequest, RegionOpenRequest, RegionRequest, + SetIndexOption, SetRegionOption, }; -use store_api::storage::{RegionId, ScanRequest}; +use store_api::storage::{ColumnId, RegionId, ScanRequest}; use crate::config::MitoConfig; use crate::engine::listener::{AlterFlushListener, NotifyRegionChangeResultListener}; @@ -71,18 +73,18 @@ fn add_tag1() -> RegionAlterRequest { fn alter_column_inverted_index() -> RegionAlterRequest { RegionAlterRequest { - kind: AlterKind::SetIndex { - options: ApiSetIndexOptions::Inverted { + kind: AlterKind::SetIndexes { + options: vec![SetIndexOption::Inverted { column_name: "tag_0".to_string(), - }, + }], }, } } fn alter_column_fulltext_options() -> RegionAlterRequest { RegionAlterRequest { - kind: AlterKind::SetIndex { - options: ApiSetIndexOptions::Fulltext { + kind: AlterKind::SetIndexes { + options: vec![SetIndexOption::Fulltext { column_name: "tag_0".to_string(), options: FulltextOptions::new_unchecked( true, @@ -92,7 +94,7 @@ fn alter_column_fulltext_options() -> RegionAlterRequest { 1000, 0.01, ), - }, + }], }, } } @@ -113,6 +115,17 @@ fn check_region_version( assert_eq!(flushed_sequence, version_data.version.flushed_sequence); } +fn assert_column_metadatas(column_name: &[(&str, ColumnId)], column_metadatas: &[ColumnMetadata]) { + assert_eq!(column_name.len(), column_metadatas.len()); + for (name, id) in column_name { + let column_metadata = column_metadatas + .iter() + .find(|c| c.column_id == *id) + .unwrap(); + assert_eq!(column_metadata.column_schema.name, *name); + } +} + #[tokio::test] async fn test_alter_region() { common_telemetry::init_default_ut_logging(); @@ -136,10 +149,16 @@ async fn test_alter_region() { let column_schemas = rows_schema(&request); let region_dir = request.region_dir.clone(); - engine + let response = engine .handle_request(region_id, RegionRequest::Create(request)) .await .unwrap(); + let column_metadatas = + parse_column_metadatas(&response.extensions, TABLE_COLUMN_METADATA_EXTENSION_KEY).unwrap(); + assert_column_metadatas( + &[("tag_0", 0), ("field_0", 1), ("ts", 2)], + &column_metadatas, + ); let rows = Rows { schema: column_schemas, @@ -148,7 +167,7 @@ async fn test_alter_region() { put_rows(&engine, region_id, rows).await; let request = add_tag1(); - engine + let response = engine .handle_request(region_id, RegionRequest::Alter(request)) .await .unwrap(); @@ -164,6 +183,18 @@ async fn test_alter_region() { scan_check_after_alter(&engine, region_id, expected).await; check_region_version(&engine, region_id, 1, 3, 1, 3); + let mut manifests = parse_manifest_infos_from_extensions(&response.extensions).unwrap(); + assert_eq!(manifests.len(), 1); + let (return_region_id, manifest) = manifests.remove(0); + assert_eq!(return_region_id, region_id); + assert_eq!(manifest, RegionManifestInfo::mito(2, 1)); + let column_metadatas = + parse_column_metadatas(&response.extensions, TABLE_COLUMN_METADATA_EXTENSION_KEY).unwrap(); + assert_column_metadatas( + &[("tag_0", 0), ("field_0", 1), ("ts", 2), ("tag_1", 3)], + &column_metadatas, + ); + // Reopen region. let engine = env.reopen_engine(engine, MitoConfig::default()).await; engine diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index db11cd957b..864a05bb6d 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -108,6 +108,14 @@ pub enum Error { error: serde_json::Error, }, + #[snafu(display("Failed to serialize column metadata"))] + SerializeColumnMetadata { + #[snafu(source)] + error: serde_json::Error, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Invalid scan index, start: {}, end: {}", start, end))] InvalidScanIndex { start: ManifestVersion, @@ -1087,7 +1095,8 @@ impl ErrorExt for Error { | UnexpectedImpureDefault { .. } | NoCheckpoint { .. } | NoManifests { .. } - | InstallManifestTo { .. } => StatusCode::Unexpected, + | InstallManifestTo { .. } + | SerializeColumnMetadata { .. } => StatusCode::Unexpected, RegionNotFound { .. } => StatusCode::RegionNotFound, ObjectStoreNotFound { .. } diff --git a/src/operator/src/expr_helper.rs b/src/operator/src/expr_helper.rs index 2b2cf7a63e..9f6e514353 100644 --- a/src/operator/src/expr_helper.rs +++ b/src/operator/src/expr_helper.rs @@ -26,9 +26,9 @@ use api::v1::{ ColumnDataType, ColumnDataTypeExtension, CreateFlowExpr, CreateTableExpr, CreateViewExpr, DropColumn, DropColumns, DropDefaults, ExpireAfter, FulltextBackend as PbFulltextBackend, ModifyColumnType, ModifyColumnTypes, RenameTable, SemanticType, SetDatabaseOptions, - SetFulltext, SetIndex, SetInverted, SetSkipping, SetTableOptions, + SetFulltext, SetIndex, SetIndexes, SetInverted, SetSkipping, SetTableOptions, SkippingIndexType as PbSkippingIndexType, TableName, UnsetDatabaseOptions, UnsetFulltext, - UnsetIndex, UnsetInverted, UnsetSkipping, UnsetTableOptions, + UnsetIndex, UnsetIndexes, UnsetInverted, UnsetSkipping, UnsetTableOptions, }; use common_error::ext::BoxedError; use common_grpc_expr::util::ColumnExpr; @@ -441,6 +441,7 @@ fn columns_to_column_schemas( .collect::>>() } +// TODO(weny): refactor this function to use `try_as_column_def` pub fn column_schemas_to_defs( column_schemas: Vec, primary_keys: &[String], @@ -576,64 +577,83 @@ pub(crate) fn to_alter_table_expr( AlterTableOperation::UnsetTableOptions { keys } => { AlterTableKind::UnsetTableOptions(UnsetTableOptions { keys }) } - AlterTableOperation::SetIndex { options } => AlterTableKind::SetIndex(match options { - sql::statements::alter::SetIndexOperation::Fulltext { - column_name, - options, - } => SetIndex { - options: Some(set_index::Options::Fulltext(SetFulltext { - column_name: column_name.value, - enable: options.enable, - analyzer: match options.analyzer { - FulltextAnalyzer::English => Analyzer::English.into(), - FulltextAnalyzer::Chinese => Analyzer::Chinese.into(), - }, - case_sensitive: options.case_sensitive, - backend: match options.backend { - FulltextBackend::Bloom => PbFulltextBackend::Bloom.into(), - FulltextBackend::Tantivy => PbFulltextBackend::Tantivy.into(), - }, - granularity: options.granularity as u64, - false_positive_rate: options.false_positive_rate(), - })), - }, - sql::statements::alter::SetIndexOperation::Inverted { column_name } => SetIndex { - options: Some(set_index::Options::Inverted(SetInverted { - column_name: column_name.value, - })), - }, - sql::statements::alter::SetIndexOperation::Skipping { - column_name, - options, - } => SetIndex { - options: Some(set_index::Options::Skipping(SetSkipping { - column_name: column_name.value, - enable: true, - granularity: options.granularity as u64, - false_positive_rate: options.false_positive_rate(), - skipping_index_type: match options.index_type { - SkippingIndexType::BloomFilter => PbSkippingIndexType::BloomFilter.into(), - }, - })), - }, - }), - AlterTableOperation::UnsetIndex { options } => AlterTableKind::UnsetIndex(match options { - sql::statements::alter::UnsetIndexOperation::Fulltext { column_name } => UnsetIndex { - options: Some(unset_index::Options::Fulltext(UnsetFulltext { - column_name: column_name.value, - })), - }, - sql::statements::alter::UnsetIndexOperation::Inverted { column_name } => UnsetIndex { - options: Some(unset_index::Options::Inverted(UnsetInverted { - column_name: column_name.value, - })), - }, - sql::statements::alter::UnsetIndexOperation::Skipping { column_name } => UnsetIndex { - options: Some(unset_index::Options::Skipping(UnsetSkipping { - column_name: column_name.value, - })), - }, - }), + AlterTableOperation::SetIndex { options } => { + let option = match options { + sql::statements::alter::SetIndexOperation::Fulltext { + column_name, + options, + } => SetIndex { + options: Some(set_index::Options::Fulltext(SetFulltext { + column_name: column_name.value, + enable: options.enable, + analyzer: match options.analyzer { + FulltextAnalyzer::English => Analyzer::English.into(), + FulltextAnalyzer::Chinese => Analyzer::Chinese.into(), + }, + case_sensitive: options.case_sensitive, + backend: match options.backend { + FulltextBackend::Bloom => PbFulltextBackend::Bloom.into(), + FulltextBackend::Tantivy => PbFulltextBackend::Tantivy.into(), + }, + granularity: options.granularity as u64, + false_positive_rate: options.false_positive_rate(), + })), + }, + sql::statements::alter::SetIndexOperation::Inverted { column_name } => SetIndex { + options: Some(set_index::Options::Inverted(SetInverted { + column_name: column_name.value, + })), + }, + sql::statements::alter::SetIndexOperation::Skipping { + column_name, + options, + } => SetIndex { + options: Some(set_index::Options::Skipping(SetSkipping { + column_name: column_name.value, + enable: true, + granularity: options.granularity as u64, + false_positive_rate: options.false_positive_rate(), + skipping_index_type: match options.index_type { + SkippingIndexType::BloomFilter => { + PbSkippingIndexType::BloomFilter.into() + } + }, + })), + }, + }; + AlterTableKind::SetIndexes(SetIndexes { + set_indexes: vec![option], + }) + } + AlterTableOperation::UnsetIndex { options } => { + let option = match options { + sql::statements::alter::UnsetIndexOperation::Fulltext { column_name } => { + UnsetIndex { + options: Some(unset_index::Options::Fulltext(UnsetFulltext { + column_name: column_name.value, + })), + } + } + sql::statements::alter::UnsetIndexOperation::Inverted { column_name } => { + UnsetIndex { + options: Some(unset_index::Options::Inverted(UnsetInverted { + column_name: column_name.value, + })), + } + } + sql::statements::alter::UnsetIndexOperation::Skipping { column_name } => { + UnsetIndex { + options: Some(unset_index::Options::Skipping(UnsetSkipping { + column_name: column_name.value, + })), + } + } + }; + + AlterTableKind::UnsetIndexes(UnsetIndexes { + unset_indexes: vec![option], + }) + } AlterTableOperation::DropDefaults { columns } => { AlterTableKind::DropDefaults(DropDefaults { drop_defaults: columns diff --git a/src/operator/src/procedure.rs b/src/operator/src/procedure.rs index 87f805acb1..6212bdf4c3 100644 --- a/src/operator/src/procedure.rs +++ b/src/operator/src/procedure.rs @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +use api::v1::meta::ReconcileRequest; use async_trait::async_trait; use catalog::CatalogManagerRef; use common_error::ext::BoxedError; use common_function::handlers::ProcedureServiceHandler; -use common_meta::ddl::{ExecutorContext, ProcedureExecutorRef}; +use common_meta::procedure_executor::{ExecutorContext, ProcedureExecutorRef}; use common_meta::rpc::procedure::{ AddRegionFollowerRequest, MigrateRegionRequest, ProcedureStateResponse, RemoveRegionFollowerRequest, @@ -57,6 +58,17 @@ impl ProcedureServiceHandler for ProcedureServiceOperator { .map(|pid| String::from_utf8_lossy(&pid.key).to_string())) } + async fn reconcile(&self, request: ReconcileRequest) -> QueryResult> { + Ok(self + .procedure_executor + .reconcile(&ExecutorContext::default(), request) + .await + .map_err(BoxedError::new) + .context(query_error::ProcedureServiceSnafu)? + .pid + .map(|pid| String::from_utf8_lossy(&pid.key).to_string())) + } + async fn query_procedure_state(&self, pid: &str) -> QueryResult { self.procedure_executor .query_procedure_state(&ExecutorContext::default(), pid) diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs index 698396528f..f9b99578cd 100644 --- a/src/operator/src/statement.rs +++ b/src/operator/src/statement.rs @@ -36,12 +36,12 @@ use client::RecordBatches; use common_error::ext::BoxedError; use common_meta::cache::TableRouteCacheRef; use common_meta::cache_invalidator::CacheInvalidatorRef; -use common_meta::ddl::ProcedureExecutorRef; use common_meta::key::flow::{FlowMetadataManager, FlowMetadataManagerRef}; use common_meta::key::schema_name::SchemaNameKey; use common_meta::key::view_info::{ViewInfoManager, ViewInfoManagerRef}; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; +use common_meta::procedure_executor::ProcedureExecutorRef; use common_query::Output; use common_telemetry::tracing; use common_time::range::TimestampRange; diff --git a/src/operator/src/statement/ddl.rs b/src/operator/src/statement/ddl.rs index d0bcc2797c..7e9c4c3e05 100644 --- a/src/operator/src/statement/ddl.rs +++ b/src/operator/src/statement/ddl.rs @@ -31,10 +31,10 @@ use common_catalog::{format_full_flow_name, format_full_table_name}; use common_error::ext::BoxedError; use common_meta::cache_invalidator::Context; use common_meta::ddl::create_flow::FlowType; -use common_meta::ddl::ExecutorContext; use common_meta::instruction::CacheIdent; use common_meta::key::schema_name::{SchemaName, SchemaNameKey}; use common_meta::key::NAME_PATTERN; +use common_meta::procedure_executor::ExecutorContext; #[cfg(feature = "enterprise")] use common_meta::rpc::ddl::trigger::CreateTriggerTask; use common_meta::rpc::ddl::{ @@ -1644,6 +1644,7 @@ fn create_table_info( options: table_options, created_on: Utc::now(), partition_key_indices, + column_ids: vec![], }; let desc = if create_table.desc.is_empty() { diff --git a/src/query/src/dist_plan/analyzer/test.rs b/src/query/src/dist_plan/analyzer/test.rs index 863f9fd3b4..22314317a4 100644 --- a/src/query/src/dist_plan/analyzer/test.rs +++ b/src/query/src/dist_plan/analyzer/test.rs @@ -87,6 +87,7 @@ impl TestTable { options: Default::default(), created_on: Default::default(), partition_key_indices: vec![0, 1], + column_ids: vec![], }; let table_info = TableInfoBuilder::default() diff --git a/src/store-api/src/metadata.rs b/src/store-api/src/metadata.rs index c279f92448..ff77eee4fd 100644 --- a/src/store-api/src/metadata.rs +++ b/src/store-api/src/metadata.rs @@ -29,7 +29,7 @@ use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; use datatypes::arrow; use datatypes::arrow::datatypes::FieldRef; -use datatypes::schema::{ColumnSchema, FulltextOptions, Schema, SchemaRef, SkippingIndexOptions}; +use datatypes::schema::{ColumnSchema, FulltextOptions, Schema, SchemaRef}; use datatypes::types::TimestampType; use serde::de::Error; use serde::{Deserialize, Deserializer, Serialize}; @@ -37,8 +37,7 @@ use snafu::{ensure, Location, OptionExt, ResultExt, Snafu}; use crate::codec::PrimaryKeyEncoding; use crate::region_request::{ - AddColumn, AddColumnLocation, AlterKind, ApiSetIndexOptions, ApiUnsetIndexOptions, - ModifyColumnType, + AddColumn, AddColumnLocation, AlterKind, ModifyColumnType, SetIndexOption, UnsetIndexOption, }; use crate::storage::consts::is_internal_column; use crate::storage::{ColumnId, RegionId}; @@ -582,30 +581,8 @@ impl RegionMetadataBuilder { AlterKind::AddColumns { columns } => self.add_columns(columns)?, AlterKind::DropColumns { names } => self.drop_columns(&names), AlterKind::ModifyColumnTypes { columns } => self.modify_column_types(columns)?, - AlterKind::SetIndex { options } => match options { - ApiSetIndexOptions::Fulltext { - column_name, - options, - } => self.change_column_fulltext_options(column_name, true, Some(options))?, - ApiSetIndexOptions::Inverted { column_name } => { - self.change_column_inverted_index_options(column_name, true)? - } - ApiSetIndexOptions::Skipping { - column_name, - options, - } => self.change_column_skipping_index_options(column_name, Some(options))?, - }, - AlterKind::UnsetIndex { options } => match options { - ApiUnsetIndexOptions::Fulltext { column_name } => { - self.change_column_fulltext_options(column_name, false, None)? - } - ApiUnsetIndexOptions::Inverted { column_name } => { - self.change_column_inverted_index_options(column_name, false)? - } - ApiUnsetIndexOptions::Skipping { column_name } => { - self.change_column_skipping_index_options(column_name, None)? - } - }, + AlterKind::SetIndexes { options } => self.set_indexes(options)?, + AlterKind::UnsetIndexes { options } => self.unset_indexes(options)?, AlterKind::SetRegionOptions { options: _ } => { // nothing to be done with RegionMetadata } @@ -615,6 +592,19 @@ impl RegionMetadataBuilder { AlterKind::DropDefaults { names } => { self.drop_defaults(names)?; } + AlterKind::SyncColumns { column_metadatas } => { + self.primary_key = column_metadatas + .iter() + .filter_map(|column_metadata| { + if column_metadata.semantic_type == SemanticType::Tag { + Some(column_metadata.column_id) + } else { + None + } + }) + .collect::>(); + self.column_metadatas = column_metadatas; + } } Ok(self) } @@ -741,92 +731,124 @@ impl RegionMetadataBuilder { Ok(()) } - fn change_column_inverted_index_options( - &mut self, - column_name: String, - value: bool, - ) -> Result<()> { - for column_meta in self.column_metadatas.iter_mut() { - if column_meta.column_schema.name == column_name { - column_meta.column_schema.set_inverted_index(value) + fn set_indexes(&mut self, options: Vec) -> Result<()> { + let mut set_index_map: HashMap<_, Vec<_>> = HashMap::new(); + for option in &options { + set_index_map + .entry(option.column_name()) + .or_default() + .push(option); + } + + for column_metadata in self.column_metadatas.iter_mut() { + if let Some(options) = set_index_map.remove(&column_metadata.column_schema.name) { + for option in options { + Self::set_index(column_metadata, option)?; + } } } + Ok(()) } - fn change_column_fulltext_options( - &mut self, - column_name: String, - enable: bool, - options: Option, - ) -> Result<()> { - for column_meta in self.column_metadatas.iter_mut() { - if column_meta.column_schema.name == column_name { + fn unset_indexes(&mut self, options: Vec) -> Result<()> { + let mut unset_index_map: HashMap<_, Vec<_>> = HashMap::new(); + for option in &options { + unset_index_map + .entry(option.column_name()) + .or_default() + .push(option); + } + + for column_metadata in self.column_metadatas.iter_mut() { + if let Some(options) = unset_index_map.remove(&column_metadata.column_schema.name) { + for option in options { + Self::unset_index(column_metadata, option)?; + } + } + } + + Ok(()) + } + + fn set_index(column_metadata: &mut ColumnMetadata, options: &SetIndexOption) -> Result<()> { + match options { + SetIndexOption::Fulltext { + column_name, + options, + } => { ensure!( - column_meta.column_schema.data_type.is_string(), + column_metadata.column_schema.data_type.is_string(), + InvalidColumnOptionSnafu { + column_name, + msg: "FULLTEXT index only supports string type".to_string(), + } + ); + let current_fulltext_options = column_metadata + .column_schema + .fulltext_options() + .with_context(|_| GetFulltextOptionsSnafu { + column_name: column_name.to_string(), + })?; + set_column_fulltext_options( + column_metadata, + column_name, + options, + current_fulltext_options, + )?; + } + SetIndexOption::Inverted { .. } => { + column_metadata.column_schema.set_inverted_index(true) + } + SetIndexOption::Skipping { + column_name, + options, + } => { + column_metadata + .column_schema + .set_skipping_options(options) + .context(UnsetSkippingIndexOptionsSnafu { column_name })?; + } + } + + Ok(()) + } + + fn unset_index(column_metadata: &mut ColumnMetadata, options: &UnsetIndexOption) -> Result<()> { + match options { + UnsetIndexOption::Fulltext { column_name } => { + ensure!( + column_metadata.column_schema.data_type.is_string(), InvalidColumnOptionSnafu { column_name, msg: "FULLTEXT index only supports string type".to_string(), } ); - let current_fulltext_options = column_meta + let current_fulltext_options = column_metadata .column_schema .fulltext_options() - .context(SetFulltextOptionsSnafu { - column_name: column_name.clone(), + .with_context(|_| GetFulltextOptionsSnafu { + column_name: column_name.to_string(), })?; - if enable { - ensure!( - options.is_some(), - InvalidColumnOptionSnafu { - column_name, - msg: "FULLTEXT index options must be provided", - } - ); - set_column_fulltext_options( - column_meta, - column_name, - options.unwrap(), - current_fulltext_options, - )?; - } else { - unset_column_fulltext_options( - column_meta, - column_name, - current_fulltext_options, - )?; - } - break; + unset_column_fulltext_options( + column_metadata, + column_name, + current_fulltext_options, + )?; + } + UnsetIndexOption::Inverted { .. } => { + column_metadata.column_schema.set_inverted_index(false) + } + UnsetIndexOption::Skipping { column_name } => { + column_metadata + .column_schema + .unset_skipping_options() + .context(UnsetSkippingIndexOptionsSnafu { column_name })?; } } - Ok(()) - } - fn change_column_skipping_index_options( - &mut self, - column_name: String, - options: Option, - ) -> Result<()> { - for column_meta in self.column_metadatas.iter_mut() { - if column_meta.column_schema.name == column_name { - if let Some(options) = &options { - column_meta - .column_schema - .set_skipping_options(options) - .context(UnsetSkippingIndexOptionsSnafu { - column_name: column_name.clone(), - })?; - } else { - column_meta.column_schema.unset_skipping_options().context( - UnsetSkippingIndexOptionsSnafu { - column_name: column_name.clone(), - }, - )?; - } - } - } Ok(()) } @@ -1019,6 +1041,14 @@ pub enum MetadataError { location: Location, }, + #[snafu(display("Failed to get fulltext options for column {}", column_name))] + GetFulltextOptions { + column_name: String, + source: datatypes::Error, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Failed to set skipping index options for column {}", column_name))] SetSkippingIndexOptions { column_name: String, @@ -1094,8 +1124,8 @@ impl ErrorExt for MetadataError { /// * case_sensitive fn set_column_fulltext_options( column_meta: &mut ColumnMetadata, - column_name: String, - options: FulltextOptions, + column_name: &str, + options: &FulltextOptions, current_options: Option, ) -> Result<()> { if let Some(current_options) = current_options { @@ -1112,7 +1142,7 @@ fn set_column_fulltext_options( column_meta .column_schema - .set_fulltext_options(&options) + .set_fulltext_options(options) .context(SetFulltextOptionsSnafu { column_name })?; Ok(()) @@ -1120,7 +1150,7 @@ fn set_column_fulltext_options( fn unset_column_fulltext_options( column_meta: &mut ColumnMetadata, - column_name: String, + column_name: &str, current_options: Option, ) -> Result<()> { if let Some(mut current_options) = current_options @@ -1625,8 +1655,8 @@ mod test { let mut builder = RegionMetadataBuilder::from_existing(metadata); builder - .alter(AlterKind::SetIndex { - options: ApiSetIndexOptions::Fulltext { + .alter(AlterKind::SetIndexes { + options: vec![SetIndexOption::Fulltext { column_name: "b".to_string(), options: FulltextOptions::new_unchecked( true, @@ -1636,7 +1666,7 @@ mod test { 1000, 0.01, ), - }, + }], }) .unwrap(); let metadata = builder.build().unwrap(); @@ -1656,10 +1686,10 @@ mod test { let mut builder = RegionMetadataBuilder::from_existing(metadata); builder - .alter(AlterKind::UnsetIndex { - options: ApiUnsetIndexOptions::Fulltext { + .alter(AlterKind::UnsetIndexes { + options: vec![UnsetIndexOption::Fulltext { column_name: "b".to_string(), - }, + }], }) .unwrap(); let metadata = builder.build().unwrap(); diff --git a/src/store-api/src/metric_engine_consts.rs b/src/store-api/src/metric_engine_consts.rs index bf0f405812..9c4d4974a7 100644 --- a/src/store-api/src/metric_engine_consts.rs +++ b/src/store-api/src/metric_engine_consts.rs @@ -73,6 +73,10 @@ pub const LOGICAL_TABLE_METADATA_KEY: &str = "on_physical_table"; /// Represent a list of column metadata that are added to physical table. pub const ALTER_PHYSICAL_EXTENSION_KEY: &str = "ALTER_PHYSICAL"; +/// HashMap key to be used in the region server's extension response. +/// Represent the column metadata of a table. +pub const TABLE_COLUMN_METADATA_EXTENSION_KEY: &str = "TABLE_COLUMN_METADATA"; + /// HashMap key to be used in the region server's extension response. /// Represent the manifest info of a region. pub const MANIFEST_INFO_EXTENSION_KEY: &str = "MANIFEST_INFO"; diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs index fc27c2ff10..3cf9bb0fc1 100644 --- a/src/store-api/src/region_engine.rs +++ b/src/store-api/src/region_engine.rs @@ -460,7 +460,7 @@ pub struct RegionStatistic { } /// The manifest info of a region. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum RegionManifestInfo { Mito { manifest_version: u64, @@ -703,6 +703,7 @@ pub trait RegionEngine: Send + Sync { Ok(RegionResponse { affected_rows, extensions, + metadata: Vec::new(), }) } diff --git a/src/store-api/src/region_request.rs b/src/store-api/src/region_request.rs index 32f5d151b7..2a75c38321 100644 --- a/src/store-api/src/region_request.rs +++ b/src/store-api/src/region_request.rs @@ -27,8 +27,8 @@ use api::v1::region::{ DropRequests, FlushRequest, InsertRequests, OpenRequest, TruncateRequest, }; use api::v1::{ - self, set_index, Analyzer, ArrowIpc, FulltextBackend as PbFulltextBackend, Option as PbOption, - Rows, SemanticType, SkippingIndexType as PbSkippingIndexType, WriteHint, + self, Analyzer, ArrowIpc, FulltextBackend as PbFulltextBackend, Option as PbOption, Rows, + SemanticType, SkippingIndexType as PbSkippingIndexType, WriteHint, }; pub use common_base::AffectedRows; use common_grpc::flight::FlightDecoder; @@ -154,6 +154,10 @@ impl RegionRequest { reason: "Sync request should be handled separately by RegionServer", } .fail(), + region_request::Body::ListMetadata(_) => UnexpectedSnafu { + reason: "ListMetadata request should be handled separately by RegionServer", + } + .fail(), } } @@ -527,18 +531,22 @@ pub enum AlterKind { /// Unset region options. UnsetRegionOptions { keys: Vec }, /// Set index options. - SetIndex { options: ApiSetIndexOptions }, + SetIndexes { options: Vec }, /// Unset index options. - UnsetIndex { options: ApiUnsetIndexOptions }, + UnsetIndexes { options: Vec }, /// Drop column default value. DropDefaults { /// Name of columns to drop. names: Vec, }, + /// Sync column metadatas. + SyncColumns { + column_metadatas: Vec, + }, } #[derive(Debug, PartialEq, Eq, Clone)] -pub enum ApiSetIndexOptions { +pub enum SetIndexOption { Fulltext { column_name: String, options: FulltextOptions, @@ -552,49 +560,121 @@ pub enum ApiSetIndexOptions { }, } -impl ApiSetIndexOptions { +impl SetIndexOption { + /// Returns the column name of the index option. pub fn column_name(&self) -> &String { match self { - ApiSetIndexOptions::Fulltext { column_name, .. } => column_name, - ApiSetIndexOptions::Inverted { column_name } => column_name, - ApiSetIndexOptions::Skipping { column_name, .. } => column_name, + SetIndexOption::Fulltext { column_name, .. } => column_name, + SetIndexOption::Inverted { column_name } => column_name, + SetIndexOption::Skipping { column_name, .. } => column_name, } } + /// Returns true if the index option is fulltext. pub fn is_fulltext(&self) -> bool { match self { - ApiSetIndexOptions::Fulltext { .. } => true, - ApiSetIndexOptions::Inverted { .. } => false, - ApiSetIndexOptions::Skipping { .. } => false, + SetIndexOption::Fulltext { .. } => true, + SetIndexOption::Inverted { .. } => false, + SetIndexOption::Skipping { .. } => false, } } } +impl TryFrom for SetIndexOption { + type Error = MetadataError; + + fn try_from(value: v1::SetIndex) -> Result { + let option = value.options.context(InvalidRawRegionRequestSnafu { + err: "missing options in SetIndex", + })?; + + let opt = match option { + v1::set_index::Options::Fulltext(x) => SetIndexOption::Fulltext { + column_name: x.column_name.clone(), + options: FulltextOptions::new( + x.enable, + as_fulltext_option_analyzer( + Analyzer::try_from(x.analyzer).context(DecodeProtoSnafu)?, + ), + x.case_sensitive, + as_fulltext_option_backend( + PbFulltextBackend::try_from(x.backend).context(DecodeProtoSnafu)?, + ), + x.granularity as u32, + x.false_positive_rate, + ) + .context(InvalidIndexOptionSnafu)?, + }, + v1::set_index::Options::Inverted(i) => SetIndexOption::Inverted { + column_name: i.column_name, + }, + v1::set_index::Options::Skipping(s) => SetIndexOption::Skipping { + column_name: s.column_name, + options: SkippingIndexOptions::new( + s.granularity as u32, + s.false_positive_rate, + as_skipping_index_type( + PbSkippingIndexType::try_from(s.skipping_index_type) + .context(DecodeProtoSnafu)?, + ), + ) + .context(InvalidIndexOptionSnafu)?, + }, + }; + + Ok(opt) + } +} + #[derive(Debug, PartialEq, Eq, Clone)] -pub enum ApiUnsetIndexOptions { +pub enum UnsetIndexOption { Fulltext { column_name: String }, Inverted { column_name: String }, Skipping { column_name: String }, } -impl ApiUnsetIndexOptions { +impl UnsetIndexOption { pub fn column_name(&self) -> &String { match self { - ApiUnsetIndexOptions::Fulltext { column_name } => column_name, - ApiUnsetIndexOptions::Inverted { column_name } => column_name, - ApiUnsetIndexOptions::Skipping { column_name } => column_name, + UnsetIndexOption::Fulltext { column_name } => column_name, + UnsetIndexOption::Inverted { column_name } => column_name, + UnsetIndexOption::Skipping { column_name } => column_name, } } pub fn is_fulltext(&self) -> bool { match self { - ApiUnsetIndexOptions::Fulltext { .. } => true, - ApiUnsetIndexOptions::Inverted { .. } => false, - ApiUnsetIndexOptions::Skipping { .. } => false, + UnsetIndexOption::Fulltext { .. } => true, + UnsetIndexOption::Inverted { .. } => false, + UnsetIndexOption::Skipping { .. } => false, } } } +impl TryFrom for UnsetIndexOption { + type Error = MetadataError; + + fn try_from(value: v1::UnsetIndex) -> Result { + let option = value.options.context(InvalidRawRegionRequestSnafu { + err: "missing options in UnsetIndex", + })?; + + let opt = match option { + v1::unset_index::Options::Fulltext(f) => UnsetIndexOption::Fulltext { + column_name: f.column_name, + }, + v1::unset_index::Options::Inverted(i) => UnsetIndexOption::Inverted { + column_name: i.column_name, + }, + v1::unset_index::Options::Skipping(s) => UnsetIndexOption::Skipping { + column_name: s.column_name, + }, + }; + + Ok(opt) + } +} + impl AlterKind { /// Returns an error if the alter kind is invalid. /// @@ -618,25 +698,91 @@ impl AlterKind { } AlterKind::SetRegionOptions { .. } => {} AlterKind::UnsetRegionOptions { .. } => {} - AlterKind::SetIndex { options } => { - Self::validate_column_alter_index_option( - options.column_name(), - metadata, - options.is_fulltext(), - )?; + AlterKind::SetIndexes { options } => { + for option in options { + Self::validate_column_alter_index_option( + option.column_name(), + metadata, + option.is_fulltext(), + )?; + } } - AlterKind::UnsetIndex { options } => { - Self::validate_column_alter_index_option( - options.column_name(), - metadata, - options.is_fulltext(), - )?; + AlterKind::UnsetIndexes { options } => { + for option in options { + Self::validate_column_alter_index_option( + option.column_name(), + metadata, + option.is_fulltext(), + )?; + } } AlterKind::DropDefaults { names } => { names .iter() .try_for_each(|name| Self::validate_column_to_drop(name, metadata))?; } + AlterKind::SyncColumns { column_metadatas } => { + let new_primary_keys = column_metadatas + .iter() + .filter(|c| c.semantic_type == SemanticType::Tag) + .map(|c| (c.column_schema.name.as_str(), c.column_id)) + .collect::>(); + + let old_primary_keys = metadata + .column_metadatas + .iter() + .filter(|c| c.semantic_type == SemanticType::Tag) + .map(|c| (c.column_schema.name.as_str(), c.column_id)); + + for (name, id) in old_primary_keys { + let primary_key = + new_primary_keys + .get(name) + .with_context(|| InvalidRegionRequestSnafu { + region_id: metadata.region_id, + err: format!("column {} is not a primary key", name), + })?; + + ensure!( + *primary_key == id, + InvalidRegionRequestSnafu { + region_id: metadata.region_id, + err: format!( + "column with same name {} has different id, existing: {}, got: {}", + name, id, primary_key + ), + } + ); + } + + let new_ts_column = column_metadatas + .iter() + .find(|c| c.semantic_type == SemanticType::Timestamp) + .map(|c| (c.column_schema.name.as_str(), c.column_id)) + .context(InvalidRegionRequestSnafu { + region_id: metadata.region_id, + err: "timestamp column not found", + })?; + + // Safety: timestamp column must exist. + let old_ts_column = metadata + .column_metadatas + .iter() + .find(|c| c.semantic_type == SemanticType::Timestamp) + .map(|c| (c.column_schema.name.as_str(), c.column_id)) + .unwrap(); + + ensure!( + new_ts_column == old_ts_column, + InvalidRegionRequestSnafu { + region_id: metadata.region_id, + err: format!( + "timestamp column {} has different id, existing: {}, got: {}", + old_ts_column.0, old_ts_column.1, new_ts_column.1 + ), + } + ); + } } Ok(()) } @@ -660,15 +806,18 @@ impl AlterKind { true } AlterKind::UnsetRegionOptions { .. } => true, - AlterKind::SetIndex { options, .. } => { - metadata.column_by_name(options.column_name()).is_some() - } - AlterKind::UnsetIndex { options } => { - metadata.column_by_name(options.column_name()).is_some() - } + AlterKind::SetIndexes { options, .. } => options + .iter() + .any(|option| metadata.column_by_name(option.column_name()).is_some()), + AlterKind::UnsetIndexes { options } => options + .iter() + .any(|option| metadata.column_by_name(option.column_name()).is_some()), AlterKind::DropDefaults { names } => names .iter() .any(|name| metadata.column_by_name(name).is_some()), + AlterKind::SyncColumns { column_metadatas } => { + metadata.column_metadatas != *column_metadatas + } } } @@ -756,65 +905,36 @@ impl TryFrom for AlterKind { .map(|key| UnsetRegionOption::try_from(key.as_str())) .collect::>>()?, }, - alter_request::Kind::SetIndex(o) => match o.options.unwrap() { - set_index::Options::Fulltext(x) => AlterKind::SetIndex { - options: ApiSetIndexOptions::Fulltext { - column_name: x.column_name.clone(), - options: FulltextOptions::new( - x.enable, - as_fulltext_option_analyzer( - Analyzer::try_from(x.analyzer).context(DecodeProtoSnafu)?, - ), - x.case_sensitive, - as_fulltext_option_backend( - PbFulltextBackend::try_from(x.backend).context(DecodeProtoSnafu)?, - ), - x.granularity as u32, - x.false_positive_rate, - ) - .context(InvalidIndexOptionSnafu)?, - }, - }, - set_index::Options::Inverted(i) => AlterKind::SetIndex { - options: ApiSetIndexOptions::Inverted { - column_name: i.column_name, - }, - }, - set_index::Options::Skipping(s) => AlterKind::SetIndex { - options: ApiSetIndexOptions::Skipping { - column_name: s.column_name, - options: SkippingIndexOptions::new( - s.granularity as u32, - s.false_positive_rate, - as_skipping_index_type( - PbSkippingIndexType::try_from(s.skipping_index_type) - .context(DecodeProtoSnafu)?, - ), - ) - .context(InvalidIndexOptionSnafu)?, - }, - }, + alter_request::Kind::SetIndex(o) => AlterKind::SetIndexes { + options: vec![SetIndexOption::try_from(o)?], }, - alter_request::Kind::UnsetIndex(o) => match o.options.unwrap() { - v1::unset_index::Options::Fulltext(f) => AlterKind::UnsetIndex { - options: ApiUnsetIndexOptions::Fulltext { - column_name: f.column_name, - }, - }, - v1::unset_index::Options::Inverted(i) => AlterKind::UnsetIndex { - options: ApiUnsetIndexOptions::Inverted { - column_name: i.column_name, - }, - }, - v1::unset_index::Options::Skipping(s) => AlterKind::UnsetIndex { - options: ApiUnsetIndexOptions::Skipping { - column_name: s.column_name, - }, - }, + alter_request::Kind::UnsetIndex(o) => AlterKind::UnsetIndexes { + options: vec![UnsetIndexOption::try_from(o)?], + }, + alter_request::Kind::SetIndexes(o) => AlterKind::SetIndexes { + options: o + .set_indexes + .into_iter() + .map(SetIndexOption::try_from) + .collect::>>()?, + }, + alter_request::Kind::UnsetIndexes(o) => AlterKind::UnsetIndexes { + options: o + .unset_indexes + .into_iter() + .map(UnsetIndexOption::try_from) + .collect::>>()?, }, alter_request::Kind::DropDefaults(x) => AlterKind::DropDefaults { names: x.drop_defaults.into_iter().map(|x| x.column_name).collect(), }, + alter_request::Kind::SyncColumns(x) => AlterKind::SyncColumns { + column_metadatas: x + .column_defs + .into_iter() + .map(ColumnMetadata::try_from_column_def) + .collect::>>()?, + }, }; Ok(alter_kind) @@ -1187,6 +1307,7 @@ impl fmt::Display for RegionRequest { #[cfg(test)] mod tests { + use api::v1::region::RegionColumnDef; use api::v1::{ColumnDataType, ColumnDef}; use datatypes::prelude::ConcreteDataType; @@ -1646,8 +1767,8 @@ mod tests { #[test] fn test_validate_modify_column_fulltext_options() { - let kind = AlterKind::SetIndex { - options: ApiSetIndexOptions::Fulltext { + let kind = AlterKind::SetIndexes { + options: vec![SetIndexOption::Fulltext { column_name: "tag_0".to_string(), options: FulltextOptions::new_unchecked( true, @@ -1657,21 +1778,93 @@ mod tests { 1000, 0.01, ), - }, + }], }; let request = RegionAlterRequest { kind }; let mut metadata = new_metadata(); metadata.schema_version = 1; request.validate(&metadata).unwrap(); - let kind = AlterKind::UnsetIndex { - options: ApiUnsetIndexOptions::Fulltext { + let kind = AlterKind::UnsetIndexes { + options: vec![UnsetIndexOption::Fulltext { column_name: "tag_0".to_string(), - }, + }], }; let request = RegionAlterRequest { kind }; let mut metadata = new_metadata(); metadata.schema_version = 1; request.validate(&metadata).unwrap(); } + + #[test] + fn test_validate_sync_columns() { + let metadata = new_metadata(); + let kind = AlterKind::SyncColumns { + column_metadatas: vec![ + ColumnMetadata { + column_schema: ColumnSchema::new( + "tag_1", + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Tag, + column_id: 5, + }, + ColumnMetadata { + column_schema: ColumnSchema::new( + "field_2", + ConcreteDataType::string_datatype(), + true, + ), + semantic_type: SemanticType::Field, + column_id: 6, + }, + ], + }; + let err = kind.validate(&metadata).unwrap_err(); + assert!(err.to_string().contains("not a primary key")); + + // Change the timestamp column name. + let mut column_metadatas_with_different_ts_column = metadata.column_metadatas.clone(); + let ts_column = column_metadatas_with_different_ts_column + .iter_mut() + .find(|c| c.semantic_type == SemanticType::Timestamp) + .unwrap(); + ts_column.column_schema.name = "ts1".to_string(); + + let kind = AlterKind::SyncColumns { + column_metadatas: column_metadatas_with_different_ts_column, + }; + let err = kind.validate(&metadata).unwrap_err(); + assert!(err + .to_string() + .contains("timestamp column ts has different id")); + + // Change the primary key column name. + let mut column_metadatas_with_different_pk_column = metadata.column_metadatas.clone(); + let pk_column = column_metadatas_with_different_pk_column + .iter_mut() + .find(|c| c.column_schema.name == "tag_0") + .unwrap(); + pk_column.column_id = 100; + let kind = AlterKind::SyncColumns { + column_metadatas: column_metadatas_with_different_pk_column, + }; + let err = kind.validate(&metadata).unwrap_err(); + assert!(err + .to_string() + .contains("column with same name tag_0 has different id")); + + // Add a new field column. + let mut column_metadatas_with_new_field_column = metadata.column_metadatas.clone(); + column_metadatas_with_new_field_column.push(ColumnMetadata { + column_schema: ColumnSchema::new("field_2", ConcreteDataType::string_datatype(), true), + semantic_type: SemanticType::Field, + column_id: 4, + }); + let kind = AlterKind::SyncColumns { + column_metadatas: column_metadatas_with_new_field_column, + }; + kind.validate(&metadata).unwrap(); + } } diff --git a/src/table/src/metadata.rs b/src/table/src/metadata.rs index f68d85164d..3f1ca58f03 100644 --- a/src/table/src/metadata.rs +++ b/src/table/src/metadata.rs @@ -35,9 +35,10 @@ use store_api::storage::{ColumnDescriptor, ColumnDescriptorBuilder, ColumnId, Re use crate::error::{self, Result}; use crate::requests::{ - AddColumnRequest, AlterKind, ModifyColumnTypeRequest, SetIndexOptions, TableOptions, - UnsetIndexOptions, + AddColumnRequest, AlterKind, ModifyColumnTypeRequest, SetIndexOption, TableOptions, + UnsetIndexOption, }; +use crate::table_reference::TableReference; pub type TableId = u32; pub type TableVersion = u64; @@ -134,6 +135,8 @@ pub struct TableMeta { pub created_on: DateTime, #[builder(default = "Vec::new()")] pub partition_key_indices: Vec, + #[builder(default = "Vec::new()")] + pub column_ids: Vec, } impl TableMetaBuilder { @@ -150,6 +153,7 @@ impl TableMetaBuilder { options: None, created_on: None, partition_key_indices: None, + column_ids: None, } } } @@ -178,6 +182,7 @@ impl TableMetaBuilder { options: None, created_on: None, partition_key_indices: None, + column_ids: None, } } } @@ -231,39 +236,8 @@ impl TableMeta { AlterKind::RenameTable { .. } => Ok(self.new_meta_builder()), AlterKind::SetTableOptions { options } => self.set_table_options(options), AlterKind::UnsetTableOptions { keys } => self.unset_table_options(keys), - AlterKind::SetIndex { options } => match options { - SetIndexOptions::Fulltext { - column_name, - options, - } => self.change_column_fulltext_options( - table_name, - column_name, - true, - Some(options), - ), - SetIndexOptions::Inverted { column_name } => { - self.change_column_modify_inverted_index(table_name, column_name, true) - } - SetIndexOptions::Skipping { - column_name, - options, - } => self.change_column_skipping_index_options( - table_name, - column_name, - Some(options), - ), - }, - AlterKind::UnsetIndex { options } => match options { - UnsetIndexOptions::Fulltext { column_name } => { - self.change_column_fulltext_options(table_name, column_name, false, None) - } - UnsetIndexOptions::Inverted { column_name } => { - self.change_column_modify_inverted_index(table_name, column_name, false) - } - UnsetIndexOptions::Skipping { column_name } => { - self.change_column_skipping_index_options(table_name, column_name, None) - } - }, + AlterKind::SetIndexes { options } => self.set_indexes(table_name, options), + AlterKind::UnsetIndexes { options } => self.unset_indexes(table_name, options), AlterKind::DropDefaults { names } => self.drop_defaults(table_name, names), } } @@ -305,30 +279,38 @@ impl TableMeta { self.set_table_options(&requests) } - /// Creates a [TableMetaBuilder] with modified column inverted index. - fn change_column_modify_inverted_index( + fn set_indexes( &self, table_name: &str, - column_name: &str, - value: bool, + requests: &[SetIndexOption], ) -> Result { let table_schema = &self.schema; - let mut meta_builder = self.new_meta_builder(); - - let mut columns: Vec = - Vec::with_capacity(table_schema.column_schemas().len()); - - for column_schema in table_schema.column_schemas().iter() { - if column_schema.name == column_name { - let mut new_column_schema = column_schema.clone(); - new_column_schema.set_inverted_index(value); - columns.push(new_column_schema); - } else { - columns.push(column_schema.clone()); - } + let mut set_index_options: HashMap<&str, Vec<_>> = HashMap::new(); + for request in requests { + let column_name = request.column_name(); + table_schema + .column_index_by_name(column_name) + .with_context(|| error::ColumnNotExistsSnafu { + column_name, + table_name, + })?; + set_index_options + .entry(column_name) + .or_default() + .push(request); + } + + let mut meta_builder = self.new_meta_builder(); + let mut columns: Vec<_> = Vec::with_capacity(table_schema.column_schemas().len()); + for mut column in table_schema.column_schemas().iter().cloned() { + if let Some(request) = set_index_options.get(column.name.as_str()) { + for request in request { + self.set_index(&mut column, request)?; + } + } + columns.push(column); } - // TODO(CookiePieWw): This part for all alter table operations is similar. We can refactor it. let mut builder = SchemaBuilder::try_from_columns(columns) .with_context(|_| error::SchemaBuildSnafu { msg: format!("Failed to convert column schemas into schema for table {table_name}"), @@ -339,12 +321,17 @@ impl TableMeta { builder = builder.add_metadata(k, v); } - let new_schema = builder.build().with_context(|_| error::SchemaBuildSnafu { - msg: format!( - "Table {table_name} cannot change fulltext options for column {column_name}", - ), + let new_schema = builder.build().with_context(|_| { + let column_names = requests + .iter() + .map(|request| request.column_name()) + .collect::>(); + error::SchemaBuildSnafu { + msg: format!( + "Table {table_name} cannot set index options with columns {column_names:?}", + ), + } })?; - let _ = meta_builder .schema(Arc::new(new_schema)) .primary_key_indices(self.primary_key_indices.clone()); @@ -352,68 +339,38 @@ impl TableMeta { Ok(meta_builder) } - /// Creates a [TableMetaBuilder] with modified column fulltext options. - fn change_column_fulltext_options( + fn unset_indexes( &self, table_name: &str, - column_name: &str, - enable: bool, - options: Option<&FulltextOptions>, + requests: &[UnsetIndexOption], ) -> Result { let table_schema = &self.schema; - let mut meta_builder = self.new_meta_builder(); - - let column = &table_schema - .column_schema_by_name(column_name) - .with_context(|| error::ColumnNotExistsSnafu { - column_name, - table_name, - })?; - - ensure!( - column.data_type.is_string(), - error::InvalidColumnOptionSnafu { - column_name, - msg: "FULLTEXT index only supports string type", - } - ); - - let current_fulltext_options = column - .fulltext_options() - .context(error::SetFulltextOptionsSnafu { column_name })?; - - let mut columns = Vec::with_capacity(table_schema.column_schemas().len()); - for column_schema in table_schema.column_schemas() { - if column_schema.name == column_name { - let mut new_column_schema = column_schema.clone(); - if enable { - ensure!( - options.is_some(), - error::InvalidColumnOptionSnafu { - column_name, - msg: "FULLTEXT index options must be provided", - } - ); - set_column_fulltext_options( - &mut new_column_schema, - column_name, - options.unwrap(), - current_fulltext_options.clone(), - )? - } else { - unset_column_fulltext_options( - &mut new_column_schema, - column_name, - current_fulltext_options.clone(), - )? - } - columns.push(new_column_schema); - } else { - columns.push(column_schema.clone()); - } + let mut set_index_options: HashMap<&str, Vec<_>> = HashMap::new(); + for request in requests { + let column_name = request.column_name(); + table_schema + .column_index_by_name(column_name) + .with_context(|| error::ColumnNotExistsSnafu { + column_name, + table_name, + })?; + set_index_options + .entry(column_name) + .or_default() + .push(request); + } + + let mut meta_builder = self.new_meta_builder(); + let mut columns: Vec<_> = Vec::with_capacity(table_schema.column_schemas().len()); + for mut column in table_schema.column_schemas().iter().cloned() { + if let Some(request) = set_index_options.get(column.name.as_str()) { + for request in request { + self.unset_index(&mut column, request)?; + } + } + columns.push(column); } - // TODO(CookiePieWw): This part for all alter table operations is similar. We can refactor it. let mut builder = SchemaBuilder::try_from_columns(columns) .with_context(|_| error::SchemaBuildSnafu { msg: format!("Failed to convert column schemas into schema for table {table_name}"), @@ -424,12 +381,17 @@ impl TableMeta { builder = builder.add_metadata(k, v); } - let new_schema = builder.build().with_context(|_| error::SchemaBuildSnafu { - msg: format!( - "Table {table_name} cannot change fulltext options for column {column_name}", - ), + let new_schema = builder.build().with_context(|_| { + let column_names = requests + .iter() + .map(|request| request.column_name()) + .collect::>(); + error::SchemaBuildSnafu { + msg: format!( + "Table {table_name} cannot set index options with columns {column_names:?}", + ), + } })?; - let _ = meta_builder .schema(Arc::new(new_schema)) .primary_key_indices(self.primary_key_indices.clone()); @@ -437,54 +399,70 @@ impl TableMeta { Ok(meta_builder) } - /// Creates a [TableMetaBuilder] with modified column skipping index options. - fn change_column_skipping_index_options( - &self, - table_name: &str, - column_name: &str, - options: Option<&SkippingIndexOptions>, - ) -> Result { - let table_schema = &self.schema; - let mut meta_builder = self.new_meta_builder(); - - let mut columns = Vec::with_capacity(table_schema.column_schemas().len()); - for column_schema in table_schema.column_schemas() { - if column_schema.name == column_name { - let mut new_column_schema = column_schema.clone(); - if let Some(options) = options { - set_column_skipping_index_options( - &mut new_column_schema, + fn set_index(&self, column_schema: &mut ColumnSchema, request: &SetIndexOption) -> Result<()> { + match request { + SetIndexOption::Fulltext { + column_name, + options, + } => { + ensure!( + column_schema.data_type.is_string(), + error::InvalidColumnOptionSnafu { column_name, - options, - )?; - } else { - unset_column_skipping_index_options(&mut new_column_schema, column_name)?; - } - columns.push(new_column_schema); - } else { - columns.push(column_schema.clone()); + msg: "FULLTEXT index only supports string type", + } + ); + + let current_fulltext_options = column_schema + .fulltext_options() + .context(error::SetFulltextOptionsSnafu { column_name })?; + set_column_fulltext_options( + column_schema, + column_name, + options, + current_fulltext_options, + )?; + } + SetIndexOption::Inverted { column_name } => { + debug_assert_eq!(column_schema.name, *column_name); + column_schema.set_inverted_index(true); + } + SetIndexOption::Skipping { + column_name, + options, + } => { + set_column_skipping_index_options(column_schema, column_name, options)?; } } - let mut builder = SchemaBuilder::try_from_columns(columns) - .with_context(|_| error::SchemaBuildSnafu { - msg: format!("Failed to convert column schemas into schema for table {table_name}"), - })? - .version(table_schema.version() + 1); + Ok(()) + } - for (k, v) in table_schema.metadata().iter() { - builder = builder.add_metadata(k, v); + fn unset_index( + &self, + column_schema: &mut ColumnSchema, + request: &UnsetIndexOption, + ) -> Result<()> { + match request { + UnsetIndexOption::Fulltext { column_name } => { + let current_fulltext_options = column_schema + .fulltext_options() + .context(error::SetFulltextOptionsSnafu { column_name })?; + unset_column_fulltext_options( + column_schema, + column_name, + current_fulltext_options.clone(), + )? + } + UnsetIndexOption::Inverted { .. } => { + column_schema.set_inverted_index(false); + } + UnsetIndexOption::Skipping { column_name } => { + unset_column_skipping_index_options(column_schema, column_name)?; + } } - let new_schema = builder.build().with_context(|_| error::SchemaBuildSnafu { - msg: format!("Failed to convert column schemas into schema for table {table_name}"), - })?; - - let _ = meta_builder - .schema(Arc::new(new_schema)) - .primary_key_indices(self.primary_key_indices.clone()); - - Ok(meta_builder) + Ok(()) } // TODO(yingwen): Remove this. @@ -1109,12 +1087,13 @@ pub struct RawTableMeta { /// The indices of columns in primary key. Note that the index of timestamp column /// is not included. Order matters to this array. pub primary_key_indices: Vec, - /// The indices of columns in value. Order doesn't matter to this array. + /// The indices of columns in value. The index of timestamp column is included. + /// Order doesn't matter to this array. pub value_indices: Vec, /// Engine type of this table. Usually in small case. pub engine: String, /// Next column id of a new column. - /// Deprecated. See https://github.com/GreptimeTeam/greptimedb/issues/2982 + /// It's used to ensure all columns with the same name across all regions have the same column id. pub next_column_id: ColumnId, pub region_numbers: Vec, pub options: TableOptions, @@ -1122,6 +1101,10 @@ pub struct RawTableMeta { /// Order doesn't matter to this array. #[serde(default)] pub partition_key_indices: Vec, + /// Map of column name to column id. + /// Note: This field may be empty for older versions that did not include this field. + #[serde(default)] + pub column_ids: Vec, } impl From for RawTableMeta { @@ -1136,6 +1119,7 @@ impl From for RawTableMeta { options: meta.options, created_on: meta.created_on, partition_key_indices: meta.partition_key_indices, + column_ids: meta.column_ids, } } } @@ -1154,6 +1138,7 @@ impl TryFrom for TableMeta { options: raw.options, created_on: raw.created_on, partition_key_indices: raw.partition_key_indices, + column_ids: raw.column_ids, }) } } @@ -1171,6 +1156,24 @@ pub struct RawTableInfo { } impl RawTableInfo { + /// Returns the map of column name to column id. + /// + /// Note: This method may return an empty map for older versions that did not include this field. + pub fn name_to_ids(&self) -> Option> { + if self.meta.column_ids.len() != self.meta.schema.column_schemas.len() { + None + } else { + Some( + self.meta + .column_ids + .iter() + .enumerate() + .map(|(index, id)| (self.meta.schema.column_schemas[index].name.clone(), *id)) + .collect(), + ) + } + } + /// Sort the columns in [RawTableInfo], logical tables require it. pub fn sort_columns(&mut self) { let column_schemas = &self.meta.schema.column_schemas; @@ -1181,6 +1184,7 @@ impl RawTableInfo { .map(|index| column_schemas[*index].name.clone()) .collect::>(); + let name_to_ids = self.name_to_ids().unwrap_or_default(); self.meta .schema .column_schemas @@ -1190,21 +1194,27 @@ impl RawTableInfo { let mut primary_key_indices = Vec::with_capacity(primary_keys.len()); let mut timestamp_index = None; let mut value_indices = - Vec::with_capacity(self.meta.schema.column_schemas.len() - primary_keys.len() - 1); + Vec::with_capacity(self.meta.schema.column_schemas.len() - primary_keys.len()); + let mut column_ids = Vec::with_capacity(self.meta.schema.column_schemas.len()); for (index, column_schema) in self.meta.schema.column_schemas.iter().enumerate() { if primary_keys.contains(&column_schema.name) { primary_key_indices.push(index); } else if column_schema.is_time_index() { + value_indices.push(index); timestamp_index = Some(index); } else { value_indices.push(index); } + if let Some(id) = name_to_ids.get(&column_schema.name) { + column_ids.push(*id); + } } // Overwrite table meta self.meta.schema.timestamp_index = timestamp_index; self.meta.primary_key_indices = primary_key_indices; self.meta.value_indices = value_indices; + self.meta.column_ids = column_ids; } /// Extracts region options from table info. @@ -1213,6 +1223,15 @@ impl RawTableInfo { pub fn to_region_options(&self) -> HashMap { HashMap::from(&self.meta.options) } + + /// Returns the table reference. + pub fn table_ref(&self) -> TableReference { + TableReference::full( + self.catalog_name.as_str(), + self.schema_name.as_str(), + self.name.as_str(), + ) + } } impl From for RawTableInfo { @@ -1929,11 +1948,11 @@ mod tests { .build() .unwrap(); - let alter_kind = AlterKind::SetIndex { - options: SetIndexOptions::Fulltext { + let alter_kind = AlterKind::SetIndexes { + options: vec![SetIndexOption::Fulltext { column_name: "col1".to_string(), options: FulltextOptions::default(), - }, + }], }; let err = meta .builder_with_alter_kind("my_table", &alter_kind) @@ -1948,8 +1967,8 @@ mod tests { let new_meta = add_columns_to_meta_with_location(&meta); assert_eq!(meta.region_numbers, new_meta.region_numbers); - let alter_kind = AlterKind::SetIndex { - options: SetIndexOptions::Fulltext { + let alter_kind = AlterKind::SetIndexes { + options: vec![SetIndexOption::Fulltext { column_name: "my_tag_first".to_string(), options: FulltextOptions::new_unchecked( true, @@ -1959,7 +1978,7 @@ mod tests { 1000, 0.01, ), - }, + }], }; let new_meta = new_meta .builder_with_alter_kind("my_table", &alter_kind) @@ -1978,10 +1997,10 @@ mod tests { ); assert!(fulltext_options.case_sensitive); - let alter_kind = AlterKind::UnsetIndex { - options: UnsetIndexOptions::Fulltext { + let alter_kind = AlterKind::UnsetIndexes { + options: vec![UnsetIndexOption::Fulltext { column_name: "my_tag_first".to_string(), - }, + }], }; let new_meta = new_meta .builder_with_alter_kind("my_table", &alter_kind) diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs index 7cbf5b3e1f..d176d043ca 100644 --- a/src/table/src/requests.rs +++ b/src/table/src/requests.rs @@ -251,11 +251,11 @@ pub enum AlterKind { UnsetTableOptions { keys: Vec, }, - SetIndex { - options: SetIndexOptions, + SetIndexes { + options: Vec, }, - UnsetIndex { - options: UnsetIndexOptions, + UnsetIndexes { + options: Vec, }, DropDefaults { names: Vec, @@ -263,7 +263,7 @@ pub enum AlterKind { } #[derive(Debug, Clone, Serialize, Deserialize)] -pub enum SetIndexOptions { +pub enum SetIndexOption { Fulltext { column_name: String, options: FulltextOptions, @@ -277,13 +277,35 @@ pub enum SetIndexOptions { }, } +impl SetIndexOption { + /// Returns the column name of the index option. + pub fn column_name(&self) -> &str { + match self { + SetIndexOption::Fulltext { column_name, .. } => column_name, + SetIndexOption::Inverted { column_name, .. } => column_name, + SetIndexOption::Skipping { column_name, .. } => column_name, + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] -pub enum UnsetIndexOptions { +pub enum UnsetIndexOption { Fulltext { column_name: String }, Inverted { column_name: String }, Skipping { column_name: String }, } +impl UnsetIndexOption { + /// Returns the column name of the index option. + pub fn column_name(&self) -> &str { + match self { + UnsetIndexOption::Fulltext { column_name, .. } => column_name, + UnsetIndexOption::Inverted { column_name, .. } => column_name, + UnsetIndexOption::Skipping { column_name, .. } => column_name, + } + } +} + #[derive(Debug)] pub struct InsertRequest { pub catalog_name: String, diff --git a/src/table/src/table/numbers.rs b/src/table/src/table/numbers.rs index ba0162d506..bba3479843 100644 --- a/src/table/src/table/numbers.rs +++ b/src/table/src/table/numbers.rs @@ -81,6 +81,7 @@ impl NumbersTable { options: Default::default(), created_on: Default::default(), partition_key_indices: vec![], + column_ids: vec![], }; let table_info = TableInfoBuilder::default() diff --git a/tests-integration/src/standalone.rs b/tests-integration/src/standalone.rs index 1525b04a55..98ec0e6d3a 100644 --- a/tests-integration/src/standalone.rs +++ b/tests-integration/src/standalone.rs @@ -35,6 +35,7 @@ use common_meta::ddl_manager::DdlManager; use common_meta::key::flow::FlowMetadataManager; use common_meta::key::TableMetadataManager; use common_meta::kv_backend::KvBackendRef; +use common_meta::procedure_executor::LocalProcedureExecutor; use common_meta::region_keeper::MemoryRegionKeeper; use common_meta::region_registry::LeaderRegionRegistry; use common_meta::sequence::SequenceBuilder; @@ -215,7 +216,7 @@ impl GreptimeDbStandaloneBuilder { flow_id_sequence, )); - let ddl_task_executor = Arc::new( + let ddl_manager = Arc::new( DdlManager::try_new( DdlContext { node_manager: node_manager.clone(), @@ -233,6 +234,10 @@ impl GreptimeDbStandaloneBuilder { ) .unwrap(), ); + let procedure_executor = Arc::new(LocalProcedureExecutor::new( + ddl_manager, + procedure_manager.clone(), + )); let server_addr = opts.frontend_options().grpc.server_addr.clone(); @@ -242,7 +247,7 @@ impl GreptimeDbStandaloneBuilder { cache_registry.clone(), catalog_manager.clone(), node_manager.clone(), - ddl_task_executor.clone(), + procedure_executor.clone(), Arc::new(ProcessManager::new(server_addr, None)), ) .with_plugin(plugins) @@ -265,7 +270,7 @@ impl GreptimeDbStandaloneBuilder { catalog_manager.clone(), kv_backend.clone(), cache_registry.clone(), - ddl_task_executor.clone(), + procedure_executor.clone(), node_manager.clone(), ) .await