From ec3a3aed3764730c4b3331a09e39820a38a0e4ae Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Sat, 4 Feb 2023 01:32:29 +0200 Subject: [PATCH] Dump current tenant config (#3534) The PR adds an endpoint to show tenant's current config: `GET /v1/tenant/:tenant_id/config` Tenant's config consists of two parts: tenant overrides (could be changed via other management API requests) and the default part, substituting all missing overrides (constant, hardcoded in pageserver). The API returns the custom overrides and the final tenant config, after applying all the defaults. Along the way, it had to fix two things in the config: * allow to shorten the json version and omit all `null`'s (same as toml serializer behaves by default), and to understand such shortened format when deserialized. A unit test is added * fix a bug, when `PUT /v1/tenant/config` endpoint rewritten the local file with what had came in the request, but updating (not rewriting the old values) the in-memory state instead. That got uncovered during adjusting the e2e test and fixed to do the replacement everywhere, otherwise there's no way to revert existing overrides. Fixes #3471 (commit https://github.com/neondatabase/neon/commit/dc688affe8f9daa601fb4af700998bcd6f21ea57) * fixes https://github.com/neondatabase/neon/issues/3472 by reordering the config saving operations --- libs/pageserver_api/src/models.rs | 1 - pageserver/src/http/openapi_spec.yml | 72 ++++++++++++++ pageserver/src/http/routes.rs | 46 +++++++-- pageserver/src/tenant.rs | 15 ++- pageserver/src/tenant/config.rs | 60 ++++++++++++ pageserver/src/tenant/mgr.rs | 13 ++- test_runner/fixtures/neon_fixtures.py | 18 ++++ test_runner/regress/test_tenant_conf.py | 120 ++++++++++++++++++++---- 8 files changed, 307 insertions(+), 38 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 6f53f4a01d..8827235d90 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -137,7 +137,6 @@ pub struct TenantConfigRequest { #[serde_as(as = "DisplayFromStr")] pub tenant_id: TenantId, #[serde(default)] - #[serde_as(as = "Option")] pub checkpoint_distance: Option, pub checkpoint_timeout: Option, pub compaction_target_size: Option, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 23faff7ace..fc271fe83b 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -664,6 +664,55 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/config/: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + get: + description: | + Returns tenant's config description: specific config overrides a tenant has + and the effective config. + responses: + "200": + description: Tenant config, specific and effective + content: + application/json: + schema: + $ref: "#/components/schemas/TenantConfig" + "400": + description: Malformed get tenanant config request + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "404": + description: Tenand or timeline were not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" components: securitySchemes: JWT: @@ -724,10 +773,33 @@ components: type: integer checkpoint_timeout: type: string + compaction_target_size: + type: integer compaction_period: type: string compaction_threshold: type: string + image_creation_threshold: + type: integer + walreceiver_connect_timeout: + type: string + lagging_wal_timeout: + type: string + max_lsn_wal_lag: + type: integer + trace_read_requests: + type: boolean + TenantConfig: + type: object + properties: + tenant_specific_overrides: + type: object + schema: + $ref: "#/components/schemas/TenantConfigInfo" + effective_config: + type: object + schema: + $ref: "#/components/schemas/TenantConfigInfo" TimelineInfo: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f81f5da84c..b0b07428f9 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::sync::Arc; use anyhow::{anyhow, Context, Result}; @@ -698,12 +699,40 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { +async fn get_tenant_config_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = mgr::get_tenant(tenant_id, false) + .await + .map_err(ApiError::NotFound)?; + + let response = HashMap::from([ + ( + "tenant_specific_overrides", + serde_json::to_value(tenant.tenant_specific_overrides()) + .context("serializing tenant specific overrides") + .map_err(ApiError::InternalServerError)?, + ), + ( + "effective_config", + serde_json::to_value(tenant.effective_config()) + .context("serializing effective config") + .map_err(ApiError::InternalServerError)?, + ), + ]); + + json_response(StatusCode::OK, response) +} + +async fn update_tenant_config_handler( + mut request: Request, +) -> Result, ApiError> { let request_data: TenantConfigRequest = json_request(&mut request).await?; let tenant_id = request_data.tenant_id; check_permission(&request, Some(tenant_id))?; - let mut tenant_conf: TenantConfOpt = Default::default(); + let mut tenant_conf = TenantConfOpt::default(); if let Some(gc_period) = request_data.gc_period { tenant_conf.gc_period = Some( humantime::parse_duration(&gc_period) @@ -738,12 +767,8 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result TenantConfOpt { + *self.tenant_conf.read().unwrap() + } + + pub fn effective_config(&self) -> TenantConf { + self.tenant_specific_overrides() + .merge(self.conf.default_tenant_conf) + } + pub fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -1690,8 +1699,8 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } - pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) { - self.tenant_conf.write().unwrap().update(&new_tenant_conf); + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { + *self.tenant_conf.write().unwrap() = new_tenant_conf; } fn create_timeline_data( diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index e66ee0ae36..087cff2537 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -51,6 +51,7 @@ pub struct TenantConf { pub checkpoint_distance: u64, // Inmemory layer is also flushed at least once in checkpoint_timeout to // eventually upload WAL after activity is stopped. + #[serde(with = "humantime_serde")] pub checkpoint_timeout: Duration, // Target file size, when creating image and delta layers. // This parameter determines L1 layer file size. @@ -96,23 +97,61 @@ pub struct TenantConf { /// which parameters are set and which are not. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct TenantConfOpt { + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] pub checkpoint_distance: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] pub checkpoint_timeout: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] pub compaction_target_size: Option, + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] + #[serde(default)] pub compaction_period: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] pub compaction_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] pub gc_horizon: Option, + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] + #[serde(default)] pub gc_period: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] pub image_creation_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] + #[serde(default)] pub pitr_interval: Option, + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] + #[serde(default)] pub walreceiver_connect_timeout: Option, + + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] + #[serde(default)] pub lagging_wal_timeout: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] pub max_lsn_wal_lag: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] pub trace_read_requests: Option, } @@ -225,3 +264,24 @@ impl Default for TenantConf { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn de_serializing_pageserver_config_omits_empty_values() { + let small_conf = TenantConfOpt { + gc_horizon: Some(42), + ..TenantConfOpt::default() + }; + + let toml_form = toml_edit::easy::to_string(&small_conf).unwrap(); + assert_eq!(toml_form, "gc_horizon = 42\n"); + assert_eq!(small_conf, toml_edit::easy::from_str(&toml_form).unwrap()); + + let json_form = serde_json::to_string(&small_conf).unwrap(); + assert_eq!(json_form, "{\"gc_horizon\":42}"); + assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); + } +} diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a9edee3794..a74dfdea04 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -285,17 +285,22 @@ pub async fn create_tenant( }).await } -pub async fn update_tenant_config( +pub async fn set_new_tenant_config( conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, + new_tenant_conf: TenantConfOpt, tenant_id: TenantId, ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); let tenant = get_tenant(tenant_id, true).await?; - tenant.update_tenant_config(tenant_conf); let tenant_config_path = conf.tenant_config_path(tenant_id); - Tenant::persist_tenant_config(&tenant.tenant_id(), &tenant_config_path, tenant_conf, false)?; + Tenant::persist_tenant_config( + &tenant.tenant_id(), + &tenant_config_path, + new_tenant_conf, + false, + )?; + tenant.set_new_tenant_config(new_tenant_conf); Ok(()) } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 58e374eb3c..b35252243e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1205,6 +1205,11 @@ class PageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def tenant_config(self, tenant_id: TenantId) -> TenantConfig: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") + self.verbose_error(res) + return TenantConfig.from_json(res.json()) + def tenant_size(self, tenant_id: TenantId) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] @@ -1500,6 +1505,19 @@ class PageserverHttpClient(requests.Session): assert res.status_code == 200 +@dataclass +class TenantConfig: + tenant_specific_overrides: Dict[str, Any] + effective_config: Dict[str, Any] + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> TenantConfig: + return TenantConfig( + tenant_specific_overrides=d["tenant_specific_overrides"], + effective_config=d["effective_config"], + ) + + @dataclass class LayerMapInfo: in_memory_layers: List[InMemoryLayerInfo] diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index cbbf01a285..e087891bba 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -22,6 +22,7 @@ wait_lsn_timeout='111 s'; tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" env = neon_env_builder.init_start() + http_client = env.pageserver.http_client() # Check that we raise on misspelled configs invalid_conf_key = "some_invalid_setting_name_blah_blah_123" @@ -36,12 +37,11 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" else: raise AssertionError("Expected validation error") - tenant, _ = env.neon_cli.create_tenant( - conf={ - "checkpoint_distance": "20000", - "gc_period": "30sec", - } - ) + new_conf = { + "checkpoint_distance": "20000", + "gc_period": "30sec", + } + tenant, _ = env.neon_cli.create_tenant(conf=new_conf) env.neon_cli.create_timeline("test_tenant_conf", tenant_id=tenant) env.postgres.create_start( @@ -69,7 +69,20 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "image_creation_threshold": 3, "pitr_interval": 604800, # 7 days }.items() - ) + ), f"Unexpected res: {res}" + default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant) + assert ( + not default_tenant_config.tenant_specific_overrides + ), "Should have no specific settings yet" + effective_config = default_tenant_config.effective_config + assert effective_config["checkpoint_distance"] == 10000 + assert effective_config["compaction_target_size"] == 1048576 + assert effective_config["compaction_period"] == "20s" + assert effective_config["compaction_threshold"] == 10 + assert effective_config["gc_horizon"] == 67108864 + assert effective_config["gc_period"] == "1h" + assert effective_config["image_creation_threshold"] == 3 + assert effective_config["pitr_interval"] == "7days" # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: @@ -89,15 +102,37 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "image_creation_threshold": 3, "pitr_interval": 604800, }.items() - ) + ), f"Unexpected res: {res}" + new_tenant_config = http_client.tenant_config(tenant_id=tenant) + new_specific_config = new_tenant_config.tenant_specific_overrides + assert new_specific_config["checkpoint_distance"] == 20000 + assert new_specific_config["gc_period"] == "30s" + assert len(new_specific_config) == len( + new_conf + ), f"No more specific properties were expected, but got: {new_specific_config}" + new_effective_config = new_tenant_config.effective_config + assert ( + new_effective_config["checkpoint_distance"] == 20000 + ), "Specific 'checkpoint_distance' config should override the default value" + assert ( + new_effective_config["gc_period"] == "30s" + ), "Specific 'gc_period' config should override the default value" + assert new_effective_config["compaction_target_size"] == 1048576 + assert new_effective_config["compaction_period"] == "20s" + assert new_effective_config["compaction_threshold"] == 10 + assert new_effective_config["gc_horizon"] == 67108864 + assert new_effective_config["image_creation_threshold"] == 3 + assert new_effective_config["pitr_interval"] == "7days" # update the config and ensure that it has changed + conf_update = { + "checkpoint_distance": "15000", + "gc_period": "80sec", + "compaction_period": "80sec", + } env.neon_cli.config_tenant( tenant_id=tenant, - conf={ - "checkpoint_distance": "15000", - "gc_period": "80sec", - }, + conf=conf_update, ) with closing(env.pageserver.connect()) as psconn: @@ -110,14 +145,37 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" for i in { "checkpoint_distance": 15000, "compaction_target_size": 1048576, - "compaction_period": 20, + "compaction_period": 80, "compaction_threshold": 10, "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, "pitr_interval": 604800, }.items() - ) + ), f"Unexpected res: {res}" + updated_tenant_config = http_client.tenant_config(tenant_id=tenant) + updated_specific_config = updated_tenant_config.tenant_specific_overrides + assert updated_specific_config["checkpoint_distance"] == 15000 + assert updated_specific_config["gc_period"] == "1m 20s" + assert updated_specific_config["compaction_period"] == "1m 20s" + assert len(updated_specific_config) == len( + conf_update + ), f"No more specific properties were expected, but got: {updated_specific_config}" + updated_effective_config = updated_tenant_config.effective_config + assert ( + updated_effective_config["checkpoint_distance"] == 15000 + ), "Specific 'checkpoint_distance' config should override the default value" + assert ( + updated_effective_config["gc_period"] == "1m 20s" + ), "Specific 'gc_period' config should override the default value" + assert ( + updated_effective_config["compaction_period"] == "1m 20s" + ), "Specific 'compaction_period' config should override the default value" + assert updated_effective_config["compaction_target_size"] == 1048576 + assert updated_effective_config["compaction_threshold"] == 10 + assert updated_effective_config["gc_horizon"] == 67108864 + assert updated_effective_config["image_creation_threshold"] == 3 + assert updated_effective_config["pitr_interval"] == "7days" # restart the pageserver and ensure that the config is still correct env.pageserver.stop() @@ -133,22 +191,44 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" for i in { "checkpoint_distance": 15000, "compaction_target_size": 1048576, - "compaction_period": 20, + "compaction_period": 80, "compaction_threshold": 10, "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, "pitr_interval": 604800, }.items() - ) + ), f"Unexpected res: {res}" + restarted_tenant_config = http_client.tenant_config(tenant_id=tenant) + assert ( + restarted_tenant_config == updated_tenant_config + ), "Updated config should not change after the restart" # update the config with very short config and make sure no trailing chars are left from previous config + final_conf = { + "pitr_interval": "1 min", + } env.neon_cli.config_tenant( tenant_id=tenant, - conf={ - "pitr_interval": "1 min", - }, + conf=final_conf, ) + final_tenant_config = http_client.tenant_config(tenant_id=tenant) + final_specific_config = final_tenant_config.tenant_specific_overrides + assert final_specific_config["pitr_interval"] == "1m" + assert len(final_specific_config) == len( + final_conf + ), f"No more specific properties were expected, but got: {final_specific_config}" + final_effective_config = final_tenant_config.effective_config + assert ( + final_effective_config["pitr_interval"] == "1m" + ), "Specific 'pitr_interval' config should override the default value" + assert final_effective_config["checkpoint_distance"] == 10000 + assert final_effective_config["compaction_target_size"] == 1048576 + assert final_effective_config["compaction_period"] == "20s" + assert final_effective_config["compaction_threshold"] == 10 + assert final_effective_config["gc_horizon"] == 67108864 + assert final_effective_config["gc_period"] == "1h" + assert final_effective_config["image_creation_threshold"] == 3 # restart the pageserver and ensure that the config is still correct env.pageserver.stop() @@ -165,7 +245,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "compaction_period": 20, "pitr_interval": 60, }.items() - ) + ), f"Unexpected res: {res}" def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):