Dump current tenant config (#3534)

The PR adds an endpoint to show tenant's current config: `GET
/v1/tenant/:tenant_id/config`

Tenant's config consists of two parts: tenant overrides (could be
changed via other management API requests) and the default part,
substituting all missing overrides (constant, hardcoded in pageserver).
The API returns the custom overrides and the final tenant config, after
applying all the defaults.

Along the way, it had to fix two things in the config:

* allow to shorten the json version and omit all `null`'s (same as toml
serializer behaves by default), and to understand such shortened format
when deserialized. A unit test is added
* fix a bug, when `PUT /v1/tenant/config` endpoint rewritten the local
file with what had came in the request, but updating (not rewriting the
old values) the in-memory state instead.
That got uncovered during adjusting the e2e test and fixed to do the
replacement everywhere, otherwise there's no way to revert existing
overrides. Fixes #3471 (commit
dc688affe8)
* fixes https://github.com/neondatabase/neon/issues/3472 by reordering
the config saving operations
This commit is contained in:
Kirill Bulatov
2023-02-04 01:32:29 +02:00
committed by GitHub
parent 87cd2bae77
commit ec3a3aed37
8 changed files with 307 additions and 38 deletions

View File

@@ -137,7 +137,6 @@ pub struct TenantConfigRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub checkpoint_distance: Option<u64>,
pub checkpoint_timeout: Option<String>,
pub compaction_target_size: Option<u64>,

View File

@@ -664,6 +664,55 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/config/:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
get:
description: |
Returns tenant's config description: specific config overrides a tenant has
and the effective config.
responses:
"200":
description: Tenant config, specific and effective
content:
application/json:
schema:
$ref: "#/components/schemas/TenantConfig"
"400":
description: Malformed get tenanant config request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404":
description: Tenand or timeline were not found
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
components:
securitySchemes:
JWT:
@@ -724,10 +773,33 @@ components:
type: integer
checkpoint_timeout:
type: string
compaction_target_size:
type: integer
compaction_period:
type: string
compaction_threshold:
type: string
image_creation_threshold:
type: integer
walreceiver_connect_timeout:
type: string
lagging_wal_timeout:
type: string
max_lsn_wal_lag:
type: integer
trace_read_requests:
type: boolean
TenantConfig:
type: object
properties:
tenant_specific_overrides:
type: object
schema:
$ref: "#/components/schemas/TenantConfigInfo"
effective_config:
type: object
schema:
$ref: "#/components/schemas/TenantConfigInfo"
TimelineInfo:
type: object
required:

View File

@@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
@@ -698,12 +699,40 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
)
}
async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, false)
.await
.map_err(ApiError::NotFound)?;
let response = HashMap::from([
(
"tenant_specific_overrides",
serde_json::to_value(tenant.tenant_specific_overrides())
.context("serializing tenant specific overrides")
.map_err(ApiError::InternalServerError)?,
),
(
"effective_config",
serde_json::to_value(tenant.effective_config())
.context("serializing effective config")
.map_err(ApiError::InternalServerError)?,
),
]);
json_response(StatusCode::OK, response)
}
async fn update_tenant_config_handler(
mut request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let request_data: TenantConfigRequest = json_request(&mut request).await?;
let tenant_id = request_data.tenant_id;
check_permission(&request, Some(tenant_id))?;
let mut tenant_conf: TenantConfOpt = Default::default();
let mut tenant_conf = TenantConfOpt::default();
if let Some(gc_period) = request_data.gc_period {
tenant_conf.gc_period = Some(
humantime::parse_duration(&gc_period)
@@ -738,12 +767,8 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
.map_err(ApiError::BadRequest)?,
);
}
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag;
tenant_conf.trace_read_requests = request_data.trace_read_requests;
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -765,7 +790,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
}
let state = get_state(&request);
mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
.await
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
@@ -979,7 +1004,8 @@ pub fn make_router(
.post("/v1/tenant", tenant_create_handler)
.get("/v1/tenant/:tenant_id", tenant_status)
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
.put("/v1/tenant/config", tenant_config_handler)
.put("/v1/tenant/config", update_tenant_config_handler)
.get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)

View File

@@ -45,6 +45,7 @@ use std::sync::MutexGuard;
use std::sync::{Mutex, RwLock};
use std::time::{Duration, Instant};
use self::config::TenantConf;
use self::metadata::TimelineMetadata;
use self::remote_timeline_client::RemoteTimelineClient;
use crate::config::PageServerConf;
@@ -1618,8 +1619,16 @@ fn tree_sort_timelines(
Ok(result)
}
/// Private functions
impl Tenant {
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
*self.tenant_conf.read().unwrap()
}
pub fn effective_config(&self) -> TenantConf {
self.tenant_specific_overrides()
.merge(self.conf.default_tenant_conf)
}
pub fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
@@ -1690,8 +1699,8 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
}
pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
self.tenant_conf.write().unwrap().update(&new_tenant_conf);
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
*self.tenant_conf.write().unwrap() = new_tenant_conf;
}
fn create_timeline_data(

View File

@@ -51,6 +51,7 @@ pub struct TenantConf {
pub checkpoint_distance: u64,
// Inmemory layer is also flushed at least once in checkpoint_timeout to
// eventually upload WAL after activity is stopped.
#[serde(with = "humantime_serde")]
pub checkpoint_timeout: Duration,
// Target file size, when creating image and delta layers.
// This parameter determines L1 layer file size.
@@ -96,23 +97,61 @@ pub struct TenantConf {
/// which parameters are set and which are not.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub checkpoint_distance: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub checkpoint_timeout: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_target_size: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub compaction_period: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_threshold: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub gc_horizon: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub gc_period: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub image_creation_threshold: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub pitr_interval: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub walreceiver_connect_timeout: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub lagging_wal_timeout: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub max_lsn_wal_lag: Option<NonZeroU64>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub trace_read_requests: Option<bool>,
}
@@ -225,3 +264,24 @@ impl Default for TenantConf {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn de_serializing_pageserver_config_omits_empty_values() {
let small_conf = TenantConfOpt {
gc_horizon: Some(42),
..TenantConfOpt::default()
};
let toml_form = toml_edit::easy::to_string(&small_conf).unwrap();
assert_eq!(toml_form, "gc_horizon = 42\n");
assert_eq!(small_conf, toml_edit::easy::from_str(&toml_form).unwrap());
let json_form = serde_json::to_string(&small_conf).unwrap();
assert_eq!(json_form, "{\"gc_horizon\":42}");
assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
}
}

View File

@@ -285,17 +285,22 @@ pub async fn create_tenant(
}).await
}
pub async fn update_tenant_config(
pub async fn set_new_tenant_config(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> anyhow::Result<()> {
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_id, true).await?;
tenant.update_tenant_config(tenant_conf);
let tenant_config_path = conf.tenant_config_path(tenant_id);
Tenant::persist_tenant_config(&tenant.tenant_id(), &tenant_config_path, tenant_conf, false)?;
Tenant::persist_tenant_config(
&tenant.tenant_id(),
&tenant_config_path,
new_tenant_conf,
false,
)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
}

View File

@@ -1205,6 +1205,11 @@ class PageserverHttpClient(requests.Session):
assert isinstance(res_json, dict)
return res_json
def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
self.verbose_error(res)
return TenantConfig.from_json(res.json())
def tenant_size(self, tenant_id: TenantId) -> int:
return self.tenant_size_and_modelinputs(tenant_id)[0]
@@ -1500,6 +1505,19 @@ class PageserverHttpClient(requests.Session):
assert res.status_code == 200
@dataclass
class TenantConfig:
tenant_specific_overrides: Dict[str, Any]
effective_config: Dict[str, Any]
@classmethod
def from_json(cls, d: Dict[str, Any]) -> TenantConfig:
return TenantConfig(
tenant_specific_overrides=d["tenant_specific_overrides"],
effective_config=d["effective_config"],
)
@dataclass
class LayerMapInfo:
in_memory_layers: List[InMemoryLayerInfo]

View File

@@ -22,6 +22,7 @@ wait_lsn_timeout='111 s';
tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
env = neon_env_builder.init_start()
http_client = env.pageserver.http_client()
# Check that we raise on misspelled configs
invalid_conf_key = "some_invalid_setting_name_blah_blah_123"
@@ -36,12 +37,11 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
else:
raise AssertionError("Expected validation error")
tenant, _ = env.neon_cli.create_tenant(
conf={
"checkpoint_distance": "20000",
"gc_period": "30sec",
}
)
new_conf = {
"checkpoint_distance": "20000",
"gc_period": "30sec",
}
tenant, _ = env.neon_cli.create_tenant(conf=new_conf)
env.neon_cli.create_timeline("test_tenant_conf", tenant_id=tenant)
env.postgres.create_start(
@@ -69,7 +69,20 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"image_creation_threshold": 3,
"pitr_interval": 604800, # 7 days
}.items()
)
), f"Unexpected res: {res}"
default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant)
assert (
not default_tenant_config.tenant_specific_overrides
), "Should have no specific settings yet"
effective_config = default_tenant_config.effective_config
assert effective_config["checkpoint_distance"] == 10000
assert effective_config["compaction_target_size"] == 1048576
assert effective_config["compaction_period"] == "20s"
assert effective_config["compaction_threshold"] == 10
assert effective_config["gc_horizon"] == 67108864
assert effective_config["gc_period"] == "1h"
assert effective_config["image_creation_threshold"] == 3
assert effective_config["pitr_interval"] == "7days"
# check the configuration of the new tenant
with closing(env.pageserver.connect()) as psconn:
@@ -89,15 +102,37 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"image_creation_threshold": 3,
"pitr_interval": 604800,
}.items()
)
), f"Unexpected res: {res}"
new_tenant_config = http_client.tenant_config(tenant_id=tenant)
new_specific_config = new_tenant_config.tenant_specific_overrides
assert new_specific_config["checkpoint_distance"] == 20000
assert new_specific_config["gc_period"] == "30s"
assert len(new_specific_config) == len(
new_conf
), f"No more specific properties were expected, but got: {new_specific_config}"
new_effective_config = new_tenant_config.effective_config
assert (
new_effective_config["checkpoint_distance"] == 20000
), "Specific 'checkpoint_distance' config should override the default value"
assert (
new_effective_config["gc_period"] == "30s"
), "Specific 'gc_period' config should override the default value"
assert new_effective_config["compaction_target_size"] == 1048576
assert new_effective_config["compaction_period"] == "20s"
assert new_effective_config["compaction_threshold"] == 10
assert new_effective_config["gc_horizon"] == 67108864
assert new_effective_config["image_creation_threshold"] == 3
assert new_effective_config["pitr_interval"] == "7days"
# update the config and ensure that it has changed
conf_update = {
"checkpoint_distance": "15000",
"gc_period": "80sec",
"compaction_period": "80sec",
}
env.neon_cli.config_tenant(
tenant_id=tenant,
conf={
"checkpoint_distance": "15000",
"gc_period": "80sec",
},
conf=conf_update,
)
with closing(env.pageserver.connect()) as psconn:
@@ -110,14 +145,37 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
for i in {
"checkpoint_distance": 15000,
"compaction_target_size": 1048576,
"compaction_period": 20,
"compaction_period": 80,
"compaction_threshold": 10,
"gc_horizon": 67108864,
"gc_period": 80,
"image_creation_threshold": 3,
"pitr_interval": 604800,
}.items()
)
), f"Unexpected res: {res}"
updated_tenant_config = http_client.tenant_config(tenant_id=tenant)
updated_specific_config = updated_tenant_config.tenant_specific_overrides
assert updated_specific_config["checkpoint_distance"] == 15000
assert updated_specific_config["gc_period"] == "1m 20s"
assert updated_specific_config["compaction_period"] == "1m 20s"
assert len(updated_specific_config) == len(
conf_update
), f"No more specific properties were expected, but got: {updated_specific_config}"
updated_effective_config = updated_tenant_config.effective_config
assert (
updated_effective_config["checkpoint_distance"] == 15000
), "Specific 'checkpoint_distance' config should override the default value"
assert (
updated_effective_config["gc_period"] == "1m 20s"
), "Specific 'gc_period' config should override the default value"
assert (
updated_effective_config["compaction_period"] == "1m 20s"
), "Specific 'compaction_period' config should override the default value"
assert updated_effective_config["compaction_target_size"] == 1048576
assert updated_effective_config["compaction_threshold"] == 10
assert updated_effective_config["gc_horizon"] == 67108864
assert updated_effective_config["image_creation_threshold"] == 3
assert updated_effective_config["pitr_interval"] == "7days"
# restart the pageserver and ensure that the config is still correct
env.pageserver.stop()
@@ -133,22 +191,44 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
for i in {
"checkpoint_distance": 15000,
"compaction_target_size": 1048576,
"compaction_period": 20,
"compaction_period": 80,
"compaction_threshold": 10,
"gc_horizon": 67108864,
"gc_period": 80,
"image_creation_threshold": 3,
"pitr_interval": 604800,
}.items()
)
), f"Unexpected res: {res}"
restarted_tenant_config = http_client.tenant_config(tenant_id=tenant)
assert (
restarted_tenant_config == updated_tenant_config
), "Updated config should not change after the restart"
# update the config with very short config and make sure no trailing chars are left from previous config
final_conf = {
"pitr_interval": "1 min",
}
env.neon_cli.config_tenant(
tenant_id=tenant,
conf={
"pitr_interval": "1 min",
},
conf=final_conf,
)
final_tenant_config = http_client.tenant_config(tenant_id=tenant)
final_specific_config = final_tenant_config.tenant_specific_overrides
assert final_specific_config["pitr_interval"] == "1m"
assert len(final_specific_config) == len(
final_conf
), f"No more specific properties were expected, but got: {final_specific_config}"
final_effective_config = final_tenant_config.effective_config
assert (
final_effective_config["pitr_interval"] == "1m"
), "Specific 'pitr_interval' config should override the default value"
assert final_effective_config["checkpoint_distance"] == 10000
assert final_effective_config["compaction_target_size"] == 1048576
assert final_effective_config["compaction_period"] == "20s"
assert final_effective_config["compaction_threshold"] == 10
assert final_effective_config["gc_horizon"] == 67108864
assert final_effective_config["gc_period"] == "1h"
assert final_effective_config["image_creation_threshold"] == 3
# restart the pageserver and ensure that the config is still correct
env.pageserver.stop()
@@ -165,7 +245,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"compaction_period": 20,
"pitr_interval": 60,
}.items()
)
), f"Unexpected res: {res}"
def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):