mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
## Problem
Cplane and storage controller tenant config changes are not additive.
Any change overrides all existing tenant configs. This would be fine if
both did client side patching, but that's not the case.
Once this merges, we must update cplane to use the PATCH endpoint.
## Summary of changes
### High Level
Allow for patching of tenant configuration with a `PATCH
/v1/tenant/config` endpoint.
It takes the same data as it's PUT counterpart. For example the payload
below will update `gc_period` and unset `compaction_period`. All other
fields are left in their original state.
```
{
"tenant_id": "1234",
"gc_period": "10s",
"compaction_period": null
}
```
### Low Level
* PS and storcon gain `PATCH /v1/tenant/config` endpoints. PS endpoint
is only used for cplane managed instances.
* `storcon_cli` is updated to have separate commands for
`set-tenant-config` and `patch-tenant-config`
Related https://github.com/neondatabase/cloud/issues/21043
414 lines
18 KiB
Python
414 lines
18 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import TYPE_CHECKING
|
|
|
|
import pytest
|
|
from fixtures.common_types import Lsn
|
|
from fixtures.neon_fixtures import (
|
|
NeonEnvBuilder,
|
|
)
|
|
from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
|
|
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
|
from fixtures.utils import run_only_on_default_postgres, wait_until
|
|
from fixtures.workload import Workload
|
|
|
|
if TYPE_CHECKING:
|
|
from typing import Any
|
|
|
|
|
|
def test_tenant_config(neon_env_builder: NeonEnvBuilder):
|
|
"""Test per tenant configuration"""
|
|
|
|
def set_some_nondefault_global_config(ps_cfg: dict[str, Any]):
|
|
ps_cfg["page_cache_size"] = 444
|
|
ps_cfg["wait_lsn_timeout"] = "111 s"
|
|
|
|
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
|
tenant_config["checkpoint_distance"] = 10000
|
|
tenant_config["compaction_target_size"] = 1048576
|
|
tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
|
|
tenant_config["eviction_policy"] = {
|
|
"kind": "LayerAccessThreshold",
|
|
"period": "20s",
|
|
"threshold": "23 hours",
|
|
}
|
|
|
|
neon_env_builder.pageserver_config_override = set_some_nondefault_global_config
|
|
|
|
env = neon_env_builder.init_start()
|
|
# we configure eviction but no remote storage, there might be error lines
|
|
env.pageserver.allowed_errors.append(".* no remote storage configured, cannot evict layers .*")
|
|
http_client = env.pageserver.http_client()
|
|
|
|
# Check that we raise on misspelled configs
|
|
invalid_conf_key = "some_invalid_setting_name_blah_blah_123"
|
|
try:
|
|
env.create_tenant(
|
|
conf={
|
|
invalid_conf_key: "20000",
|
|
}
|
|
)
|
|
except Exception as e:
|
|
assert invalid_conf_key in str(e)
|
|
else:
|
|
raise AssertionError("Expected validation error")
|
|
|
|
new_conf = {
|
|
"checkpoint_distance": "20000",
|
|
"gc_period": "30sec",
|
|
"evictions_low_residence_duration_metric_threshold": "42s",
|
|
"eviction_policy": json.dumps({"kind": "NoEviction"}),
|
|
}
|
|
tenant, _ = env.create_tenant(conf=new_conf)
|
|
|
|
env.create_timeline("test_tenant_conf", tenant_id=tenant)
|
|
env.endpoints.create_start("test_tenant_conf", "main", tenant)
|
|
|
|
# check the configuration of the default tenant
|
|
# it should match global configuration
|
|
default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant)
|
|
assert (
|
|
not default_tenant_config.tenant_specific_overrides
|
|
), "Should have no specific settings yet"
|
|
effective_config = default_tenant_config.effective_config
|
|
assert effective_config["checkpoint_distance"] == 10000
|
|
assert effective_config["compaction_target_size"] == 1048576
|
|
assert effective_config["compaction_period"] == "20s"
|
|
assert effective_config["compaction_threshold"] == 10
|
|
assert effective_config["gc_horizon"] == 67108864
|
|
assert effective_config["gc_period"] == "1h"
|
|
assert effective_config["image_creation_threshold"] == 3
|
|
assert effective_config["pitr_interval"] == "7days"
|
|
assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
|
|
assert effective_config["eviction_policy"] == {
|
|
"kind": "LayerAccessThreshold",
|
|
"period": "20s",
|
|
"threshold": "23h",
|
|
}
|
|
|
|
# check the configuration of the new tenant
|
|
new_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
new_specific_config = new_tenant_config.tenant_specific_overrides
|
|
assert new_specific_config["checkpoint_distance"] == 20000
|
|
assert new_specific_config["gc_period"] == "30s"
|
|
assert len(new_specific_config) == len(
|
|
new_conf
|
|
), f"No more specific properties were expected, but got: {new_specific_config}"
|
|
new_effective_config = new_tenant_config.effective_config
|
|
assert (
|
|
new_effective_config["checkpoint_distance"] == 20000
|
|
), "Specific 'checkpoint_distance' config should override the default value"
|
|
assert (
|
|
new_effective_config["gc_period"] == "30s"
|
|
), "Specific 'gc_period' config should override the default value"
|
|
assert (
|
|
new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s"
|
|
), "Should override default value"
|
|
assert new_effective_config["eviction_policy"] == {
|
|
"kind": "NoEviction"
|
|
}, "Specific 'eviction_policy' config should override the default value"
|
|
assert new_effective_config["compaction_target_size"] == 1048576
|
|
assert new_effective_config["compaction_period"] == "20s"
|
|
assert new_effective_config["compaction_threshold"] == 10
|
|
assert new_effective_config["gc_horizon"] == 67108864
|
|
assert new_effective_config["image_creation_threshold"] == 3
|
|
assert new_effective_config["pitr_interval"] == "7days"
|
|
|
|
# update the config and ensure that it has changed
|
|
conf_update = {
|
|
"checkpoint_distance": "15000",
|
|
"gc_period": "80sec",
|
|
"compaction_period": "80sec",
|
|
"image_creation_threshold": "2",
|
|
"evictions_low_residence_duration_metric_threshold": "23h",
|
|
"eviction_policy": json.dumps(
|
|
{"kind": "LayerAccessThreshold", "period": "80s", "threshold": "42h"}
|
|
),
|
|
"max_lsn_wal_lag": "13000000",
|
|
}
|
|
env.config_tenant(tenant_id=tenant, conf=conf_update)
|
|
|
|
updated_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
updated_specific_config = updated_tenant_config.tenant_specific_overrides
|
|
assert updated_specific_config["checkpoint_distance"] == 15000
|
|
assert updated_specific_config["gc_period"] == "1m 20s"
|
|
assert updated_specific_config["compaction_period"] == "1m 20s"
|
|
assert len(updated_specific_config) == len(
|
|
conf_update
|
|
), f"No more specific properties were expected, but got: {updated_specific_config}"
|
|
updated_effective_config = updated_tenant_config.effective_config
|
|
assert (
|
|
updated_effective_config["checkpoint_distance"] == 15000
|
|
), "Specific 'checkpoint_distance' config should override the default value"
|
|
assert (
|
|
updated_effective_config["gc_period"] == "1m 20s"
|
|
), "Specific 'gc_period' config should override the default value"
|
|
assert (
|
|
updated_effective_config["compaction_period"] == "1m 20s"
|
|
), "Specific 'compaction_period' config should override the default value"
|
|
assert (
|
|
updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h"
|
|
), "Should override default value"
|
|
assert updated_effective_config["eviction_policy"] == {
|
|
"kind": "LayerAccessThreshold",
|
|
"period": "1m 20s",
|
|
"threshold": "1day 18h",
|
|
}, "Specific 'eviction_policy' config should override the default value"
|
|
assert updated_effective_config["compaction_target_size"] == 1048576
|
|
assert updated_effective_config["compaction_threshold"] == 10
|
|
assert updated_effective_config["gc_horizon"] == 67108864
|
|
assert updated_effective_config["image_creation_threshold"] == 2
|
|
assert updated_effective_config["pitr_interval"] == "7days"
|
|
assert updated_effective_config["max_lsn_wal_lag"] == 13000000
|
|
|
|
# restart the pageserver and ensure that the config is still correct
|
|
env.pageserver.stop()
|
|
env.pageserver.start()
|
|
|
|
restarted_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
assert (
|
|
restarted_tenant_config == updated_tenant_config
|
|
), "Updated config should not change after the restart"
|
|
|
|
# update the config with very short config and make sure no trailing chars are left from previous config
|
|
final_conf = {
|
|
"pitr_interval": "1 min",
|
|
}
|
|
env.config_tenant(tenant_id=tenant, conf=final_conf)
|
|
|
|
final_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
final_specific_config = final_tenant_config.tenant_specific_overrides
|
|
assert final_specific_config["pitr_interval"] == "1m"
|
|
assert len(final_specific_config) == len(
|
|
final_conf
|
|
), f"No more specific properties were expected, but got: {final_specific_config}"
|
|
final_effective_config = final_tenant_config.effective_config
|
|
assert (
|
|
final_effective_config["pitr_interval"] == "1m"
|
|
), "Specific 'pitr_interval' config should override the default value"
|
|
assert final_effective_config["checkpoint_distance"] == 10000
|
|
assert final_effective_config["compaction_target_size"] == 1048576
|
|
assert final_effective_config["compaction_period"] == "20s"
|
|
assert final_effective_config["compaction_threshold"] == 10
|
|
assert final_effective_config["gc_horizon"] == 67108864
|
|
assert final_effective_config["gc_period"] == "1h"
|
|
assert final_effective_config["image_creation_threshold"] == 3
|
|
assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
|
|
assert final_effective_config["eviction_policy"] == {
|
|
"kind": "LayerAccessThreshold",
|
|
"period": "20s",
|
|
"threshold": "23h",
|
|
}
|
|
assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
|
|
|
|
# restart the pageserver and ensure that the config is still correct
|
|
env.pageserver.stop()
|
|
env.pageserver.start()
|
|
|
|
restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
assert (
|
|
restarted_final_tenant_config == final_tenant_config
|
|
), "Updated config should not change after the restart"
|
|
|
|
|
|
def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
|
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
|
|
|
env = neon_env_builder.init_start()
|
|
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
|
|
|
# tenant is created with defaults, as in without config file
|
|
(tenant_id, timeline_id) = env.create_tenant()
|
|
config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1"
|
|
|
|
http_client = env.pageserver.http_client()
|
|
|
|
detail = http_client.timeline_detail(tenant_id, timeline_id)
|
|
last_record_lsn = Lsn(detail["last_record_lsn"])
|
|
assert last_record_lsn.lsn_int != 0, "initdb must have executed"
|
|
|
|
wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)
|
|
|
|
http_client.tenant_detach(tenant_id)
|
|
|
|
assert not config_path.exists(), "detach did not remove config file"
|
|
|
|
env.pageserver.tenant_attach(tenant_id)
|
|
wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active"))
|
|
|
|
env.config_tenant(tenant_id, {"gc_horizon": "1000000"})
|
|
contents_first = config_path.read_text()
|
|
env.config_tenant(tenant_id, {"gc_horizon": "0"})
|
|
contents_later = config_path.read_text()
|
|
|
|
# dont test applying the setting here, we have that another test case to show it
|
|
# we just care about being able to create the file
|
|
assert len(contents_first) > len(contents_later)
|
|
|
|
|
|
def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
|
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf={
|
|
# disable compaction so that it will not download the layer for repartitioning
|
|
"compaction_period": "0s"
|
|
}
|
|
)
|
|
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
|
|
|
(tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline
|
|
ps_http = env.pageserver.http_client()
|
|
|
|
# When we evict/download layers, we will use this Workload to generate getpage requests
|
|
# that touch some layers, as otherwise the pageserver doesn't report totally unused layers
|
|
# as problems when they have short residence duration.
|
|
workload = Workload(env, tenant_id, timeline_id)
|
|
workload.init()
|
|
workload.write_rows(100)
|
|
|
|
def get_metric():
|
|
metrics = ps_http.get_metrics()
|
|
metric = metrics.query_one(
|
|
"pageserver_evictions_with_low_residence_duration_total",
|
|
{
|
|
"tenant_id": str(tenant_id),
|
|
"timeline_id": str(timeline_id),
|
|
},
|
|
)
|
|
return metric
|
|
|
|
default_value = ps_http.tenant_config(tenant_id).effective_config[
|
|
"evictions_low_residence_duration_metric_threshold"
|
|
]
|
|
metric = get_metric()
|
|
assert int(metric.value) == 0, "metric is present with default value"
|
|
|
|
assert default_value == "1day"
|
|
|
|
ps_http.download_all_layers(tenant_id, timeline_id)
|
|
workload.validate()
|
|
ps_http.evict_all_layers(tenant_id, timeline_id)
|
|
metric = get_metric()
|
|
assert int(metric.value) > 0, "metric is updated"
|
|
|
|
env.config_tenant(
|
|
tenant_id, {"evictions_low_residence_duration_metric_threshold": default_value}
|
|
)
|
|
updated_metric = get_metric()
|
|
assert int(updated_metric.value) == int(
|
|
metric.value
|
|
), "metric is unchanged when setting same value"
|
|
|
|
env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"})
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
|
|
assert int(metric.value) == 0
|
|
|
|
ps_http.download_all_layers(tenant_id, timeline_id)
|
|
workload.validate()
|
|
ps_http.evict_all_layers(tenant_id, timeline_id)
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
|
|
assert int(metric.value) > 0
|
|
|
|
env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2h"})
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
|
|
assert int(metric.value) == 0, "value resets if label changes"
|
|
|
|
ps_http.download_all_layers(tenant_id, timeline_id)
|
|
workload.validate()
|
|
ps_http.evict_all_layers(tenant_id, timeline_id)
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
|
|
assert int(metric.value) > 0, "set a non-zero value for next step"
|
|
|
|
env.config_tenant(tenant_id, {})
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
|
|
assert int(metric.value) == 0, "value resets to default"
|
|
|
|
|
|
@run_only_on_default_postgres("Test does not start a compute")
|
|
@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"])
|
|
def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str):
|
|
"""
|
|
Test tenant config patching (i.e. additive updates)
|
|
|
|
The flow is different for storage controller and cplane managed pageserver.
|
|
1. Storcon managed: /v1/tenant/config request lands on storcon, which generates
|
|
location_config calls containing the update to the pageserver
|
|
2. Cplane managed: /v1/tenant/config is called directly on the pageserver
|
|
"""
|
|
|
|
def assert_tenant_conf_semantically_equal(lhs, rhs):
|
|
"""
|
|
Storcon returns None for fields that are not set while the pageserver does not.
|
|
Compare two tenant's config overrides semantically, by dropping the None values.
|
|
"""
|
|
lhs = {k: v for k, v in lhs.items() if v is not None}
|
|
rhs = {k: v for k, v in rhs.items() if v is not None}
|
|
|
|
assert lhs == rhs
|
|
|
|
env = neon_env_builder.init_start()
|
|
|
|
if ps_managed_by == "storcon":
|
|
api = env.storage_controller.pageserver_api()
|
|
elif ps_managed_by == "cplane":
|
|
# Disallow storcon from sending location_configs to the pageserver.
|
|
# These would overwrite the manually set tenant configs.
|
|
env.storage_controller.reconcile_until_idle()
|
|
env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"})
|
|
env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
|
|
|
|
api = env.pageserver.http_client()
|
|
else:
|
|
raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}")
|
|
|
|
crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
|
|
patch: dict[str, Any | None] = {
|
|
"gc_period": "3h",
|
|
"wal_receiver_protocol_override": {
|
|
"type": "interpreted",
|
|
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
|
|
},
|
|
}
|
|
api.patch_tenant_config(env.initial_tenant, patch)
|
|
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
if ps_managed_by == "storcon":
|
|
# Check that the config was propagated to the PS.
|
|
overrides_on_ps = (
|
|
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
)
|
|
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
|
|
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
|
|
crnt_tenant_conf = tenant_conf_after_patch
|
|
|
|
patch = {"gc_period": "5h", "wal_receiver_protocol_override": None}
|
|
api.patch_tenant_config(env.initial_tenant, patch)
|
|
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
if ps_managed_by == "storcon":
|
|
overrides_on_ps = (
|
|
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
)
|
|
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
|
|
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
|
|
crnt_tenant_conf = tenant_conf_after_patch
|
|
|
|
put = {"pitr_interval": "1m 1s"}
|
|
api.set_tenant_config(env.initial_tenant, put)
|
|
tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
if ps_managed_by == "storcon":
|
|
overrides_on_ps = (
|
|
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
)
|
|
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put)
|
|
assert_tenant_conf_semantically_equal(tenant_conf_after_put, put)
|
|
crnt_tenant_conf = tenant_conf_after_put
|