mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 13:32:57 +00:00
pageserver/storcon: add patch endpoints for tenant config metrics (#10020)
## Problem
Cplane and storage controller tenant config changes are not additive.
Any change overrides all existing tenant configs. This would be fine if
both did client side patching, but that's not the case.
Once this merges, we must update cplane to use the PATCH endpoint.
## Summary of changes
### High Level
Allow for patching of tenant configuration with a `PATCH
/v1/tenant/config` endpoint.
It takes the same data as it's PUT counterpart. For example the payload
below will update `gc_period` and unset `compaction_period`. All other
fields are left in their original state.
```
{
"tenant_id": "1234",
"gc_period": "10s",
"compaction_period": null
}
```
### Low Level
* PS and storcon gain `PATCH /v1/tenant/config` endpoints. PS endpoint
is only used for cplane managed instances.
* `storcon_cli` is updated to have separate commands for
`set-tenant-config` and `patch-tenant-config`
Related https://github.com/neondatabase/cloud/issues/21043
This commit is contained in:
@@ -488,7 +488,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def patch_tenant_config_client_side(
|
||||
def patch_tenant_config(self, tenant_id: TenantId | TenantShardId, updates: dict[str, Any]):
|
||||
"""
|
||||
Only use this via storage_controller.pageserver_api().
|
||||
|
||||
See `set_tenant_config` for more information.
|
||||
"""
|
||||
assert "tenant_id" not in updates.keys()
|
||||
res = self.patch(
|
||||
f"http://localhost:{self.port}/v1/tenant/config",
|
||||
json={**updates, "tenant_id": str(tenant_id)},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def update_tenant_config(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
inserts: dict[str, Any] | None = None,
|
||||
@@ -499,13 +512,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
|
||||
See `set_tenant_config` for more information.
|
||||
"""
|
||||
current = self.tenant_config(tenant_id).tenant_specific_overrides
|
||||
if inserts is not None:
|
||||
current.update(inserts)
|
||||
if removes is not None:
|
||||
for key in removes:
|
||||
del current[key]
|
||||
self.set_tenant_config(tenant_id, current)
|
||||
if inserts is None:
|
||||
inserts = {}
|
||||
if removes is None:
|
||||
removes = []
|
||||
|
||||
patch = inserts | {remove: None for remove in removes}
|
||||
self.patch_tenant_config(tenant_id, patch)
|
||||
|
||||
def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
|
||||
return self.tenant_size_and_modelinputs(tenant_id)[0]
|
||||
|
||||
@@ -460,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
|
||||
assert (
|
||||
du_by_timeline[large_tenant] > min_resident_size
|
||||
), "ensure the larger tenant will get a haircut"
|
||||
env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
|
||||
env.neon_env.storage_controller.pageserver_api().update_tenant_config(
|
||||
small_tenant[0], {"min_resident_size_override": min_resident_size}
|
||||
)
|
||||
env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
|
||||
env.neon_env.storage_controller.pageserver_api().update_tenant_config(
|
||||
large_tenant[0], {"min_resident_size_override": min_resident_size}
|
||||
)
|
||||
|
||||
|
||||
@@ -74,7 +74,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
|
||||
print_layer_size_histogram(post_ingest)
|
||||
|
||||
# since all we have are L0s, we should be getting nice L1s and images out of them now
|
||||
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
|
||||
env.storage_controller.pageserver_api().update_tenant_config(
|
||||
env.initial_tenant,
|
||||
{
|
||||
"compaction_threshold": 1,
|
||||
|
||||
@@ -132,7 +132,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
|
||||
), "sanity check for what above loop is supposed to do"
|
||||
|
||||
# create the image layer from the future
|
||||
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
|
||||
env.storage_controller.pageserver_api().update_tenant_config(
|
||||
tenant_id, {"image_creation_threshold": image_creation_threshold}, None
|
||||
)
|
||||
assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
|
||||
|
||||
@@ -46,7 +46,7 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
|
||||
env.storage_controller.pageserver_api().update_tenant_config(
|
||||
tenant_id, {"compaction_threshold": 3}
|
||||
)
|
||||
# hit the exit failpoint
|
||||
|
||||
@@ -1768,7 +1768,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
|
||||
# Modify a tenant's config
|
||||
storcon_cli(
|
||||
[
|
||||
"tenant-config",
|
||||
"patch-tenant-config",
|
||||
"--tenant-id",
|
||||
str(env.initial_tenant),
|
||||
"--config",
|
||||
@@ -2403,7 +2403,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Make a change to the tenant config to trigger a slow reconcile
|
||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||
virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
|
||||
virtual_ps_http.update_tenant_config(tid, {"compaction_threshold": 5}, None)
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*Accepted configuration update but reconciliation failed.*",
|
||||
|
||||
@@ -3,13 +3,14 @@ from __future__ import annotations
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.utils import run_only_on_default_postgres, wait_until
|
||||
from fixtures.workload import Workload
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -330,3 +331,83 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
|
||||
metric = get_metric()
|
||||
assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
|
||||
assert int(metric.value) == 0, "value resets to default"
|
||||
|
||||
|
||||
@run_only_on_default_postgres("Test does not start a compute")
|
||||
@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"])
|
||||
def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str):
|
||||
"""
|
||||
Test tenant config patching (i.e. additive updates)
|
||||
|
||||
The flow is different for storage controller and cplane managed pageserver.
|
||||
1. Storcon managed: /v1/tenant/config request lands on storcon, which generates
|
||||
location_config calls containing the update to the pageserver
|
||||
2. Cplane managed: /v1/tenant/config is called directly on the pageserver
|
||||
"""
|
||||
|
||||
def assert_tenant_conf_semantically_equal(lhs, rhs):
|
||||
"""
|
||||
Storcon returns None for fields that are not set while the pageserver does not.
|
||||
Compare two tenant's config overrides semantically, by dropping the None values.
|
||||
"""
|
||||
lhs = {k: v for k, v in lhs.items() if v is not None}
|
||||
rhs = {k: v for k, v in rhs.items() if v is not None}
|
||||
|
||||
assert lhs == rhs
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
if ps_managed_by == "storcon":
|
||||
api = env.storage_controller.pageserver_api()
|
||||
elif ps_managed_by == "cplane":
|
||||
# Disallow storcon from sending location_configs to the pageserver.
|
||||
# These would overwrite the manually set tenant configs.
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"})
|
||||
env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
|
||||
|
||||
api = env.pageserver.http_client()
|
||||
else:
|
||||
raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}")
|
||||
|
||||
crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
||||
|
||||
patch: dict[str, Any | None] = {
|
||||
"gc_period": "3h",
|
||||
"wal_receiver_protocol_override": {
|
||||
"type": "interpreted",
|
||||
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
|
||||
},
|
||||
}
|
||||
api.patch_tenant_config(env.initial_tenant, patch)
|
||||
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
||||
if ps_managed_by == "storcon":
|
||||
# Check that the config was propagated to the PS.
|
||||
overrides_on_ps = (
|
||||
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
||||
)
|
||||
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
|
||||
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
|
||||
crnt_tenant_conf = tenant_conf_after_patch
|
||||
|
||||
patch = {"gc_period": "5h", "wal_receiver_protocol_override": None}
|
||||
api.patch_tenant_config(env.initial_tenant, patch)
|
||||
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
||||
if ps_managed_by == "storcon":
|
||||
overrides_on_ps = (
|
||||
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
||||
)
|
||||
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
|
||||
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
|
||||
crnt_tenant_conf = tenant_conf_after_patch
|
||||
|
||||
put = {"pitr_interval": "1m 1s"}
|
||||
api.set_tenant_config(env.initial_tenant, put)
|
||||
tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
||||
if ps_managed_by == "storcon":
|
||||
overrides_on_ps = (
|
||||
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
||||
)
|
||||
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put)
|
||||
assert_tenant_conf_semantically_equal(tenant_conf_after_put, put)
|
||||
crnt_tenant_conf = tenant_conf_after_put
|
||||
|
||||
@@ -81,7 +81,7 @@ def test_threshold_based_eviction(
|
||||
|
||||
# create a bunch of L1s, only the least of which will need to be resident
|
||||
compaction_threshold = 3 # create L1 layers quickly
|
||||
vps_http.patch_tenant_config_client_side(
|
||||
vps_http.update_tenant_config(
|
||||
tenant_id,
|
||||
inserts={
|
||||
# Disable gc and compaction to avoid on-demand downloads from their side.
|
||||
|
||||
@@ -514,7 +514,7 @@ def test_compaction_induced_by_detaches_in_history(
|
||||
|
||||
assert len(delta_layers(branch_timeline_id)) == 5
|
||||
|
||||
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
|
||||
env.storage_controller.pageserver_api().update_tenant_config(
|
||||
env.initial_tenant, {"compaction_threshold": 5}, None
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user