pageserver/storcon: add patch endpoints for tenant config metrics (#10020)

## Problem

Cplane and storage controller tenant config changes are not additive.
Any change overrides all existing tenant configs. This would be fine if
both did client side patching, but that's not the case.

Once this merges, we must update cplane to use the PATCH endpoint.

## Summary of changes

### High Level

Allow for patching of tenant configuration with a `PATCH
/v1/tenant/config` endpoint.
It takes the same data as it's PUT counterpart. For example the payload
below will update `gc_period` and unset `compaction_period`. All other
fields are left in their original state.
```
{
  "tenant_id": "1234",
  "gc_period": "10s",
  "compaction_period": null
}
```

### Low Level
* PS and storcon gain `PATCH /v1/tenant/config` endpoints. PS endpoint
is only used for cplane managed instances.
* `storcon_cli` is updated to have separate commands for
`set-tenant-config` and `patch-tenant-config`

Related https://github.com/neondatabase/cloud/issues/21043
This commit is contained in:
Vlad Lazar
2024-12-11 19:16:33 +00:00
committed by GitHub
parent ef233e91ef
commit a3e80448e8
22 changed files with 800 additions and 50 deletions

View File

@@ -488,7 +488,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
)
self.verbose_error(res)
def patch_tenant_config_client_side(
def patch_tenant_config(self, tenant_id: TenantId | TenantShardId, updates: dict[str, Any]):
"""
Only use this via storage_controller.pageserver_api().
See `set_tenant_config` for more information.
"""
assert "tenant_id" not in updates.keys()
res = self.patch(
f"http://localhost:{self.port}/v1/tenant/config",
json={**updates, "tenant_id": str(tenant_id)},
)
self.verbose_error(res)
def update_tenant_config(
self,
tenant_id: TenantId,
inserts: dict[str, Any] | None = None,
@@ -499,13 +512,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
See `set_tenant_config` for more information.
"""
current = self.tenant_config(tenant_id).tenant_specific_overrides
if inserts is not None:
current.update(inserts)
if removes is not None:
for key in removes:
del current[key]
self.set_tenant_config(tenant_id, current)
if inserts is None:
inserts = {}
if removes is None:
removes = []
patch = inserts | {remove: None for remove in removes}
self.patch_tenant_config(tenant_id, patch)
def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
return self.tenant_size_and_modelinputs(tenant_id)[0]

View File

@@ -460,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
assert (
du_by_timeline[large_tenant] > min_resident_size
), "ensure the larger tenant will get a haircut"
env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.neon_env.storage_controller.pageserver_api().update_tenant_config(
small_tenant[0], {"min_resident_size_override": min_resident_size}
)
env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.neon_env.storage_controller.pageserver_api().update_tenant_config(
large_tenant[0], {"min_resident_size_override": min_resident_size}
)

View File

@@ -74,7 +74,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
print_layer_size_histogram(post_ingest)
# since all we have are L0s, we should be getting nice L1s and images out of them now
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.storage_controller.pageserver_api().update_tenant_config(
env.initial_tenant,
{
"compaction_threshold": 1,

View File

@@ -132,7 +132,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
), "sanity check for what above loop is supposed to do"
# create the image layer from the future
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.storage_controller.pageserver_api().update_tenant_config(
tenant_id, {"image_creation_threshold": image_creation_threshold}, None
)
assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1

View File

@@ -46,7 +46,7 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
for sk in env.safekeepers:
sk.stop()
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.storage_controller.pageserver_api().update_tenant_config(
tenant_id, {"compaction_threshold": 3}
)
# hit the exit failpoint

View File

@@ -1768,7 +1768,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
# Modify a tenant's config
storcon_cli(
[
"tenant-config",
"patch-tenant-config",
"--tenant-id",
str(env.initial_tenant),
"--config",
@@ -2403,7 +2403,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
# Make a change to the tenant config to trigger a slow reconcile
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
virtual_ps_http.update_tenant_config(tid, {"compaction_threshold": 5}, None)
env.storage_controller.allowed_errors.extend(
[
".*Accepted configuration update but reconciliation failed.*",

View File

@@ -3,13 +3,14 @@ from __future__ import annotations
import json
from typing import TYPE_CHECKING
import pytest
from fixtures.common_types import Lsn
from fixtures.neon_fixtures import (
NeonEnvBuilder,
)
from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.utils import wait_until
from fixtures.utils import run_only_on_default_postgres, wait_until
from fixtures.workload import Workload
if TYPE_CHECKING:
@@ -330,3 +331,83 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
metric = get_metric()
assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
assert int(metric.value) == 0, "value resets to default"
@run_only_on_default_postgres("Test does not start a compute")
@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"])
def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str):
"""
Test tenant config patching (i.e. additive updates)
The flow is different for storage controller and cplane managed pageserver.
1. Storcon managed: /v1/tenant/config request lands on storcon, which generates
location_config calls containing the update to the pageserver
2. Cplane managed: /v1/tenant/config is called directly on the pageserver
"""
def assert_tenant_conf_semantically_equal(lhs, rhs):
"""
Storcon returns None for fields that are not set while the pageserver does not.
Compare two tenant's config overrides semantically, by dropping the None values.
"""
lhs = {k: v for k, v in lhs.items() if v is not None}
rhs = {k: v for k, v in rhs.items() if v is not None}
assert lhs == rhs
env = neon_env_builder.init_start()
if ps_managed_by == "storcon":
api = env.storage_controller.pageserver_api()
elif ps_managed_by == "cplane":
# Disallow storcon from sending location_configs to the pageserver.
# These would overwrite the manually set tenant configs.
env.storage_controller.reconcile_until_idle()
env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"})
env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
api = env.pageserver.http_client()
else:
raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}")
crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides
patch: dict[str, Any | None] = {
"gc_period": "3h",
"wal_receiver_protocol_override": {
"type": "interpreted",
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
},
}
api.patch_tenant_config(env.initial_tenant, patch)
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
if ps_managed_by == "storcon":
# Check that the config was propagated to the PS.
overrides_on_ps = (
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
)
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
crnt_tenant_conf = tenant_conf_after_patch
patch = {"gc_period": "5h", "wal_receiver_protocol_override": None}
api.patch_tenant_config(env.initial_tenant, patch)
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
if ps_managed_by == "storcon":
overrides_on_ps = (
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
)
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
crnt_tenant_conf = tenant_conf_after_patch
put = {"pitr_interval": "1m 1s"}
api.set_tenant_config(env.initial_tenant, put)
tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides
if ps_managed_by == "storcon":
overrides_on_ps = (
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
)
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put)
assert_tenant_conf_semantically_equal(tenant_conf_after_put, put)
crnt_tenant_conf = tenant_conf_after_put

View File

@@ -81,7 +81,7 @@ def test_threshold_based_eviction(
# create a bunch of L1s, only the least of which will need to be resident
compaction_threshold = 3 # create L1 layers quickly
vps_http.patch_tenant_config_client_side(
vps_http.update_tenant_config(
tenant_id,
inserts={
# Disable gc and compaction to avoid on-demand downloads from their side.

View File

@@ -514,7 +514,7 @@ def test_compaction_induced_by_detaches_in_history(
assert len(delta_layers(branch_timeline_id)) == 5
env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.storage_controller.pageserver_api().update_tenant_config(
env.initial_tenant, {"compaction_threshold": 5}, None
)