Files
neon/test_runner/regress/test_tenant_conf.py
Vlad Lazar 868f194a3b pageserver: remove handling of vanilla protocol (#12126)
## Problem

We support two ingest protocols on the pageserver: vanilla and
interpreted.
Interpreted has been the only protocol in use for a long time.

## Summary of changes

* Remove the ingest handling of the vanilla protocol
* Remove tenant and pageserver configuration for it
* Update all tests that tweaked the ingest protocol

## Compatibility

Backward compatibility:
* The new pageserver version can read the existing pageserver
configuration and it will ignore the unknown field.
* When the tenant config is read from the storcon db or from the
pageserver disk, the extra field will be ignored.

Forward compatiblity:
* Both the pageserver config and the tenant config map missing fields to
their default value.

I'm not aware of any tenant level override that was made for this knob.
2025-06-05 11:43:04 +00:00

411 lines
17 KiB
Python

from __future__ import annotations
import json
from typing import TYPE_CHECKING
import pytest
from fixtures.common_types import Lsn
from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.utils import run_only_on_default_postgres, wait_until
from fixtures.workload import Workload
if TYPE_CHECKING:
from typing import Any
from fixtures.neon_fixtures import (
NeonEnvBuilder,
)
def test_tenant_config(neon_env_builder: NeonEnvBuilder):
"""Test per tenant configuration"""
def set_some_nondefault_global_config(ps_cfg: dict[str, Any]):
ps_cfg["page_cache_size"] = 444
ps_cfg["wait_lsn_timeout"] = "111 s"
tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config["checkpoint_distance"] = 10000
tenant_config["compaction_target_size"] = 1048576
tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
tenant_config["eviction_policy"] = {
"kind": "LayerAccessThreshold",
"period": "20s",
"threshold": "23 hours",
}
neon_env_builder.pageserver_config_override = set_some_nondefault_global_config
env = neon_env_builder.init_start()
# we configure eviction but no remote storage, there might be error lines
env.pageserver.allowed_errors.append(".* no remote storage configured, cannot evict layers .*")
http_client = env.pageserver.http_client()
# Check that we raise on misspelled configs
invalid_conf_key = "some_invalid_setting_name_blah_blah_123"
try:
env.create_tenant(
conf={
invalid_conf_key: "20000",
}
)
except Exception as e:
assert invalid_conf_key in str(e)
else:
raise AssertionError("Expected validation error")
new_conf = {
"checkpoint_distance": "20000",
"gc_period": "30sec",
"evictions_low_residence_duration_metric_threshold": "42s",
"eviction_policy": json.dumps({"kind": "NoEviction"}),
}
tenant, _ = env.create_tenant(conf=new_conf)
env.create_timeline("test_tenant_conf", tenant_id=tenant)
env.endpoints.create_start("test_tenant_conf", "main", tenant)
# check the configuration of the default tenant
# it should match global configuration
default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant)
assert not default_tenant_config.tenant_specific_overrides, (
"Should have no specific settings yet"
)
effective_config = default_tenant_config.effective_config
assert effective_config["checkpoint_distance"] == 10000
assert effective_config["compaction_target_size"] == 1048576
assert effective_config["compaction_period"] == "20s"
assert effective_config["compaction_threshold"] == 10
assert effective_config["gc_horizon"] == 67108864
assert effective_config["gc_period"] == "1h"
assert effective_config["image_creation_threshold"] == 3
assert effective_config["pitr_interval"] == "7days"
assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
assert effective_config["eviction_policy"] == {
"kind": "LayerAccessThreshold",
"period": "20s",
"threshold": "23h",
}
# check the configuration of the new tenant
new_tenant_config = http_client.tenant_config(tenant_id=tenant)
new_specific_config = new_tenant_config.tenant_specific_overrides
assert new_specific_config["checkpoint_distance"] == 20000
assert new_specific_config["gc_period"] == "30s"
assert len(new_specific_config) == len(new_conf), (
f"No more specific properties were expected, but got: {new_specific_config}"
)
new_effective_config = new_tenant_config.effective_config
assert new_effective_config["checkpoint_distance"] == 20000, (
"Specific 'checkpoint_distance' config should override the default value"
)
assert new_effective_config["gc_period"] == "30s", (
"Specific 'gc_period' config should override the default value"
)
assert new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s", (
"Should override default value"
)
assert new_effective_config["eviction_policy"] == {"kind": "NoEviction"}, (
"Specific 'eviction_policy' config should override the default value"
)
assert new_effective_config["compaction_target_size"] == 1048576
assert new_effective_config["compaction_period"] == "20s"
assert new_effective_config["compaction_threshold"] == 10
assert new_effective_config["gc_horizon"] == 67108864
assert new_effective_config["image_creation_threshold"] == 3
assert new_effective_config["pitr_interval"] == "7days"
# update the config and ensure that it has changed
conf_update = {
"checkpoint_distance": "15000",
"gc_period": "80sec",
"compaction_period": "80sec",
"image_creation_threshold": "2",
"evictions_low_residence_duration_metric_threshold": "23h",
"eviction_policy": json.dumps(
{"kind": "LayerAccessThreshold", "period": "80s", "threshold": "42h"}
),
"max_lsn_wal_lag": "13000000",
}
env.config_tenant(tenant_id=tenant, conf=conf_update)
updated_tenant_config = http_client.tenant_config(tenant_id=tenant)
updated_specific_config = updated_tenant_config.tenant_specific_overrides
assert updated_specific_config["checkpoint_distance"] == 15000
assert updated_specific_config["gc_period"] == "1m 20s"
assert updated_specific_config["compaction_period"] == "1m 20s"
assert len(updated_specific_config) == len(conf_update), (
f"No more specific properties were expected, but got: {updated_specific_config}"
)
updated_effective_config = updated_tenant_config.effective_config
assert updated_effective_config["checkpoint_distance"] == 15000, (
"Specific 'checkpoint_distance' config should override the default value"
)
assert updated_effective_config["gc_period"] == "1m 20s", (
"Specific 'gc_period' config should override the default value"
)
assert updated_effective_config["compaction_period"] == "1m 20s", (
"Specific 'compaction_period' config should override the default value"
)
assert updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h", (
"Should override default value"
)
assert updated_effective_config["eviction_policy"] == {
"kind": "LayerAccessThreshold",
"period": "1m 20s",
"threshold": "1day 18h",
}, "Specific 'eviction_policy' config should override the default value"
assert updated_effective_config["compaction_target_size"] == 1048576
assert updated_effective_config["compaction_threshold"] == 10
assert updated_effective_config["gc_horizon"] == 67108864
assert updated_effective_config["image_creation_threshold"] == 2
assert updated_effective_config["pitr_interval"] == "7days"
assert updated_effective_config["max_lsn_wal_lag"] == 13000000
# restart the pageserver and ensure that the config is still correct
env.pageserver.stop()
env.pageserver.start()
restarted_tenant_config = http_client.tenant_config(tenant_id=tenant)
assert restarted_tenant_config == updated_tenant_config, (
"Updated config should not change after the restart"
)
# update the config with very short config and make sure no trailing chars are left from previous config
final_conf = {
"pitr_interval": "1 min",
}
env.config_tenant(tenant_id=tenant, conf=final_conf)
final_tenant_config = http_client.tenant_config(tenant_id=tenant)
final_specific_config = final_tenant_config.tenant_specific_overrides
assert final_specific_config["pitr_interval"] == "1m"
assert len(final_specific_config) == len(final_conf), (
f"No more specific properties were expected, but got: {final_specific_config}"
)
final_effective_config = final_tenant_config.effective_config
assert final_effective_config["pitr_interval"] == "1m", (
"Specific 'pitr_interval' config should override the default value"
)
assert final_effective_config["checkpoint_distance"] == 10000
assert final_effective_config["compaction_target_size"] == 1048576
assert final_effective_config["compaction_period"] == "20s"
assert final_effective_config["compaction_threshold"] == 10
assert final_effective_config["gc_horizon"] == 67108864
assert final_effective_config["gc_period"] == "1h"
assert final_effective_config["image_creation_threshold"] == 3
assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
assert final_effective_config["eviction_policy"] == {
"kind": "LayerAccessThreshold",
"period": "20s",
"threshold": "23h",
}
assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
# restart the pageserver and ensure that the config is still correct
env.pageserver.stop()
env.pageserver.start()
restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant)
assert restarted_final_tenant_config == final_tenant_config, (
"Updated config should not change after the restart"
)
def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
# tenant is created with defaults, as in without config file
(tenant_id, timeline_id) = env.create_tenant()
config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1"
http_client = env.pageserver.http_client()
detail = http_client.timeline_detail(tenant_id, timeline_id)
last_record_lsn = Lsn(detail["last_record_lsn"])
assert last_record_lsn.lsn_int != 0, "initdb must have executed"
wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)
http_client.tenant_detach(tenant_id)
assert not config_path.exists(), "detach did not remove config file"
env.pageserver.tenant_attach(tenant_id)
wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active"))
env.config_tenant(tenant_id, {"gc_horizon": "1000000"})
contents_first = config_path.read_text()
env.config_tenant(tenant_id, {"gc_horizon": "0"})
contents_later = config_path.read_text()
# dont test applying the setting here, we have that another test case to show it
# we just care about being able to create the file
assert len(contents_first) > len(contents_later)
def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
neon_env_builder: NeonEnvBuilder,
):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(
initial_tenant_conf={
# disable compaction so that it will not download the layer for repartitioning
"compaction_period": "0s"
}
)
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
(tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline
ps_http = env.pageserver.http_client()
# When we evict/download layers, we will use this Workload to generate getpage requests
# that touch some layers, as otherwise the pageserver doesn't report totally unused layers
# as problems when they have short residence duration.
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(100)
def get_metric():
metrics = ps_http.get_metrics()
metric = metrics.query_one(
"pageserver_evictions_with_low_residence_duration_total",
{
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
},
)
return metric
default_value = ps_http.tenant_config(tenant_id).effective_config[
"evictions_low_residence_duration_metric_threshold"
]
metric = get_metric()
assert int(metric.value) == 0, "metric is present with default value"
assert default_value == "1day"
ps_http.download_all_layers(tenant_id, timeline_id)
workload.validate()
ps_http.evict_all_layers(tenant_id, timeline_id)
metric = get_metric()
assert int(metric.value) > 0, "metric is updated"
env.config_tenant(
tenant_id, {"evictions_low_residence_duration_metric_threshold": default_value}
)
updated_metric = get_metric()
assert int(updated_metric.value) == int(metric.value), (
"metric is unchanged when setting same value"
)
env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"})
metric = get_metric()
assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
assert int(metric.value) == 0
ps_http.download_all_layers(tenant_id, timeline_id)
workload.validate()
ps_http.evict_all_layers(tenant_id, timeline_id)
metric = get_metric()
assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
assert int(metric.value) > 0
env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2h"})
metric = get_metric()
assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
assert int(metric.value) == 0, "value resets if label changes"
ps_http.download_all_layers(tenant_id, timeline_id)
workload.validate()
ps_http.evict_all_layers(tenant_id, timeline_id)
metric = get_metric()
assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
assert int(metric.value) > 0, "set a non-zero value for next step"
env.config_tenant(tenant_id, {})
metric = get_metric()
assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
assert int(metric.value) == 0, "value resets to default"
@run_only_on_default_postgres("Test does not start a compute")
@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"])
def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str):
"""
Test tenant config patching (i.e. additive updates)
The flow is different for storage controller and cplane managed pageserver.
1. Storcon managed: /v1/tenant/config request lands on storcon, which generates
location_config calls containing the update to the pageserver
2. Cplane managed: /v1/tenant/config is called directly on the pageserver
"""
def assert_tenant_conf_semantically_equal(lhs, rhs):
"""
Compare two tenant's config overrides semantically, by dropping the None values.
"""
lhs = {k: v for k, v in lhs.items() if v is not None}
rhs = {k: v for k, v in rhs.items() if v is not None}
assert lhs == rhs
env = neon_env_builder.init_start()
if ps_managed_by == "storcon":
api = env.storage_controller.pageserver_api()
elif ps_managed_by == "cplane":
# Disallow storcon from sending location_configs to the pageserver.
# These would overwrite the manually set tenant configs.
env.storage_controller.reconcile_until_idle()
env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"})
env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
api = env.pageserver.http_client()
else:
raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}")
crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides
patch: dict[str, Any | None] = {
"gc_period": "3h",
"gc_compaction_ratio_percent": 10,
}
api.patch_tenant_config(env.initial_tenant, patch)
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
if ps_managed_by == "storcon":
# Check that the config was propagated to the PS.
overrides_on_ps = (
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
)
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
crnt_tenant_conf = tenant_conf_after_patch
patch = {"gc_period": "5h", "gc_compaction_ratio_percent": None}
api.patch_tenant_config(env.initial_tenant, patch)
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
if ps_managed_by == "storcon":
overrides_on_ps = (
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
)
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
crnt_tenant_conf = tenant_conf_after_patch
put = {"pitr_interval": "1m 1s"}
api.set_tenant_config(env.initial_tenant, put)
tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides
if ps_managed_by == "storcon":
overrides_on_ps = (
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
)
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put)
assert_tenant_conf_semantically_equal(tenant_conf_after_put, put)
crnt_tenant_conf = tenant_conf_after_put