mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 03:52:56 +00:00
## Problem We support two ingest protocols on the pageserver: vanilla and interpreted. Interpreted has been the only protocol in use for a long time. ## Summary of changes * Remove the ingest handling of the vanilla protocol * Remove tenant and pageserver configuration for it * Update all tests that tweaked the ingest protocol ## Compatibility Backward compatibility: * The new pageserver version can read the existing pageserver configuration and it will ignore the unknown field. * When the tenant config is read from the storcon db or from the pageserver disk, the extra field will be ignored. Forward compatiblity: * Both the pageserver config and the tenant config map missing fields to their default value. I'm not aware of any tenant level override that was made for this knob.
411 lines
17 KiB
Python
411 lines
17 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import TYPE_CHECKING
|
|
|
|
import pytest
|
|
from fixtures.common_types import Lsn
|
|
from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
|
|
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
|
from fixtures.utils import run_only_on_default_postgres, wait_until
|
|
from fixtures.workload import Workload
|
|
|
|
if TYPE_CHECKING:
|
|
from typing import Any
|
|
|
|
from fixtures.neon_fixtures import (
|
|
NeonEnvBuilder,
|
|
)
|
|
|
|
|
|
def test_tenant_config(neon_env_builder: NeonEnvBuilder):
|
|
"""Test per tenant configuration"""
|
|
|
|
def set_some_nondefault_global_config(ps_cfg: dict[str, Any]):
|
|
ps_cfg["page_cache_size"] = 444
|
|
ps_cfg["wait_lsn_timeout"] = "111 s"
|
|
|
|
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
|
tenant_config["checkpoint_distance"] = 10000
|
|
tenant_config["compaction_target_size"] = 1048576
|
|
tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
|
|
tenant_config["eviction_policy"] = {
|
|
"kind": "LayerAccessThreshold",
|
|
"period": "20s",
|
|
"threshold": "23 hours",
|
|
}
|
|
|
|
neon_env_builder.pageserver_config_override = set_some_nondefault_global_config
|
|
|
|
env = neon_env_builder.init_start()
|
|
# we configure eviction but no remote storage, there might be error lines
|
|
env.pageserver.allowed_errors.append(".* no remote storage configured, cannot evict layers .*")
|
|
http_client = env.pageserver.http_client()
|
|
|
|
# Check that we raise on misspelled configs
|
|
invalid_conf_key = "some_invalid_setting_name_blah_blah_123"
|
|
try:
|
|
env.create_tenant(
|
|
conf={
|
|
invalid_conf_key: "20000",
|
|
}
|
|
)
|
|
except Exception as e:
|
|
assert invalid_conf_key in str(e)
|
|
else:
|
|
raise AssertionError("Expected validation error")
|
|
|
|
new_conf = {
|
|
"checkpoint_distance": "20000",
|
|
"gc_period": "30sec",
|
|
"evictions_low_residence_duration_metric_threshold": "42s",
|
|
"eviction_policy": json.dumps({"kind": "NoEviction"}),
|
|
}
|
|
tenant, _ = env.create_tenant(conf=new_conf)
|
|
|
|
env.create_timeline("test_tenant_conf", tenant_id=tenant)
|
|
env.endpoints.create_start("test_tenant_conf", "main", tenant)
|
|
|
|
# check the configuration of the default tenant
|
|
# it should match global configuration
|
|
default_tenant_config = http_client.tenant_config(tenant_id=env.initial_tenant)
|
|
assert not default_tenant_config.tenant_specific_overrides, (
|
|
"Should have no specific settings yet"
|
|
)
|
|
effective_config = default_tenant_config.effective_config
|
|
assert effective_config["checkpoint_distance"] == 10000
|
|
assert effective_config["compaction_target_size"] == 1048576
|
|
assert effective_config["compaction_period"] == "20s"
|
|
assert effective_config["compaction_threshold"] == 10
|
|
assert effective_config["gc_horizon"] == 67108864
|
|
assert effective_config["gc_period"] == "1h"
|
|
assert effective_config["image_creation_threshold"] == 3
|
|
assert effective_config["pitr_interval"] == "7days"
|
|
assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
|
|
assert effective_config["eviction_policy"] == {
|
|
"kind": "LayerAccessThreshold",
|
|
"period": "20s",
|
|
"threshold": "23h",
|
|
}
|
|
|
|
# check the configuration of the new tenant
|
|
new_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
new_specific_config = new_tenant_config.tenant_specific_overrides
|
|
assert new_specific_config["checkpoint_distance"] == 20000
|
|
assert new_specific_config["gc_period"] == "30s"
|
|
assert len(new_specific_config) == len(new_conf), (
|
|
f"No more specific properties were expected, but got: {new_specific_config}"
|
|
)
|
|
new_effective_config = new_tenant_config.effective_config
|
|
assert new_effective_config["checkpoint_distance"] == 20000, (
|
|
"Specific 'checkpoint_distance' config should override the default value"
|
|
)
|
|
assert new_effective_config["gc_period"] == "30s", (
|
|
"Specific 'gc_period' config should override the default value"
|
|
)
|
|
assert new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s", (
|
|
"Should override default value"
|
|
)
|
|
assert new_effective_config["eviction_policy"] == {"kind": "NoEviction"}, (
|
|
"Specific 'eviction_policy' config should override the default value"
|
|
)
|
|
assert new_effective_config["compaction_target_size"] == 1048576
|
|
assert new_effective_config["compaction_period"] == "20s"
|
|
assert new_effective_config["compaction_threshold"] == 10
|
|
assert new_effective_config["gc_horizon"] == 67108864
|
|
assert new_effective_config["image_creation_threshold"] == 3
|
|
assert new_effective_config["pitr_interval"] == "7days"
|
|
|
|
# update the config and ensure that it has changed
|
|
conf_update = {
|
|
"checkpoint_distance": "15000",
|
|
"gc_period": "80sec",
|
|
"compaction_period": "80sec",
|
|
"image_creation_threshold": "2",
|
|
"evictions_low_residence_duration_metric_threshold": "23h",
|
|
"eviction_policy": json.dumps(
|
|
{"kind": "LayerAccessThreshold", "period": "80s", "threshold": "42h"}
|
|
),
|
|
"max_lsn_wal_lag": "13000000",
|
|
}
|
|
env.config_tenant(tenant_id=tenant, conf=conf_update)
|
|
|
|
updated_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
updated_specific_config = updated_tenant_config.tenant_specific_overrides
|
|
assert updated_specific_config["checkpoint_distance"] == 15000
|
|
assert updated_specific_config["gc_period"] == "1m 20s"
|
|
assert updated_specific_config["compaction_period"] == "1m 20s"
|
|
assert len(updated_specific_config) == len(conf_update), (
|
|
f"No more specific properties were expected, but got: {updated_specific_config}"
|
|
)
|
|
updated_effective_config = updated_tenant_config.effective_config
|
|
assert updated_effective_config["checkpoint_distance"] == 15000, (
|
|
"Specific 'checkpoint_distance' config should override the default value"
|
|
)
|
|
assert updated_effective_config["gc_period"] == "1m 20s", (
|
|
"Specific 'gc_period' config should override the default value"
|
|
)
|
|
assert updated_effective_config["compaction_period"] == "1m 20s", (
|
|
"Specific 'compaction_period' config should override the default value"
|
|
)
|
|
assert updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h", (
|
|
"Should override default value"
|
|
)
|
|
assert updated_effective_config["eviction_policy"] == {
|
|
"kind": "LayerAccessThreshold",
|
|
"period": "1m 20s",
|
|
"threshold": "1day 18h",
|
|
}, "Specific 'eviction_policy' config should override the default value"
|
|
assert updated_effective_config["compaction_target_size"] == 1048576
|
|
assert updated_effective_config["compaction_threshold"] == 10
|
|
assert updated_effective_config["gc_horizon"] == 67108864
|
|
assert updated_effective_config["image_creation_threshold"] == 2
|
|
assert updated_effective_config["pitr_interval"] == "7days"
|
|
assert updated_effective_config["max_lsn_wal_lag"] == 13000000
|
|
|
|
# restart the pageserver and ensure that the config is still correct
|
|
env.pageserver.stop()
|
|
env.pageserver.start()
|
|
|
|
restarted_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
assert restarted_tenant_config == updated_tenant_config, (
|
|
"Updated config should not change after the restart"
|
|
)
|
|
|
|
# update the config with very short config and make sure no trailing chars are left from previous config
|
|
final_conf = {
|
|
"pitr_interval": "1 min",
|
|
}
|
|
env.config_tenant(tenant_id=tenant, conf=final_conf)
|
|
|
|
final_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
final_specific_config = final_tenant_config.tenant_specific_overrides
|
|
assert final_specific_config["pitr_interval"] == "1m"
|
|
assert len(final_specific_config) == len(final_conf), (
|
|
f"No more specific properties were expected, but got: {final_specific_config}"
|
|
)
|
|
final_effective_config = final_tenant_config.effective_config
|
|
assert final_effective_config["pitr_interval"] == "1m", (
|
|
"Specific 'pitr_interval' config should override the default value"
|
|
)
|
|
assert final_effective_config["checkpoint_distance"] == 10000
|
|
assert final_effective_config["compaction_target_size"] == 1048576
|
|
assert final_effective_config["compaction_period"] == "20s"
|
|
assert final_effective_config["compaction_threshold"] == 10
|
|
assert final_effective_config["gc_horizon"] == 67108864
|
|
assert final_effective_config["gc_period"] == "1h"
|
|
assert final_effective_config["image_creation_threshold"] == 3
|
|
assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
|
|
assert final_effective_config["eviction_policy"] == {
|
|
"kind": "LayerAccessThreshold",
|
|
"period": "20s",
|
|
"threshold": "23h",
|
|
}
|
|
assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
|
|
|
|
# restart the pageserver and ensure that the config is still correct
|
|
env.pageserver.stop()
|
|
env.pageserver.start()
|
|
|
|
restarted_final_tenant_config = http_client.tenant_config(tenant_id=tenant)
|
|
assert restarted_final_tenant_config == final_tenant_config, (
|
|
"Updated config should not change after the restart"
|
|
)
|
|
|
|
|
|
def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
|
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
|
|
|
env = neon_env_builder.init_start()
|
|
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
|
|
|
# tenant is created with defaults, as in without config file
|
|
(tenant_id, timeline_id) = env.create_tenant()
|
|
config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1"
|
|
|
|
http_client = env.pageserver.http_client()
|
|
|
|
detail = http_client.timeline_detail(tenant_id, timeline_id)
|
|
last_record_lsn = Lsn(detail["last_record_lsn"])
|
|
assert last_record_lsn.lsn_int != 0, "initdb must have executed"
|
|
|
|
wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)
|
|
|
|
http_client.tenant_detach(tenant_id)
|
|
|
|
assert not config_path.exists(), "detach did not remove config file"
|
|
|
|
env.pageserver.tenant_attach(tenant_id)
|
|
wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active"))
|
|
|
|
env.config_tenant(tenant_id, {"gc_horizon": "1000000"})
|
|
contents_first = config_path.read_text()
|
|
env.config_tenant(tenant_id, {"gc_horizon": "0"})
|
|
contents_later = config_path.read_text()
|
|
|
|
# dont test applying the setting here, we have that another test case to show it
|
|
# we just care about being able to create the file
|
|
assert len(contents_first) > len(contents_later)
|
|
|
|
|
|
def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
|
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf={
|
|
# disable compaction so that it will not download the layer for repartitioning
|
|
"compaction_period": "0s"
|
|
}
|
|
)
|
|
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
|
|
|
(tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline
|
|
ps_http = env.pageserver.http_client()
|
|
|
|
# When we evict/download layers, we will use this Workload to generate getpage requests
|
|
# that touch some layers, as otherwise the pageserver doesn't report totally unused layers
|
|
# as problems when they have short residence duration.
|
|
workload = Workload(env, tenant_id, timeline_id)
|
|
workload.init()
|
|
workload.write_rows(100)
|
|
|
|
def get_metric():
|
|
metrics = ps_http.get_metrics()
|
|
metric = metrics.query_one(
|
|
"pageserver_evictions_with_low_residence_duration_total",
|
|
{
|
|
"tenant_id": str(tenant_id),
|
|
"timeline_id": str(timeline_id),
|
|
},
|
|
)
|
|
return metric
|
|
|
|
default_value = ps_http.tenant_config(tenant_id).effective_config[
|
|
"evictions_low_residence_duration_metric_threshold"
|
|
]
|
|
metric = get_metric()
|
|
assert int(metric.value) == 0, "metric is present with default value"
|
|
|
|
assert default_value == "1day"
|
|
|
|
ps_http.download_all_layers(tenant_id, timeline_id)
|
|
workload.validate()
|
|
ps_http.evict_all_layers(tenant_id, timeline_id)
|
|
metric = get_metric()
|
|
assert int(metric.value) > 0, "metric is updated"
|
|
|
|
env.config_tenant(
|
|
tenant_id, {"evictions_low_residence_duration_metric_threshold": default_value}
|
|
)
|
|
updated_metric = get_metric()
|
|
assert int(updated_metric.value) == int(metric.value), (
|
|
"metric is unchanged when setting same value"
|
|
)
|
|
|
|
env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"})
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
|
|
assert int(metric.value) == 0
|
|
|
|
ps_http.download_all_layers(tenant_id, timeline_id)
|
|
workload.validate()
|
|
ps_http.evict_all_layers(tenant_id, timeline_id)
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60
|
|
assert int(metric.value) > 0
|
|
|
|
env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2h"})
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
|
|
assert int(metric.value) == 0, "value resets if label changes"
|
|
|
|
ps_http.download_all_layers(tenant_id, timeline_id)
|
|
workload.validate()
|
|
ps_http.evict_all_layers(tenant_id, timeline_id)
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60
|
|
assert int(metric.value) > 0, "set a non-zero value for next step"
|
|
|
|
env.config_tenant(tenant_id, {})
|
|
metric = get_metric()
|
|
assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
|
|
assert int(metric.value) == 0, "value resets to default"
|
|
|
|
|
|
@run_only_on_default_postgres("Test does not start a compute")
|
|
@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"])
|
|
def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str):
|
|
"""
|
|
Test tenant config patching (i.e. additive updates)
|
|
|
|
The flow is different for storage controller and cplane managed pageserver.
|
|
1. Storcon managed: /v1/tenant/config request lands on storcon, which generates
|
|
location_config calls containing the update to the pageserver
|
|
2. Cplane managed: /v1/tenant/config is called directly on the pageserver
|
|
"""
|
|
|
|
def assert_tenant_conf_semantically_equal(lhs, rhs):
|
|
"""
|
|
Compare two tenant's config overrides semantically, by dropping the None values.
|
|
"""
|
|
lhs = {k: v for k, v in lhs.items() if v is not None}
|
|
rhs = {k: v for k, v in rhs.items() if v is not None}
|
|
|
|
assert lhs == rhs
|
|
|
|
env = neon_env_builder.init_start()
|
|
|
|
if ps_managed_by == "storcon":
|
|
api = env.storage_controller.pageserver_api()
|
|
elif ps_managed_by == "cplane":
|
|
# Disallow storcon from sending location_configs to the pageserver.
|
|
# These would overwrite the manually set tenant configs.
|
|
env.storage_controller.reconcile_until_idle()
|
|
env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"})
|
|
env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
|
|
|
|
api = env.pageserver.http_client()
|
|
else:
|
|
raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}")
|
|
|
|
crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
|
|
patch: dict[str, Any | None] = {
|
|
"gc_period": "3h",
|
|
"gc_compaction_ratio_percent": 10,
|
|
}
|
|
api.patch_tenant_config(env.initial_tenant, patch)
|
|
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
if ps_managed_by == "storcon":
|
|
# Check that the config was propagated to the PS.
|
|
overrides_on_ps = (
|
|
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
)
|
|
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
|
|
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
|
|
crnt_tenant_conf = tenant_conf_after_patch
|
|
|
|
patch = {"gc_period": "5h", "gc_compaction_ratio_percent": None}
|
|
api.patch_tenant_config(env.initial_tenant, patch)
|
|
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
if ps_managed_by == "storcon":
|
|
overrides_on_ps = (
|
|
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
)
|
|
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
|
|
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
|
|
crnt_tenant_conf = tenant_conf_after_patch
|
|
|
|
put = {"pitr_interval": "1m 1s"}
|
|
api.set_tenant_config(env.initial_tenant, put)
|
|
tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
if ps_managed_by == "storcon":
|
|
overrides_on_ps = (
|
|
env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
|
|
)
|
|
assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put)
|
|
assert_tenant_conf_semantically_equal(tenant_conf_after_put, put)
|
|
crnt_tenant_conf = tenant_conf_after_put
|