test_suite: patch, don't replace, the tenant_config field, where appropriate (#7771)

Before this PR, the changed tests would overwrite the entire
`tenant_config` because `pageserver_config_override` is merged
non-recursively into the `ps_cfg`.

This meant they would override the
`PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM`, impacting our
matrix build for `compaction_algorithm=Tiered|Legacy` in
https://github.com/neondatabase/neon/pull/7748.

I found the tests fixed in this PR using the
`NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM` env var that
I added in #7748. Therefore, I think this is an exhaustive fix. This is
better than just searching the code base for `tenant_config`, which is
what I had sketched in #7747.

refs #7749
This commit is contained in:
Christian Schwarz
2024-05-17 12:24:02 +02:00
committed by GitHub
parent 4b8809b280
commit 6d951e69d6
11 changed files with 70 additions and 46 deletions

View File

@@ -455,7 +455,7 @@ class NeonEnvBuilder:
test_overlay_dir: Optional[Path] = None,
pageserver_remote_storage: Optional[RemoteStorage] = None,
# toml that will be decomposed into `--config-override` flags during `pageserver --init`
pageserver_config_override: Optional[str] = None,
pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None,
num_safekeepers: int = 1,
num_pageservers: int = 1,
# Use non-standard SK ids to check for various parsing bugs
@@ -1127,10 +1127,14 @@ class NeonEnv:
)
if config.pageserver_config_override is not None:
for o in config.pageserver_config_override.split(";"):
override = toml.loads(o)
for key, value in override.items():
ps_cfg[key] = value
if callable(config.pageserver_config_override):
config.pageserver_config_override(ps_cfg)
else:
assert isinstance(config.pageserver_config_override, str)
for o in config.pageserver_config_override.split(";"):
override = toml.loads(o)
for key, value in override.items():
ps_cfg[key] = value
# Create a corresponding NeonPageserver object
self.pageservers.append(

View File

@@ -11,8 +11,7 @@ from fixtures.utils import print_gc_result, query_scalar
#
def test_branch_behind(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
error_regexes = [
".*invalid branch start lsn.*",

View File

@@ -67,8 +67,7 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
#
def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
endpoint = env.endpoints.create_start("test_gc_aggressive")
@@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
#
def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
num_index_uploads = 0
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
endpoint = env.endpoints.create_start("test_gc_index_upload")

View File

@@ -16,8 +16,7 @@ from fixtures.utils import print_gc_result, query_scalar
#
def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
env.neon_cli.create_branch("test_old_request_lsn", "main")
endpoint = env.endpoints.create_start("test_old_request_lsn")

View File

@@ -42,6 +42,8 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
"listen_http_addr",
"pg_auth_type",
"http_auth_type",
# TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
# "tenant_config",
]
required_config_overrides = [
f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys

View File

@@ -10,11 +10,9 @@ from fixtures.utils import print_gc_result, query_scalar
#
def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
# Set pitr interval such that we need to keep the data
neon_env_builder.pageserver_config_override = (
"tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
env = neon_env_builder.init_start(
initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"}
)
env = neon_env_builder.init_start()
endpoint_main = env.endpoints.create_start("main")
main_pg_conn = endpoint_main.connect()

View File

@@ -10,9 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder
#
def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
# Override default checkpointer settings to run it more often
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_distance": "1048576",
}
)
env.pageserver.is_testing_enabled_or_skip()
# We expect the pageserver to exit, which will cause storage storage controller

View File

@@ -1,5 +1,6 @@
import json
from contextlib import closing
from typing import Any, Dict
import psycopg2.extras
from fixtures.common_types import Lsn
@@ -14,16 +15,22 @@ from fixtures.utils import wait_until
def test_tenant_config(neon_env_builder: NeonEnvBuilder):
"""Test per tenant configuration"""
# set some non-default global config
neon_env_builder.pageserver_config_override = """
page_cache_size=444;
wait_lsn_timeout='111 s';
[tenant_config]
checkpoint_distance = 10000
compaction_target_size = 1048576
evictions_low_residence_duration_metric_threshold = "2 days"
eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
"""
def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]):
ps_cfg["page_cache_size"] = 444
ps_cfg["wait_lsn_timeout"] = "111 s"
tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config["checkpoint_distance"] = 10000
tenant_config["compaction_target_size"] = 1048576
tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
tenant_config["eviction_policy"] = {
"kind": "LayerAccessThreshold",
"period": "20s",
"threshold": "23 hours",
}
neon_env_builder.pageserver_config_override = set_some_nondefault_global_config
env = neon_env_builder.init_start()
# we configure eviction but no remote storage, there might be error lines

View File

@@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches(
gc_horizon = 128 * 1024
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(
initial_tenant_conf={
"compaction_period": "0s",
"gc_period": "0s",
"pitr_interval": "0sec",
"gc_horizon": gc_horizon,
}
)
# FIXME: we have a race condition between GC and delete timeline. GC might fail with this
# error. Similar to https://github.com/neondatabase/neon/issues/2671

View File

@@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
# Disable background compaction as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
neon_env_builder.pageserver_config_override = (
"tenant_config={checkpoint_distance=100000, compaction_period='10m'}"
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_distance": "100000",
"compaction_period": "10m",
}
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
@@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
env = neon_env_builder.init_start()
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_distance": "100000",
"compaction_period": "0s",
"gc_period": "0s",
"pitr_interval": "1s",
}
)
pageserver_http = env.pageserver.http_client()
new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")

View File

@@ -1,4 +1,5 @@
import time
from typing import Any, Dict
from fixtures.common_types import Lsn, TenantId
from fixtures.log_helper import log
@@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
# Kills one of the safekeepers and ensures that only the active ones are printed in the state.
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
# Trigger WAL wait timeout faster
neon_env_builder.pageserver_config_override = """
wait_lsn_timeout = "1s"
tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"}
"""
def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
ps_cfg["wait_lsn_timeout"] = "1s"
tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config["walreceiver_connect_timeout"] = "2s"
tenant_config["lagging_wal_timeout"] = "2s"
neon_env_builder.pageserver_config_override = customize_pageserver_toml
# Have notable SK ids to ensure we check logs for their presence, not some other random numbers
neon_env_builder.safekeepers_id_start = 12345
neon_env_builder.num_safekeepers = 3