mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-24 16:40:38 +00:00
NeonEnv.from_repo_dir: use storage_controller_db instead of attachments.json (#8382)
When `NeonEnv.from_repo_dir` was introduced, storage controller stored its state exclusively `attachments.json`. Since then, it has moved to using Postgres, which stores its state in `storage_controller_db`. But `NeonEnv.from_repo_dir` wasn't adjusted to do this. This PR rectifies the situation. Context for this is failures in `test_pageserver_characterize_throughput_with_n_tenants` CF: https://neondb.slack.com/archives/C033RQ5SPDH/p1721035799502239?thread_ts=1720901332.293769&cid=C033RQ5SPDH Notably, `from_repo_dir` is also used by the backwards- and forwards-compatibility. Thus, the changes in this PR affect those tests as well. However, it turns out that the compatibility snapshot already contains the `storage_controller_db`. Thus, it should just work and in fact we can remove hacks like `fixup_storage_controller`. Follow-ups created as part of this work: * https://github.com/neondatabase/neon/issues/8399 * https://github.com/neondatabase/neon/issues/8400
This commit is contained in:
committed by
GitHub
parent
1303d47778
commit
a2d170b6d0
@@ -31,6 +31,7 @@ import backoff
|
||||
import httpx
|
||||
import jwt
|
||||
import psycopg2
|
||||
import psycopg2.sql
|
||||
import pytest
|
||||
import requests
|
||||
import toml
|
||||
@@ -727,8 +728,30 @@ class NeonEnvBuilder:
|
||||
self.repo_dir / "local_fs_remote_storage",
|
||||
)
|
||||
|
||||
if (attachments_json := Path(repo_dir / "attachments.json")).exists():
|
||||
shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
|
||||
# restore storage controller (the db is small, don't bother with overlayfs)
|
||||
storcon_db_from_dir = repo_dir / "storage_controller_db"
|
||||
storcon_db_to_dir = self.repo_dir / "storage_controller_db"
|
||||
log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}")
|
||||
assert storcon_db_from_dir.is_dir()
|
||||
assert not storcon_db_to_dir.exists()
|
||||
|
||||
def ignore_postgres_log(path: str, _names):
|
||||
if Path(path) == storcon_db_from_dir:
|
||||
return {"postgres.log"}
|
||||
return set()
|
||||
|
||||
shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
|
||||
assert not (storcon_db_to_dir / "postgres.log").exists()
|
||||
# NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
|
||||
# However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
|
||||
# will currently reject re-attach requests from them because the NodeMetadata isn't identical.
|
||||
# So, from_repo_dir patches up the the storcon database.
|
||||
patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
|
||||
assert not patch_script_path.exists()
|
||||
patch_script = ""
|
||||
for ps in self.env.pageservers:
|
||||
patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';"
|
||||
patch_script_path.write_text(patch_script)
|
||||
|
||||
# Update the config with info about tenants and timelines
|
||||
with (self.repo_dir / "config").open("r") as f:
|
||||
|
||||
@@ -255,11 +255,3 @@ def run_pagebench_benchmark(
|
||||
unit="ms",
|
||||
report=MetricReport.LOWER_IS_BETTER,
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.append(
|
||||
# The test setup swaps NeonEnv instances, hence different
|
||||
# pg instances are used for the storage controller db. This means
|
||||
# the storage controller doesn't know about the nodes mentioned
|
||||
# in attachments.json at start-up.
|
||||
".* Scheduler missing node 1",
|
||||
)
|
||||
|
||||
@@ -93,29 +93,6 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
|
||||
)
|
||||
|
||||
|
||||
def fixup_storage_controller(env: NeonEnv):
|
||||
"""
|
||||
After importing a repo_dir, we need to massage the storage controller's state a bit: it will have
|
||||
initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled
|
||||
anywhere.
|
||||
|
||||
After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get
|
||||
the storage controller into a good state.
|
||||
|
||||
This function should go away once compat tests carry the controller database in their snapshots, so
|
||||
that the controller properly remembers nodes between creating + restoring the snapshot.
|
||||
"""
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*Tenant shard .+ references non-existent node.*",
|
||||
".*Failed to schedule tenant .+ at startup.*",
|
||||
]
|
||||
)
|
||||
env.storage_controller.stop()
|
||||
env.storage_controller.start()
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
|
||||
@pytest.mark.xdist_group("compatibility")
|
||||
@pytest.mark.order(before="test_forward_compatibility")
|
||||
def test_create_snapshot(
|
||||
@@ -198,7 +175,6 @@ def test_backward_compatibility(
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
|
||||
neon_env_builder.start()
|
||||
fixup_storage_controller(env)
|
||||
|
||||
check_neon_works(
|
||||
env,
|
||||
@@ -287,7 +263,6 @@ def test_forward_compatibility(
|
||||
assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
|
||||
|
||||
neon_env_builder.start()
|
||||
fixup_storage_controller(env)
|
||||
|
||||
# ensure the specified pageserver is running
|
||||
assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
|
||||
|
||||
Reference in New Issue
Block a user