NeonEnv.from_repo_dir: use storage_controller_db instead of attachments.json (#8382)

When `NeonEnv.from_repo_dir` was introduced, storage controller stored
its
state exclusively `attachments.json`.
Since then, it has moved to using Postgres, which stores its state in
`storage_controller_db`.

But `NeonEnv.from_repo_dir` wasn't adjusted to do this.
This PR rectifies the situation.

Context for this is failures in
`test_pageserver_characterize_throughput_with_n_tenants`
CF:
https://neondb.slack.com/archives/C033RQ5SPDH/p1721035799502239?thread_ts=1720901332.293769&cid=C033RQ5SPDH

Notably, `from_repo_dir` is also used by the backwards- and
forwards-compatibility.
Thus, the changes in this PR affect those tests as well.
However, it turns out that the compatibility snapshot already contains
the `storage_controller_db`.
Thus, it should just work and in fact we can remove hacks like
`fixup_storage_controller`.

Follow-ups created as part of this work:
* https://github.com/neondatabase/neon/issues/8399
* https://github.com/neondatabase/neon/issues/8400
This commit is contained in:
Christian Schwarz
2024-07-18 10:56:07 +02:00
committed by GitHub
parent 1303d47778
commit a2d170b6d0
9 changed files with 133 additions and 162 deletions

View File

@@ -31,6 +31,7 @@ import backoff
import httpx
import jwt
import psycopg2
import psycopg2.sql
import pytest
import requests
import toml
@@ -727,8 +728,30 @@ class NeonEnvBuilder:
self.repo_dir / "local_fs_remote_storage",
)
if (attachments_json := Path(repo_dir / "attachments.json")).exists():
shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
# restore storage controller (the db is small, don't bother with overlayfs)
storcon_db_from_dir = repo_dir / "storage_controller_db"
storcon_db_to_dir = self.repo_dir / "storage_controller_db"
log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}")
assert storcon_db_from_dir.is_dir()
assert not storcon_db_to_dir.exists()
def ignore_postgres_log(path: str, _names):
if Path(path) == storcon_db_from_dir:
return {"postgres.log"}
return set()
shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
assert not (storcon_db_to_dir / "postgres.log").exists()
# NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
# However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
# will currently reject re-attach requests from them because the NodeMetadata isn't identical.
# So, from_repo_dir patches up the the storcon database.
patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
assert not patch_script_path.exists()
patch_script = ""
for ps in self.env.pageservers:
patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';"
patch_script_path.write_text(patch_script)
# Update the config with info about tenants and timelines
with (self.repo_dir / "config").open("r") as f:

View File

@@ -255,11 +255,3 @@ def run_pagebench_benchmark(
unit="ms",
report=MetricReport.LOWER_IS_BETTER,
)
env.storage_controller.allowed_errors.append(
# The test setup swaps NeonEnv instances, hence different
# pg instances are used for the storage controller db. This means
# the storage controller doesn't know about the nodes mentioned
# in attachments.json at start-up.
".* Scheduler missing node 1",
)

View File

@@ -93,29 +93,6 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
)
def fixup_storage_controller(env: NeonEnv):
"""
After importing a repo_dir, we need to massage the storage controller's state a bit: it will have
initially started up with no nodes, but some tenants, and thereby those tenants won't be scheduled
anywhere.
After NeonEnv.start() is done (i.e. nodes are started + registered), call this function to get
the storage controller into a good state.
This function should go away once compat tests carry the controller database in their snapshots, so
that the controller properly remembers nodes between creating + restoring the snapshot.
"""
env.storage_controller.allowed_errors.extend(
[
".*Tenant shard .+ references non-existent node.*",
".*Failed to schedule tenant .+ at startup.*",
]
)
env.storage_controller.stop()
env.storage_controller.start()
env.storage_controller.reconcile_until_idle()
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(before="test_forward_compatibility")
def test_create_snapshot(
@@ -198,7 +175,6 @@ def test_backward_compatibility(
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
neon_env_builder.start()
fixup_storage_controller(env)
check_neon_works(
env,
@@ -287,7 +263,6 @@ def test_forward_compatibility(
assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
neon_env_builder.start()
fixup_storage_controller(env)
# ensure the specified pageserver is running
assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)