mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 13:02:55 +00:00
## Problem In many places in test code, paths are built manually from what NeonEnv.tenant_dir and NeonEnv.timeline_dir could do. ## Summary of changes 1. NeonEnv.tenant_dir and NeonEnv.timeline_dir moved under class NeonPageserver as the path they use is per-pageserver instance. 2. Used these everywhere to replace manual path building Closes #5258 --------- Signed-off-by: Rahul Modpur <rmodpur2@gmail.com>
733 lines
27 KiB
Python
733 lines
27 KiB
Python
import os
|
|
import shutil
|
|
import threading
|
|
import time
|
|
from contextlib import closing, contextmanager
|
|
from pathlib import Path
|
|
from typing import Any, Dict, Optional, Tuple
|
|
|
|
import pytest
|
|
from fixtures.broker import NeonBroker
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import (
|
|
Endpoint,
|
|
NeonEnv,
|
|
NeonEnvBuilder,
|
|
)
|
|
from fixtures.pageserver.http import PageserverHttpClient
|
|
from fixtures.pageserver.utils import (
|
|
assert_tenant_state,
|
|
wait_for_last_record_lsn,
|
|
wait_for_upload,
|
|
wait_tenant_status_404,
|
|
)
|
|
from fixtures.port_distributor import PortDistributor
|
|
from fixtures.remote_storage import (
|
|
LocalFsStorage,
|
|
RemoteStorageKind,
|
|
available_remote_storages,
|
|
)
|
|
from fixtures.types import Lsn, TenantId, TimelineId
|
|
from fixtures.utils import (
|
|
query_scalar,
|
|
start_in_background,
|
|
subprocess_capture,
|
|
wait_until,
|
|
)
|
|
|
|
|
|
def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
|
|
assert abs(a - b) / a < margin_ratio, abs(a - b) / a
|
|
|
|
|
|
@contextmanager
|
|
def new_pageserver_service(
|
|
new_pageserver_dir: Path,
|
|
pageserver_bin: Path,
|
|
remote_storage_mock_path: Path,
|
|
pg_port: int,
|
|
http_port: int,
|
|
broker: Optional[NeonBroker],
|
|
pg_distrib_dir: Path,
|
|
):
|
|
"""
|
|
cannot use NeonPageserver yet because it depends on neon cli
|
|
which currently lacks support for multiple pageservers
|
|
"""
|
|
# actually run new pageserver
|
|
cmd = [
|
|
str(pageserver_bin),
|
|
"--workdir",
|
|
str(new_pageserver_dir),
|
|
"--update-config",
|
|
f"-c listen_pg_addr='localhost:{pg_port}'",
|
|
f"-c listen_http_addr='localhost:{http_port}'",
|
|
f"-c pg_distrib_dir='{pg_distrib_dir}'",
|
|
"-c id=2",
|
|
f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}",
|
|
]
|
|
if broker is not None:
|
|
cmd.append(
|
|
f"-c broker_endpoint='{broker.client_url()}'",
|
|
)
|
|
pageserver_client = PageserverHttpClient(
|
|
port=http_port,
|
|
auth_token=None,
|
|
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
|
|
)
|
|
try:
|
|
pageserver_process = start_in_background(
|
|
cmd, new_pageserver_dir, "pageserver.log", pageserver_client.check_status
|
|
)
|
|
except Exception as e:
|
|
log.error(e)
|
|
pageserver_process.kill()
|
|
raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") from e
|
|
|
|
log.info("new pageserver started")
|
|
try:
|
|
yield pageserver_process
|
|
finally:
|
|
log.info("stopping new pageserver")
|
|
pageserver_process.kill()
|
|
|
|
|
|
@contextmanager
|
|
def pg_cur(endpoint):
|
|
with closing(endpoint.connect()) as conn:
|
|
with conn.cursor() as cur:
|
|
yield cur
|
|
|
|
|
|
def load(endpoint: Endpoint, stop_event: threading.Event, load_ok_event: threading.Event):
|
|
log.info("load started")
|
|
|
|
inserted_ctr = 0
|
|
failed = False
|
|
while not stop_event.is_set():
|
|
try:
|
|
with pg_cur(endpoint) as cur:
|
|
cur.execute("INSERT INTO load VALUES ('some payload')")
|
|
inserted_ctr += 1
|
|
except: # noqa: E722
|
|
if not failed:
|
|
log.info("load failed")
|
|
failed = True
|
|
load_ok_event.clear()
|
|
else:
|
|
if failed:
|
|
with pg_cur(endpoint) as cur:
|
|
# if we recovered after failure verify that we have correct number of rows
|
|
log.info("recovering at %s", inserted_ctr)
|
|
cur.execute("SELECT count(*) FROM load")
|
|
# it seems that sometimes transaction gets committed before we can acknowledge
|
|
# the result, so sometimes selected value is larger by one than we expect
|
|
assert cur.fetchone()[0] - inserted_ctr <= 1
|
|
log.info("successfully recovered %s", inserted_ctr)
|
|
failed = False
|
|
load_ok_event.set()
|
|
log.info("load thread stopped")
|
|
|
|
|
|
def populate_branch(
|
|
endpoint: Endpoint,
|
|
tenant_id: TenantId,
|
|
ps_http: PageserverHttpClient,
|
|
create_table: bool,
|
|
expected_sum: Optional[int],
|
|
) -> Tuple[TimelineId, Lsn]:
|
|
# insert some data
|
|
with pg_cur(endpoint) as cur:
|
|
cur.execute("SHOW neon.timeline_id")
|
|
timeline_id = TimelineId(cur.fetchone()[0])
|
|
log.info("timeline to relocate %s", timeline_id)
|
|
|
|
log.info(
|
|
"pg_current_wal_flush_lsn(): %s",
|
|
Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")),
|
|
)
|
|
log.info(
|
|
"timeline detail %s",
|
|
ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id),
|
|
)
|
|
|
|
# we rely upon autocommit after each statement
|
|
# as waiting for acceptors happens there
|
|
if create_table:
|
|
cur.execute("CREATE TABLE t(key int, value text)")
|
|
cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'")
|
|
if expected_sum is not None:
|
|
cur.execute("SELECT sum(key) FROM t")
|
|
assert cur.fetchone() == (expected_sum,)
|
|
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
|
|
|
return timeline_id, current_lsn
|
|
|
|
|
|
def ensure_checkpoint(
|
|
pageserver_http: PageserverHttpClient,
|
|
tenant_id: TenantId,
|
|
timeline_id: TimelineId,
|
|
current_lsn: Lsn,
|
|
):
|
|
# run checkpoint manually to be sure that data landed in remote storage
|
|
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
|
|
|
# wait until pageserver successfully uploaded a checkpoint to remote storage
|
|
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
|
|
|
|
|
|
def check_timeline_attached(
|
|
new_pageserver_http_client: PageserverHttpClient,
|
|
tenant_id: TenantId,
|
|
timeline_id: TimelineId,
|
|
old_timeline_detail: Dict[str, Any],
|
|
old_current_lsn: Lsn,
|
|
):
|
|
# new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint
|
|
new_timeline_detail = new_pageserver_http_client.timeline_detail(tenant_id, timeline_id)
|
|
|
|
# when load is active these checks can break because lsns are not static
|
|
# so let's check with some margin
|
|
assert_abs_margin_ratio(
|
|
int(Lsn(new_timeline_detail["disk_consistent_lsn"])),
|
|
int(Lsn(old_timeline_detail["disk_consistent_lsn"])),
|
|
0.03,
|
|
)
|
|
|
|
assert_abs_margin_ratio(
|
|
int(Lsn(new_timeline_detail["disk_consistent_lsn"])), int(old_current_lsn), 0.03
|
|
)
|
|
|
|
|
|
def switch_pg_to_new_pageserver(
|
|
env: NeonEnv,
|
|
endpoint: Endpoint,
|
|
new_pageserver_port: int,
|
|
tenant_id: TenantId,
|
|
timeline_id: TimelineId,
|
|
) -> Path:
|
|
endpoint.stop()
|
|
|
|
pg_config_file_path = Path(endpoint.config_file_path())
|
|
pg_config_file_path.open("a").write(
|
|
f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'"
|
|
)
|
|
|
|
endpoint.start()
|
|
|
|
timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
|
files_before_detach = os.listdir(timeline_to_detach_local_path)
|
|
assert (
|
|
"metadata" in files_before_detach
|
|
), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\
|
|
but got: {files_before_detach}"
|
|
assert (
|
|
len(files_before_detach) >= 2
|
|
), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\
|
|
but got {files_before_detach}"
|
|
|
|
return timeline_to_detach_local_path
|
|
|
|
|
|
def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_local_path: Path):
|
|
with pg_cur(endpoint) as cur:
|
|
# check that data is still there
|
|
cur.execute("SELECT sum(key) FROM t")
|
|
assert cur.fetchone() == (sum_before_migration,)
|
|
# check that we can write new data
|
|
cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'")
|
|
cur.execute("SELECT sum(key) FROM t")
|
|
assert cur.fetchone() == (sum_before_migration + 1500500,)
|
|
|
|
assert not os.path.exists(
|
|
old_local_path
|
|
), f"After detach, local timeline dir {old_local_path} should be removed"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"method",
|
|
[
|
|
# A minor migration involves no storage breaking changes.
|
|
# It is done by attaching the tenant to a new pageserver.
|
|
"minor",
|
|
# A major migration involves exporting a postgres datadir
|
|
# basebackup and importing it into the new pageserver.
|
|
# This kind of migration can tolerate breaking changes
|
|
# to storage format
|
|
"major",
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("with_load", ["with_load", "without_load"])
|
|
def test_tenant_relocation(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
port_distributor: PortDistributor,
|
|
test_output_dir: Path,
|
|
neon_binpath: Path,
|
|
base_dir: Path,
|
|
method: str,
|
|
with_load: str,
|
|
):
|
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
|
|
|
env = neon_env_builder.init_start()
|
|
|
|
tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
|
|
|
|
# FIXME: Is this expected?
|
|
env.pageserver.allowed_errors.append(
|
|
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
|
)
|
|
|
|
# Needed for detach polling.
|
|
env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
|
|
|
|
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
|
remote_storage_mock_path = env.pageserver_remote_storage.root
|
|
|
|
# we use two branches to check that they are both relocated
|
|
# first branch is used for load, compute for second one is used to
|
|
# check that data is not lost
|
|
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
_, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
|
|
log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)
|
|
|
|
env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id)
|
|
ep_main = env.endpoints.create_start(
|
|
branch_name="test_tenant_relocation_main", tenant_id=tenant_id
|
|
)
|
|
|
|
timeline_id_main, current_lsn_main = populate_branch(
|
|
ep_main,
|
|
tenant_id=tenant_id,
|
|
ps_http=pageserver_http,
|
|
create_table=True,
|
|
expected_sum=500500,
|
|
)
|
|
|
|
env.neon_cli.create_branch(
|
|
new_branch_name="test_tenant_relocation_second",
|
|
ancestor_branch_name="test_tenant_relocation_main",
|
|
ancestor_start_lsn=current_lsn_main,
|
|
tenant_id=tenant_id,
|
|
)
|
|
ep_second = env.endpoints.create_start(
|
|
branch_name="test_tenant_relocation_second", tenant_id=tenant_id
|
|
)
|
|
|
|
timeline_id_second, current_lsn_second = populate_branch(
|
|
ep_second,
|
|
tenant_id=tenant_id,
|
|
ps_http=pageserver_http,
|
|
create_table=False,
|
|
expected_sum=1001000,
|
|
)
|
|
|
|
# wait until pageserver receives that data
|
|
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main)
|
|
timeline_detail_main = pageserver_http.timeline_detail(tenant_id, timeline_id_main)
|
|
|
|
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second)
|
|
timeline_detail_second = pageserver_http.timeline_detail(tenant_id, timeline_id_second)
|
|
|
|
if with_load == "with_load":
|
|
# create load table
|
|
with pg_cur(ep_main) as cur:
|
|
cur.execute("CREATE TABLE load(value text)")
|
|
|
|
load_stop_event = threading.Event()
|
|
load_ok_event = threading.Event()
|
|
load_thread = threading.Thread(
|
|
target=load,
|
|
args=(ep_main, load_stop_event, load_ok_event),
|
|
daemon=True, # To make sure the child dies when the parent errors
|
|
)
|
|
load_thread.start()
|
|
|
|
# this requirement introduces a problem
|
|
# if user creates a branch during migration
|
|
# it wont appear on the new pageserver
|
|
ensure_checkpoint(
|
|
pageserver_http=pageserver_http,
|
|
tenant_id=tenant_id,
|
|
timeline_id=timeline_id_main,
|
|
current_lsn=current_lsn_main,
|
|
)
|
|
|
|
ensure_checkpoint(
|
|
pageserver_http=pageserver_http,
|
|
tenant_id=tenant_id,
|
|
timeline_id=timeline_id_second,
|
|
current_lsn=current_lsn_second,
|
|
)
|
|
|
|
log.info("inititalizing new pageserver")
|
|
# bootstrap second pageserver
|
|
new_pageserver_dir = env.repo_dir / "new_pageserver"
|
|
new_pageserver_dir.mkdir()
|
|
|
|
new_pageserver_pg_port = port_distributor.get_port()
|
|
new_pageserver_http_port = port_distributor.get_port()
|
|
log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port)
|
|
pageserver_bin = neon_binpath / "pageserver"
|
|
|
|
new_pageserver_http = PageserverHttpClient(
|
|
port=new_pageserver_http_port,
|
|
auth_token=None,
|
|
is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip,
|
|
)
|
|
|
|
with new_pageserver_service(
|
|
new_pageserver_dir,
|
|
pageserver_bin,
|
|
remote_storage_mock_path,
|
|
new_pageserver_pg_port,
|
|
new_pageserver_http_port,
|
|
neon_env_builder.broker,
|
|
neon_env_builder.pg_distrib_dir,
|
|
):
|
|
# Migrate either by attaching from s3 or import/export basebackup
|
|
if method == "major":
|
|
cmd = [
|
|
"poetry",
|
|
"run",
|
|
"python",
|
|
str(base_dir / "scripts/export_import_between_pageservers.py"),
|
|
"--tenant-id",
|
|
str(tenant_id),
|
|
"--from-host",
|
|
"localhost",
|
|
"--from-http-port",
|
|
str(pageserver_http.port),
|
|
"--from-pg-port",
|
|
str(env.pageserver.service_port.pg),
|
|
"--to-host",
|
|
"localhost",
|
|
"--to-http-port",
|
|
str(new_pageserver_http_port),
|
|
"--to-pg-port",
|
|
str(new_pageserver_pg_port),
|
|
"--pg-distrib-dir",
|
|
str(neon_env_builder.pg_distrib_dir),
|
|
"--work-dir",
|
|
str(test_output_dir),
|
|
"--tmp-pg-port",
|
|
str(port_distributor.get_port()),
|
|
]
|
|
subprocess_capture(test_output_dir, cmd, check=True)
|
|
elif method == "minor":
|
|
# call to attach timeline to new pageserver
|
|
new_pageserver_http.tenant_attach(tenant_id)
|
|
|
|
# wait for tenant to finish attaching
|
|
wait_until(
|
|
number_of_iterations=10,
|
|
interval=1,
|
|
func=lambda: assert_tenant_state(new_pageserver_http, tenant_id, "Active"),
|
|
)
|
|
|
|
check_timeline_attached(
|
|
new_pageserver_http,
|
|
tenant_id,
|
|
timeline_id_main,
|
|
timeline_detail_main,
|
|
current_lsn_main,
|
|
)
|
|
|
|
check_timeline_attached(
|
|
new_pageserver_http,
|
|
tenant_id,
|
|
timeline_id_second,
|
|
timeline_detail_second,
|
|
current_lsn_second,
|
|
)
|
|
|
|
# rewrite neon cli config to use new pageserver for basebackup to start new compute
|
|
lines = (env.repo_dir / "config").read_text().splitlines()
|
|
for i, line in enumerate(lines):
|
|
if line.startswith("listen_http_addr"):
|
|
lines[i] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
|
|
if line.startswith("listen_pg_addr"):
|
|
lines[i] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
|
|
(env.repo_dir / "config").write_text("\n".join(lines))
|
|
|
|
old_local_path_main = switch_pg_to_new_pageserver(
|
|
env,
|
|
ep_main,
|
|
new_pageserver_pg_port,
|
|
tenant_id,
|
|
timeline_id_main,
|
|
)
|
|
|
|
old_local_path_second = switch_pg_to_new_pageserver(
|
|
env,
|
|
ep_second,
|
|
new_pageserver_pg_port,
|
|
tenant_id,
|
|
timeline_id_second,
|
|
)
|
|
|
|
# detach tenant from old pageserver before we check
|
|
# that all the data is there to be sure that old pageserver
|
|
# is no longer involved, and if it is, we will see the error
|
|
pageserver_http.tenant_detach(tenant_id)
|
|
|
|
# Wait a little, so that the detach operation has time to finish.
|
|
wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
|
|
|
|
post_migration_check(ep_main, 500500, old_local_path_main)
|
|
post_migration_check(ep_second, 1001000, old_local_path_second)
|
|
|
|
# ensure that we can successfully read all relations on the new pageserver
|
|
with pg_cur(ep_second) as cur:
|
|
cur.execute(
|
|
"""
|
|
DO $$
|
|
DECLARE
|
|
r RECORD;
|
|
BEGIN
|
|
FOR r IN
|
|
SELECT relname FROM pg_class WHERE relkind='r'
|
|
LOOP
|
|
RAISE NOTICE '%', r.relname;
|
|
EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
|
|
END LOOP;
|
|
END$$;
|
|
"""
|
|
)
|
|
|
|
if with_load == "with_load":
|
|
assert load_ok_event.wait(3)
|
|
log.info("stopping load thread")
|
|
load_stop_event.set()
|
|
load_thread.join(timeout=10)
|
|
log.info("load thread stopped")
|
|
|
|
# bring old pageserver back for clean shutdown via neon cli
|
|
# new pageserver will be shut down by the context manager
|
|
lines = (env.repo_dir / "config").read_text().splitlines()
|
|
for i, line in enumerate(lines):
|
|
if line.startswith("listen_http_addr"):
|
|
lines[i] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
|
|
if line.startswith("listen_pg_addr"):
|
|
lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
|
|
(env.repo_dir / "config").write_text("\n".join(lines))
|
|
|
|
|
|
# Simulate hard crash of pageserver and re-attach a tenant with a branch
|
|
#
|
|
# This exercises a race condition after tenant attach, where the
|
|
# branch point on the ancestor timeline is greater than the ancestor's
|
|
# last-record LSN. We had a bug where GetPage incorrectly followed the
|
|
# timeline to the ancestor without waiting for the missing WAL to
|
|
# arrive.
|
|
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
|
def test_emergency_relocate_with_branches_slow_replay(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
remote_storage_kind: RemoteStorageKind,
|
|
):
|
|
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
|
|
|
env = neon_env_builder.init_start()
|
|
env.pageserver.is_testing_enabled_or_skip()
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
# Prepare for the test:
|
|
#
|
|
# - Main branch, with a table and two inserts to it.
|
|
# - A logical replication message between the inserts, so that we can conveniently
|
|
# pause the WAL ingestion between the two inserts.
|
|
# - Child branch, created after the inserts
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
|
with main_endpoint.cursor() as cur:
|
|
cur.execute("CREATE TABLE test_reattach (t text)")
|
|
cur.execute("INSERT INTO test_reattach VALUES ('before pause')")
|
|
|
|
cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
|
|
|
|
cur.execute("INSERT INTO test_reattach VALUES ('after pause')")
|
|
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
|
|
|
main_endpoint.stop()
|
|
env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
|
|
|
|
# Now kill the pageserver, remove the tenant directory, and restart. This simulates
|
|
# the scenario that a pageserver dies unexpectedly and cannot be recovered, so we relocate
|
|
# the tenant to a different pageserver. We reuse the same pageserver because it's
|
|
# simpler than initializing a new one from scratch, but the effect on the single tenant
|
|
# is the same.
|
|
env.pageserver.stop(immediate=True)
|
|
shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
|
|
env.pageserver.start()
|
|
|
|
# This fail point will pause the WAL ingestion on the main branch, after the
|
|
# the first insert
|
|
pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
|
|
|
|
# Attach and wait a few seconds to give it time to load the tenants, attach to the
|
|
# safekeepers, and to stream and ingest the WAL up to the pause-point.
|
|
before_attach_time = time.time()
|
|
pageserver_http.tenant_attach(tenant_id)
|
|
time.sleep(3)
|
|
|
|
# The wal ingestion on the main timeline should now be paused at the fail point.
|
|
# Run a query on the child branch. The GetPage requests for this should recurse to the
|
|
# parent timeline, and wait for the WAL to be ingested there. Otherwise it won't see
|
|
# the second insert.
|
|
child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
|
|
with child_endpoint.cursor() as cur:
|
|
cur.execute("SELECT * FROM test_reattach")
|
|
assert cur.fetchall() == [("before pause",), ("after pause",)]
|
|
|
|
# Sanity check that the failpoint was reached
|
|
assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
|
|
assert time.time() - before_attach_time > 5
|
|
|
|
# Clean up
|
|
pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
|
|
|
|
|
|
# Simulate hard crash of pageserver and re-attach a tenant with a branch
|
|
#
|
|
# This exercises the same race condition after as
|
|
# 'test_emergency_relocate_with_branches_slow_replay', but this test case
|
|
# is closer to the original scenario where we originally found the
|
|
# issue.
|
|
#
|
|
# In this scenario, the incorrect result to get-request leads to
|
|
# *permanent damage* in the child timeline, because ingesting the WAL
|
|
# on the child timeline depended on incorrect view of the parent. This
|
|
# test reproduced one such case; the symptom was an error on the child, when
|
|
# trying to connect to the child endpoint after re-attaching the tenant:
|
|
#
|
|
# FATAL: database "neondb" does not exist
|
|
#
|
|
# In the original case where we bumped into this, the error was slightly
|
|
# different:
|
|
#
|
|
# FATAL: "base/16385" is not a valid data directory
|
|
# DETAIL: File "base/16385/PG_VERSION" is missing.
|
|
#
|
|
### Detailed explanation of the original bug and why it lead to that error:
|
|
#
|
|
# The WAL on the main and the child branches look like this:
|
|
#
|
|
# Main Child
|
|
# 1. CREATE DATABASE
|
|
# <child branch is created>
|
|
# 2. CREATE TABLE AS SELECT ... 3. CREATE TABLE AS SELECT ...
|
|
#
|
|
# None of these WAL records have been flushed to disk or uploaded to remote
|
|
# storage in the pageserver yet, when the tenant is detached.
|
|
#
|
|
# After detach and re-attach, a walreceiver is spawned on both timelines.
|
|
# They will connect to the safekeepers and start ingesting the WAL
|
|
# from their respective IndexParts' `disk_consistent_lsn` onward.
|
|
#
|
|
# The bug occurs if the child branch's walreceiver runs before the
|
|
# main's. It receives the SMGR_CREATE WAL record emitted by the
|
|
# CREATE TABLE statement (3.), and applies it, without seeing the
|
|
# effect of the CREATE DATABASE statement.
|
|
#
|
|
# To understand why that leads to a 'File "base/16385/PG_VERSION" is
|
|
# missing' error, let's look at what the handlers for the WAL records
|
|
# do:
|
|
#
|
|
# CREATE DATABASE WAL record is handled by ingest_xlog_dbase_create:
|
|
#
|
|
# ingest_xlog_dbase_create:
|
|
# put_relmap_file()
|
|
# // NOTE 'true': It means that there is a relmapper and PG_VERSION file
|
|
# 1: let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
|
|
#
|
|
#
|
|
# CREATE TABLE emits an SMGR_CREATE record, which is handled by:
|
|
#
|
|
# ingest_xlog_smgr_create:
|
|
# put_rel_creation:
|
|
# ...
|
|
# let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
|
|
# 2: // Didn't exist. Update dbdir
|
|
# dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
|
|
# let buf = DbDirectory::ser(&dbdir)?;
|
|
# self.put(DBDIR_KEY, Value::Image(buf.into()));
|
|
#
|
|
# // and create the RelDirectory
|
|
# RelDirectory::default()
|
|
# } else {
|
|
# 3: // reldir already exists, fetch it
|
|
# RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
|
# };
|
|
#
|
|
#
|
|
# In the correct ordering, the SMGR_CREATE record is applied after the
|
|
# CREATE DATABASE record. The CREATE DATABASE creates the entry in the
|
|
# 'dbdir', with the 'true' flag that indicates that PG_VERSION exists
|
|
# (1). The SMGR_CREATE handler calls put_rel_creation, which finds the
|
|
# dbdir entry that the CREATE DATABASE record created, and takes the
|
|
# "reldir already exists, fetch it" else-branch at the if statement (3).
|
|
#
|
|
# In the incorrect ordering, the child walreceiver applies the
|
|
# SMGR_CREATE record without seeing the effects of the CREATE
|
|
# DATABASE. In that case, put_rel_creation takes the "Didn't
|
|
# exist. Update dbir" path (2), and inserts an entry in the
|
|
# DbDirectory with 'false' to indicate there is no PG_VERSION file.
|
|
#
|
|
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
|
def test_emergency_relocate_with_branches_createdb(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
remote_storage_kind: RemoteStorageKind,
|
|
):
|
|
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
|
|
|
env = neon_env_builder.init_start()
|
|
pageserver_http = env.pageserver.http_client()
|
|
|
|
# create new nenant
|
|
tenant_id, _ = env.neon_cli.create_tenant()
|
|
|
|
main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
|
with main_endpoint.cursor() as cur:
|
|
cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
|
|
|
|
cur.execute("CREATE DATABASE neondb")
|
|
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
|
env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
|
|
|
|
with main_endpoint.cursor(dbname="neondb") as cur:
|
|
cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,100)")
|
|
main_endpoint.stop()
|
|
|
|
child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
|
|
with child_endpoint.cursor(dbname="neondb") as cur:
|
|
cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,200)")
|
|
child_endpoint.stop()
|
|
|
|
# Kill the pageserver, remove the tenant directory, and restart
|
|
env.pageserver.stop(immediate=True)
|
|
shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
|
|
env.pageserver.start()
|
|
|
|
# Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original
|
|
# bug reproduced easily even without this, as there is always some delay between
|
|
# loading the timeline and establishing the connection to the safekeeper to stream and
|
|
# ingest the WAL, but let's make this less dependent on accidental timing.
|
|
pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
|
|
before_attach_time = time.time()
|
|
pageserver_http.tenant_attach(tenant_id)
|
|
|
|
child_endpoint.start()
|
|
with child_endpoint.cursor(dbname="neondb") as cur:
|
|
assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
|
|
|
|
# Sanity check that the failpoint was reached
|
|
assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
|
|
assert time.time() - before_attach_time > 5
|
|
|
|
# Clean up
|
|
pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
|