Merge remote-tracking branch 'upstream/main' into jcsp/deletion-queue

This commit is contained in:
John Spray
2023-08-22 10:09:23 +01:00
43 changed files with 1222 additions and 697 deletions

View File

@@ -427,6 +427,7 @@ class NeonEnvBuilder:
default_branch_name: str = DEFAULT_BRANCH_NAME,
preserve_database_files: bool = False,
initial_tenant: Optional[TenantId] = None,
initial_timeline: Optional[TimelineId] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
@@ -452,6 +453,7 @@ class NeonEnvBuilder:
self.pg_version = pg_version
self.preserve_database_files = preserve_database_files
self.initial_tenant = initial_tenant or TenantId.generate()
self.initial_timeline = initial_timeline or TimelineId.generate()
def init_configs(self) -> NeonEnv:
# Cannot create more than one environment from one builder
@@ -473,9 +475,10 @@ class NeonEnvBuilder:
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
)
initial_tenant, initial_timeline = env.neon_cli.create_tenant(
tenant_id=env.initial_tenant, conf=initial_tenant_conf
tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline
)
env.initial_timeline = initial_timeline
assert env.initial_tenant == initial_tenant
assert env.initial_timeline == initial_timeline
log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
return env
@@ -784,7 +787,7 @@ class NeonEnv:
# generate initial tenant ID here instead of letting 'neon init' generate it,
# so that we don't need to dig it out of the config file afterwards.
self.initial_tenant = config.initial_tenant
self.initial_timeline: Optional[TimelineId] = None
self.initial_timeline = config.initial_timeline
# Create a config file corresponding to the options
toml = textwrap.dedent(

View File

@@ -315,4 +315,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = {
def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 10
return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15

View File

@@ -1,4 +1,3 @@
import shutil
import time
from dataclasses import dataclass
from typing import Dict, Tuple
@@ -14,7 +13,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import wait_for_upload_queue_empty
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
@@ -138,22 +137,14 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
env = neon_env_builder.init_start()
# initial tenant will not be present on this pageserver
env = neon_env_builder.init_configs()
env.start()
pageserver_http = env.pageserver.http_client()
# allow because we are invoking this manually; we always warn on executing disk based eviction
env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
# remove the initial tenant
assert env.initial_timeline
pageserver_http.tenant_detach(env.initial_tenant)
assert isinstance(env.remote_storage, LocalFsStorage)
tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
assert tenant_remote_storage.is_dir()
shutil.rmtree(tenant_remote_storage)
env.initial_tenant = TenantId("0" * 32)
env.initial_timeline = None
# Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers.
# Large count of layers and small layer size is good for testing because it makes evictions predictable.
# Predictable in the sense that many layer evictions will be required to reach the eviction target, because

View File

@@ -11,8 +11,7 @@ from fixtures.neon_fixtures import (
wait_for_last_flush_lsn,
)
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from fixtures.utils import query_scalar
from fixtures.types import TimelineId
# Test configuration
#
@@ -71,13 +70,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_gc_aggressive", "main")
timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
endpoint = env.endpoints.create_start("test_gc_aggressive")
log.info("postgres is running on test_gc_aggressive branch")
with endpoint.cursor() as cur:
timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
# Create table, and insert the first 100 rows
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
@@ -109,7 +106,8 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
)
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_gc_index_upload", "main")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
endpoint = env.endpoints.create_start("test_gc_index_upload")
pageserver_http = env.pageserver.http_client()
@@ -117,9 +115,6 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""

View File

@@ -12,13 +12,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
# test anyway, so it doesn't need any special attention here.
@pytest.mark.timeout(600)
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
tenant_id, _ = env.neon_cli.create_tenant(
conf={
env = neon_env_builder.init_start(
initial_tenant_conf={
"gc_period": "10 s",
"gc_horizon": f"{1024 ** 2}",
"checkpoint_distance": f"{1024 ** 2}",
@@ -29,6 +24,11 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
"image_creation_threshold": "2",
}
)
pageserver_http = env.pageserver.http_client()
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
tenant_id = env.initial_tenant
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
connstr = endpoint.connstr(options="-csynchronous_commit=off")
pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
@@ -39,5 +39,4 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
with pytest.raises(subprocess.SubprocessError):
pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
env.pageserver.stop()
env.pageserver.start()
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})

View File

@@ -74,9 +74,9 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder):
cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid")
# Check layer file sizes
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id)
timeline_path = "{}/tenants/{}/timelines/{}/".format(
env.repo_dir, env.initial_tenant, env.initial_timeline
)
for filename in os.listdir(timeline_path):
if filename.startswith("00000"):
log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}")

View File

@@ -8,7 +8,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.types import Lsn
from fixtures.utils import query_scalar
@@ -34,8 +34,8 @@ def test_basic_eviction(
client = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Create a number of layers in the tenant
with endpoint.cursor() as cur:

View File

@@ -18,8 +18,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from fixtures.utils import query_scalar
from fixtures.types import TenantId
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@@ -115,15 +114,13 @@ def test_metric_collection(
# Order of fixtures shutdown is not specified, and if http server gets down
# before pageserver, pageserver log might contain such errors in the end.
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
env.neon_cli.create_branch("test_metric_collection")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_metric_collection")
endpoint = env.endpoints.create_start("test_metric_collection")
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""

View File

@@ -78,8 +78,8 @@ def test_ondemand_download_large_rel(
client = env.pageserver.http_client()
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# We want to make sure that the data is large enough that the keyspace is partitioned.
num_rows = 1000000
@@ -183,8 +183,8 @@ def test_ondemand_download_timetravel(
client = env.pageserver.http_client()
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
lsns = []
@@ -342,8 +342,8 @@ def test_download_remote_layers_api(
client = env.pageserver.http_client()
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
table_len = 10000
with endpoint.cursor() as cur:
@@ -516,7 +516,6 @@ def test_compaction_downloads_on_demand_without_image_creation(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
assert timeline_id is not None
with env.endpoints.create_start("main") as endpoint:
# no particular reason to create the layers like this, but we are sure
@@ -590,7 +589,6 @@ def test_compaction_downloads_on_demand_with_image_creation(
env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
assert timeline_id is not None
pageserver_http = env.pageserver.http_client()

View File

@@ -2,7 +2,7 @@ from contextlib import closing
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.pageserver.utils import wait_for_last_record_lsn
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.types import Lsn
from fixtures.utils import query_scalar
@@ -12,24 +12,21 @@ from fixtures.utils import query_scalar
# Additionally, tests that pageserver is able to create tenants with custom configs.
def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
env = neon_env_builder.init_start()
tenant, _ = env.neon_cli.create_tenant(
conf={
env = neon_env_builder.init_start(
initial_tenant_conf={
"trace_read_requests": "true",
}
)
timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant)
endpoint = env.endpoints.create_start("test_trace_replay", "main", tenant)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main")
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("create table t (i integer);")
cur.execute(f"insert into t values (generate_series(1,{10000}));")
cur.execute("select count(*) from t;")
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
# wait until pageserver receives that data
pageserver_http = env.pageserver.http_client()
@@ -38,5 +35,5 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
# Stop postgres so we drop the connection and flush the traces
endpoint.stop()
trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline)
trace_path = env.repo_dir / "traces" / str(tenant_id) / str(timeline_id)
assert trace_path.exists()

View File

@@ -98,12 +98,12 @@ def test_remote_storage_backup_and_restore(
client = env.pageserver.http_client()
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Thats because of UnreliableWrapper's injected failures
env.pageserver.allowed_errors.append(
f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
f".*failed to fetch tenant deletion mark at tenants/{tenant_id}/deleted attempt 1.*"
)
checkpoint_numbers = range(1, 3)
@@ -458,8 +458,7 @@ def test_remote_timeline_client_calls_started_metric(
)
tenant_id = env.initial_tenant
assert env.initial_timeline is not None
timeline_id: TimelineId = env.initial_timeline
timeline_id = env.initial_timeline
client = env.pageserver.http_client()
@@ -596,8 +595,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
}
)
tenant_id = env.initial_tenant
assert env.initial_timeline is not None
timeline_id: TimelineId = env.initial_timeline
timeline_id = env.initial_timeline
timeline_path = env.timeline_dir(tenant_id, timeline_id)
@@ -862,8 +860,7 @@ def test_compaction_delete_before_upload(
)
tenant_id = env.initial_tenant
assert env.initial_timeline is not None
timeline_id: TimelineId = env.initial_timeline
timeline_id = env.initial_timeline
client = env.pageserver.http_client()

View File

@@ -57,6 +57,11 @@ def test_tenant_delete_smoke(
]
)
# lucky race with stopping from flushing a layer we fail to schedule any uploads
env.pageserver.allowed_errors.append(
".*layer flush task.+: could not flush frozen layer: update_metadata_file"
)
ps_http = env.pageserver.http_client()
# first try to delete non existing tenant
@@ -309,9 +314,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
)
# TODO resume deletion (https://github.com/neondatabase/neon/issues/5006)
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_deleted_tenant_ignored_on_attach(
def test_tenant_delete_is_resumed_on_attach(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
pg_bin: PgBin,
@@ -353,6 +357,8 @@ def test_deleted_tenant_ignored_on_attach(
(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# From deletion polling
f".*NotFound: tenant {env.initial_tenant}.*",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
@@ -398,20 +404,17 @@ def test_deleted_tenant_ignored_on_attach(
env.pageserver.start()
# now we call attach
with pytest.raises(
PageserverApiException, match="Tenant is marked as deleted on remote storage"
):
ps_http.tenant_attach(tenant_id=tenant_id)
ps_http.tenant_attach(tenant_id=tenant_id)
# delete should be resumed (not yet)
# wait_tenant_status_404(ps_http, tenant_id, iterations)
# delete should be resumed
wait_tenant_status_404(ps_http, tenant_id, iterations)
# we shouldn've created tenant dir on disk
tenant_path = env.tenant_dir(tenant_id=tenant_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(

View File

@@ -463,8 +463,8 @@ def test_detach_while_attaching(
client = env.pageserver.http_client()
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
@@ -615,8 +615,8 @@ def test_ignored_tenant_download_missing_layers(
pageserver_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
@@ -679,10 +679,10 @@ def test_ignored_tenant_stays_broken_without_metadata(
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
env.endpoints.create_start("main")
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
@@ -723,9 +723,9 @@ def test_load_attach_negatives(
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
env.endpoints.create_start("main")
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
tenant_id = env.initial_tenant
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
@@ -773,8 +773,8 @@ def test_ignore_while_attaching(
pageserver_http = env.pageserver.http_client()
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.

View File

@@ -142,8 +142,8 @@ def test_tenants_attached_after_download(
client = env.pageserver.http_client()
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Thats because of UnreliableWrapper's injected failures
env.pageserver.allowed_errors.append(
@@ -252,8 +252,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
pageserver_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t1 AS VALUES (123, 'foobar');")

View File

@@ -10,7 +10,6 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.http import LayerMapInfo
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TimelineId
from pytest_httpserver import HTTPServer
# NB: basic config change tests are in test_tenant_conf.py
@@ -45,7 +44,6 @@ def test_threshold_based_eviction(
)
tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
assert isinstance(timeline_id, TimelineId)
ps_http = env.pageserver.http_client()
assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {

View File

@@ -17,6 +17,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
MANY_SMALL_LAYERS_TENANT_CONFIG,
assert_prefix_empty,
assert_prefix_not_empty,
poll_for_remote_storage_iterations,
@@ -34,7 +35,7 @@ from fixtures.remote_storage import (
available_s3_storages,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar, wait_until
from fixtures.utils import query_scalar, run_pg_bench_small, wait_until
def test_timeline_delete(neon_simple_env: NeonEnv):
@@ -208,7 +209,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
timeline_id = env.neon_cli.create_timeline("delete")
with env.endpoints.create_start("delete") as endpoint:
# generate enough layers
pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
run_pg_bench_small(pg_bin, endpoint.connstr())
if remote_storage_kind is RemoteStorageKind.NOOP:
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
else:
@@ -358,8 +359,8 @@ def test_timeline_resurrection_on_attach(
ps_http = env.pageserver.http_client()
pg = env.endpoints.create_start("main")
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
main_timeline_id = env.initial_timeline
with pg.cursor() as cur:
cur.execute("CREATE TABLE f (i integer);")
@@ -496,16 +497,6 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
interval=0.5,
)
try:
data = ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
log.debug(f"detail {data}")
except PageserverApiException as e:
log.debug(e)
if e.status_code != 404:
raise
else:
raise Exception("detail succeeded (it should return 404)")
assert (
not leaf_timeline_path.exists()
), "timeline load procedure should have resumed the deletion interrupted by the failpoint"
@@ -528,8 +519,6 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
),
)
assert env.initial_timeline is not None
for timeline_id in (intermediate_timeline_id, env.initial_timeline):
timeline_delete_wait_completed(
ps_http, tenant_id=env.initial_tenant, timeline_id=timeline_id
@@ -732,13 +721,9 @@ def test_timeline_delete_works_for_remote_smoke(
ps_http = env.pageserver.http_client()
pg = env.endpoints.create_start("main")
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
assert tenant_id == env.initial_tenant
assert main_timeline_id == env.initial_timeline
assert env.initial_timeline is not None
timeline_ids = [env.initial_timeline]
for i in range(2):
branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
@@ -759,9 +744,8 @@ def test_timeline_delete_works_for_remote_smoke(
log.info("waiting for checkpoint upload")
wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn)
log.info("upload of checkpoint is done")
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
timeline_ids.append(timeline_id)
timeline_ids.append(branch_timeline_id)
for timeline_id in timeline_ids:
assert_prefix_not_empty(
@@ -825,7 +809,7 @@ def test_delete_orphaned_objects(
timeline_id = env.neon_cli.create_timeline("delete")
with env.endpoints.create_start("delete") as endpoint:
# generate enough layers
pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
# write orphaned file that is missing from the index
@@ -863,3 +847,121 @@ def test_delete_orphaned_objects(
)
assert env.remote_storage.index_path(env.initial_tenant, timeline_id).exists()
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_timeline_delete_resumed_on_attach(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
pg_bin: PgBin,
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_deleted_tenant_ignored_on_attach",
)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
tenant_id = env.initial_tenant
ps_http = env.pageserver.http_client()
timeline_id = env.neon_cli.create_timeline("delete")
with env.endpoints.create_start("delete") as endpoint:
# generate enough layers
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(env.initial_tenant),
"timelines",
str(timeline_id),
)
),
)
# failpoint before we remove index_part from s3
failpoint = "timeline-delete-during-rm"
ps_http.configure_failpoints((failpoint, "return"))
env.pageserver.allowed_errors.extend(
(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
# Polling after attach may fail with this
f".*InternalServerError\\(Tenant {tenant_id} is not active.*",
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
)
)
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
ps_http.timeline_delete(tenant_id, timeline_id)
timeline_info = wait_until_timeline_state(
pageserver_http=ps_http,
tenant_id=env.initial_tenant,
timeline_id=timeline_id,
expected_state="Broken",
iterations=iterations,
)
reason = timeline_info["state"]["Broken"]["reason"]
log.info(f"timeline broken: {reason}")
# failpoint may not be the only error in the stack
assert reason.endswith(f"failpoint: {failpoint}"), reason
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
"timelines",
str(timeline_id),
)
),
)
# now we stop pageserver and remove local tenant state
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = Path(env.repo_dir) / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
env.pageserver.start()
# now we call attach
ps_http.tenant_attach(tenant_id=tenant_id)
# delete should be resumed
wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
tenant_path = env.timeline_dir(tenant_id=tenant_id, timeline_id=timeline_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(timeline_id),
"timelines",
str(timeline_id),
)
),
)

View File

@@ -270,7 +270,8 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_local_fs_remote_storage()
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_broker", "main")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_broker", "main")
# FIXME: Is this expected?
env.pageserver.allowed_errors.append(
@@ -280,10 +281,6 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
endpoint = env.endpoints.create_start("test_broker")
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# wait until remote_consistent_lsn gets advanced on all safekeepers
clients = [sk.http_client() for sk in env.safekeepers]
stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
@@ -325,7 +322,8 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
)
env.neon_cli.create_branch("test_safekeepers_wal_removal")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_removal")
endpoint = env.endpoints.create_start("test_safekeepers_wal_removal")
# Note: it is important to insert at least two segments, as currently
@@ -338,9 +336,6 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
]
)
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# force checkpoint to advance remote_consistent_lsn
pageserver_conn_options = {}
if auth_enabled:
@@ -451,13 +446,10 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_safekeepers_wal_backup")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_backup")
endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
cur.execute("create table t(key int, value text)")
@@ -505,14 +497,11 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_s3_wal_replay")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_s3_wal_replay")
endpoint = env.endpoints.create_start("test_s3_wal_replay")
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
expected_sum = 0
with closing(endpoint.connect()) as conn:
@@ -796,15 +785,12 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
neon_env_builder.auth_enabled = auth_enabled
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_timeline_status")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_timeline_status")
endpoint = env.endpoints.create_start("test_timeline_status")
wa = env.safekeepers[0]
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
if not auth_enabled:
wa_http_cli = wa.http_client()
wa_http_cli.check_status()
@@ -887,15 +873,12 @@ def test_start_replication_term(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_start_replication_term")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_start_replication_term")
endpoint = env.endpoints.create_start("test_start_replication_term")
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
sk = env.safekeepers[0]
sk_http_cli = sk.http_client()
tli_status = sk_http_cli.timeline_status(tenant_id, timeline_id)
@@ -922,15 +905,12 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_sk_auth")
endpoint = env.endpoints.create_start("test_sk_auth")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_sk_auth")
env.endpoints.create_start("test_sk_auth")
sk = env.safekeepers[0]
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
tenant_token = env.auth_keys.generate_tenant_token(tenant_id)
full_token = env.auth_keys.generate_safekeeper_token()
@@ -981,6 +961,35 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder):
connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only, password=tenant_token)
# Try restarting endpoint with enabled auth.
def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("create table t(i int)")
# Restarting endpoints and random safekeepers, to trigger recovery.
for _i in range(3):
random_sk = random.choice(env.safekeepers)
random_sk.stop()
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
start = random.randint(1, 100000)
end = start + random.randint(1, 10000)
cur.execute("insert into t select generate_series(%s,%s)", (start, end))
endpoint.stop()
random_sk.start()
endpoint.start()
class SafekeeperEnv:
def __init__(
self,
@@ -1156,7 +1165,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 4
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_replace_safekeeper")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_replace_safekeeper")
log.info("Use only first 3 safekeepers")
env.safekeepers[3].stop()
@@ -1164,10 +1174,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
endpoint.active_safekeepers = [1, 2, 3]
endpoint.start()
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
execute_payload(endpoint)
show_statuses(env.safekeepers, tenant_id, timeline_id)
@@ -1419,7 +1425,8 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 4
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_pull_timeline")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_pull_timeline")
log.info("Use only first 3 safekeepers")
env.safekeepers[3].stop()
@@ -1427,10 +1434,6 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
endpoint.active_safekeepers = [1, 2, 3]
endpoint.start()
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
execute_payload(endpoint)
show_statuses(env.safekeepers, tenant_id, timeline_id)