From 3122f3282f1a7f4639141f5a5a451cefae53a43d Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 8 Dec 2022 15:52:02 +0200 Subject: [PATCH] Ignore backup files (ones with .n.old suffix) in download_missing This is rather a hack to resolve immediate issue: https://github.com/neondatabase/neon/issues/3024 Properly cleaning this file from index part requires changes to initialization of remote queue. Because we need to clean it up earlier than we start warking around files. With on-demand there will be no walk around layer files becase download_missing is no longer needed, so I believe it will be natural to unify this with load_layer_map --- pageserver/src/tenant/timeline.rs | 6 ++ test_runner/fixtures/neon_fixtures.py | 1 + .../test_tenants_with_remote_storage.py | 85 +++++++++++++++++-- 3 files changed, 83 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ec8049bcea..aab5d6f1d3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1126,6 +1126,12 @@ impl Timeline { .unwrap() .insert_historic(Arc::new(delta_layer)); self.metrics.current_physical_size_gauge.add(sz); + } else if layer_name.ends_with(".old") { + // For details see https://github.com/neondatabase/neon/issues/3024 + warn!( + "got backup file on the remote storage, ignoring it {file}", + file = layer_name + ) } else { bail!("unexpected layer filename {layer_name} in remote storage path: {remote_layer_path:?}"); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index cda518f26c..d20e591e9b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2896,6 +2896,7 @@ def assert_no_in_progress_downloads_for_tenant( ): tenant_status = pageserver_http_client.tenant_status(tenant) assert tenant_status["has_in_progress_downloads"] is False, tenant_status + assert tenant_status["state"] == "Active" def remote_consistent_lsn( diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 76639e4055..1228f8b86e 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -248,15 +248,6 @@ def test_tenant_upgrades_index_json_from_v0( # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade env = neon_env_builder.init_start() - # FIXME: Are these expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") - env.pageserver.allowed_errors.append( - ".*Failed to get local tenant state: Tenant .* not found in the local state.*" - ) - pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") @@ -352,6 +343,82 @@ def test_tenant_upgrades_index_json_from_v0( # FIXME: test index_part.json getting downgraded from imaginary new version +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenant_ignores_backup_file( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + # getting a too eager compaction happening for this test would not play + # well with the strict assertions. + neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'" + + neon_env_builder.enable_remote_storage(remote_storage_kind, "test_tenant_ignores_backup_file") + + # launch pageserver, populate the default tenants timeline, wait for it to be uploaded, + # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append(".*got backup file on the remote storage, ignoring it.*") + + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + with pg.cursor() as cur: + cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # flush, wait until in remote storage + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + env.postgres.stop_all() + env.pageserver.stop() + + # change the remote file to have entry with .0.old suffix + timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id) + with open(timeline_path, "r+") as timeline_file: + # keep the deserialized for later inspection + orig_index_part = json.load(timeline_file) + backup_layer_name = orig_index_part["timeline_layers"][0] + ".0.old" + orig_index_part["timeline_layers"].append(backup_layer_name) + + timeline_file.seek(0) + json.dump(orig_index_part, timeline_file) + + env.pageserver.start() + pageserver_http = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + ) + + pg = env.postgres.create_start("main") + + with pg.cursor() as cur: + cur.execute("INSERT INTO t0 VALUES (234, 'test data');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + # not needed anymore + env.postgres.stop_all() + env.pageserver.stop() + + # file is still mentioned in the index. Removing it requires more hacking on remote queue initialization + # Will be easier to do once there will be no .download_missing so it will be only one cycle through the layers + # in load_layer_map + new_index_part = local_fs_index_part(env, tenant_id, timeline_id) + backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"]) + assert len(list(backup_layers)) == 1 + + @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_tenant_redownloads_truncated_file_on_startup( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind