Ignore backup files (ones with .n.old suffix) in download_missing

This is rather a hack to resolve immediate issue:
https://github.com/neondatabase/neon/issues/3024

Properly cleaning this file from index part requires changes to
initialization of remote queue. Because we need to clean it up earlier
than we start warking around files.

With on-demand there will be no walk around layer files becase
download_missing is no longer needed, so I believe it will be
natural to unify this with load_layer_map
This commit is contained in:
Dmitry Rodionov
2022-12-08 15:52:02 +02:00
committed by Dmitry Rodionov
parent 4752385470
commit 3122f3282f
3 changed files with 83 additions and 9 deletions

View File

@@ -1126,6 +1126,12 @@ impl Timeline {
.unwrap()
.insert_historic(Arc::new(delta_layer));
self.metrics.current_physical_size_gauge.add(sz);
} else if layer_name.ends_with(".old") {
// For details see https://github.com/neondatabase/neon/issues/3024
warn!(
"got backup file on the remote storage, ignoring it {file}",
file = layer_name
)
} else {
bail!("unexpected layer filename {layer_name} in remote storage path: {remote_layer_path:?}");
}

View File

@@ -2896,6 +2896,7 @@ def assert_no_in_progress_downloads_for_tenant(
):
tenant_status = pageserver_http_client.tenant_status(tenant)
assert tenant_status["has_in_progress_downloads"] is False, tenant_status
assert tenant_status["state"] == "Active"
def remote_consistent_lsn(

View File

@@ -248,15 +248,6 @@ def test_tenant_upgrades_index_json_from_v0(
# then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
env = neon_env_builder.init_start()
# FIXME: Are these expected?
env.pageserver.allowed_errors.append(
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
)
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
env.pageserver.allowed_errors.append(
".*Failed to get local tenant state: Tenant .* not found in the local state.*"
)
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
@@ -352,6 +343,82 @@ def test_tenant_upgrades_index_json_from_v0(
# FIXME: test index_part.json getting downgraded from imaginary new version
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_tenant_ignores_backup_file(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
# getting a too eager compaction happening for this test would not play
# well with the strict assertions.
neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'"
neon_env_builder.enable_remote_storage(remote_storage_kind, "test_tenant_ignores_backup_file")
# launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
# then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*got backup file on the remote storage, ignoring it.*")
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
with pg.cursor() as cur:
cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');")
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
# flush, wait until in remote storage
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
env.postgres.stop_all()
env.pageserver.stop()
# change the remote file to have entry with .0.old suffix
timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
with open(timeline_path, "r+") as timeline_file:
# keep the deserialized for later inspection
orig_index_part = json.load(timeline_file)
backup_layer_name = orig_index_part["timeline_layers"][0] + ".0.old"
orig_index_part["timeline_layers"].append(backup_layer_name)
timeline_file.seek(0)
json.dump(orig_index_part, timeline_file)
env.pageserver.start()
pageserver_http = env.pageserver.http_client()
wait_until(
number_of_iterations=5,
interval=1,
func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id),
)
pg = env.postgres.create_start("main")
with pg.cursor() as cur:
cur.execute("INSERT INTO t0 VALUES (234, 'test data');")
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
# not needed anymore
env.postgres.stop_all()
env.pageserver.stop()
# file is still mentioned in the index. Removing it requires more hacking on remote queue initialization
# Will be easier to do once there will be no .download_missing so it will be only one cycle through the layers
# in load_layer_map
new_index_part = local_fs_index_part(env, tenant_id, timeline_id)
backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"])
assert len(list(backup_layers)) == 1
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_tenant_redownloads_truncated_file_on_startup(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind