From b4e00b8b220b6443c117fbaf9746ab4ef9c38e55 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 13 Nov 2024 18:07:39 +0000 Subject: [PATCH] pageserver: refuse to load tenants with suspiciously old indices in old generations (#9719) ## Problem Historically, if a control component passed a pageserver "generation: 1" this could be a quick way to corrupt a tenant by loading a historic index. Follows https://github.com/neondatabase/neon/pull/9383 Closes #6951 ## Summary of changes - Introduce a Fatal variant to DownloadError, to enable index downloads to signal when they have encountered a scary enough situation that we shouldn't proceed to load the tenant. - Handle this variant by putting the tenant into a broken state (no matter which timeline within the tenant reported it) - Add a test for this case In the event that this behavior fires when we don't want it to, we have ways to intervene: - "Touch" an affected index to update its mtime (download+upload S3 object) - If this behavior is triggered, it indicates we're attaching in some old generation, so we should be able to fix that by manually bumping generation numbers in the storage controller database (this should never happen, but it's an option if it does) --- libs/remote_storage/src/error.rs | 6 +- pageserver/src/tenant.rs | 6 ++ .../src/tenant/remote_timeline_client.rs | 8 ++- .../regress/test_pageserver_generations.py | 68 ++++++++++++++++++- 4 files changed, 85 insertions(+), 3 deletions(-) diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs index 17790e9f70..ec9f868998 100644 --- a/libs/remote_storage/src/error.rs +++ b/libs/remote_storage/src/error.rs @@ -15,6 +15,9 @@ pub enum DownloadError { /// /// Concurrency control is not timed within timeout. Timeout, + /// Some integrity/consistency check failed during download. This is used during + /// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption. + Fatal(String), /// The file was found in the remote storage, but the download failed. Other(anyhow::Error), } @@ -29,6 +32,7 @@ impl std::fmt::Display for DownloadError { DownloadError::Unmodified => write!(f, "File was not modified"), DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), DownloadError::Timeout => write!(f, "timeout"), + DownloadError::Fatal(why) => write!(f, "Fatal read error: {why}"), DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), } } @@ -41,7 +45,7 @@ impl DownloadError { pub fn is_permanent(&self) -> bool { use DownloadError::*; match self { - BadInput(_) | NotFound | Unmodified | Cancelled => true, + BadInput(_) | NotFound | Unmodified | Fatal(_) | Cancelled => true, Timeout | Other(_) => false, } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d0a96e78a6..61bb1fe40c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1433,6 +1433,12 @@ impl Tenant { info!(%timeline_id, "index_part not found on remote"); continue; } + Err(DownloadError::Fatal(why)) => { + // If, while loading one remote timeline, we saw an indication that our generation + // number is likely invalid, then we should not load the whole tenant. + error!(%timeline_id, "Fatal error loading timeline: {why}"); + anyhow::bail!(why.to_string()); + } Err(e) => { // Some (possibly ephemeral) error happened during index_part download. // Pretend the timeline exists to not delete the timeline directory, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index b37c16e133..600583f6b5 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -574,12 +574,18 @@ impl RemoteTimelineClient { if latest_index_generation > index_generation { // Unexpected! Why are we loading such an old index if a more recent one exists? - tracing::warn!( + // We will refuse to proceed, as there is no reasonable scenario where this should happen, but + // there _is_ a clear bug/corruption scenario where it would happen (controller sets the generation + // backwards). + tracing::error!( ?index_generation, ?latest_index_generation, ?latest_index_mtime, "Found a newer index while loading an old one" ); + return Err(DownloadError::Fatal( + "Index age exceeds threshold and a newer index exists".into(), + )); } } diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 8f6c9f16fd..4f59efb8b3 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -35,9 +35,10 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.remote_storage import ( + LocalFsStorage, RemoteStorageKind, ) -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until from fixtures.workload import Workload if TYPE_CHECKING: @@ -728,3 +729,68 @@ def test_upgrade_generationless_local_file_paths( ) # We should download into the same local path we started with assert os.path.exists(victim_path) + + +@run_only_on_default_postgres("Only tests index logic") +def test_old_index_time_threshold( + neon_env_builder: NeonEnvBuilder, +): + """ + Exercise pageserver's detection of trying to load an ancient non-latest index. + (see https://github.com/neondatabase/neon/issues/6951) + """ + + # Run with local_fs because we will interfere with mtimes by local filesystem access + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(32) + + # Remember generation 1's index path + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + index_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id) + + # Increment generation by detaching+attaching, and write+flush some data to get a new remote index + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + workload.churn_rows(32) + + # A new index should have been written + assert env.pageserver_remote_storage.index_path(tenant_id, timeline_id) != index_path + + # Hack the mtime on the generation 1 index + log.info(f"Setting old mtime on {index_path}") + os.utime(index_path, times=(time.time(), time.time() - 30 * 24 * 3600)) + env.pageserver.allowed_errors.extend( + [ + ".*Found a newer index while loading an old one.*", + ".*Index age exceeds threshold and a newer index exists.*", + ] + ) + + # Detach from storage controller + attach in an old generation directly on the pageserver. + workload.stop() + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy") + + # The controller would not do this (attach in an old generation): we are doing it to simulate + # a hypothetical profound bug in the controller. + env.pageserver.http_client().tenant_location_conf( + tenant_id, {"generation": 1, "mode": "AttachedSingle", "tenant_conf": {}} + ) + + # The pageserver should react to this situation by refusing to attach the tenant and putting + # it into Broken state + env.pageserver.allowed_errors.append(".*tenant is broken.*") + with pytest.raises( + PageserverApiException, + match="tenant is broken: Index age exceeds threshold and a newer index exists", + ): + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)