Handle broken timelines on startup (#1809)

Resolve #1663.

## Changes

- ignore a "broken" [1] timeline on page server startup
- fix the race condition when creating multiple timelines in parallel for a tenant
- added tests for the above changes

[1]: a timeline is marked as "broken" if either
- failed to load the timeline's metadata or
- the timeline's disk consistent LSN is zero
This commit is contained in:
Thang Pham
2022-05-27 11:43:06 -04:00
committed by GitHub
parent 54b75248ff
commit 75f71a6380
4 changed files with 74 additions and 5 deletions

View File

@@ -2518,7 +2518,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
bail!("couldn't find an unused backup number for {:?}", path)
}
fn load_metadata(
pub fn load_metadata(
conf: &'static PageServerConf,
timeline_id: ZTimelineId,
tenant_id: ZTenantId,

View File

@@ -2,7 +2,7 @@
//! page server.
use crate::config::PageServerConf;
use crate::layered_repository::LayeredRepository;
use crate::layered_repository::{load_metadata, LayeredRepository};
use crate::pgdatadir_mapping::DatadirTimeline;
use crate::repository::{Repository, TimelineSyncStatusUpdate};
use crate::storage_sync::index::RemoteIndex;
@@ -22,6 +22,7 @@ use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
use tracing::*;
use utils::lsn::Lsn;
use utils::zid::{ZTenantId, ZTimelineId};
@@ -399,6 +400,26 @@ pub fn list_tenants() -> Vec<TenantInfo> {
.collect()
}
/// Check if a given timeline is "broken" \[1\].
/// The function returns an error if the timeline is "broken".
///
/// \[1\]: it's not clear now how should we classify a timeline as broken.
/// A timeline is categorized as broken when any of following conditions is true:
/// - failed to load the timeline's metadata
/// - the timeline's disk consistent LSN is zero
fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> {
let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id())
.context("failed to load metadata")?;
// A timeline with zero disk consistent LSN can happen when the page server
// failed to checkpoint the timeline import data when creating that timeline.
if metadata.disk_consistent_lsn() == Lsn::INVALID {
bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
}
Ok(())
}
fn init_local_repository(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
@@ -414,7 +435,13 @@ fn init_local_repository(
match init_status {
LocalTimelineInitStatus::LocallyComplete => {
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
if let Err(err) = check_broken_timeline(&repo, timeline_id) {
info!(
"Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository"
);
} else {
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
}
}
LocalTimelineInitStatus::NeedsSync => {
debug!(

View File

@@ -285,7 +285,9 @@ fn bootstrap_timeline<R: Repository>(
) -> Result<()> {
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
let initdb_path = conf
.tenant_path(&tenantid)
.join(format!("tmp-timeline-{}", tli));
// Init temporarily repo to get bootstrap data
run_initdb(conf, &initdb_path)?;
@@ -300,6 +302,11 @@ fn bootstrap_timeline<R: Repository>(
let timeline = repo.create_empty_timeline(tli, lsn)?;
let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline, u64::MAX);
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
fail::fail_point!("before-checkpoint-new-timeline", |_| {
bail!("failpoint before-checkpoint-new-timeline");
});
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
info!(

View File

@@ -1,6 +1,7 @@
import pytest
import concurrent.futures
from contextlib import closing
from fixtures.zenith_fixtures import ZenithEnvBuilder
from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv
from fixtures.log_helper import log
import os
@@ -78,3 +79,37 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder):
with pytest.raises(Exception, match="Cannot load local timeline") as err:
pg.start()
log.info(f'compute startup failed as expected: {err}')
def test_create_multiple_timelines_parallel(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
tenant_id, _ = env.zenith_cli.create_tenant()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = [
executor.submit(env.zenith_cli.create_timeline,
f"test-create-multiple-timelines-{i}",
tenant_id) for i in range(4)
]
for future in futures:
future.result()
def test_fix_broken_timelines_on_startup(zenith_simple_env: ZenithEnv):
env = zenith_simple_env
tenant_id, _ = env.zenith_cli.create_tenant()
# Introduce failpoint when creating a new timeline
env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return")
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
_ = env.zenith_cli.create_timeline("test_fix_broken_timelines", tenant_id)
# Restart the page server
env.zenith_cli.pageserver_stop(immediate=True)
env.zenith_cli.pageserver_start()
# Check that the "broken" timeline is not loaded
timelines = env.zenith_cli.list_timelines(tenant_id)
assert len(timelines) == 1