mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 12:02:55 +00:00
Handle broken timelines on startup (#1809)
Resolve #1663. ## Changes - ignore a "broken" [1] timeline on page server startup - fix the race condition when creating multiple timelines in parallel for a tenant - added tests for the above changes [1]: a timeline is marked as "broken" if either - failed to load the timeline's metadata or - the timeline's disk consistent LSN is zero
This commit is contained in:
@@ -2518,7 +2518,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
bail!("couldn't find an unused backup number for {:?}", path)
|
||||
}
|
||||
|
||||
fn load_metadata(
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: ZTimelineId,
|
||||
tenant_id: ZTenantId,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! page server.
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::layered_repository::{load_metadata, LayeredRepository};
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::repository::{Repository, TimelineSyncStatusUpdate};
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
@@ -22,6 +22,7 @@ use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
@@ -399,6 +400,26 @@ pub fn list_tenants() -> Vec<TenantInfo> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Check if a given timeline is "broken" \[1\].
|
||||
/// The function returns an error if the timeline is "broken".
|
||||
///
|
||||
/// \[1\]: it's not clear now how should we classify a timeline as broken.
|
||||
/// A timeline is categorized as broken when any of following conditions is true:
|
||||
/// - failed to load the timeline's metadata
|
||||
/// - the timeline's disk consistent LSN is zero
|
||||
fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||
let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id())
|
||||
.context("failed to load metadata")?;
|
||||
|
||||
// A timeline with zero disk consistent LSN can happen when the page server
|
||||
// failed to checkpoint the timeline import data when creating that timeline.
|
||||
if metadata.disk_consistent_lsn() == Lsn::INVALID {
|
||||
bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_local_repository(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
@@ -414,7 +435,13 @@ fn init_local_repository(
|
||||
match init_status {
|
||||
LocalTimelineInitStatus::LocallyComplete => {
|
||||
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
||||
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
||||
if let Err(err) = check_broken_timeline(&repo, timeline_id) {
|
||||
info!(
|
||||
"Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository"
|
||||
);
|
||||
} else {
|
||||
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
||||
}
|
||||
}
|
||||
LocalTimelineInitStatus::NeedsSync => {
|
||||
debug!(
|
||||
|
||||
@@ -285,7 +285,9 @@ fn bootstrap_timeline<R: Repository>(
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
|
||||
|
||||
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
|
||||
let initdb_path = conf
|
||||
.tenant_path(&tenantid)
|
||||
.join(format!("tmp-timeline-{}", tli));
|
||||
|
||||
// Init temporarily repo to get bootstrap data
|
||||
run_initdb(conf, &initdb_path)?;
|
||||
@@ -300,6 +302,11 @@ fn bootstrap_timeline<R: Repository>(
|
||||
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
||||
let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline, u64::MAX);
|
||||
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
info!(
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import pytest
|
||||
import concurrent.futures
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
import os
|
||||
|
||||
@@ -78,3 +79,37 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder):
|
||||
with pytest.raises(Exception, match="Cannot load local timeline") as err:
|
||||
pg.start()
|
||||
log.info(f'compute startup failed as expected: {err}')
|
||||
|
||||
|
||||
def test_create_multiple_timelines_parallel(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
|
||||
tenant_id, _ = env.zenith_cli.create_tenant()
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||
futures = [
|
||||
executor.submit(env.zenith_cli.create_timeline,
|
||||
f"test-create-multiple-timelines-{i}",
|
||||
tenant_id) for i in range(4)
|
||||
]
|
||||
for future in futures:
|
||||
future.result()
|
||||
|
||||
|
||||
def test_fix_broken_timelines_on_startup(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
|
||||
tenant_id, _ = env.zenith_cli.create_tenant()
|
||||
|
||||
# Introduce failpoint when creating a new timeline
|
||||
env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return")
|
||||
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
|
||||
_ = env.zenith_cli.create_timeline("test_fix_broken_timelines", tenant_id)
|
||||
|
||||
# Restart the page server
|
||||
env.zenith_cli.pageserver_stop(immediate=True)
|
||||
env.zenith_cli.pageserver_start()
|
||||
|
||||
# Check that the "broken" timeline is not loaded
|
||||
timelines = env.zenith_cli.list_timelines(tenant_id)
|
||||
assert len(timelines) == 1
|
||||
|
||||
Reference in New Issue
Block a user