Pull clone timeline from peer safekeepers (#4089)

Add HTTP endpoint to initialize safekeeper timeline from peer
safekeepers. This is useful for initializing new safekeeper to replace
failed safekeeper. Not fully "correct" in all cases, but should work in
most.

This code is not suitable for production workloads but can be tested on
staging to get started. New endpoint is separated from usual cases and
should not affect anything if no one explicitly uses a new endpoint. We
can rollback this commit in case of issues.
This commit is contained in:
Arthur Petukhovsky
2023-04-28 17:20:46 +03:00
committed by GitHub
parent ec53c5ca2e
commit 8543485e92
10 changed files with 424 additions and 7 deletions

2
Cargo.lock generated
View File

@@ -3657,6 +3657,7 @@ dependencies = [
"const_format",
"crc32c",
"fs2",
"futures",
"git-version",
"hex",
"humantime",
@@ -3671,6 +3672,7 @@ dependencies = [
"pq_proto",
"regex",
"remote_storage",
"reqwest",
"safekeeper_api",
"serde",
"serde_json",

View File

@@ -19,11 +19,13 @@ git-version.workspace = true
hex.workspace = true
humantime.workspace = true
hyper.workspace = true
futures.workspace = true
once_cell.workspace = true
parking_lot.workspace = true
postgres.workspace = true
postgres-protocol.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["json"] }
serde.workspace = true
serde_json.workspace = true
serde_with.workspace = true
@@ -33,6 +35,7 @@ tokio = { workspace = true, features = ["fs"] }
tokio-io-timeout.workspace = true
tokio-postgres.workspace = true
toml_edit.workspace = true
tempfile.workspace = true
tracing.workspace = true
url.workspace = true
metrics.workspace = true
@@ -45,6 +48,3 @@ storage_broker.workspace = true
utils.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
tempfile.workspace = true

View File

@@ -11,11 +11,13 @@ use std::str::FromStr;
use std::sync::Arc;
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use tokio::fs::File;
use tokio::io::AsyncReadExt;
use tokio::task::JoinError;
use crate::debug_dump;
use crate::safekeeper::ServerInfo;
use crate::safekeeper::Term;
use crate::{debug_dump, pull_timeline};
use crate::timelines_global_map::TimelineDeleteForceResult;
use crate::GlobalTimelines;
@@ -177,6 +179,49 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
json_response(StatusCode::OK, ())
}
/// Pull timeline from peer safekeeper instances.
async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let data: pull_timeline::Request = json_request(&mut request).await?;
let resp = pull_timeline::handle_request(data)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, resp)
}
/// Download a file from the timeline directory.
// TODO: figure out a better way to copy files between safekeepers
async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let ttid = TenantTimelineId::new(
parse_request_param(&request, "tenant_id")?,
parse_request_param(&request, "timeline_id")?,
);
check_permission(&request, Some(ttid.tenant_id))?;
let filename: String = parse_request_param(&request, "filename")?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let filepath = tli.timeline_dir.join(filename);
let mut file = File::open(&filepath)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
let mut content = Vec::new();
// TODO: don't store files in memory
file.read_to_end(&mut content)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
Response::builder()
.status(StatusCode::OK)
.header("Content-Type", "application/octet-stream")
.body(Body::from(content))
.map_err(|e| ApiError::InternalServerError(e.into()))
}
/// Deactivates the timeline and removes its data directory.
async fn timeline_delete_force_handler(
mut request: Request<Body>,
@@ -353,6 +398,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
timeline_delete_force_handler,
)
.delete("/v1/tenant/:tenant_id", tenant_delete_force_handler)
.post("/v1/pull_timeline", timeline_pull_handler)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
timeline_files_handler,
)
// for tests
.post(
"/v1/record_safekeeper_info/:tenant_id/:timeline_id",

View File

@@ -15,6 +15,7 @@ pub mod handler;
pub mod http;
pub mod json_ctrl;
pub mod metrics;
pub mod pull_timeline;
pub mod receive_wal;
pub mod remove_wal;
pub mod safekeeper;

View File

@@ -0,0 +1,240 @@
use serde::{Deserialize, Serialize};
use anyhow::{bail, Context, Result};
use tokio::io::AsyncWriteExt;
use tracing::info;
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use serde_with::{serde_as, DisplayFromStr};
use crate::{
control_file, debug_dump,
http::routes::TimelineStatus,
wal_storage::{self, Storage},
GlobalTimelines,
};
/// Info about timeline on safekeeper ready for reporting.
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
pub struct Request {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
pub timeline_id: TimelineId,
pub http_hosts: Vec<String>,
}
#[derive(Debug, Serialize)]
pub struct Response {
// Donor safekeeper host
pub safekeeper_host: String,
// TODO: add more fields?
}
/// Find the most advanced safekeeper and pull timeline from it.
pub async fn handle_request(request: Request) -> Result<Response> {
let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
request.tenant_id,
request.timeline_id,
));
if existing_tli.is_ok() {
bail!("Timeline {} already exists", request.timeline_id);
}
let client = reqwest::Client::new();
let http_hosts = request.http_hosts.clone();
// Send request to /v1/tenant/:tenant_id/timeline/:timeline_id
let responses = futures::future::join_all(http_hosts.iter().map(|url| {
let url = format!(
"{}/v1/tenant/{}/timeline/{}",
url, request.tenant_id, request.timeline_id
);
client.get(url).send()
}))
.await;
let mut statuses = Vec::new();
for (i, response) in responses.into_iter().enumerate() {
let response = response.context(format!("Failed to get status from {}", http_hosts[i]))?;
let status: crate::http::routes::TimelineStatus = response.json().await?;
statuses.push((status, i));
}
// Find the most advanced safekeeper
// TODO: current logic may be wrong, fix it later
let (status, i) = statuses
.into_iter()
.max_by_key(|(status, _)| {
(
status.acceptor_state.epoch,
status.flush_lsn,
status.commit_lsn,
)
})
.unwrap();
let safekeeper_host = http_hosts[i].clone();
assert!(status.tenant_id == request.tenant_id);
assert!(status.timeline_id == request.timeline_id);
pull_timeline(status, safekeeper_host).await
}
async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
info!(
"Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
ttid,
host,
status.commit_lsn,
status.flush_lsn,
status.acceptor_state.term,
status.acceptor_state.epoch
);
let conf = &GlobalTimelines::get_global_config();
let client = reqwest::Client::new();
// TODO: don't use debug dump, it should be used only in tests.
// This is a proof of concept, we should figure out a way
// to use scp without implementing it manually.
// Implementing our own scp over HTTP.
// At first, we need to fetch list of files from safekeeper.
let dump: debug_dump::Response = client
.get(format!(
"{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}",
host, status.tenant_id, status.timeline_id
))
.send()
.await?
.json()
.await?;
if dump.timelines.len() != 1 {
bail!(
"Expected to fetch single timeline, got {} timelines",
dump.timelines.len()
);
}
let timeline = dump.timelines.into_iter().next().unwrap();
let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
"Timeline {} doesn't have disk content",
ttid
))?;
let mut filenames = disk_content
.files
.iter()
.map(|file| file.name.clone())
.collect::<Vec<_>>();
// Sort filenames to make sure we pull files in correct order
// After sorting, we should have:
// - 000000010000000000000001
// - ...
// - 000000010000000000000002.partial
// - safekeeper.control
filenames.sort();
// safekeeper.control should be the first file, so we need to move it to the beginning
let control_file_index = filenames
.iter()
.position(|name| name == "safekeeper.control")
.ok_or(anyhow::anyhow!("safekeeper.control not found"))?;
filenames.remove(control_file_index);
filenames.insert(0, "safekeeper.control".to_string());
info!(
"Downloading {} files from safekeeper {}",
filenames.len(),
host
);
// Creating temp directory for a new timeline. It needs to be
// located on the same filesystem as the rest of the timelines.
// conf.workdir is usually /storage/safekeeper/data
// will try to transform it into /storage/safekeeper/tmp
let temp_base = conf
.workdir
.parent()
.ok_or(anyhow::anyhow!("workdir has no parent"))?
.join("tmp");
tokio::fs::create_dir_all(&temp_base).await?;
let tli_dir = tempfile::Builder::new()
.suffix("_temptli")
.prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
.tempdir_in(temp_base)?;
let tli_dir_path = tli_dir.path().to_owned();
// Note: some time happens between fetching list of files and fetching files themselves.
// It's possible that some files will be removed from safekeeper and we will fail to fetch them.
// This function will fail in this case, should be retried by the caller.
for filename in filenames {
let file_path = tli_dir_path.join(&filename);
// /v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename
let http_url = format!(
"{}/v1/tenant/{}/timeline/{}/file/{}",
host, status.tenant_id, status.timeline_id, filename
);
let mut file = tokio::fs::File::create(&file_path).await?;
let mut response = client.get(&http_url).send().await?;
while let Some(chunk) = response.chunk().await? {
file.write_all(&chunk).await?;
}
}
// TODO: fsync?
// Let's create timeline from temp directory and verify that it's correct
let control_path = tli_dir_path.join("safekeeper.control");
let control_store = control_file::FileStorage::load_control_file(control_path)?;
if control_store.server.wal_seg_size == 0 {
bail!("wal_seg_size is not set");
}
let wal_store =
wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?;
let commit_lsn = status.commit_lsn;
let flush_lsn = wal_store.flush_lsn();
info!(
"Finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
ttid, commit_lsn, flush_lsn
);
assert!(status.commit_lsn <= status.flush_lsn);
// Move timeline dir to the correct location
let timeline_path = conf.timeline_dir(&ttid);
info!(
"Moving timeline {} from {} to {}",
ttid,
tli_dir_path.display(),
timeline_path.display()
);
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
tokio::fs::rename(tli_dir_path, &timeline_path).await?;
let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?;
info!(
"Loaded timeline {}, flush_lsn={}",
ttid,
tli.get_flush_lsn()
);
Ok(Response {
safekeeper_host: host,
})
}

View File

@@ -129,7 +129,8 @@ impl SharedState {
// We don't want to write anything to disk, because we may have existing timeline there.
// These functions should not change anything on disk.
let control_store = control_file::FileStorage::create_new(ttid, conf, state)?;
let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?;
let wal_store =
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
Ok(Self {
@@ -149,7 +150,8 @@ impl SharedState {
bail!(TimelineError::UninitializedWalSegSize(*ttid));
}
let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?;
let wal_store =
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
Ok(Self {
sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,

View File

@@ -159,6 +159,26 @@ impl GlobalTimelines {
Ok(())
}
/// Load timeline from disk to the memory.
pub fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
match Timeline::load_timeline(conf, ttid, wal_backup_launcher_tx) {
Ok(timeline) => {
let tli = Arc::new(timeline);
// TODO: prevent concurrent timeline creation/loading
TIMELINES_STATE
.lock()
.unwrap()
.timelines
.insert(ttid, tli.clone());
Ok(tli)
}
// If we can't load a timeline, it's bad. Caller will figure it out.
Err(e) => bail!("failed to load timeline {}, reason: {:?}", ttid, e),
}
}
/// Get the number of timelines in the map.
pub fn timelines_count() -> usize {
TIMELINES_STATE.lock().unwrap().timelines.len()

View File

@@ -112,10 +112,10 @@ impl PhysicalStorage {
/// the disk. Otherwise, all LSNs are set to zero.
pub fn new(
ttid: &TenantTimelineId,
timeline_dir: PathBuf,
conf: &SafeKeeperConf,
state: &SafeKeeperState,
) -> Result<PhysicalStorage> {
let timeline_dir = conf.timeline_dir(ttid);
let wal_seg_size = state.server.wal_seg_size as usize;
// Find out where stored WAL ends, starting at commit_lsn which is a

View File

@@ -2661,6 +2661,13 @@ class SafekeeperHttpClient(requests.Session):
assert isinstance(res_json, dict)
return res_json
def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
res.raise_for_status()
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def timeline_create(
self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn
):

View File

@@ -1254,3 +1254,98 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
with closing(endpoint_other.connect()) as conn:
with conn.cursor() as cur:
cur.execute("INSERT INTO t (key) VALUES (123)")
def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str:
return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names])
def execute_payload(endpoint: Endpoint):
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
# we rely upon autocommit after each statement
# as waiting for acceptors happens there
cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)")
cur.execute("INSERT INTO t VALUES (0, 'something')")
sum_before = query_scalar(cur, "SELECT SUM(key) FROM t")
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
sum_after = query_scalar(cur, "SELECT SUM(key) FROM t")
assert sum_after == sum_before + 5000050000
def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
for sk in safekeepers:
http_cli = sk.http_client()
try:
status = http_cli.timeline_status(tenant_id, timeline_id)
log.info(f"Safekeeper {sk.id} status: {status}")
except Exception as e:
log.info(f"Safekeeper {sk.id} status error: {e}")
neon_env_builder.num_safekeepers = 4
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_pull_timeline")
log.info("Use only first 3 safekeepers")
env.safekeepers[3].stop()
active_safekeepers = [1, 2, 3]
endpoint = env.endpoints.create("test_pull_timeline")
endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
endpoint.start()
# learn neon timeline from compute
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
execute_payload(endpoint)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Kill safekeeper 2, continue with payload")
env.safekeepers[1].stop(immediate=True)
execute_payload(endpoint)
log.info("Initialize new safekeeper 4, pull data from 1 & 3")
env.safekeepers[3].start()
res = (
env.safekeepers[3]
.http_client()
.pull_timeline(
{
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
"http_hosts": [
f"http://localhost:{env.safekeepers[0].port.http}",
f"http://localhost:{env.safekeepers[2].port.http}",
],
}
)
)
log.info("Finished pulling timeline")
log.info(res)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Restarting compute with new config to verify that it works")
active_safekeepers = [1, 3, 4]
endpoint.stop_and_destroy().create("test_pull_timeline")
endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
endpoint.start()
execute_payload(endpoint)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Stop sk1 (simulate failure) and use only quorum of sk3 and sk4")
env.safekeepers[0].stop(immediate=True)
execute_payload(endpoint)
show_statuses(env.safekeepers, tenant_id, timeline_id)
log.info("Restart sk4 and and use quorum of sk1 and sk4")
env.safekeepers[3].stop()
env.safekeepers[2].stop()
env.safekeepers[0].start()
env.safekeepers[3].start()
execute_payload(endpoint)
show_statuses(env.safekeepers, tenant_id, timeline_id)