mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 14:40:37 +00:00
Compare commits
27 Commits
actorsssss
...
problame/b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6aefbb1aa5 | ||
|
|
a7f8032d28 | ||
|
|
dac2f40b26 | ||
|
|
eb2088264f | ||
|
|
d36623ad74 | ||
|
|
689ad72e92 | ||
|
|
fd4cce9417 | ||
|
|
d52b81340f | ||
|
|
8dee9908f8 | ||
|
|
19ed230708 | ||
|
|
463b6a26b5 | ||
|
|
c9b1657e4c | ||
|
|
b92be77e19 | ||
|
|
8cb8c8d7b5 | ||
|
|
210700d0d9 | ||
|
|
a0a3ba85e7 | ||
|
|
d820aa1d08 | ||
|
|
996abc9563 | ||
|
|
a72af29d12 | ||
|
|
4f51824820 | ||
|
|
743f6dfb9b | ||
|
|
faf275d4a2 | ||
|
|
001f0d6db7 | ||
|
|
42c17a6fc6 | ||
|
|
37638fce79 | ||
|
|
50288c16b1 | ||
|
|
e03f8abba9 |
20
.github/workflows/build_and_test.yml
vendored
20
.github/workflows/build_and_test.yml
vendored
@@ -21,6 +21,9 @@ env:
|
||||
COPT: '-Werror'
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
NEXTEST_RETRIES: 3
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
@@ -44,6 +47,20 @@ jobs:
|
||||
|
||||
exit 1
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Cancel previous e2e-tests runs for this PR
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
gh workflow --repo neondatabase/cloud \
|
||||
run cancel-previous-in-concurrency-group.yml \
|
||||
--field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -695,7 +712,8 @@ jobs:
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
|
||||
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
|
||||
6
Cargo.lock
generated
6
Cargo.lock
generated
@@ -286,6 +286,7 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
@@ -1341,6 +1342,7 @@ dependencies = [
|
||||
"regex",
|
||||
"reqwest",
|
||||
"safekeeper_api",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
@@ -5112,9 +5114,9 @@ checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
|
||||
|
||||
[[package]]
|
||||
name = "smol_str"
|
||||
version = "0.2.0"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
|
||||
checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -19,6 +19,7 @@ hex.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
|
||||
@@ -14,6 +14,7 @@ hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -17,6 +17,8 @@ enum PlacementPolicy {
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Do not attach to any pageservers
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
||||
|
||||
@@ -66,7 +66,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
jwt_token: args.jwt_token,
|
||||
};
|
||||
|
||||
let persistence = Arc::new(Persistence::new(&args.path).await);
|
||||
let persistence = Arc::new(Persistence::spawn(&args.path).await);
|
||||
|
||||
let service = Service::spawn(config, persistence).await?;
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ use pageserver_api::{
|
||||
};
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId},
|
||||
@@ -20,46 +21,28 @@ use crate::{node::Node, PlacementPolicy};
|
||||
|
||||
/// Placeholder for storage. This will be replaced with a database client.
|
||||
pub struct Persistence {
|
||||
state: std::sync::Mutex<PersistentState>,
|
||||
inner: std::sync::Mutex<Inner>,
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
state: PersistentState,
|
||||
write_queue_tx: tokio::sync::mpsc::UnboundedSender<PendingWrite>,
|
||||
}
|
||||
|
||||
// Top level state available to all HTTP handlers
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PersistentState {
|
||||
tenants: HashMap<TenantShardId, TenantShardPersistence>,
|
||||
|
||||
#[serde(skip)]
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
/// A convenience for serializing the state inside a sync lock, and then
|
||||
/// writing it to disk outside of the lock. This will go away when switching
|
||||
/// to a database backend.
|
||||
struct PendingWrite {
|
||||
bytes: Vec<u8>,
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
impl PendingWrite {
|
||||
async fn commit(&self) -> anyhow::Result<()> {
|
||||
tokio::fs::write(&self.path, &self.bytes).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
done_tx: tokio::sync::oneshot::Sender<()>,
|
||||
}
|
||||
|
||||
impl PersistentState {
|
||||
fn save(&self) -> PendingWrite {
|
||||
PendingWrite {
|
||||
bytes: serde_json::to_vec(self).expect("Serialization error"),
|
||||
path: self.path.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
|
||||
let bytes = tokio::fs::read(path).await?;
|
||||
let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
|
||||
decoded.path = path.to_owned();
|
||||
|
||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||
@@ -88,7 +71,6 @@ impl PersistentState {
|
||||
tracing::info!("Will create state file at {}", path);
|
||||
Self {
|
||||
tenants: HashMap::new(),
|
||||
path: path.to_owned(),
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -99,13 +81,74 @@ impl PersistentState {
|
||||
}
|
||||
|
||||
impl Persistence {
|
||||
pub async fn new(path: &Utf8Path) -> Self {
|
||||
pub async fn spawn(path: &Utf8Path) -> Self {
|
||||
let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let state = PersistentState::load_or_new(path).await;
|
||||
tokio::spawn(Self::writer_task(rx, path.to_owned()));
|
||||
Self {
|
||||
state: std::sync::Mutex::new(state),
|
||||
inner: std::sync::Mutex::new(Inner {
|
||||
state,
|
||||
write_queue_tx: tx,
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
async fn writer_task(
|
||||
mut rx: tokio::sync::mpsc::UnboundedReceiver<PendingWrite>,
|
||||
path: Utf8PathBuf,
|
||||
) {
|
||||
scopeguard::defer! {
|
||||
info!("persistence writer task exiting");
|
||||
};
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Some(write) => {
|
||||
tokio::task::spawn_blocking({
|
||||
let path = path.clone();
|
||||
move || {
|
||||
let tmp_path =
|
||||
utils::crashsafe::path_with_suffix_extension(&path, "___new");
|
||||
utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes)
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking")
|
||||
.expect("write file");
|
||||
let _ = write.done_tx.send(()); // receiver may lose interest any time
|
||||
}
|
||||
None => {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform a modification on our [`PersistentState`].
|
||||
/// Return a future that completes once our modification has been persisted.
|
||||
/// The output of the future is the return value of the `txn`` closure.
|
||||
async fn mutating_transaction<F, R>(&self, txn: F) -> R
|
||||
where
|
||||
F: FnOnce(&mut PersistentState) -> R,
|
||||
{
|
||||
let (ret, done_rx) = {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let ret = txn(&mut inner.state);
|
||||
let (done_tx, done_rx) = tokio::sync::oneshot::channel();
|
||||
let write = PendingWrite {
|
||||
bytes: serde_json::to_vec(&inner.state).expect("Serialization error"),
|
||||
done_tx,
|
||||
};
|
||||
inner
|
||||
.write_queue_tx
|
||||
.send(write)
|
||||
.expect("writer task always outlives self");
|
||||
(ret, done_rx)
|
||||
};
|
||||
// the write task can go away once we start .await'ing
|
||||
let _: () = done_rx.await.expect("writer task dead, check logs");
|
||||
ret
|
||||
}
|
||||
|
||||
/// When registering a node, persist it so that on next start we will be able to
|
||||
/// iterate over known nodes to synchronize their tenant shard states with our observed state.
|
||||
pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
|
||||
@@ -149,8 +192,8 @@ impl Persistence {
|
||||
|
||||
/// At startup, we populate our map of tenant shards from persistent storage.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
|
||||
let locked = self.state.lock().unwrap();
|
||||
Ok(locked.tenants.values().cloned().collect())
|
||||
let inner = self.inner.lock().unwrap();
|
||||
Ok(inner.state.tenants.values().cloned().collect())
|
||||
}
|
||||
|
||||
/// Tenants must be persisted before we schedule them for the first time. This enables us
|
||||
@@ -159,8 +202,7 @@ impl Persistence {
|
||||
&self,
|
||||
shards: Vec<TenantShardPersistence>,
|
||||
) -> anyhow::Result<()> {
|
||||
let write = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
self.mutating_transaction(|locked| {
|
||||
for shard in shards {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
|
||||
@@ -170,12 +212,9 @@ impl Persistence {
|
||||
|
||||
locked.tenants.insert(tenant_shard_id, shard);
|
||||
}
|
||||
locked.save()
|
||||
};
|
||||
|
||||
write.commit().await?;
|
||||
|
||||
Ok(())
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
|
||||
@@ -186,8 +225,7 @@ impl Persistence {
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<Generation> {
|
||||
let (write, gen) = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
self.mutating_transaction(|locked| {
|
||||
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
anyhow::bail!("Tried to increment generation of unknown shard");
|
||||
};
|
||||
@@ -196,45 +234,38 @@ impl Persistence {
|
||||
shard.generation_pageserver = Some(node_id);
|
||||
|
||||
let gen = Generation::new(shard.generation);
|
||||
(locked.save(), gen)
|
||||
};
|
||||
|
||||
write.commit().await?;
|
||||
Ok(gen)
|
||||
Ok(gen)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
let write = {
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
self.mutating_transaction(|locked| {
|
||||
let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||
anyhow::bail!("Tried to increment generation of unknown shard");
|
||||
};
|
||||
shard.generation_pageserver = None;
|
||||
locked.save()
|
||||
};
|
||||
write.commit().await?;
|
||||
Ok(())
|
||||
shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap();
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn re_attach(
|
||||
&self,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
|
||||
let (write, result) = {
|
||||
self.mutating_transaction(|locked| {
|
||||
let mut result = HashMap::new();
|
||||
let mut locked = self.state.lock().unwrap();
|
||||
for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
|
||||
if shard.generation_pageserver == Some(node_id) {
|
||||
shard.generation += 1;
|
||||
result.insert(*tenant_shard_id, Generation::new(shard.generation));
|
||||
}
|
||||
}
|
||||
|
||||
(locked.save(), result)
|
||||
};
|
||||
|
||||
write.commit().await?;
|
||||
Ok(result)
|
||||
Ok(result)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
// TODO: when we start shard splitting, we must durably mark the tenant so that
|
||||
|
||||
@@ -381,6 +381,11 @@ impl Service {
|
||||
|
||||
if let Some(new_generation) = new_generation {
|
||||
tenant_state.generation = new_generation;
|
||||
} else {
|
||||
// This is a detach notification. We must update placement policy to avoid re-attaching
|
||||
// during background scheduling/reconciliation, or during attachment service restart.
|
||||
assert!(attach_req.node_id.is_none());
|
||||
tenant_state.policy = PlacementPolicy::Detached;
|
||||
}
|
||||
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
@@ -870,7 +875,6 @@ impl Service {
|
||||
} else {
|
||||
let old_attached = shard.intent.attached;
|
||||
|
||||
shard.intent.attached = Some(migrate_req.node_id);
|
||||
match shard.policy {
|
||||
PlacementPolicy::Single => {
|
||||
shard.intent.secondary.clear();
|
||||
@@ -884,7 +888,13 @@ impl Service {
|
||||
shard.intent.secondary.push(old_attached);
|
||||
}
|
||||
}
|
||||
PlacementPolicy::Detached => {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
|
||||
)))
|
||||
}
|
||||
}
|
||||
shard.intent.attached = Some(migrate_req.node_id);
|
||||
|
||||
tracing::info!("Migrating: new intent {:?}", shard.intent);
|
||||
shard.sequence = shard.sequence.next();
|
||||
|
||||
@@ -312,6 +312,18 @@ impl TenantState {
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Detached => {
|
||||
// Should have no attached or secondary pageservers
|
||||
if self.intent.attached.is_some() {
|
||||
self.intent.attached = None;
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if modified {
|
||||
|
||||
@@ -9,7 +9,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{path::PathBuf, process::Child, str::FromStr};
|
||||
use std::{path::PathBuf, str::FromStr};
|
||||
use tracing::instrument;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
@@ -220,7 +220,7 @@ impl AttachmentService {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<Child> {
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
let path_str = self.path.to_string_lossy();
|
||||
|
||||
let mut args = vec!["-l", &self.listen, "-p", &path_str]
|
||||
@@ -254,6 +254,7 @@ impl AttachmentService {
|
||||
)
|
||||
.await;
|
||||
|
||||
// TODO: shouldn't we bail if we fail to spawn the process?
|
||||
for ps_conf in &self.env.pageservers {
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::io::Write;
|
||||
use std::os::unix::prelude::AsRawFd;
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Child, Command};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{fs, io, thread};
|
||||
|
||||
@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
envs: EI,
|
||||
initial_pid_file: InitialPidFile,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<Child>
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
F: Fn() -> Fut,
|
||||
Fut: std::future::Future<Output = anyhow::Result<bool>>,
|
||||
@@ -98,7 +98,7 @@ where
|
||||
InitialPidFile::Expect(path) => path,
|
||||
};
|
||||
|
||||
let mut spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
let spawned_process = filled_cmd.spawn().with_context(|| {
|
||||
format!("Could not spawn {process_name}, see console output and log files for details.")
|
||||
})?;
|
||||
let pid = spawned_process.id();
|
||||
@@ -106,12 +106,26 @@ where
|
||||
i32::try_from(pid)
|
||||
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
|
||||
);
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
|
||||
println!("SIGKILL & wait the started process");
|
||||
(|| {
|
||||
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
|
||||
spawned_process.kill().context("SIGKILL child")?;
|
||||
spawned_process.wait().context("wait() for child process")?;
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check).await {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started, pid: {pid}");
|
||||
return Ok(spawned_process);
|
||||
println!("\n{process_name} started and passed status check, pid: {pid}");
|
||||
// leak the child process, it'll outlive this neon_local invocation
|
||||
drop(scopeguard::ScopeGuard::into_inner(spawned_process));
|
||||
return Ok(());
|
||||
}
|
||||
Ok(false) => {
|
||||
if retries == NOTICE_AFTER_RETRIES {
|
||||
@@ -126,16 +140,15 @@ where
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} failed to start: {e:#}");
|
||||
if let Err(e) = spawned_process.kill() {
|
||||
println!("Could not stop {process_name} subprocess: {e:#}")
|
||||
};
|
||||
println!("error starting process {process_name:?}: {e:#}");
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
|
||||
anyhow::bail!(
|
||||
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
|
||||
);
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::io;
|
||||
use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Child, Command};
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
@@ -161,7 +161,7 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false).await
|
||||
}
|
||||
|
||||
@@ -207,7 +207,7 @@ impl PageServerNode {
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
update_config: bool,
|
||||
) -> anyhow::Result<Child> {
|
||||
) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
//! ```
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -104,7 +103,7 @@ impl SafekeeperNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
|
||||
@@ -104,6 +104,7 @@ pub struct KeySpaceAccum {
|
||||
accum: Option<Range<Key>>,
|
||||
|
||||
ranges: Vec<Range<Key>>,
|
||||
size: u64,
|
||||
}
|
||||
|
||||
impl KeySpaceAccum {
|
||||
@@ -111,6 +112,7 @@ impl KeySpaceAccum {
|
||||
Self {
|
||||
accum: None,
|
||||
ranges: Vec::new(),
|
||||
size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +123,8 @@ impl KeySpaceAccum {
|
||||
|
||||
#[inline(always)]
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
self.size += key_range_size(&range) as u64;
|
||||
|
||||
match self.accum.as_mut() {
|
||||
Some(accum) => {
|
||||
if range.start == accum.end {
|
||||
@@ -146,6 +150,23 @@ impl KeySpaceAccum {
|
||||
ranges: self.ranges,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
if let Some(accum) = self.accum.take() {
|
||||
self.ranges.push(accum);
|
||||
}
|
||||
|
||||
let mut prev_accum = KeySpaceAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
KeySpace {
|
||||
ranges: prev_accum.ranges,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -254,6 +275,30 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_consume() {
|
||||
let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
|
||||
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
for range in &ranges {
|
||||
accum.add_range(range.clone());
|
||||
}
|
||||
|
||||
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
|
||||
assert_eq!(accum.size(), expected_size);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
|
||||
assert_eq!(accum.size(), 0);
|
||||
|
||||
assert_ks_eq(&accum.consume_keyspace(), vec![]);
|
||||
assert_eq!(accum.size(), 0);
|
||||
|
||||
for range in &ranges {
|
||||
accum.add_range(range.clone());
|
||||
}
|
||||
assert_ks_eq(&accum.to_keyspace(), ranges);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyspace_add_range() {
|
||||
// two separate ranges
|
||||
|
||||
@@ -111,7 +111,19 @@ impl RelTag {
|
||||
/// These files are divided into segments, which are divided into
|
||||
/// pages of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
Hash,
|
||||
Serialize,
|
||||
Deserialize,
|
||||
PartialEq,
|
||||
Eq,
|
||||
PartialOrd,
|
||||
Ord,
|
||||
strum_macros::EnumIter,
|
||||
)]
|
||||
pub enum SlruKind {
|
||||
Clog,
|
||||
MultiXactMembers,
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
@@ -23,6 +24,7 @@ use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
@@ -183,7 +185,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -371,6 +372,20 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
copy_status = status;
|
||||
}
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
// TODO use Azure point in time recovery feature for this
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
Err(anyhow::anyhow!(
|
||||
"time travel recovery for azure blob storage is not implemented"
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
|
||||
@@ -25,6 +25,7 @@ use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
|
||||
@@ -142,7 +143,7 @@ pub struct Listing {
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
@@ -210,6 +211,15 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
|
||||
/// Copy a remote object inside a bucket from one path to another.
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
/// Resets the content of everything with the given prefix to the given state
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
@@ -262,14 +272,15 @@ impl std::error::Error for DownloadError {}
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
#[derive(Clone)]
|
||||
pub enum GenericRemoteStorage {
|
||||
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
|
||||
pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
|
||||
LocalFs(LocalFs),
|
||||
AwsS3(Arc<S3Bucket>),
|
||||
AzureBlob(Arc<AzureBlobStorage>),
|
||||
Unreliable(Arc<UnreliableWrapper>),
|
||||
Unreliable(Other),
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
pub async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -386,6 +397,33 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(s) => s.copy(from, to).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
@@ -673,6 +711,7 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::List => &self.read,
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
RequestKind::TimeTravel => &self.write,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//! This storage used in tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
@@ -14,7 +14,7 @@ use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
@@ -157,7 +157,6 @@ impl LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -423,6 +422,17 @@ impl RemoteStorage for LocalFs {
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::diverging_sub_expression)]
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
|
||||
@@ -6,12 +6,14 @@
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{Context, Poll},
|
||||
time::SystemTime,
|
||||
};
|
||||
|
||||
use anyhow::Context as _;
|
||||
use anyhow::{anyhow, Context as _};
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
@@ -27,17 +29,19 @@ use aws_sdk_s3::{
|
||||
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
types::{Delete, ObjectIdentifier},
|
||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
|
||||
Client,
|
||||
};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::body::SdkBody;
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
@@ -270,6 +274,59 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_oids(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
delete_objects: &[ObjectIdentifier],
|
||||
) -> anyhow::Result<()> {
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
let resp = resp?;
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
@@ -373,7 +430,6 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
async fn list(
|
||||
&self,
|
||||
@@ -569,64 +625,168 @@ impl RemoteStorage for S3Bucket {
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
match resp {
|
||||
Ok(resp) => {
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
self.delete_oids(kind, &delete_objects).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
let timestamp = DateTime::from(timestamp);
|
||||
let done_if_after = DateTime::from(done_if_after);
|
||||
|
||||
tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
let is_permanent = |_e: &_| false;
|
||||
|
||||
let list = backoff::retry(
|
||||
|| async {
|
||||
Ok(self
|
||||
.client
|
||||
.list_object_versions()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(prefix.clone())
|
||||
.send()
|
||||
.await?)
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if list.is_truncated().unwrap_or_default() {
|
||||
anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
|
||||
}
|
||||
|
||||
let mut versions_deletes = list
|
||||
.versions()
|
||||
.iter()
|
||||
.map(VerOrDelete::Version)
|
||||
.chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
|
||||
|
||||
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
|
||||
|
||||
for vd in versions_deletes {
|
||||
let last_modified = vd.last_modified();
|
||||
let version_id = vd.version_id();
|
||||
let key = vd.key();
|
||||
let (Some(last_modified), Some(version_id), Some(key)) =
|
||||
(last_modified, version_id, key)
|
||||
else {
|
||||
anyhow::bail!(
|
||||
"One (or more) of last_modified, key, and id is None. \
|
||||
Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
|
||||
last_modified, key, version_id,
|
||||
);
|
||||
};
|
||||
if version_id == "null" {
|
||||
anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
|
||||
indicating either disabled versioning, or legacy objects with null version id values");
|
||||
}
|
||||
tracing::trace!(
|
||||
"Parsing version key={key} version_id={version_id} is_delete={}",
|
||||
matches!(vd, VerOrDelete::DeleteMarker(_))
|
||||
);
|
||||
|
||||
vds_for_key
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((vd, last_modified, version_id));
|
||||
}
|
||||
for (key, versions) in vds_for_key {
|
||||
let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
|
||||
if last_last_modified > &&done_if_after {
|
||||
tracing::trace!("Key {key} has version later than done_if_after, skipping");
|
||||
continue;
|
||||
}
|
||||
// the version we want to restore to.
|
||||
let version_to_restore_to =
|
||||
match versions.binary_search_by_key(×tamp, |tpl| *tpl.1) {
|
||||
Ok(v) => v,
|
||||
Err(e) => e,
|
||||
};
|
||||
if version_to_restore_to == versions.len() {
|
||||
tracing::trace!("Key {key} has no changes since timestamp, skipping");
|
||||
continue;
|
||||
}
|
||||
let mut do_delete = false;
|
||||
if version_to_restore_to == 0 {
|
||||
// All versions more recent, so the key didn't exist at the specified time point.
|
||||
tracing::trace!(
|
||||
"All {} versions more recent for {key}, deleting",
|
||||
versions.len()
|
||||
);
|
||||
do_delete = true;
|
||||
} else {
|
||||
match &versions[version_to_restore_to - 1] {
|
||||
(VerOrDelete::Version(_), _last_modified, version_id) => {
|
||||
tracing::trace!("Copying old version {version_id} for {key}...");
|
||||
// Restore the state to the last version by copying
|
||||
let source_id =
|
||||
format!("{}/{key}?versionId={version_id}", self.bucket_name);
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
Ok(self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(key)
|
||||
.copy_source(&source_id)
|
||||
.send()
|
||||
.await?)
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
(VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
|
||||
do_delete = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
if do_delete {
|
||||
if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
|
||||
// Key has since been deleted (but there was some history), no need to do anything
|
||||
tracing::trace!("Key {key} already deleted, skipping.");
|
||||
} else {
|
||||
tracing::trace!("Deleting {key}...");
|
||||
|
||||
let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
|
||||
self.delete_oids(kind, &[oid]).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
@@ -651,6 +811,32 @@ fn start_measuring_requests(
|
||||
})
|
||||
}
|
||||
|
||||
enum VerOrDelete<'a> {
|
||||
Version(&'a ObjectVersion),
|
||||
DeleteMarker(&'a DeleteMarkerEntry),
|
||||
}
|
||||
|
||||
impl<'a> VerOrDelete<'a> {
|
||||
fn last_modified(&self) -> Option<&'a DateTime> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.last_modified(),
|
||||
VerOrDelete::DeleteMarker(v) => v.last_modified(),
|
||||
}
|
||||
}
|
||||
fn version_id(&self) -> Option<&'a str> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.version_id(),
|
||||
VerOrDelete::DeleteMarker(v) => v.version_id(),
|
||||
}
|
||||
}
|
||||
fn key(&self) -> Option<&'a str> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.key(),
|
||||
VerOrDelete::DeleteMarker(v) => v.key(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -12,6 +12,7 @@ pub(crate) enum RequestKind {
|
||||
Delete = 2,
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
TimeTravel = 5,
|
||||
}
|
||||
|
||||
use RequestKind::*;
|
||||
@@ -24,6 +25,7 @@ impl RequestKind {
|
||||
Delete => "delete_object",
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
TimeTravel => "time_travel_recover",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -31,7 +33,7 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 5]);
|
||||
pub(super) struct RequestTyped<C>([C; 6]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -40,8 +42,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy].into_iter();
|
||||
let arr = std::array::from_fn::<C, 5, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -3,16 +3,19 @@
|
||||
//! testing purposes.
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
use std::time::SystemTime;
|
||||
use std::{collections::hash_map::Entry, sync::Arc};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
StorageMetadata,
|
||||
};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
inner: crate::GenericRemoteStorage,
|
||||
inner: GenericRemoteStorage<Arc<VoidStorage>>,
|
||||
|
||||
// This many attempts of each operation will fail, then we let it succeed.
|
||||
attempts_to_fail: u64,
|
||||
@@ -29,11 +32,21 @@ enum RemoteOp {
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
DeleteObjects(Vec<RemotePath>),
|
||||
TimeTravelRecover(Option<RemotePath>),
|
||||
}
|
||||
|
||||
impl UnreliableWrapper {
|
||||
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
|
||||
assert!(attempts_to_fail > 0);
|
||||
let inner = match inner {
|
||||
GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
|
||||
GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
|
||||
GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
|
||||
// We could also make this a no-op, as in, extract the inner of the passed generic remote storage
|
||||
GenericRemoteStorage::Unreliable(_s) => {
|
||||
panic!("Can't wrap unreliable wrapper unreliably")
|
||||
}
|
||||
};
|
||||
UnreliableWrapper {
|
||||
inner,
|
||||
attempts_to_fail,
|
||||
@@ -84,7 +97,9 @@ impl UnreliableWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
|
||||
type VoidStorage = crate::LocalFs;
|
||||
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
@@ -169,4 +184,17 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.copy_object(from, to).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
|
||||
self.inner
|
||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,19 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
use std::time::{Duration, UNIX_EPOCH};
|
||||
use std::{collections::HashSet, time::SystemTime};
|
||||
|
||||
use crate::common::{download_to_vec, upload_stream};
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
mod common;
|
||||
@@ -23,6 +27,121 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorage::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
// Our test depends on discrepancies in the clock between S3 and the environment the tests
|
||||
// run in. Therefore, wait a little bit before and after. The alternative would be
|
||||
// to take the time from S3 response headers.
|
||||
const WAIT_TIME: Duration = Duration::from_millis(3_000);
|
||||
|
||||
async fn time_point() -> SystemTime {
|
||||
tokio::time::sleep(WAIT_TIME).await;
|
||||
let ret = SystemTime::now();
|
||||
tokio::time::sleep(WAIT_TIME).await;
|
||||
ret
|
||||
}
|
||||
|
||||
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(client
|
||||
.list_files(None)
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>())
|
||||
}
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None).await?;
|
||||
|
||||
let t0_files = list_files(&ctx.client).await?;
|
||||
let t0 = time_point().await;
|
||||
println!("at t0: {t0_files:?}");
|
||||
|
||||
let old_data = "remote blob data2";
|
||||
let (data, len) = upload_stream(old_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
|
||||
let t1_files = list_files(&ctx.client).await?;
|
||||
let t1 = time_point().await;
|
||||
println!("at t1: {t1_files:?}");
|
||||
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = ctx.client.download(&path2).await?;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
let t1_hwt = t1 - half_wt;
|
||||
if !(t0_hwt..=t1_hwt).contains(&last_modified) {
|
||||
panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
|
||||
This likely means a large lock discrepancy between S3 and the local clock.");
|
||||
}
|
||||
}
|
||||
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None).await?;
|
||||
|
||||
let new_data = "new remote blob data2";
|
||||
let (data, len) = upload_stream(new_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
|
||||
ctx.client.delete(&path1).await?;
|
||||
|
||||
let t2_files = list_files(&ctx.client).await?;
|
||||
let t2 = time_point().await;
|
||||
println!("at t2: {t2_files:?}");
|
||||
|
||||
// No changes after recovery to t2 (no-op)
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
assert_eq!(t2_files, t2_files_recovered);
|
||||
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
assert_eq!(path2_recovered_t2, new_data.as_bytes());
|
||||
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
assert_eq!(t1_files, t1_files_recovered);
|
||||
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
assert_eq!(path2_recovered_t1, old_data.as_bytes());
|
||||
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
assert_eq!(t0_files, t0_files_recovered);
|
||||
|
||||
// cleanup
|
||||
ctx.client.delete_objects(&[path1, path2, path3]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct EnabledS3 {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
fs::{self, File},
|
||||
io,
|
||||
io::{self, Write},
|
||||
};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use tracing::info;
|
||||
|
||||
/// Similar to [`std::fs::create_dir`], except we fsync the
|
||||
/// created directory and its parent.
|
||||
@@ -81,6 +82,22 @@ pub fn path_with_suffix_extension(
|
||||
original_path.as_ref().with_extension(new_extension)
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub fn fsync_file_and_parent_log(file_path: &Utf8Path) -> io::Result<()> {
|
||||
let parent = file_path.parent().ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
format!("File {file_path:?} has no parent"),
|
||||
)
|
||||
})?;
|
||||
info!("fsync file");
|
||||
fsync(file_path)?;
|
||||
info!("fsync parent");
|
||||
fsync(parent)?;
|
||||
info!("done");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
|
||||
let parent = file_path.parent().ok_or_else(|| {
|
||||
io::Error::new(
|
||||
@@ -88,7 +105,6 @@ pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
|
||||
format!("File {file_path:?} has no parent"),
|
||||
)
|
||||
})?;
|
||||
|
||||
fsync(file_path)?;
|
||||
fsync(parent)?;
|
||||
Ok(())
|
||||
@@ -112,6 +128,48 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
|
||||
tokio::fs::File::open(path.as_ref()).await?.sync_all().await
|
||||
}
|
||||
|
||||
/// Writes a file to the specified `final_path` in a crash safe fasion
|
||||
///
|
||||
/// The file is first written to the specified tmp_path, and in a second
|
||||
/// step, the tmp path is renamed to the final path. As renames are
|
||||
/// atomic, a crash during the write operation will never leave behind a
|
||||
/// partially written file.
|
||||
///
|
||||
/// NB: an async variant of this code exists in Pageserver's VirtualFile.
|
||||
pub fn overwrite(
|
||||
final_path: &Utf8Path,
|
||||
tmp_path: &Utf8Path,
|
||||
content: &[u8],
|
||||
) -> std::io::Result<()> {
|
||||
let Some(final_path_parent) = final_path.parent() else {
|
||||
return Err(std::io::Error::from_raw_os_error(
|
||||
nix::errno::Errno::EINVAL as i32,
|
||||
));
|
||||
};
|
||||
std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
|
||||
let mut file = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
// Use `create_new` so that, if we race with ourselves or something else,
|
||||
// we bail out instead of causing damage.
|
||||
.create_new(true)
|
||||
.open(tmp_path)?;
|
||||
file.write_all(content)?;
|
||||
file.sync_all()?;
|
||||
drop(file); // before the rename, that's important!
|
||||
// renames are atomic
|
||||
std::fs::rename(tmp_path, final_path)?;
|
||||
// Only open final path parent dirfd now, so that this operation only
|
||||
// ever holds one VirtualFile fd at a time. That's important because
|
||||
// the current `find_victim_slot` impl might pick the same slot for both
|
||||
// VirtualFile., and it eventually does a blocking write lock instead of
|
||||
// try_lock.
|
||||
let final_parent_dirfd = std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(final_path_parent)?;
|
||||
final_parent_dirfd.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -131,7 +131,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
||||
_ => error!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
|
||||
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
|
||||
_ => info!("Error processing HTTP request: {api_error:#}"),
|
||||
}
|
||||
|
||||
api_error.into_response()
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::{
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use nix::{errno::Errno::EAGAIN, fcntl};
|
||||
use tracing::info;
|
||||
|
||||
use crate::crashsafe;
|
||||
|
||||
@@ -41,14 +42,19 @@ impl Deref for LockFileGuard {
|
||||
|
||||
impl UnwrittenLockFile {
|
||||
/// Replace the content of this lock file with the byte representation of `contents`.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub fn write_content(mut self, contents: String) -> anyhow::Result<LockFileGuard> {
|
||||
info!("truncate");
|
||||
self.file
|
||||
.set_len(0)
|
||||
.context("Failed to truncate lockfile")?;
|
||||
info!("write_all");
|
||||
self.file
|
||||
.write_all(contents.as_bytes())
|
||||
.with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
|
||||
crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
|
||||
info!("fsync file and parent");
|
||||
crashsafe::fsync_file_and_parent_log(&self.path).context("fsync lockfile")?;
|
||||
info!("done");
|
||||
Ok(LockFileGuard(self.file))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,6 +54,7 @@ use std::ops::Deref;
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use nix::unistd::Pid;
|
||||
use tracing::info;
|
||||
|
||||
use crate::lock_file::{self, LockFileRead};
|
||||
|
||||
@@ -85,12 +86,16 @@ impl Deref for PidFileGuard {
|
||||
/// The claim ends as soon as the returned guard object is dropped.
|
||||
/// To maintain the claim for the remaining lifetime of the current process,
|
||||
/// use [`std::mem::forget`] or similar.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub fn claim_for_current_process(path: &Utf8Path) -> anyhow::Result<PidFileGuard> {
|
||||
info!("create_exclusive");
|
||||
let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
|
||||
// if any of the next steps fail, we drop the file descriptor and thereby release the lock
|
||||
info!("write_content");
|
||||
let guard = unwritten_lock_file
|
||||
.write_content(Pid::this().to_string())
|
||||
.context("write pid to lock file")?;
|
||||
info!("done");
|
||||
Ok(PidFileGuard(guard))
|
||||
}
|
||||
|
||||
|
||||
@@ -423,8 +423,8 @@ async fn client(
|
||||
tokio::select! {
|
||||
res = do_requests => { res },
|
||||
_ = cancel.cancelled() => {
|
||||
client.shutdown().await;
|
||||
return;
|
||||
// fallthrough to shutdown
|
||||
}
|
||||
}
|
||||
client.shutdown().await;
|
||||
}
|
||||
|
||||
@@ -11,8 +11,9 @@
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -133,6 +134,87 @@ where
|
||||
ctx: &'a RequestContext,
|
||||
}
|
||||
|
||||
/// A sink that accepts SLRU blocks ordered by key and forwards
|
||||
/// full segments to the archive.
|
||||
struct SlruSegmentsBuilder<'a, 'b, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
ar: &'a mut Builder<&'b mut W>,
|
||||
buf: Vec<u8>,
|
||||
current_segment: Option<(SlruKind, u32)>,
|
||||
}
|
||||
|
||||
impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
fn new(ar: &'a mut Builder<&'b mut W>) -> Self {
|
||||
Self {
|
||||
ar,
|
||||
buf: Vec::new(),
|
||||
current_segment: None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
|
||||
let (kind, segno, _) = key_to_slru_block(*key)?;
|
||||
|
||||
match kind {
|
||||
SlruKind::Clog => {
|
||||
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
|
||||
}
|
||||
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
|
||||
ensure!(block.len() == BLCKSZ as usize);
|
||||
}
|
||||
}
|
||||
|
||||
let segment = (kind, segno);
|
||||
match self.current_segment {
|
||||
None => {
|
||||
self.current_segment = Some(segment);
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
Some(current_seg) if current_seg == segment => {
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
Some(_) => {
|
||||
self.flush().await?;
|
||||
|
||||
self.current_segment = Some(segment);
|
||||
self.buf
|
||||
.extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> anyhow::Result<()> {
|
||||
let nblocks = self.buf.len() / BLCKSZ as usize;
|
||||
let (kind, segno) = self.current_segment.take().unwrap();
|
||||
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
|
||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||
self.ar.append(&header, self.buf.as_slice()).await?;
|
||||
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
|
||||
self.buf.clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn finish(mut self) -> anyhow::Result<()> {
|
||||
if self.current_segment.is_none() || self.buf.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.flush().await
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W> Basebackup<'a, W>
|
||||
where
|
||||
W: AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -168,20 +250,27 @@ where
|
||||
}
|
||||
|
||||
// Gather non-relational files from object storage pages.
|
||||
for kind in [
|
||||
SlruKind::Clog,
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
for segno in self
|
||||
let slru_partitions = self
|
||||
.timeline
|
||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
|
||||
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let blocks = self
|
||||
.timeline
|
||||
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
.get_vectored(&part.ranges, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
slru_builder.add_block(&key, block?).await?;
|
||||
}
|
||||
}
|
||||
|
||||
slru_builder.finish().await?;
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
@@ -305,39 +394,6 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Generate SLRU segment files from repository.
|
||||
//
|
||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
|
||||
} else {
|
||||
ensure!(img.len() == BLCKSZ as usize);
|
||||
}
|
||||
|
||||
slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
|
||||
}
|
||||
|
||||
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
|
||||
let header = new_tar_header(&segname, slru_buf.len() as u64)?;
|
||||
self.ar.append(&header, slru_buf.as_slice()).await?;
|
||||
|
||||
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Include database/tablespace directories.
|
||||
//
|
||||
|
||||
@@ -293,6 +293,7 @@ fn start_pageserver(
|
||||
// Create and lock PID file. This ensures that there cannot be more than one
|
||||
// pageserver process running at the same time.
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
info!("Claiming pid file at {lock_file_path:?}");
|
||||
let lock_file =
|
||||
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("Claimed pid file at {lock_file_path:?}");
|
||||
|
||||
@@ -877,6 +877,56 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
Marks the initdb archive for preservation upon deletion of the timeline or tenant.
|
||||
This is meant to be part of the disaster recovery process.
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"404":
|
||||
description: No tenant or timeline found for the specified ids
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
|
||||
@@ -187,6 +187,7 @@ impl From<TenantSlotUpsertError> for ApiError {
|
||||
match e {
|
||||
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
|
||||
MapState(e) => e.into(),
|
||||
ShuttingDown(_) => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -495,6 +496,10 @@ async fn timeline_create_handler(
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
Err(_) if tenant.cancel.is_cancelled() => {
|
||||
// In case we get some ugly error type during shutdown, cast it into a clean 503.
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
|
||||
json_response(StatusCode::CONFLICT, ())
|
||||
}
|
||||
@@ -561,6 +566,43 @@ async fn timeline_list_handler(
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
|
||||
async fn timeline_preserve_initdb_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// Part of the process for disaster recovery from safekeeper-stored WAL:
|
||||
// If we don't recover into a new timeline but want to keep the timeline ID,
|
||||
// then the initdb archive is deleted. This endpoint copies it to a different
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
timeline
|
||||
.preserve_initdb_archive()
|
||||
.await
|
||||
.context("preserving initdb archive")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_preserve_initdb_archive",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
%timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1220,19 +1262,9 @@ async fn tenant_create_handler(
|
||||
};
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
if let res @ Err(_) = new_tenant
|
||||
new_tenant
|
||||
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||
.await
|
||||
{
|
||||
// This shouldn't happen because we just created the tenant directory
|
||||
// in upsert_location, and there aren't any remote timelines
|
||||
// to load, so, nothing can really fail during load.
|
||||
// Don't do cleanup because we don't know how we got here.
|
||||
// The tenant will likely be in `Broken` state and subsequent
|
||||
// calls will fail.
|
||||
res.context("created tenant failed to become active")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
.await?;
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
@@ -1943,6 +1975,10 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||
api_handler(r, tenant_ignore_handler)
|
||||
})
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
)
|
||||
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||
api_handler(r, timeline_detail_handler)
|
||||
})
|
||||
|
||||
@@ -322,8 +322,8 @@ enum PageStreamError {
|
||||
Shutdown,
|
||||
|
||||
/// Something went wrong reading a page: this likely indicates a pageserver bug
|
||||
#[error("Read error: {0}")]
|
||||
Read(PageReconstructError),
|
||||
#[error("Read error")]
|
||||
Read(#[source] PageReconstructError),
|
||||
|
||||
/// Ran out of time waiting for an LSN
|
||||
#[error("LSN timeout: {0}")]
|
||||
@@ -332,11 +332,11 @@ enum PageStreamError {
|
||||
/// The entity required to serve the request (tenant or timeline) is not found,
|
||||
/// or is not found in a suitable state to serve a request.
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(std::borrow::Cow<'static, str>),
|
||||
NotFound(Cow<'static, str>),
|
||||
|
||||
/// Request asked for something that doesn't make sense, like an invalid LSN
|
||||
#[error("Bad request: {0}")]
|
||||
BadRequest(std::borrow::Cow<'static, str>),
|
||||
BadRequest(Cow<'static, str>),
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for PageStreamError {
|
||||
@@ -667,7 +667,10 @@ impl PageServerHandler {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
let full = utils::error::report_compact_sources(&e);
|
||||
span.in_scope(|| {
|
||||
error!("error reading relation or page version: {full:#}")
|
||||
});
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
|
||||
@@ -27,6 +27,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
@@ -533,6 +534,33 @@ impl Timeline {
|
||||
Ok(Default::default())
|
||||
}
|
||||
|
||||
pub(crate) async fn get_slru_keyspace(
|
||||
&self,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<KeySpace, PageReconstructError> {
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
|
||||
for kind in SlruKind::iter() {
|
||||
let mut segments: Vec<u32> = self
|
||||
.list_slru_segments(kind, version, ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
segments.sort_unstable();
|
||||
|
||||
for seg in segments {
|
||||
let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
|
||||
|
||||
accum.add_range(
|
||||
slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(accum.to_keyspace())
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub(crate) async fn list_slru_segments(
|
||||
&self,
|
||||
|
||||
@@ -91,7 +91,6 @@ use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::ops::Bound::Included;
|
||||
use std::process::Stdio;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
@@ -628,9 +627,15 @@ impl Tenant {
|
||||
deletion_queue_client,
|
||||
));
|
||||
|
||||
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
||||
// we shut down while attaching.
|
||||
let Ok(attach_gate_guard) = tenant.gate.enter() else {
|
||||
// We just created the Tenant: nothing else can have shut it down yet
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
// Do all the hard work in the background
|
||||
let tenant_clone = Arc::clone(&tenant);
|
||||
|
||||
let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
@@ -640,6 +645,8 @@ impl Tenant {
|
||||
"attach tenant",
|
||||
false,
|
||||
async move {
|
||||
let _gate_guard = attach_gate_guard;
|
||||
|
||||
// Is this tenant being spawned as part of process startup?
|
||||
let starting_up = init_order.is_some();
|
||||
scopeguard::defer! {
|
||||
@@ -814,7 +821,7 @@ impl Tenant {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||
};
|
||||
match tenant_clone.attach(preload, &ctx).await {
|
||||
match tenant_clone.attach(preload, mode, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
if let Some(t)= attach_timer {t.observe_duration();}
|
||||
@@ -901,15 +908,20 @@ impl Tenant {
|
||||
async fn attach(
|
||||
self: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
|
||||
let preload = match preload {
|
||||
Some(p) => p,
|
||||
None => {
|
||||
let preload = match (preload, mode) {
|
||||
(Some(p), _) => p,
|
||||
(None, SpawnMode::Create) => TenantPreload {
|
||||
deleting: false,
|
||||
timelines: HashMap::new(),
|
||||
},
|
||||
(None, SpawnMode::Normal) => {
|
||||
// Deprecated dev mode: load from local disk state instead of remote storage
|
||||
// https://github.com/neondatabase/neon/issues/5624
|
||||
return self.load_local(ctx).await;
|
||||
@@ -1017,7 +1029,10 @@ impl Tenant {
|
||||
// IndexPart is the source of truth.
|
||||
self.clean_up_timelines(&existent_timelines)?;
|
||||
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
|
||||
fail::fail_point!("attach-before-activate", |_| {
|
||||
anyhow::bail!("attach-before-activate");
|
||||
});
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);
|
||||
|
||||
info!("Done");
|
||||
|
||||
@@ -1681,9 +1696,13 @@ impl Tenant {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
if !self.is_active() {
|
||||
return Err(CreateTimelineError::Other(anyhow::anyhow!(
|
||||
"Cannot create timelines on inactive tenant"
|
||||
)));
|
||||
if matches!(self.current_state(), TenantState::Stopping { .. }) {
|
||||
return Err(CreateTimelineError::ShuttingDown);
|
||||
} else {
|
||||
return Err(CreateTimelineError::Other(anyhow::anyhow!(
|
||||
"Cannot create timelines on inactive tenant"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let _gate = self
|
||||
@@ -3759,27 +3778,25 @@ async fn run_initdb(
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
// If the `select!` below doesn't finish the `wait_with_output`,
|
||||
// let the task get `wait()`ed for asynchronously by tokio.
|
||||
// This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
|
||||
// TODO: fix for this is non-trivial, see
|
||||
// https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
|
||||
//
|
||||
.kill_on_drop(true)
|
||||
.spawn()?;
|
||||
|
||||
tokio::select! {
|
||||
initdb_output = initdb_command.wait_with_output() => {
|
||||
let initdb_output = initdb_output?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_command.wait_with_output().await?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
initdb_output.status,
|
||||
initdb_output.stderr,
|
||||
));
|
||||
}
|
||||
|
||||
// This isn't true cancellation support, see above. Still return an error to
|
||||
// excercise the cancellation code path.
|
||||
if cancel.is_cancelled() {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -4035,7 +4052,7 @@ pub(crate) mod harness {
|
||||
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
tenant
|
||||
.attach(Some(preload), ctx)
|
||||
.attach(Some(preload), SpawnMode::Normal, ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -409,7 +409,10 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant.attach(preload, ctx).await.context("attach")?;
|
||||
tenant
|
||||
.attach(preload, super::SpawnMode::Normal, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
|
||||
@@ -283,15 +283,15 @@ impl LayerMap {
|
||||
///
|
||||
/// This is used for garbage collection, to determine if an old layer can
|
||||
/// be deleted.
|
||||
pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
|
||||
pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
|
||||
if key.is_empty() {
|
||||
// Vacuously true. There's a newer image for all 0 of the kerys in the range.
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
|
||||
Some(v) => v,
|
||||
None => return Ok(false),
|
||||
None => return false,
|
||||
};
|
||||
|
||||
let start = key.start.to_i128();
|
||||
@@ -304,17 +304,17 @@ impl LayerMap {
|
||||
|
||||
// Check the start is covered
|
||||
if !layer_covers(version.image_coverage.query(start)) {
|
||||
return Ok(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check after all changes of coverage
|
||||
for (_, change_val) in version.image_coverage.range(start..end) {
|
||||
if !layer_covers(change_val) {
|
||||
return Ok(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
true
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
|
||||
@@ -325,18 +325,14 @@ impl LayerMap {
|
||||
/// Divide the whole given range of keys into sub-ranges based on the latest
|
||||
/// image layer that covers each range at the specified lsn (inclusive).
|
||||
/// This is used when creating new image layers.
|
||||
///
|
||||
// FIXME: clippy complains that the result type is very complex. She's probably
|
||||
// right...
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub fn image_coverage(
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
|
||||
) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.0) {
|
||||
Some(v) => v,
|
||||
None => return Ok(vec![]),
|
||||
None => return vec![],
|
||||
};
|
||||
|
||||
let start = key_range.start.to_i128();
|
||||
@@ -359,7 +355,7 @@ impl LayerMap {
|
||||
let kr = Key::from_i128(current_key)..Key::from_i128(end);
|
||||
coverage.push((kr, current_val.take()));
|
||||
|
||||
Ok(coverage)
|
||||
coverage
|
||||
}
|
||||
|
||||
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
|
||||
@@ -410,24 +406,19 @@ impl LayerMap {
|
||||
/// This number is used to compute the largest number of deltas that
|
||||
/// we'll need to visit for any page reconstruction in this region.
|
||||
/// We use this heuristic to decide whether to create an image layer.
|
||||
pub fn count_deltas(
|
||||
&self,
|
||||
key: &Range<Key>,
|
||||
lsn: &Range<Lsn>,
|
||||
limit: Option<usize>,
|
||||
) -> Result<usize> {
|
||||
pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
|
||||
// We get the delta coverage of the region, and for each part of the coverage
|
||||
// we recurse right underneath the delta. The recursion depth is limited by
|
||||
// the largest result this function could return, which is in practice between
|
||||
// 3 and 10 (since we usually try to create an image when the number gets larger).
|
||||
|
||||
if lsn.is_empty() || key.is_empty() || limit == Some(0) {
|
||||
return Ok(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
|
||||
Some(v) => v,
|
||||
None => return Ok(0),
|
||||
None => return 0,
|
||||
};
|
||||
|
||||
let start = key.start.to_i128();
|
||||
@@ -448,8 +439,7 @@ impl LayerMap {
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath =
|
||||
self.count_deltas(&kr, &lr, new_limit)?;
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
max_stacked_deltas,
|
||||
base_count + max_stacked_deltas_underneath,
|
||||
@@ -471,7 +461,7 @@ impl LayerMap {
|
||||
if !kr.is_empty() {
|
||||
let base_count = Self::is_reimage_worthy(&val, key) as usize;
|
||||
let new_limit = limit.map(|l| l - base_count);
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
|
||||
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
|
||||
max_stacked_deltas = std::cmp::max(
|
||||
max_stacked_deltas,
|
||||
base_count + max_stacked_deltas_underneath,
|
||||
@@ -480,7 +470,7 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(max_stacked_deltas)
|
||||
max_stacked_deltas
|
||||
}
|
||||
|
||||
/// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
|
||||
@@ -592,10 +582,7 @@ impl LayerMap {
|
||||
if limit == Some(difficulty) {
|
||||
break;
|
||||
}
|
||||
for (img_range, last_img) in self
|
||||
.image_coverage(range, lsn)
|
||||
.expect("why would this err?")
|
||||
{
|
||||
for (img_range, last_img) in self.image_coverage(range, lsn) {
|
||||
if limit == Some(difficulty) {
|
||||
break;
|
||||
}
|
||||
@@ -606,9 +593,7 @@ impl LayerMap {
|
||||
};
|
||||
|
||||
if img_lsn < lsn {
|
||||
let num_deltas = self
|
||||
.count_deltas(&img_range, &(img_lsn..lsn), limit)
|
||||
.expect("why would this err lol?");
|
||||
let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
|
||||
difficulty = std::cmp::max(difficulty, num_deltas);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
@@ -32,7 +33,8 @@ use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
TenantConfOpt,
|
||||
};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
@@ -466,6 +468,26 @@ pub async fn init_tenant_mgr(
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
if let Some(gen) = generations.get(&tenant_shard_id) {
|
||||
if let LocationMode::Attached(attached) = &location_conf.mode {
|
||||
if attached.generation > *gen {
|
||||
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
attached.generation
|
||||
);
|
||||
|
||||
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
|
||||
// local disk content: demote to secondary rather than detaching.
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf,
|
||||
&SecondaryLocationConfig { warm: false },
|
||||
)),
|
||||
);
|
||||
}
|
||||
}
|
||||
*gen
|
||||
} else {
|
||||
match &location_conf.mode {
|
||||
@@ -721,7 +743,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
tokio::select! {
|
||||
Some(joined) = join_set.join_next() => {
|
||||
match joined {
|
||||
Ok(()) => {}
|
||||
Ok(()) => {},
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the tasks");
|
||||
}
|
||||
@@ -882,7 +904,7 @@ impl TenantManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_location_config: LocationConf,
|
||||
flush: Option<Duration>,
|
||||
spawn_mode: SpawnMode,
|
||||
mut spawn_mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
@@ -902,19 +924,29 @@ impl TenantManager {
|
||||
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
||||
match (&new_location_config.mode, peek_slot) {
|
||||
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
|
||||
if attach_conf.generation == tenant.generation {
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
match attach_conf.generation.cmp(&tenant.generation) {
|
||||
Ordering::Equal => {
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
} else {
|
||||
// Different generations, fall through to general case
|
||||
None
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
}
|
||||
Ordering::Less => {
|
||||
return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
|
||||
"Generation {:?} is less than existing {:?}",
|
||||
attach_conf.generation,
|
||||
tenant.generation
|
||||
)));
|
||||
}
|
||||
Ordering::Greater => {
|
||||
// Generation advanced, fall through to general case of replacing `Tenant` object
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
(
|
||||
@@ -1019,6 +1051,12 @@ impl TenantManager {
|
||||
}
|
||||
}
|
||||
slot_guard.drop_old_value().expect("We just shut it down");
|
||||
|
||||
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
|
||||
// the caller thinks they're creating but the tenant already existed. We must switch to
|
||||
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
|
||||
// rather than assuming it to be empty.
|
||||
spawn_mode = SpawnMode::Normal;
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
info!("Shutting down secondary tenant");
|
||||
@@ -1102,14 +1140,46 @@ impl TenantManager {
|
||||
None
|
||||
};
|
||||
|
||||
slot_guard.upsert(new_slot).map_err(|e| match e {
|
||||
TenantSlotUpsertError::InternalError(e) => {
|
||||
UpsertLocationError::Other(anyhow::anyhow!(e))
|
||||
match slot_guard.upsert(new_slot) {
|
||||
Err(TenantSlotUpsertError::InternalError(e)) => {
|
||||
Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
|
||||
}
|
||||
TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
|
||||
})?;
|
||||
Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
|
||||
Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
|
||||
// If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
|
||||
// we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
|
||||
// are shutdown.
|
||||
//
|
||||
// We must shut it down inline here.
|
||||
match new_slot {
|
||||
TenantSlot::InProgress(_) => {
|
||||
// Unreachable because we never insert an InProgress
|
||||
unreachable!()
|
||||
}
|
||||
TenantSlot::Attached(tenant) => {
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {
|
||||
info!("Finished shutting down just-spawned tenant");
|
||||
}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
barrier.wait().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
TenantSlot::Secondary(secondary_tenant) => {
|
||||
secondary_tenant.shutdown().await;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(attached_tenant)
|
||||
Err(UpsertLocationError::Unavailable(
|
||||
TenantMapError::ShuttingDown,
|
||||
))
|
||||
}
|
||||
Ok(()) => Ok(attached_tenant),
|
||||
}
|
||||
}
|
||||
|
||||
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
|
||||
@@ -1728,14 +1798,31 @@ pub(crate) enum TenantSlotError {
|
||||
|
||||
/// Superset of TenantMapError: issues that can occur when using a SlotGuard
|
||||
/// to insert a new value.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum TenantSlotUpsertError {
|
||||
#[derive(thiserror::Error)]
|
||||
pub(crate) enum TenantSlotUpsertError {
|
||||
/// An error where the slot is in an unexpected state, indicating a code bug
|
||||
#[error("Internal error updating Tenant")]
|
||||
InternalError(Cow<'static, str>),
|
||||
|
||||
#[error(transparent)]
|
||||
MapState(#[from] TenantMapError),
|
||||
MapState(TenantMapError),
|
||||
|
||||
// If we encounter TenantManager shutdown during upsert, we must carry the Completion
|
||||
// from the SlotGuard, so that the caller can hold it while they clean up: otherwise
|
||||
// TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
|
||||
// was protected by the SlotGuard.
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown((TenantSlot, utils::completion::Completion)),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TenantSlotUpsertError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
|
||||
Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
|
||||
Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1784,7 +1871,7 @@ pub struct SlotGuard {
|
||||
|
||||
/// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
|
||||
/// release any waiters as soon as this SlotGuard is dropped.
|
||||
_completion: utils::completion::Completion,
|
||||
completion: utils::completion::Completion,
|
||||
}
|
||||
|
||||
impl SlotGuard {
|
||||
@@ -1797,7 +1884,7 @@ impl SlotGuard {
|
||||
tenant_shard_id,
|
||||
old_value,
|
||||
upserted: false,
|
||||
_completion: completion,
|
||||
completion,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1830,9 +1917,16 @@ impl SlotGuard {
|
||||
}
|
||||
|
||||
let m = match &mut *locked {
|
||||
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
|
||||
TenantsMap::Initializing => {
|
||||
return Err(TenantSlotUpsertError::MapState(
|
||||
TenantMapError::StillInitializing,
|
||||
))
|
||||
}
|
||||
TenantsMap::ShuttingDown(_) => {
|
||||
return Err(TenantMapError::ShuttingDown.into());
|
||||
return Err(TenantSlotUpsertError::ShuttingDown((
|
||||
new_value,
|
||||
self.completion.clone(),
|
||||
)));
|
||||
}
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
@@ -1880,7 +1974,9 @@ impl SlotGuard {
|
||||
Err(TenantSlotUpsertError::InternalError(_)) => {
|
||||
// We already logged the error, nothing else we can do.
|
||||
}
|
||||
Err(TenantSlotUpsertError::MapState(_)) => {
|
||||
Err(
|
||||
TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
|
||||
) => {
|
||||
// If the map is shutting down, we need not replace anything
|
||||
}
|
||||
Ok(()) => {}
|
||||
@@ -1978,18 +2074,22 @@ fn tenant_map_peek_slot<'a>(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
mode: TenantSlotPeekMode,
|
||||
) -> Result<Option<&'a TenantSlot>, TenantMapError> {
|
||||
let m = match tenants.deref() {
|
||||
TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
|
||||
match tenants.deref() {
|
||||
TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
|
||||
TenantsMap::ShuttingDown(m) => match mode {
|
||||
TenantSlotPeekMode::Read => m,
|
||||
TenantSlotPeekMode::Write => {
|
||||
return Err(TenantMapError::ShuttingDown);
|
||||
}
|
||||
TenantSlotPeekMode::Read => Ok(Some(
|
||||
// When reading in ShuttingDown state, we must translate None results
|
||||
// into a ShuttingDown error, because absence of a tenant shard ID in the map
|
||||
// isn't a reliable indicator of the tenant being gone: it might have been
|
||||
// InProgress when shutdown started, and cleaned up from that state such
|
||||
// that it's now no longer in the map. Callers will have to wait until
|
||||
// we next start up to get a proper answer. This avoids incorrect 404 API responses.
|
||||
m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
|
||||
)),
|
||||
TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
|
||||
},
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
|
||||
Ok(m.get(tenant_shard_id))
|
||||
TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
|
||||
}
|
||||
}
|
||||
|
||||
enum TenantSlotAcquireMode {
|
||||
|
||||
@@ -257,6 +257,8 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
|
||||
|
||||
pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
||||
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
@@ -1066,6 +1068,28 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
self: &Arc<Self>,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"preserve_initdb_tar_zst",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.context("backing up initdb archive")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||
@@ -1101,6 +1125,14 @@ impl RemoteTimelineClient {
|
||||
let layer_deletion_count = layers.len();
|
||||
self.deletion_queue_client.push_immediate(layers).await?;
|
||||
|
||||
// Delete the initdb.tar.zst, which is not always present, but deletion attempts of
|
||||
// inexistant objects are not considered errors.
|
||||
let initdb_path =
|
||||
remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
|
||||
self.deletion_queue_client
|
||||
.push_immediate(vec![initdb_path])
|
||||
.await?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
||||
@@ -1148,10 +1180,8 @@ impl RemoteTimelineClient {
|
||||
if p == &latest_index {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = p.object_name() {
|
||||
if name == INITDB_PATH {
|
||||
return false;
|
||||
}
|
||||
if p.object_name() == Some(INITDB_PRESERVED_PATH) {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})
|
||||
@@ -1724,6 +1754,16 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_initdb_preserved_archive_path(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> RemotePath {
|
||||
RemotePath::from_string(&format!(
|
||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}"
|
||||
))
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_index_path(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
|
||||
@@ -32,7 +32,8 @@ use utils::id::TimelineId;
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{
|
||||
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
|
||||
INITDB_PATH,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -430,6 +431,9 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
|
||||
let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
let remote_preserved_path =
|
||||
remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
let timeline_path = conf.timelines_path(tenant_shard_id);
|
||||
|
||||
if !timeline_path.exists() {
|
||||
@@ -456,8 +460,16 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download =
|
||||
download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
|
||||
let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
|
||||
.await
|
||||
{
|
||||
Ok(dl) => dl,
|
||||
Err(DownloadError::NotFound) => {
|
||||
download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
|
||||
.await?
|
||||
}
|
||||
Err(other) => Err(other)?,
|
||||
};
|
||||
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
|
||||
|
||||
|
||||
@@ -13,8 +13,8 @@ use super::Generation;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
|
||||
upload_cancellable,
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
|
||||
},
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -144,3 +144,16 @@ pub(crate) async fn upload_initdb_dir(
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
|
||||
.await
|
||||
.with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
log_compaction_error(
|
||||
&e,
|
||||
error_run_count,
|
||||
&wait_duration,
|
||||
cancel.is_cancelled(),
|
||||
);
|
||||
wait_duration
|
||||
} else {
|
||||
@@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
}
|
||||
|
||||
fn log_compaction_error(
|
||||
e: &CompactionError,
|
||||
error_run_count: u32,
|
||||
sleep_duration: &std::time::Duration,
|
||||
task_cancelled: bool,
|
||||
) {
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use CompactionError::*;
|
||||
|
||||
enum LooksLike {
|
||||
Info,
|
||||
Error,
|
||||
}
|
||||
|
||||
let decision = match e {
|
||||
ShuttingDown => None,
|
||||
_ if task_cancelled => Some(LooksLike::Info),
|
||||
Other(e) => {
|
||||
let root_cause = e.root_cause();
|
||||
|
||||
let is_stopping = {
|
||||
let upload_queue = root_cause
|
||||
.downcast_ref::<NotInitialized>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
let timeline = root_cause
|
||||
.downcast_ref::<PageReconstructError>()
|
||||
.is_some_and(|e| e.is_stopping());
|
||||
|
||||
upload_queue || timeline
|
||||
};
|
||||
|
||||
if is_stopping {
|
||||
Some(LooksLike::Info)
|
||||
} else {
|
||||
Some(LooksLike::Error)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match decision {
|
||||
Some(LooksLike::Info) => info!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
|
||||
),
|
||||
Some(LooksLike::Error) => error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
|
||||
),
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
|
||||
@@ -14,6 +14,7 @@ use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
keyspace::{key_range_size, KeySpaceAccum},
|
||||
models::{
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
|
||||
LayerMapInfo, TimelineState,
|
||||
@@ -32,7 +33,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::sync::gate::Gate;
|
||||
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
@@ -391,8 +392,7 @@ pub(crate) enum PageReconstructError {
|
||||
#[error("Ancestor LSN wait error: {0}")]
|
||||
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||
|
||||
/// The operation was cancelled
|
||||
#[error("Cancelled")]
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
/// The ancestor of this is being stopped
|
||||
@@ -404,6 +404,34 @@ pub(crate) enum PageReconstructError {
|
||||
WalRedo(anyhow::Error),
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
/// Returns true if this error indicates a tenant/timeline shutdown alike situation
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use PageReconstructError::*;
|
||||
match self {
|
||||
Other(_) => false,
|
||||
AncestorLsnTimeout(_) => false,
|
||||
Cancelled | AncestorStopping(_) => true,
|
||||
WalRedo(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum CreateImageLayersError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
GetVectoredError(GetVectoredError),
|
||||
|
||||
#[error(transparent)]
|
||||
PageReconstructError(PageReconstructError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum FlushLayerError {
|
||||
/// Timeline cancellation token was cancelled
|
||||
@@ -411,12 +439,24 @@ enum FlushLayerError {
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
PageReconstructError(#[from] PageReconstructError),
|
||||
CreateImageLayersError(CreateImageLayersError),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetVectoredError {
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
|
||||
#[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
|
||||
Oversized(u64),
|
||||
|
||||
#[error("Requested at invalid LSN: {0}")]
|
||||
InvalidLsn(Lsn),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum LogicalSizeCalculationCause {
|
||||
Initial,
|
||||
@@ -456,6 +496,45 @@ pub(crate) enum WaitLsnError {
|
||||
Timeout(String),
|
||||
}
|
||||
|
||||
// The impls below achieve cancellation mapping for errors.
|
||||
// Perhaps there's a way of achieving this with less cruft.
|
||||
|
||||
impl From<CreateImageLayersError> for CompactionError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
|
||||
_ => CompactionError::Other(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CreateImageLayersError> for FlushLayerError {
|
||||
fn from(e: CreateImageLayersError) -> Self {
|
||||
match e {
|
||||
CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
|
||||
any => FlushLayerError::CreateImageLayersError(any),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CreateImageLayersError {
|
||||
fn from(e: PageReconstructError) -> Self {
|
||||
match e {
|
||||
PageReconstructError::Cancelled => CreateImageLayersError::Cancelled,
|
||||
_ => CreateImageLayersError::PageReconstructError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetVectoredError> for CreateImageLayersError {
|
||||
fn from(e: GetVectoredError) -> Self {
|
||||
match e {
|
||||
GetVectoredError::Cancelled => CreateImageLayersError::Cancelled,
|
||||
_ => CreateImageLayersError::GetVectoredError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -575,6 +654,53 @@ impl Timeline {
|
||||
res
|
||||
}
|
||||
|
||||
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
|
||||
|
||||
/// Look up multiple page versions at a given LSN
|
||||
///
|
||||
/// This naive implementation will be replaced with a more efficient one
|
||||
/// which actually vectorizes the read path.
|
||||
pub(crate) async fn get_vectored(
|
||||
&self,
|
||||
key_ranges: &[Range<Key>],
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(lsn));
|
||||
}
|
||||
|
||||
let key_count = key_ranges
|
||||
.iter()
|
||||
.map(|range| key_range_size(range) as u64)
|
||||
.sum();
|
||||
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
|
||||
let mut values = BTreeMap::new();
|
||||
for range in key_ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
|
||||
let block = self.get(key, lsn, ctx).await;
|
||||
|
||||
if matches!(
|
||||
block,
|
||||
Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||
) {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
values.insert(key, block);
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(values)
|
||||
}
|
||||
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
pub fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().last
|
||||
@@ -2582,7 +2708,7 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
err @ Err(
|
||||
FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
|
||||
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
|
||||
) => {
|
||||
error!("could not flush frozen layer: {err:?}");
|
||||
break err;
|
||||
@@ -2859,6 +2985,21 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client
|
||||
.preserve_initdb_archive(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
|
||||
// in layer map immediately. The caller is responsible to put it into the layer map.
|
||||
async fn create_delta_layer(
|
||||
@@ -2950,11 +3091,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
async fn time_for_new_image_layer(
|
||||
&self,
|
||||
partition: &KeySpace,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<bool> {
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
@@ -2974,20 +3111,20 @@ impl Timeline {
|
||||
// but the range is already covered by image layers at more recent LSNs. Before we
|
||||
// create a new image layer, check if the range is already covered at more recent LSNs.
|
||||
if !layers
|
||||
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
|
||||
.image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
|
||||
{
|
||||
debug!(
|
||||
"Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
|
||||
img_range.start, img_range.end, cutoff_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for part_range in &partition.ranges {
|
||||
let image_coverage = layers.image_coverage(part_range, lsn)?;
|
||||
let image_coverage = layers.image_coverage(part_range, lsn);
|
||||
for (img_range, last_img) in image_coverage {
|
||||
let img_lsn = if let Some(last_img) = last_img {
|
||||
last_img.get_lsn_range().end
|
||||
@@ -3008,7 +3145,7 @@ impl Timeline {
|
||||
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||
if img_lsn < lsn {
|
||||
let num_deltas =
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold));
|
||||
|
||||
max_deltas = max_deltas.max(num_deltas);
|
||||
if num_deltas >= threshold {
|
||||
@@ -3016,7 +3153,7 @@ impl Timeline {
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3026,7 +3163,7 @@ impl Timeline {
|
||||
max_deltas,
|
||||
"none of the partitioned ranges had >= {threshold} deltas"
|
||||
);
|
||||
Ok(false)
|
||||
false
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%lsn, %force))]
|
||||
@@ -3036,7 +3173,7 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<ResidentLayer>, PageReconstructError> {
|
||||
) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
let mut image_layers = Vec::new();
|
||||
|
||||
@@ -3054,7 +3191,7 @@ impl Timeline {
|
||||
for partition in partitioning.parts.iter() {
|
||||
let img_range = start..partition.ranges.last().unwrap().end;
|
||||
start = img_range.end;
|
||||
if force || self.time_for_new_image_layer(partition, lsn).await? {
|
||||
if force || self.time_for_new_image_layer(partition, lsn).await {
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -3065,10 +3202,12 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
"failpoint image-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
|
||||
let mut key_request_accum = KeySpaceAccum::new();
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
@@ -3081,34 +3220,55 @@ impl Timeline {
|
||||
key = key.next();
|
||||
continue;
|
||||
}
|
||||
let img = match self.get(key, lsn, ctx).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
|
||||
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
image_layer_writer.put_image(key, &img).await?;
|
||||
key_request_accum.add_key(key);
|
||||
if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| key.next() == range.end
|
||||
{
|
||||
let results = self
|
||||
.get_vectored(
|
||||
&key_request_accum.consume_keyspace().ranges,
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for (img_key, img) in results {
|
||||
let img = match img {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(img_key)
|
||||
|| is_rel_vm_block_key(img_key)
|
||||
{
|
||||
warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(
|
||||
CreateImageLayersError::PageReconstructError(err),
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
image_layer_writer.put_image(img_key, &img).await?;
|
||||
}
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
@@ -3484,7 +3644,7 @@ impl Timeline {
|
||||
// has not so much sense, because largest holes will corresponds field1/field2 changes.
|
||||
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
|
||||
// That is why it is better to measure size of hole as number of covering image layers.
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
|
||||
let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
|
||||
if coverage_size >= min_hole_coverage_size {
|
||||
heap.push(Hole {
|
||||
key_range,
|
||||
@@ -4110,7 +4270,7 @@ impl Timeline {
|
||||
// we cannot remove C, even though it's older than 2500, because
|
||||
// the delta layer 2000-3000 depends on it.
|
||||
if !layers
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
|
||||
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
|
||||
{
|
||||
debug!("keeping {} because it is the latest layer", l.filename());
|
||||
// Collect delta key ranges that need image layers to allow garbage
|
||||
@@ -4240,7 +4400,7 @@ impl Timeline {
|
||||
.walredo_mgr
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.await
|
||||
.context("Failed to reconstruct a page image:")
|
||||
.context("reconstruct a page image")
|
||||
{
|
||||
Ok(img) => img,
|
||||
Err(e) => return Err(PageReconstructError::WalRedo(e)),
|
||||
|
||||
@@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped {
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum NotInitialized {
|
||||
#[error("queue is in state Uninitialized")]
|
||||
Uninitialized,
|
||||
#[error("queue is in state Stopping")]
|
||||
Stopped,
|
||||
#[error("queue is shutting down")]
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
impl NotInitialized {
|
||||
pub(crate) fn is_stopping(&self) -> bool {
|
||||
use NotInitialized::*;
|
||||
match self {
|
||||
Uninitialized => false,
|
||||
Stopped => true,
|
||||
ShuttingDown => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
pub(crate) fn initialize_empty_remote(
|
||||
&mut self,
|
||||
@@ -214,17 +235,17 @@ impl UploadQueue {
|
||||
}
|
||||
|
||||
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
use UploadQueue::*;
|
||||
match self {
|
||||
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(x) => {
|
||||
if !x.shutting_down {
|
||||
Ok(x)
|
||||
Uninitialized => Err(NotInitialized::Uninitialized.into()),
|
||||
Initialized(x) => {
|
||||
if x.shutting_down {
|
||||
Err(NotInitialized::ShuttingDown.into())
|
||||
} else {
|
||||
anyhow::bail!("queue is shutting down")
|
||||
Ok(x)
|
||||
}
|
||||
}
|
||||
Stopped(_) => Err(NotInitialized::Stopped.into()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -356,12 +356,7 @@ impl VirtualFile {
|
||||
Ok(vfile)
|
||||
}
|
||||
|
||||
/// Writes a file to the specified `final_path` in a crash safe fasion
|
||||
///
|
||||
/// The file is first written to the specified tmp_path, and in a second
|
||||
/// step, the tmp path is renamed to the final path. As renames are
|
||||
/// atomic, a crash during the write operation will never leave behind a
|
||||
/// partially written file.
|
||||
/// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`].
|
||||
pub async fn crashsafe_overwrite(
|
||||
final_path: &Utf8Path,
|
||||
tmp_path: &Utf8Path,
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/xlog.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "fmgr.h"
|
||||
#include "libpq-fe.h"
|
||||
#include "libpq/libpq.h"
|
||||
@@ -38,17 +39,6 @@
|
||||
#define MIN_RECONNECT_INTERVAL_USEC 1000
|
||||
#define MAX_RECONNECT_INTERVAL_USEC 1000000
|
||||
|
||||
bool connected = false;
|
||||
PGconn *pageserver_conn = NULL;
|
||||
|
||||
/*
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on pageserver_conn,
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *pageserver_conn_wes = NULL;
|
||||
|
||||
/* GUCs */
|
||||
char *neon_timeline;
|
||||
char *neon_tenant;
|
||||
@@ -59,17 +49,25 @@ char *neon_auth_token;
|
||||
int readahead_buffer_size = 128;
|
||||
int flush_every_n_requests = 8;
|
||||
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int stripe_size;
|
||||
|
||||
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
|
||||
typedef struct
|
||||
{
|
||||
char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
size_t num_shards;
|
||||
} ShardMap;
|
||||
|
||||
/*
|
||||
* PagestoreShmemState is kept in shared memory. It contains the connection
|
||||
* strings for each shard.
|
||||
*
|
||||
* The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
|
||||
* allowing it to be changed using pg_reload_conf(). The control plane can
|
||||
* update the connection string if the pageserver crashes, is relocated, or
|
||||
* new shards are added. A copy of the current value of the GUC is kept in
|
||||
* shared memory, updated by the postmaster, because regular backends don't
|
||||
* new shards are added. A parsed copy of the current value of the GUC is kept
|
||||
* in shared memory, updated by the postmaster, because regular backends don't
|
||||
* reload the config during query execution, but we might need to re-establish
|
||||
* the pageserver connection with the new connection string even in the middle
|
||||
* of a query.
|
||||
@@ -84,7 +82,7 @@ typedef struct
|
||||
{
|
||||
pg_atomic_uint64 begin_update_counter;
|
||||
pg_atomic_uint64 end_update_counter;
|
||||
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
ShardMap shard_map;
|
||||
} PagestoreShmemState;
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
@@ -94,10 +92,25 @@ static void walproposer_shmem_request(void);
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
static PagestoreShmemState *pagestore_shared;
|
||||
static uint64 pagestore_local_counter = 0;
|
||||
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
|
||||
static bool pageserver_flush(void);
|
||||
static void pageserver_disconnect(void);
|
||||
/* This backend's per-shard connections */
|
||||
typedef struct
|
||||
{
|
||||
PGconn *conn;
|
||||
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
* - WL_LATCH_SET on MyLatch, and
|
||||
* - WL_EXIT_ON_PM_DEATH.
|
||||
*/
|
||||
WaitEventSet *wes;
|
||||
} PageServer;
|
||||
|
||||
static PageServer page_servers[MAX_SHARDS];
|
||||
|
||||
static bool pageserver_flush(shardno_t shard_no);
|
||||
static void pageserver_disconnect(shardno_t shard_no);
|
||||
|
||||
static bool
|
||||
PagestoreShmemIsValid(void)
|
||||
@@ -105,89 +118,213 @@ PagestoreShmemIsValid(void)
|
||||
return pagestore_shared && UsedShmemSegAddr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse a comma-separated list of connection strings into a ShardMap.
|
||||
*
|
||||
* If 'result' is NULL, just checks that the input is valid. If the input is
|
||||
* not valid, returns false. The contents of *result are undefined in
|
||||
* that case, and must not be relied on.
|
||||
*/
|
||||
static bool
|
||||
ParseShardMap(const char *connstr, ShardMap *result)
|
||||
{
|
||||
const char *p;
|
||||
int nshards = 0;
|
||||
|
||||
if (result)
|
||||
memset(result, 0, sizeof(ShardMap));
|
||||
|
||||
p = connstr;
|
||||
nshards = 0;
|
||||
for (;;)
|
||||
{
|
||||
const char *sep;
|
||||
size_t connstr_len;
|
||||
|
||||
sep = strchr(p, ',');
|
||||
connstr_len = sep != NULL ? sep - p : strlen(p);
|
||||
|
||||
if (connstr_len == 0 && sep == NULL)
|
||||
break; /* ignore trailing comma */
|
||||
|
||||
if (nshards >= MAX_SHARDS)
|
||||
{
|
||||
neon_log(LOG, "Too many shards");
|
||||
return false;
|
||||
}
|
||||
if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE)
|
||||
{
|
||||
neon_log(LOG, "Connection string too long");
|
||||
return false;
|
||||
}
|
||||
if (result)
|
||||
{
|
||||
memcpy(result->connstring[nshards], p, connstr_len);
|
||||
result->connstring[nshards][connstr_len] = '\0';
|
||||
}
|
||||
nshards++;
|
||||
|
||||
if (sep == NULL)
|
||||
break;
|
||||
p = sep + 1;
|
||||
}
|
||||
if (result)
|
||||
result->num_shards = nshards;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
|
||||
{
|
||||
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
|
||||
char *p = *newval;
|
||||
|
||||
return ParseShardMap(p, NULL);
|
||||
}
|
||||
|
||||
static void
|
||||
AssignPageserverConnstring(const char *newval, void *extra)
|
||||
{
|
||||
ShardMap shard_map;
|
||||
|
||||
/*
|
||||
* Only postmaster updates the copy in shared memory.
|
||||
*/
|
||||
if (!PagestoreShmemIsValid() || IsUnderPostmaster)
|
||||
return;
|
||||
|
||||
pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
|
||||
pg_write_barrier();
|
||||
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
pg_write_barrier();
|
||||
pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
|
||||
}
|
||||
|
||||
static bool
|
||||
CheckConnstringUpdated(void)
|
||||
{
|
||||
if (!PagestoreShmemIsValid())
|
||||
return false;
|
||||
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
|
||||
if (!ParseShardMap(newval, &shard_map))
|
||||
{
|
||||
/*
|
||||
* shouldn't happen, because we already checked the value in
|
||||
* CheckPageserverConnstring
|
||||
*/
|
||||
elog(ERROR, "could not parse shard map");
|
||||
}
|
||||
|
||||
if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0)
|
||||
{
|
||||
pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
|
||||
pg_write_barrier();
|
||||
memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap));
|
||||
pg_write_barrier();
|
||||
pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* no change */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the current number of shards, and/or the connection string for a
|
||||
* particular shard from the shard map in shared memory.
|
||||
*
|
||||
* If num_shards_p is not NULL, it is set to the current number of shards.
|
||||
*
|
||||
* If connstr_p is not NULL, the connection string for 'shard_no' is copied to
|
||||
* it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes
|
||||
* long.
|
||||
*
|
||||
* As a side-effect, if the shard map in shared memory had changed since the
|
||||
* last call, terminates all existing connections to all pageservers.
|
||||
*/
|
||||
static void
|
||||
ReloadConnstring(void)
|
||||
load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
|
||||
{
|
||||
uint64 begin_update_counter;
|
||||
uint64 end_update_counter;
|
||||
|
||||
if (!PagestoreShmemIsValid())
|
||||
return;
|
||||
ShardMap *shard_map = &pagestore_shared->shard_map;
|
||||
shardno_t num_shards;
|
||||
|
||||
/*
|
||||
* Copy the current settnig from shared to local memory. Postmaster can
|
||||
* update the value concurrently, in which case we would copy a garbled
|
||||
* mix of the old and new values. We will detect it because the counter's
|
||||
* won't match, and retry. But it's important that we don't do anything
|
||||
* within the retry-loop that would depend on the string having valid
|
||||
* contents.
|
||||
* Postmaster can update the shared memory values concurrently, in which
|
||||
* case we would copy a garbled mix of the old and new values. We will
|
||||
* detect it because the counter's won't match, and retry. But it's
|
||||
* important that we don't do anything within the retry-loop that would
|
||||
* depend on the string having valid contents.
|
||||
*/
|
||||
do
|
||||
{
|
||||
begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
|
||||
end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
|
||||
pg_read_barrier();
|
||||
|
||||
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
|
||||
pg_read_barrier();
|
||||
num_shards = shard_map->num_shards;
|
||||
if (connstr_p && shard_no < MAX_SHARDS)
|
||||
strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
pg_memory_barrier();
|
||||
}
|
||||
while (begin_update_counter != end_update_counter
|
||||
|| begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
|
||||
|| end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
|
||||
|
||||
pagestore_local_counter = end_update_counter;
|
||||
if (connstr_p && shard_no >= num_shards)
|
||||
neon_log(ERROR, "Shard %d is greater or equal than number of shards %d",
|
||||
shard_no, num_shards);
|
||||
|
||||
/*
|
||||
* If any of the connection strings changed, reset all connections.
|
||||
*/
|
||||
if (pagestore_local_counter != end_update_counter)
|
||||
{
|
||||
for (shardno_t i = 0; i < MAX_SHARDS; i++)
|
||||
{
|
||||
if (page_servers[i].conn)
|
||||
pageserver_disconnect(i);
|
||||
}
|
||||
pagestore_local_counter = end_update_counter;
|
||||
}
|
||||
|
||||
if (num_shards_p)
|
||||
*num_shards_p = num_shards;
|
||||
}
|
||||
|
||||
#define MB (1024*1024)
|
||||
|
||||
shardno_t
|
||||
get_shard_number(BufferTag *tag)
|
||||
{
|
||||
shardno_t n_shards;
|
||||
uint32 hash;
|
||||
|
||||
load_shard_map(0, NULL, &n_shards);
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
hash = murmurhash32(tag->rnode.relNode);
|
||||
hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
|
||||
#else
|
||||
hash = murmurhash32(tag->relNumber);
|
||||
hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
|
||||
#endif
|
||||
|
||||
return hash % n_shards;
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_connect(int elevel)
|
||||
pageserver_connect(shardno_t shard_no, int elevel)
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
PGconn *conn;
|
||||
WaitEventSet *wes;
|
||||
char connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
|
||||
static TimestampTz last_connect_time = 0;
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
|
||||
Assert(!connected);
|
||||
Assert(page_servers[shard_no].conn == NULL);
|
||||
|
||||
if (CheckConnstringUpdated())
|
||||
{
|
||||
ReloadConnstring();
|
||||
}
|
||||
/*
|
||||
* Get the connection string for this shard. If the shard map has been
|
||||
* updated since we last looked, this will also disconnect any existing
|
||||
* pageserver connections as a side effect.
|
||||
*/
|
||||
load_shard_map(shard_no, connstr, NULL);
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
@@ -223,76 +360,84 @@ pageserver_connect(int elevel)
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = local_pageserver_connstring;
|
||||
values[n] = connstr;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
pageserver_conn = PQconnectdbParams(keywords, values, 1);
|
||||
conn = PQconnectdbParams(keywords, values, 1);
|
||||
|
||||
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
if (PQstatus(conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
PQfinish(conn);
|
||||
|
||||
ereport(elevel,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "could not establish connection to pageserver"),
|
||||
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
|
||||
errdetail_internal("%s", msg)));
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
ret = PQsendQuery(pageserver_conn, query);
|
||||
ret = PQsendQuery(conn, query);
|
||||
pfree(query);
|
||||
if (ret != 1)
|
||||
{
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
neon_log(elevel, "could not send pagestream command to pageserver");
|
||||
PQfinish(conn);
|
||||
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
}
|
||||
|
||||
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
|
||||
AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
|
||||
|
||||
while (PQisBusy(pageserver_conn))
|
||||
PG_TRY();
|
||||
{
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
while (PQisBusy(conn))
|
||||
{
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
if (!PQconsumeInput(conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(conn));
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
pageserver_conn_wes = NULL;
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
|
||||
neon_log(elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
return false;
|
||||
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
PQfinish(conn);
|
||||
FreeWaitEventSet(wes);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
|
||||
page_servers[shard_no].conn = conn;
|
||||
page_servers[shard_no].wes = wes;
|
||||
|
||||
connected = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -300,9 +445,10 @@ pageserver_connect(int elevel)
|
||||
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
|
||||
*/
|
||||
static int
|
||||
call_PQgetCopyData(char **buffer)
|
||||
call_PQgetCopyData(shardno_t shard_no, char **buffer)
|
||||
{
|
||||
int ret;
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
retry:
|
||||
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
|
||||
@@ -312,7 +458,7 @@ retry:
|
||||
WaitEvent event;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
@@ -324,7 +470,7 @@ retry:
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
neon_log(LOG, "could not get response from pageserver: %s", msg);
|
||||
neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
|
||||
pfree(msg);
|
||||
return -1;
|
||||
}
|
||||
@@ -338,7 +484,7 @@ retry:
|
||||
|
||||
|
||||
static void
|
||||
pageserver_disconnect(void)
|
||||
pageserver_disconnect(shardno_t shard_no)
|
||||
{
|
||||
/*
|
||||
* If anything goes wrong while we were sending a request, it's not clear
|
||||
@@ -347,38 +493,38 @@ pageserver_disconnect(void)
|
||||
* time later after we have already sent a new unrelated request. Close
|
||||
* the connection to avoid getting confused.
|
||||
*/
|
||||
if (connected)
|
||||
if (page_servers[shard_no].conn)
|
||||
{
|
||||
neon_log(LOG, "dropping connection to page server due to error");
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
connected = false;
|
||||
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
||||
PQfinish(page_servers[shard_no].conn);
|
||||
page_servers[shard_no].conn = NULL;
|
||||
|
||||
/*
|
||||
* If the connection to any pageserver is lost, we throw away the
|
||||
* whole prefetch queue, even for other pageservers. It should not
|
||||
* cause big problems, because connection loss is supposed to be a
|
||||
* rare event.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
}
|
||||
if (pageserver_conn_wes != NULL)
|
||||
if (page_servers[shard_no].wes != NULL)
|
||||
{
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
pageserver_conn_wes = NULL;
|
||||
FreeWaitEventSet(page_servers[shard_no].wes);
|
||||
page_servers[shard_no].wes = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_send(NeonRequest *request)
|
||||
pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
|
||||
if (CheckConnstringUpdated())
|
||||
{
|
||||
pageserver_disconnect();
|
||||
ReloadConnstring();
|
||||
}
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
{
|
||||
neon_log(LOG, "pageserver_send disconnect bad connection");
|
||||
pageserver_disconnect();
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
|
||||
pageserver_disconnect(shard_no);
|
||||
}
|
||||
|
||||
req_buff = nm_pack_request(request);
|
||||
@@ -392,9 +538,9 @@ pageserver_send(NeonRequest *request)
|
||||
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
|
||||
* connection in case of failure.
|
||||
*/
|
||||
if (!connected)
|
||||
if (!page_servers[shard_no].conn)
|
||||
{
|
||||
while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||
{
|
||||
HandleMainLoopInterrupts();
|
||||
n_reconnect_attempts += 1;
|
||||
@@ -402,6 +548,8 @@ pageserver_send(NeonRequest *request)
|
||||
n_reconnect_attempts = 0;
|
||||
}
|
||||
|
||||
pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
/*
|
||||
* Send request.
|
||||
*
|
||||
@@ -414,8 +562,8 @@ pageserver_send(NeonRequest *request)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
pfree(msg);
|
||||
pfree(req_buff.data);
|
||||
return false;
|
||||
@@ -427,19 +575,20 @@ pageserver_send(NeonRequest *request)
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) request);
|
||||
|
||||
neon_log(PageStoreTrace, "sent request: %s", msg);
|
||||
neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static NeonResponse *
|
||||
pageserver_receive(void)
|
||||
pageserver_receive(shardno_t shard_no)
|
||||
{
|
||||
StringInfoData resp_buff;
|
||||
NeonResponse *resp;
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
if (!connected)
|
||||
if (!pageserver_conn)
|
||||
return NULL;
|
||||
|
||||
PG_TRY();
|
||||
@@ -447,7 +596,7 @@ pageserver_receive(void)
|
||||
/* read response */
|
||||
int rc;
|
||||
|
||||
rc = call_PQgetCopyData(&resp_buff.data);
|
||||
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
|
||||
if (rc >= 0)
|
||||
{
|
||||
resp_buff.len = rc;
|
||||
@@ -459,33 +608,33 @@ pageserver_receive(void)
|
||||
{
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_log(PageStoreTrace, "got response: %s", msg);
|
||||
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
else if (rc == -1)
|
||||
{
|
||||
neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect();
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
|
||||
pageserver_disconnect(shard_no);
|
||||
resp = NULL;
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
neon_log(LOG, "pageserver_receive disconnect due to caught exception");
|
||||
pageserver_disconnect();
|
||||
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
|
||||
pageserver_disconnect(shard_no);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
@@ -495,11 +644,13 @@ pageserver_receive(void)
|
||||
|
||||
|
||||
static bool
|
||||
pageserver_flush(void)
|
||||
pageserver_flush(shardno_t shard_no)
|
||||
{
|
||||
if (!connected)
|
||||
PGconn *pageserver_conn = page_servers[shard_no].conn;
|
||||
|
||||
if (!pageserver_conn)
|
||||
{
|
||||
neon_log(WARNING, "Tried to flush while disconnected");
|
||||
neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -507,8 +658,8 @@ pageserver_flush(void)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
|
||||
pageserver_disconnect(shard_no);
|
||||
neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
|
||||
pfree(msg);
|
||||
return false;
|
||||
}
|
||||
@@ -550,6 +701,7 @@ PagestoreShmemInit(void)
|
||||
{
|
||||
pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
|
||||
pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
|
||||
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
@@ -624,6 +776,15 @@ pg_init_libpagestore(void)
|
||||
0, /* no flags required */
|
||||
check_neon_id, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.stripe_size",
|
||||
"sharding stripe size",
|
||||
NULL,
|
||||
&stripe_size,
|
||||
32768, 1, INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_BLOCKS,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.max_cluster_size",
|
||||
"cluster size limit",
|
||||
NULL,
|
||||
|
||||
@@ -20,9 +20,13 @@
|
||||
#include "lib/stringinfo.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "storage/block.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "utils/memutils.h"
|
||||
|
||||
#define MAX_SHARDS 128
|
||||
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
|
||||
|
||||
typedef enum
|
||||
{
|
||||
/* pagestore_client -> pagestore */
|
||||
@@ -51,6 +55,9 @@ typedef struct
|
||||
#define neon_log(tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
|
||||
|
||||
/*
|
||||
* supertype of all the Neon*Request structs below
|
||||
@@ -141,11 +148,13 @@ extern char *nm_to_string(NeonMessage *msg);
|
||||
* API
|
||||
*/
|
||||
|
||||
typedef unsigned shardno_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
bool (*send) (NeonRequest *request);
|
||||
NeonResponse *(*receive) (void);
|
||||
bool (*flush) (void);
|
||||
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
||||
NeonResponse *(*receive) (shardno_t shard_no);
|
||||
bool (*flush) (shardno_t shard_no);
|
||||
} page_server_api;
|
||||
|
||||
extern void prefetch_on_ps_disconnect(void);
|
||||
@@ -159,6 +168,8 @@ extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
extern int32 max_cluster_size;
|
||||
|
||||
extern shardno_t get_shard_number(BufferTag* tag);
|
||||
|
||||
extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
|
||||
extern void smgr_init_neon(void);
|
||||
extern void readahead_buffer_resize(int newsize, void *extra);
|
||||
|
||||
@@ -172,6 +172,7 @@ typedef struct PrefetchRequest
|
||||
XLogRecPtr actual_request_lsn;
|
||||
NeonResponse *response; /* may be null */
|
||||
PrefetchStatus status;
|
||||
shardno_t shard_no;
|
||||
uint64 my_ring_index;
|
||||
} PrefetchRequest;
|
||||
|
||||
@@ -239,10 +240,17 @@ typedef struct PrefetchState
|
||||
* also unused */
|
||||
|
||||
/* the buffers */
|
||||
prfh_hash *prf_hash;
|
||||
prfh_hash *prf_hash;
|
||||
int max_shard_no;
|
||||
/* Mark shards involved in prefetch */
|
||||
uint8 shard_bitmap[(MAX_SHARDS + 7)/8];
|
||||
PrefetchRequest prf_buffer[]; /* prefetch buffers */
|
||||
} PrefetchState;
|
||||
|
||||
#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
|
||||
#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
|
||||
#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
|
||||
|
||||
static PrefetchState *MyPState;
|
||||
|
||||
#define GetPrfSlot(ring_index) ( \
|
||||
@@ -327,6 +335,7 @@ compact_prefetch_buffers(void)
|
||||
Assert(target_slot->status == PRFS_UNUSED);
|
||||
|
||||
target_slot->buftag = source_slot->buftag;
|
||||
target_slot->shard_no = source_slot->shard_no;
|
||||
target_slot->status = source_slot->status;
|
||||
target_slot->response = source_slot->response;
|
||||
target_slot->effective_request_lsn = source_slot->effective_request_lsn;
|
||||
@@ -494,6 +503,23 @@ prefetch_cleanup_trailing_unused(void)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
prefetch_flush_requests(void)
|
||||
{
|
||||
for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
|
||||
{
|
||||
if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
|
||||
{
|
||||
if (!page_server->flush(shard_no))
|
||||
return false;
|
||||
BITMAP_CLR(MyPState->shard_bitmap, shard_no);
|
||||
}
|
||||
}
|
||||
MyPState->max_shard_no = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for slot of ring_index to have received its response.
|
||||
* The caller is responsible for making sure the request buffer is flushed.
|
||||
@@ -509,7 +535,7 @@ prefetch_wait_for(uint64 ring_index)
|
||||
if (MyPState->ring_flush <= ring_index &&
|
||||
MyPState->ring_unused > MyPState->ring_flush)
|
||||
{
|
||||
if (!page_server->flush())
|
||||
if (!prefetch_flush_requests())
|
||||
return false;
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
}
|
||||
@@ -547,7 +573,7 @@ prefetch_read(PrefetchRequest *slot)
|
||||
Assert(slot->my_ring_index == MyPState->ring_receive);
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = (NeonResponse *) page_server->receive();
|
||||
response = (NeonResponse *) page_server->receive(slot->shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
if (response)
|
||||
{
|
||||
@@ -704,12 +730,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_unused);
|
||||
|
||||
while (!page_server->send((NeonRequest *) &request));
|
||||
while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_requests_inflight += 1;
|
||||
MyPState->n_unused -= 1;
|
||||
MyPState->ring_unused += 1;
|
||||
BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
|
||||
MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
|
||||
|
||||
/* update slot state */
|
||||
slot->status = PRFS_REQUESTED;
|
||||
@@ -880,6 +908,7 @@ Retry:
|
||||
* function reads the buffer tag from the slot.
|
||||
*/
|
||||
slot->buftag = tag;
|
||||
slot->shard_no = get_shard_number(&tag);
|
||||
slot->my_ring_index = ring_index;
|
||||
|
||||
prefetch_do_request(slot, force_latest, force_lsn);
|
||||
@@ -890,7 +919,7 @@ Retry:
|
||||
if (flush_every_n_requests > 0 &&
|
||||
MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
|
||||
{
|
||||
if (!page_server->flush())
|
||||
if (!prefetch_flush_requests())
|
||||
{
|
||||
/*
|
||||
* Prefetch set is reset in case of error, so we should try to
|
||||
@@ -908,13 +937,44 @@ static NeonResponse *
|
||||
page_server_request(void const *req)
|
||||
{
|
||||
NeonResponse *resp;
|
||||
BufferTag tag = {0};
|
||||
shardno_t shard_no;
|
||||
|
||||
switch (((NeonRequest *) req)->tag)
|
||||
{
|
||||
case T_NeonExistsRequest:
|
||||
CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
|
||||
break;
|
||||
case T_NeonNblocksRequest:
|
||||
CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
|
||||
break;
|
||||
case T_NeonDbSizeRequest:
|
||||
NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
|
||||
break;
|
||||
case T_NeonGetPageRequest:
|
||||
CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
|
||||
tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
|
||||
break;
|
||||
default:
|
||||
neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
|
||||
}
|
||||
shard_no = get_shard_number(&tag);
|
||||
|
||||
|
||||
/*
|
||||
* Current sharding model assumes that all metadata is present only at shard 0.
|
||||
* We still need to call get_shard_no() to check if shard map is up-to-date.
|
||||
*/
|
||||
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
|
||||
{
|
||||
shard_no = 0;
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
while (!page_server->send((NeonRequest *) req) || !page_server->flush());
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
|
||||
consume_prefetch_responses();
|
||||
resp = page_server->receive();
|
||||
resp = page_server->receive(shard_no);
|
||||
} while (resp == NULL);
|
||||
return resp;
|
||||
|
||||
@@ -2098,8 +2158,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
blkno,
|
||||
errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
slot->shard_no, blkno,
|
||||
RelFileInfoFmt(rinfo),
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
|
||||
@@ -4,7 +4,9 @@ pub mod backend;
|
||||
pub use backend::BackendType;
|
||||
|
||||
mod credentials;
|
||||
pub use credentials::{check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint};
|
||||
pub use credentials::{
|
||||
check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern,
|
||||
};
|
||||
|
||||
mod password_hack;
|
||||
pub use password_hack::parse_endpoint_param;
|
||||
|
||||
@@ -3,7 +3,6 @@ mod hacks;
|
||||
mod link;
|
||||
|
||||
pub use link::LinkAuthError;
|
||||
use smol_str::SmolStr;
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
|
||||
use crate::auth::credentials::check_peer_addr_is_in_list;
|
||||
@@ -16,7 +15,6 @@ use crate::context::RequestMonitoring;
|
||||
use crate::proxy::connect_compute::handle_try_wake;
|
||||
use crate::proxy::retry::retry_after;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::scram;
|
||||
use crate::stream::Stream;
|
||||
use crate::{
|
||||
auth::{self, ComputeUserInfoMaybeEndpoint},
|
||||
@@ -28,6 +26,7 @@ use crate::{
|
||||
},
|
||||
stream, url,
|
||||
};
|
||||
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
|
||||
use futures::TryFutureExt;
|
||||
use std::borrow::Cow;
|
||||
use std::ops::ControlFlow;
|
||||
@@ -35,6 +34,8 @@ use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use super::IpPattern;
|
||||
|
||||
/// This type serves two purposes:
|
||||
///
|
||||
/// * When `T` is `()`, it's just a regular auth backend selector
|
||||
@@ -55,7 +56,7 @@ pub enum BackendType<'a, T> {
|
||||
|
||||
pub trait TestBackend: Send + Sync + 'static {
|
||||
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
|
||||
fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError>;
|
||||
fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError>;
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BackendType<'_, ()> {
|
||||
@@ -128,19 +129,19 @@ pub struct ComputeCredentials<T> {
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ComputeUserInfoNoEndpoint {
|
||||
pub user: SmolStr,
|
||||
pub user: RoleName,
|
||||
pub options: NeonOptions,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ComputeUserInfo {
|
||||
pub endpoint: SmolStr,
|
||||
pub user: SmolStr,
|
||||
pub endpoint: EndpointId,
|
||||
pub user: RoleName,
|
||||
pub options: NeonOptions,
|
||||
}
|
||||
|
||||
impl ComputeUserInfo {
|
||||
pub fn endpoint_cache_key(&self) -> SmolStr {
|
||||
pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
|
||||
self.options.get_cache_key(&self.endpoint)
|
||||
}
|
||||
}
|
||||
@@ -156,7 +157,7 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
|
||||
type Error = ComputeUserInfoNoEndpoint;
|
||||
|
||||
fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result<Self, Self::Error> {
|
||||
match user_info.project {
|
||||
match user_info.endpoint_id {
|
||||
None => Err(ComputeUserInfoNoEndpoint {
|
||||
user: user_info.user,
|
||||
options: user_info.options,
|
||||
@@ -315,11 +316,11 @@ async fn auth_and_wake_compute(
|
||||
|
||||
impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
||||
/// Get compute endpoint name from the credentials.
|
||||
pub fn get_endpoint(&self) -> Option<SmolStr> {
|
||||
pub fn get_endpoint(&self) -> Option<EndpointId> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(_, user_info) => user_info.project.clone(),
|
||||
Console(_, user_info) => user_info.endpoint_id.clone(),
|
||||
Link(_) => Some("link".into()),
|
||||
#[cfg(test)]
|
||||
Test(_) => Some("test".into()),
|
||||
@@ -353,7 +354,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
||||
Console(api, user_info) => {
|
||||
info!(
|
||||
user = &*user_info.user,
|
||||
project = user_info.project(),
|
||||
project = user_info.endpoint(),
|
||||
"performing authentication using the console"
|
||||
);
|
||||
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
|
||||
use crate::{
|
||||
auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
|
||||
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions,
|
||||
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use smol_str::SmolStr;
|
||||
use std::{collections::HashSet, net::IpAddr};
|
||||
use std::{collections::HashSet, net::IpAddr, str::FromStr};
|
||||
use thiserror::Error;
|
||||
use tracing::{info, warn};
|
||||
|
||||
@@ -21,7 +21,10 @@ pub enum ComputeUserInfoParseError {
|
||||
SNI ('{}') and project option ('{}').",
|
||||
.domain, .option,
|
||||
)]
|
||||
InconsistentProjectNames { domain: SmolStr, option: SmolStr },
|
||||
InconsistentProjectNames {
|
||||
domain: EndpointId,
|
||||
option: EndpointId,
|
||||
},
|
||||
|
||||
#[error(
|
||||
"Common name inferred from SNI ('{}') is not known",
|
||||
@@ -30,7 +33,7 @@ pub enum ComputeUserInfoParseError {
|
||||
UnknownCommonName { cn: String },
|
||||
|
||||
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
|
||||
MalformedProjectName(SmolStr),
|
||||
MalformedProjectName(EndpointId),
|
||||
}
|
||||
|
||||
impl UserFacingError for ComputeUserInfoParseError {}
|
||||
@@ -39,17 +42,15 @@ impl UserFacingError for ComputeUserInfoParseError {}
|
||||
/// Note that we don't store any kind of client key or password here.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ComputeUserInfoMaybeEndpoint {
|
||||
pub user: SmolStr,
|
||||
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
||||
pub project: Option<SmolStr>,
|
||||
|
||||
pub user: RoleName,
|
||||
pub endpoint_id: Option<EndpointId>,
|
||||
pub options: NeonOptions,
|
||||
}
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
#[inline]
|
||||
pub fn project(&self) -> Option<&str> {
|
||||
self.project.as_deref()
|
||||
pub fn endpoint(&self) -> Option<&str> {
|
||||
self.endpoint_id.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,15 +80,15 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
|
||||
// Some parameters are stored in the startup message.
|
||||
let get_param = |key| params.get(key).ok_or(MissingKey(key));
|
||||
let user: SmolStr = get_param("user")?.into();
|
||||
let user: RoleName = get_param("user")?.into();
|
||||
|
||||
// record the values if we have them
|
||||
ctx.set_application(params.get("application_name").map(SmolStr::from));
|
||||
ctx.set_user(user.clone());
|
||||
ctx.set_endpoint_id(sni.map(SmolStr::from));
|
||||
ctx.set_endpoint_id(sni.map(EndpointId::from));
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let project_option = params
|
||||
let endpoint_option = params
|
||||
.options_raw()
|
||||
.and_then(|options| {
|
||||
// We support both `project` (deprecated) and `endpoint` options for backward compatibility.
|
||||
@@ -100,9 +101,9 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
})
|
||||
.map(|name| name.into());
|
||||
|
||||
let project_from_domain = if let Some(sni_str) = sni {
|
||||
let endpoint_from_domain = if let Some(sni_str) = sni {
|
||||
if let Some(cn) = common_names {
|
||||
Some(SmolStr::from(endpoint_sni(sni_str, cn)?))
|
||||
Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -110,7 +111,7 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
None
|
||||
};
|
||||
|
||||
let project = match (project_option, project_from_domain) {
|
||||
let endpoint = match (endpoint_option, endpoint_from_domain) {
|
||||
// Invariant: if we have both project name variants, they should match.
|
||||
(Some(option), Some(domain)) if option != domain => {
|
||||
Some(Err(InconsistentProjectNames { domain, option }))
|
||||
@@ -123,13 +124,13 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
}
|
||||
.transpose()?;
|
||||
|
||||
info!(%user, project = project.as_deref(), "credentials");
|
||||
info!(%user, project = endpoint.as_deref(), "credentials");
|
||||
if sni.is_some() {
|
||||
info!("Connection with sni");
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
.with_label_values(&["sni"])
|
||||
.inc();
|
||||
} else if project.is_some() {
|
||||
} else if endpoint.is_some() {
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
.with_label_values(&["no_sni"])
|
||||
.inc();
|
||||
@@ -145,36 +146,57 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
|
||||
Ok(Self {
|
||||
user,
|
||||
project,
|
||||
endpoint_id: endpoint.map(EndpointId::from),
|
||||
options,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<SmolStr>) -> bool {
|
||||
if ip_list.is_empty() {
|
||||
return true;
|
||||
}
|
||||
for ip in ip_list {
|
||||
// We expect that all ip addresses from control plane are correct.
|
||||
// However, if some of them are broken, we still can check the others.
|
||||
match parse_ip_pattern(ip) {
|
||||
Ok(pattern) => {
|
||||
if check_ip(peer_addr, &pattern) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err),
|
||||
}
|
||||
}
|
||||
false
|
||||
pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
|
||||
ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern))
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
enum IpPattern {
|
||||
pub enum IpPattern {
|
||||
Subnet(ipnet::IpNet),
|
||||
Range(IpAddr, IpAddr),
|
||||
Single(IpAddr),
|
||||
None,
|
||||
}
|
||||
|
||||
impl<'de> serde::de::Deserialize<'de> for IpPattern {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct StrVisitor;
|
||||
impl<'de> serde::de::Visitor<'de> for StrVisitor {
|
||||
type Value = IpPattern;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Ok(parse_ip_pattern(v).unwrap_or_else(|e| {
|
||||
warn!("Cannot parse ip pattern {v}: {e}");
|
||||
IpPattern::None
|
||||
}))
|
||||
}
|
||||
}
|
||||
deserializer.deserialize_str(StrVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for IpPattern {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
parse_ip_pattern(s)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_ip_pattern(pattern: &str) -> anyhow::Result<IpPattern> {
|
||||
@@ -196,6 +218,7 @@ fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool {
|
||||
IpPattern::Subnet(subnet) => subnet.contains(ip),
|
||||
IpPattern::Range(start, end) => start <= ip && ip <= end,
|
||||
IpPattern::Single(addr) => addr == ip,
|
||||
IpPattern::None => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,6 +229,7 @@ fn project_name_valid(name: &str) -> bool {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
use ComputeUserInfoParseError::*;
|
||||
|
||||
#[test]
|
||||
@@ -215,7 +239,7 @@ mod tests {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.project, None);
|
||||
assert_eq!(user_info.endpoint_id, None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -230,7 +254,7 @@ mod tests {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.project, None);
|
||||
assert_eq!(user_info.endpoint_id, None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -246,7 +270,7 @@ mod tests {
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.project.as_deref(), Some("foo"));
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
|
||||
assert_eq!(user_info.options.get_cache_key("foo"), "foo");
|
||||
|
||||
Ok(())
|
||||
@@ -262,7 +286,7 @@ mod tests {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.project.as_deref(), Some("bar"));
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -277,7 +301,7 @@ mod tests {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.project.as_deref(), Some("bar"));
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -295,7 +319,7 @@ mod tests {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert!(user_info.project.is_none());
|
||||
assert!(user_info.endpoint_id.is_none());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -310,7 +334,7 @@ mod tests {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert!(user_info.project.is_none());
|
||||
assert!(user_info.endpoint_id.is_none());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -326,7 +350,7 @@ mod tests {
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.user, "john_doe");
|
||||
assert_eq!(user_info.project.as_deref(), Some("baz"));
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -340,14 +364,14 @@ mod tests {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.project.as_deref(), Some("p1"));
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.b.com");
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.project.as_deref(), Some("p1"));
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -404,7 +428,7 @@ mod tests {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let user_info =
|
||||
ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
|
||||
assert_eq!(user_info.project.as_deref(), Some("project"));
|
||||
assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
|
||||
assert_eq!(
|
||||
user_info.options.get_cache_key("project"),
|
||||
"project endpoint_type:read_write lsn:0/2"
|
||||
@@ -415,21 +439,17 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_check_peer_addr_is_in_list() {
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
assert!(check_peer_addr_is_in_list(&peer_addr, &vec![]));
|
||||
assert!(check_peer_addr_is_in_list(
|
||||
&peer_addr,
|
||||
&vec!["127.0.0.1".into()]
|
||||
));
|
||||
assert!(!check_peer_addr_is_in_list(
|
||||
&peer_addr,
|
||||
&vec!["8.8.8.8".into()]
|
||||
));
|
||||
fn check(v: serde_json::Value) -> bool {
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let ip_list: Vec<IpPattern> = serde_json::from_value(v).unwrap();
|
||||
check_peer_addr_is_in_list(&peer_addr, &ip_list)
|
||||
}
|
||||
|
||||
assert!(check(json!([])));
|
||||
assert!(check(json!(["127.0.0.1"])));
|
||||
assert!(!check(json!(["8.8.8.8"])));
|
||||
// If there is an incorrect address, it will be skipped.
|
||||
assert!(check_peer_addr_is_in_list(
|
||||
&peer_addr,
|
||||
&vec!["88.8.8".into(), "127.0.0.1".into()]
|
||||
));
|
||||
assert!(check(json!(["88.8.8", "127.0.0.1"])));
|
||||
}
|
||||
#[test]
|
||||
fn test_parse_ip_v4() -> anyhow::Result<()> {
|
||||
|
||||
@@ -4,10 +4,11 @@
|
||||
//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified.
|
||||
|
||||
use bstr::ByteSlice;
|
||||
use smol_str::SmolStr;
|
||||
|
||||
use crate::EndpointId;
|
||||
|
||||
pub struct PasswordHackPayload {
|
||||
pub endpoint: SmolStr,
|
||||
pub endpoint: EndpointId,
|
||||
pub password: Vec<u8>,
|
||||
}
|
||||
|
||||
|
||||
87
proxy/src/cache/project_info.rs
vendored
87
proxy/src/cache/project_info.rs
vendored
@@ -11,13 +11,16 @@ use smol_str::SmolStr;
|
||||
use tokio::time::Instant;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::{config::ProjectInfoCacheOptions, console::AuthSecret};
|
||||
use crate::{
|
||||
auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
|
||||
RoleName,
|
||||
};
|
||||
|
||||
use super::{Cache, Cached};
|
||||
|
||||
pub trait ProjectInfoCache {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr);
|
||||
fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr);
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
|
||||
fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
|
||||
fn enable_ttl(&self);
|
||||
fn disable_ttl(&self);
|
||||
}
|
||||
@@ -44,8 +47,8 @@ impl<T> From<T> for Entry<T> {
|
||||
|
||||
#[derive(Default)]
|
||||
struct EndpointInfo {
|
||||
secret: std::collections::HashMap<SmolStr, Entry<Option<AuthSecret>>>,
|
||||
allowed_ips: Option<Entry<Arc<Vec<SmolStr>>>>,
|
||||
secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
|
||||
allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
|
||||
}
|
||||
|
||||
impl EndpointInfo {
|
||||
@@ -57,7 +60,7 @@ impl EndpointInfo {
|
||||
}
|
||||
pub fn get_role_secret(
|
||||
&self,
|
||||
role_name: &SmolStr,
|
||||
role_name: &RoleName,
|
||||
valid_since: Instant,
|
||||
ignore_cache_since: Option<Instant>,
|
||||
) -> Option<(Option<AuthSecret>, bool)> {
|
||||
@@ -76,7 +79,7 @@ impl EndpointInfo {
|
||||
&self,
|
||||
valid_since: Instant,
|
||||
ignore_cache_since: Option<Instant>,
|
||||
) -> Option<(Arc<Vec<SmolStr>>, bool)> {
|
||||
) -> Option<(Arc<Vec<IpPattern>>, bool)> {
|
||||
if let Some(allowed_ips) = &self.allowed_ips {
|
||||
if valid_since < allowed_ips.created_at {
|
||||
return Some((
|
||||
@@ -90,7 +93,7 @@ impl EndpointInfo {
|
||||
pub fn invalidate_allowed_ips(&mut self) {
|
||||
self.allowed_ips = None;
|
||||
}
|
||||
pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) {
|
||||
pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
|
||||
self.secret.remove(role_name);
|
||||
}
|
||||
}
|
||||
@@ -103,9 +106,9 @@ impl EndpointInfo {
|
||||
/// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
|
||||
/// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
|
||||
pub struct ProjectInfoCacheImpl {
|
||||
cache: DashMap<SmolStr, EndpointInfo>,
|
||||
cache: DashMap<EndpointId, EndpointInfo>,
|
||||
|
||||
project2ep: DashMap<SmolStr, HashSet<SmolStr>>,
|
||||
project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
|
||||
config: ProjectInfoCacheOptions,
|
||||
|
||||
start_time: Instant,
|
||||
@@ -113,7 +116,7 @@ pub struct ProjectInfoCacheImpl {
|
||||
}
|
||||
|
||||
impl ProjectInfoCache for ProjectInfoCacheImpl {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
|
||||
info!("invalidating allowed ips for project `{}`", project_id);
|
||||
let endpoints = self
|
||||
.project2ep
|
||||
@@ -126,7 +129,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
|
||||
}
|
||||
}
|
||||
}
|
||||
fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) {
|
||||
fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
|
||||
info!(
|
||||
"invalidating role secret for project_id `{}` and role_name `{}`",
|
||||
project_id, role_name
|
||||
@@ -167,8 +170,8 @@ impl ProjectInfoCacheImpl {
|
||||
|
||||
pub fn get_role_secret(
|
||||
&self,
|
||||
endpoint_id: &SmolStr,
|
||||
role_name: &SmolStr,
|
||||
endpoint_id: &EndpointId,
|
||||
role_name: &RoleName,
|
||||
) -> Option<Cached<&Self, Option<AuthSecret>>> {
|
||||
let (valid_since, ignore_cache_since) = self.get_cache_times();
|
||||
let endpoint_info = self.cache.get(endpoint_id)?;
|
||||
@@ -188,8 +191,8 @@ impl ProjectInfoCacheImpl {
|
||||
}
|
||||
pub fn get_allowed_ips(
|
||||
&self,
|
||||
endpoint_id: &SmolStr,
|
||||
) -> Option<Cached<&Self, Arc<Vec<SmolStr>>>> {
|
||||
endpoint_id: &EndpointId,
|
||||
) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
|
||||
let (valid_since, ignore_cache_since) = self.get_cache_times();
|
||||
let endpoint_info = self.cache.get(endpoint_id)?;
|
||||
let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
|
||||
@@ -205,9 +208,9 @@ impl ProjectInfoCacheImpl {
|
||||
}
|
||||
pub fn insert_role_secret(
|
||||
&self,
|
||||
project_id: &SmolStr,
|
||||
endpoint_id: &SmolStr,
|
||||
role_name: &SmolStr,
|
||||
project_id: &ProjectId,
|
||||
endpoint_id: &EndpointId,
|
||||
role_name: &RoleName,
|
||||
secret: Option<AuthSecret>,
|
||||
) {
|
||||
if self.cache.len() >= self.config.size {
|
||||
@@ -222,9 +225,9 @@ impl ProjectInfoCacheImpl {
|
||||
}
|
||||
pub fn insert_allowed_ips(
|
||||
&self,
|
||||
project_id: &SmolStr,
|
||||
endpoint_id: &SmolStr,
|
||||
allowed_ips: Arc<Vec<SmolStr>>,
|
||||
project_id: &ProjectId,
|
||||
endpoint_id: &EndpointId,
|
||||
allowed_ips: Arc<Vec<IpPattern>>,
|
||||
) {
|
||||
if self.cache.len() >= self.config.size {
|
||||
// If there are too many entries, wait until the next gc cycle.
|
||||
@@ -236,7 +239,7 @@ impl ProjectInfoCacheImpl {
|
||||
.or_default()
|
||||
.allowed_ips = Some(allowed_ips.into());
|
||||
}
|
||||
fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) {
|
||||
fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
|
||||
if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
|
||||
endpoints.insert(endpoint_id.clone());
|
||||
} else {
|
||||
@@ -297,18 +300,18 @@ impl ProjectInfoCacheImpl {
|
||||
/// This is used to invalidate cache entries.
|
||||
pub struct CachedLookupInfo {
|
||||
/// Search by this key.
|
||||
endpoint_id: SmolStr,
|
||||
endpoint_id: EndpointId,
|
||||
lookup_type: LookupType,
|
||||
}
|
||||
|
||||
impl CachedLookupInfo {
|
||||
pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self {
|
||||
pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
|
||||
Self {
|
||||
endpoint_id,
|
||||
lookup_type: LookupType::RoleSecret(role_name),
|
||||
}
|
||||
}
|
||||
pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self {
|
||||
pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
|
||||
Self {
|
||||
endpoint_id,
|
||||
lookup_type: LookupType::AllowedIps,
|
||||
@@ -317,7 +320,7 @@ impl CachedLookupInfo {
|
||||
}
|
||||
|
||||
enum LookupType {
|
||||
RoleSecret(SmolStr),
|
||||
RoleSecret(RoleName),
|
||||
AllowedIps,
|
||||
}
|
||||
|
||||
@@ -348,7 +351,6 @@ impl Cache for ProjectInfoCacheImpl {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{console::AuthSecret, scram::ServerSecret};
|
||||
use smol_str::SmolStr;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -362,14 +364,17 @@ mod tests {
|
||||
});
|
||||
let project_id = "project".into();
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: SmolStr = "user1".into();
|
||||
let user2: SmolStr = "user2".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
)));
|
||||
let secret2 = None;
|
||||
let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
"127.0.0.2".parse().unwrap(),
|
||||
]);
|
||||
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
|
||||
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
|
||||
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
|
||||
@@ -382,7 +387,7 @@ mod tests {
|
||||
assert_eq!(cached.value, secret2);
|
||||
|
||||
// Shouldn't add more than 2 roles.
|
||||
let user3: SmolStr = "user3".into();
|
||||
let user3: RoleName = "user3".into();
|
||||
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user3.as_str(),
|
||||
[3; 32],
|
||||
@@ -417,8 +422,8 @@ mod tests {
|
||||
|
||||
let project_id = "project".into();
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: SmolStr = "user1".into();
|
||||
let user2: SmolStr = "user2".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
@@ -427,7 +432,10 @@ mod tests {
|
||||
user2.as_str(),
|
||||
[2; 32],
|
||||
)));
|
||||
let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
"127.0.0.2".parse().unwrap(),
|
||||
]);
|
||||
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
|
||||
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
|
||||
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
|
||||
@@ -469,8 +477,8 @@ mod tests {
|
||||
|
||||
let project_id = "project".into();
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: SmolStr = "user1".into();
|
||||
let user2: SmolStr = "user2".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
@@ -479,7 +487,10 @@ mod tests {
|
||||
user2.as_str(),
|
||||
[2; 32],
|
||||
)));
|
||||
let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
"127.0.0.2".parse().unwrap(),
|
||||
]);
|
||||
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
|
||||
cache.clone().disable_ttl();
|
||||
tokio::time::advance(Duration::from_millis(100)).await;
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
use serde::Deserialize;
|
||||
use smol_str::SmolStr;
|
||||
use std::fmt;
|
||||
|
||||
use crate::auth::IpPattern;
|
||||
|
||||
use crate::{BranchId, EndpointId, ProjectId};
|
||||
|
||||
/// Generic error response with human-readable description.
|
||||
/// Note that we can't always present it to user as is.
|
||||
#[derive(Debug, Deserialize)]
|
||||
@@ -14,8 +17,8 @@ pub struct ConsoleError {
|
||||
#[derive(Deserialize)]
|
||||
pub struct GetRoleSecret {
|
||||
pub role_secret: Box<str>,
|
||||
pub allowed_ips: Option<Vec<Box<str>>>,
|
||||
pub project_id: Option<Box<str>>,
|
||||
pub allowed_ips: Option<Vec<IpPattern>>,
|
||||
pub project_id: Option<ProjectId>,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
@@ -92,9 +95,9 @@ impl fmt::Debug for DatabaseInfo {
|
||||
/// Also known as `ProxyMetricsAuxInfo` in the console.
|
||||
#[derive(Debug, Deserialize, Clone, Default)]
|
||||
pub struct MetricsAuxInfo {
|
||||
pub endpoint_id: SmolStr,
|
||||
pub project_id: SmolStr,
|
||||
pub branch_id: SmolStr,
|
||||
pub endpoint_id: EndpointId,
|
||||
pub project_id: ProjectId,
|
||||
pub branch_id: BranchId,
|
||||
}
|
||||
|
||||
impl MetricsAuxInfo {
|
||||
|
||||
@@ -4,16 +4,15 @@ pub mod neon;
|
||||
|
||||
use super::messages::MetricsAuxInfo;
|
||||
use crate::{
|
||||
auth::backend::ComputeUserInfo,
|
||||
auth::{backend::ComputeUserInfo, IpPattern},
|
||||
cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
|
||||
compute,
|
||||
config::{CacheOptions, ProjectInfoCacheOptions},
|
||||
context::RequestMonitoring,
|
||||
scram,
|
||||
scram, EndpointCacheKey, ProjectId,
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use smol_str::SmolStr;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
use tokio::time::Instant;
|
||||
@@ -212,9 +211,9 @@ pub enum AuthSecret {
|
||||
pub struct AuthInfo {
|
||||
pub secret: Option<AuthSecret>,
|
||||
/// List of IP addresses allowed for the autorization.
|
||||
pub allowed_ips: Vec<SmolStr>,
|
||||
pub allowed_ips: Vec<IpPattern>,
|
||||
/// Project ID. This is used for cache invalidation.
|
||||
pub project_id: Option<SmolStr>,
|
||||
pub project_id: Option<ProjectId>,
|
||||
}
|
||||
|
||||
/// Info for establishing a connection to a compute node.
|
||||
@@ -233,10 +232,10 @@ pub struct NodeInfo {
|
||||
pub allow_self_signed_compute: bool,
|
||||
}
|
||||
|
||||
pub type NodeInfoCache = TimedLru<SmolStr, NodeInfo>;
|
||||
pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
|
||||
pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
|
||||
pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
|
||||
pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<SmolStr>>>;
|
||||
pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
|
||||
|
||||
/// This will allocate per each call, but the http requests alone
|
||||
/// already require a few allocations, so it should be fine.
|
||||
@@ -345,7 +344,7 @@ impl ApiCaches {
|
||||
/// Various caches for [`console`](super).
|
||||
pub struct ApiLocks {
|
||||
name: &'static str,
|
||||
node_locks: DashMap<SmolStr, Arc<Semaphore>>,
|
||||
node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
|
||||
permits: usize,
|
||||
timeout: Duration,
|
||||
registered: prometheus::IntCounter,
|
||||
@@ -413,7 +412,7 @@ impl ApiLocks {
|
||||
|
||||
pub async fn get_wake_compute_permit(
|
||||
&self,
|
||||
key: &SmolStr,
|
||||
key: &EndpointCacheKey,
|
||||
) -> Result<WakeComputePermit, errors::WakeComputeError> {
|
||||
if self.permits == 0 {
|
||||
return Ok(WakeComputePermit { permit: None });
|
||||
|
||||
@@ -4,14 +4,13 @@ use super::{
|
||||
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
||||
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
|
||||
};
|
||||
use crate::cache::Cached;
|
||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||
use crate::{auth::IpPattern, cache::Cached};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use smol_str::SmolStr;
|
||||
use std::sync::Arc;
|
||||
use std::{str::FromStr, sync::Arc};
|
||||
use thiserror::Error;
|
||||
use tokio_postgres::{config::SslMode, Client};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
@@ -88,7 +87,9 @@ impl Api {
|
||||
{
|
||||
Some(s) => {
|
||||
info!("got allowed_ips: {s}");
|
||||
s.split(',').map(String::from).collect()
|
||||
s.split(',')
|
||||
.map(|s| IpPattern::from_str(s).unwrap())
|
||||
.collect()
|
||||
}
|
||||
None => vec![],
|
||||
};
|
||||
@@ -100,7 +101,7 @@ impl Api {
|
||||
.await?;
|
||||
Ok(AuthInfo {
|
||||
secret,
|
||||
allowed_ips: allowed_ips.iter().map(SmolStr::from).collect(),
|
||||
allowed_ips,
|
||||
project_id: None,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -14,8 +14,6 @@ use crate::{
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
use smol_str::SmolStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::config::SslMode;
|
||||
@@ -94,17 +92,12 @@ impl Api {
|
||||
.ok_or(GetAuthInfoError::BadSecret)?;
|
||||
Some(secret)
|
||||
};
|
||||
let allowed_ips = body
|
||||
.allowed_ips
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(SmolStr::from)
|
||||
.collect_vec();
|
||||
let allowed_ips = body.allowed_ips.unwrap_or_default();
|
||||
ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
|
||||
Ok(AuthInfo {
|
||||
secret,
|
||||
allowed_ips,
|
||||
project_id: body.project_id.map(SmolStr::from),
|
||||
project_id: body.project_id,
|
||||
})
|
||||
}
|
||||
.map_err(crate::error::log_error)
|
||||
@@ -245,7 +238,7 @@ impl super::Api for Api {
|
||||
// for some time (highly depends on the console's scale-to-zero policy);
|
||||
// The connection info remains the same during that period of time,
|
||||
// which means that we might cache it to reduce the load and latency.
|
||||
if let Some(cached) = self.caches.node_info.get(&*key) {
|
||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||
info!(key = &*key, "found cached compute node info");
|
||||
return Ok(cached);
|
||||
}
|
||||
|
||||
@@ -7,7 +7,10 @@ use std::net::IpAddr;
|
||||
use tokio::sync::mpsc;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer};
|
||||
use crate::{
|
||||
console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
|
||||
EndpointId, ProjectId, RoleName,
|
||||
};
|
||||
|
||||
pub mod parquet;
|
||||
|
||||
@@ -26,10 +29,10 @@ pub struct RequestMonitoring {
|
||||
region: &'static str,
|
||||
|
||||
// filled in as they are discovered
|
||||
project: Option<SmolStr>,
|
||||
branch: Option<SmolStr>,
|
||||
endpoint_id: Option<SmolStr>,
|
||||
user: Option<SmolStr>,
|
||||
project: Option<ProjectId>,
|
||||
branch: Option<BranchId>,
|
||||
endpoint_id: Option<EndpointId>,
|
||||
user: Option<RoleName>,
|
||||
application: Option<SmolStr>,
|
||||
error_kind: Option<ErrorKind>,
|
||||
success: bool,
|
||||
@@ -86,7 +89,7 @@ impl RequestMonitoring {
|
||||
self.project = Some(x.project_id);
|
||||
}
|
||||
|
||||
pub fn set_endpoint_id(&mut self, endpoint_id: Option<SmolStr>) {
|
||||
pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
|
||||
self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
|
||||
}
|
||||
|
||||
@@ -94,7 +97,7 @@ impl RequestMonitoring {
|
||||
self.application = app.or_else(|| self.application.clone());
|
||||
}
|
||||
|
||||
pub fn set_user(&mut self, user: SmolStr) {
|
||||
pub fn set_user(&mut self, user: RoleName) {
|
||||
self.user = Some(user);
|
||||
}
|
||||
|
||||
|
||||
@@ -62,3 +62,79 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
|
||||
pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
|
||||
r.context("join error").and_then(|x| x)
|
||||
}
|
||||
|
||||
macro_rules! smol_str_wrapper {
|
||||
($name:ident) => {
|
||||
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
|
||||
pub struct $name(smol_str::SmolStr);
|
||||
|
||||
impl $name {
|
||||
pub fn as_str(&self) -> &str {
|
||||
self.0.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for $name {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> std::cmp::PartialEq<T> for $name
|
||||
where
|
||||
smol_str::SmolStr: std::cmp::PartialEq<T>,
|
||||
{
|
||||
fn eq(&self, other: &T) -> bool {
|
||||
self.0.eq(other)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> From<T> for $name
|
||||
where
|
||||
smol_str::SmolStr: From<T>,
|
||||
{
|
||||
fn from(x: T) -> Self {
|
||||
Self(x.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<str> for $name {
|
||||
fn as_ref(&self) -> &str {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for $name {
|
||||
type Target = str;
|
||||
fn deref(&self) -> &str {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> serde::de::Deserialize<'de> for $name {
|
||||
fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
|
||||
<smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for $name {
|
||||
fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
|
||||
self.0.serialize(s)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// 90% of role name strings are 20 characters or less.
|
||||
smol_str_wrapper!(RoleName);
|
||||
// 50% of endpoint strings are 23 characters or less.
|
||||
smol_str_wrapper!(EndpointId);
|
||||
// 50% of branch strings are 23 characters or less.
|
||||
smol_str_wrapper!(BranchId);
|
||||
// 90% of project strings are 23 characters or less.
|
||||
smol_str_wrapper!(ProjectId);
|
||||
|
||||
// will usually equal endpoint ID
|
||||
smol_str_wrapper!(EndpointCacheKey);
|
||||
|
||||
smol_str_wrapper!(DbName);
|
||||
|
||||
@@ -19,6 +19,7 @@ use crate::{
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
stream::{PqStream, Stream},
|
||||
usage_metrics::{Ids, USAGE_METRICS},
|
||||
EndpointCacheKey,
|
||||
};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::TryFutureExt;
|
||||
@@ -26,7 +27,7 @@ use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||
use regex::Regex;
|
||||
use smol_str::SmolStr;
|
||||
use smol_str::{format_smolstr, SmolStr};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -516,20 +517,21 @@ impl NeonOptions {
|
||||
Self(options)
|
||||
}
|
||||
|
||||
pub fn get_cache_key(&self, prefix: &str) -> SmolStr {
|
||||
pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
|
||||
// prefix + format!(" {k}:{v}")
|
||||
// kinda jank because SmolStr is immutable
|
||||
std::iter::once(prefix)
|
||||
.chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v]))
|
||||
.collect()
|
||||
.collect::<SmolStr>()
|
||||
.into()
|
||||
}
|
||||
|
||||
/// <https://swagger.io/docs/specification/serialization/> DeepObject format
|
||||
/// `paramName[prop1]=value1¶mName[prop2]=value2&...`
|
||||
pub fn to_deep_object(&self) -> Vec<(String, SmolStr)> {
|
||||
pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
|
||||
self.0
|
||||
.iter()
|
||||
.map(|(k, v)| (format!("options[{}]", k), v.clone()))
|
||||
.map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,13 +6,13 @@ use super::connect_compute::ConnectMechanism;
|
||||
use super::retry::ShouldRetry;
|
||||
use super::*;
|
||||
use crate::auth::backend::{ComputeUserInfo, TestBackend};
|
||||
use crate::auth::IpPattern;
|
||||
use crate::config::CertResolver;
|
||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
|
||||
use crate::{auth, http, sasl, scram};
|
||||
use async_trait::async_trait;
|
||||
use rstest::rstest;
|
||||
use smol_str::SmolStr;
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tokio_postgres::tls::{MakeTlsConnect, NoTls};
|
||||
use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
|
||||
@@ -471,7 +471,7 @@ impl TestBackend for TestConnectMechanism {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError> {
|
||||
fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError> {
|
||||
unimplemented!("not used in tests")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,11 +11,12 @@ use anyhow::bail;
|
||||
use dashmap::DashMap;
|
||||
use itertools::Itertools;
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use smol_str::SmolStr;
|
||||
use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
|
||||
use tokio::time::{timeout, Duration, Instant};
|
||||
use tracing::info;
|
||||
|
||||
use crate::EndpointId;
|
||||
|
||||
use super::{
|
||||
limit_algorithm::{LimitAlgorithm, Sample},
|
||||
RateLimiterConfig,
|
||||
@@ -33,7 +34,7 @@ use super::{
|
||||
// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
|
||||
// I went with a more expensive way that yields user-friendlier error messages.
|
||||
pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
|
||||
map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
|
||||
map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
|
||||
info: &'static [RateBucketInfo],
|
||||
access_count: AtomicUsize,
|
||||
rand: Mutex<Rand>,
|
||||
@@ -146,7 +147,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
|
||||
}
|
||||
|
||||
/// Check that number of connections to the endpoint is below `max_rps` rps.
|
||||
pub fn check(&self, endpoint: SmolStr) -> bool {
|
||||
pub fn check(&self, endpoint: EndpointId) -> bool {
|
||||
// do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
|
||||
// worst case memory usage is about:
|
||||
// = 2 * 2048 * 64 * (48B + 72B)
|
||||
@@ -493,11 +494,13 @@ mod tests {
|
||||
use futures::{task::noop_waker_ref, Future};
|
||||
use rand::SeedableRng;
|
||||
use rustc_hash::FxHasher;
|
||||
use smol_str::SmolStr;
|
||||
use tokio::time;
|
||||
|
||||
use super::{EndpointRateLimiter, Limiter, Outcome};
|
||||
use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
|
||||
use crate::{
|
||||
rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
|
||||
EndpointId,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn it_works() {
|
||||
@@ -654,7 +657,7 @@ mod tests {
|
||||
RateBucketInfo::validate(&mut rates).unwrap();
|
||||
let limiter = EndpointRateLimiter::new(Vec::leak(rates));
|
||||
|
||||
let endpoint = SmolStr::from("ep-my-endpoint-1234");
|
||||
let endpoint = EndpointId::from("ep-my-endpoint-1234");
|
||||
|
||||
time::pause();
|
||||
|
||||
|
||||
@@ -3,9 +3,8 @@ use std::{convert::Infallible, sync::Arc};
|
||||
use futures::StreamExt;
|
||||
use redis::aio::PubSub;
|
||||
use serde::Deserialize;
|
||||
use smol_str::SmolStr;
|
||||
|
||||
use crate::cache::project_info::ProjectInfoCache;
|
||||
use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};
|
||||
|
||||
const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
|
||||
@@ -46,12 +45,12 @@ enum Notification {
|
||||
}
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
struct AllowedIpsUpdate {
|
||||
project_id: SmolStr,
|
||||
project_id: ProjectId,
|
||||
}
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
struct PasswordUpdate {
|
||||
project_id: SmolStr,
|
||||
role_name: SmolStr,
|
||||
project_id: ProjectId,
|
||||
role_name: RoleName,
|
||||
}
|
||||
fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
|
||||
where
|
||||
|
||||
@@ -31,6 +31,7 @@ use crate::{
|
||||
metrics::NUM_DB_CONNECTIONS_GAUGE,
|
||||
proxy::connect_compute::ConnectMechanism,
|
||||
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
|
||||
DbName, EndpointCacheKey, RoleName,
|
||||
};
|
||||
use crate::{compute, config};
|
||||
|
||||
@@ -42,17 +43,17 @@ pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ConnInfo {
|
||||
pub user_info: ComputeUserInfo,
|
||||
pub dbname: SmolStr,
|
||||
pub dbname: DbName,
|
||||
pub password: SmolStr,
|
||||
}
|
||||
|
||||
impl ConnInfo {
|
||||
// hm, change to hasher to avoid cloning?
|
||||
pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
|
||||
pub fn db_and_user(&self) -> (DbName, RoleName) {
|
||||
(self.dbname.clone(), self.user_info.user.clone())
|
||||
}
|
||||
|
||||
pub fn endpoint_cache_key(&self) -> SmolStr {
|
||||
pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
|
||||
self.user_info.endpoint_cache_key()
|
||||
}
|
||||
}
|
||||
@@ -79,14 +80,14 @@ struct ConnPoolEntry {
|
||||
// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
|
||||
// Number of open connections is limited by the `max_conns_per_endpoint`.
|
||||
pub struct EndpointConnPool {
|
||||
pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
|
||||
pools: HashMap<(DbName, RoleName), DbUserConnPool>,
|
||||
total_conns: usize,
|
||||
max_conns: usize,
|
||||
_guard: IntCounterPairGuard,
|
||||
}
|
||||
|
||||
impl EndpointConnPool {
|
||||
fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
|
||||
fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry> {
|
||||
let Self {
|
||||
pools, total_conns, ..
|
||||
} = self;
|
||||
@@ -95,7 +96,7 @@ impl EndpointConnPool {
|
||||
.and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
|
||||
}
|
||||
|
||||
fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
|
||||
fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
|
||||
let Self {
|
||||
pools, total_conns, ..
|
||||
} = self;
|
||||
@@ -196,7 +197,7 @@ pub struct GlobalConnPool {
|
||||
//
|
||||
// That should be a fairly conteded map, so return reference to the per-endpoint
|
||||
// pool as early as possible and release the lock.
|
||||
global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
|
||||
global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
|
||||
|
||||
/// Number of endpoint-connection pools
|
||||
///
|
||||
@@ -440,7 +441,10 @@ impl GlobalConnPool {
|
||||
Ok(Client::new(new_client, conn_info, endpoint_pool).await)
|
||||
}
|
||||
|
||||
fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
|
||||
fn get_or_create_endpoint_pool(
|
||||
&self,
|
||||
endpoint: &EndpointCacheKey,
|
||||
) -> Arc<RwLock<EndpointConnPool>> {
|
||||
// fast path
|
||||
if let Some(pool) = self.global_pool.get(endpoint) {
|
||||
return pool.clone();
|
||||
|
||||
@@ -13,7 +13,6 @@ use hyper::{Body, HeaderMap, Request};
|
||||
use serde_json::json;
|
||||
use serde_json::Map;
|
||||
use serde_json::Value;
|
||||
use smol_str::SmolStr;
|
||||
use tokio_postgres::error::DbError;
|
||||
use tokio_postgres::error::ErrorPosition;
|
||||
use tokio_postgres::types::Kind;
|
||||
@@ -36,6 +35,8 @@ use crate::config::TlsConfig;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::EndpointId;
|
||||
use crate::RoleName;
|
||||
|
||||
use super::conn_pool::ConnInfo;
|
||||
use super::conn_pool::GlobalConnPool;
|
||||
@@ -155,7 +156,7 @@ fn get_conn_info(
|
||||
.next()
|
||||
.ok_or(anyhow::anyhow!("invalid database name"))?;
|
||||
|
||||
let username = SmolStr::from(connection_url.username());
|
||||
let username = RoleName::from(connection_url.username());
|
||||
if username.is_empty() {
|
||||
return Err(anyhow::anyhow!("missing username"));
|
||||
}
|
||||
@@ -189,7 +190,7 @@ fn get_conn_info(
|
||||
|
||||
let endpoint = endpoint_sni(hostname, &tls.common_names)?;
|
||||
|
||||
let endpoint: SmolStr = endpoint.into();
|
||||
let endpoint: EndpointId = endpoint.into();
|
||||
ctx.set_endpoint_id(Some(endpoint.clone()));
|
||||
|
||||
let pairs = connection_url.query_pairs();
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
//! Periodically collect proxy consumption metrics
|
||||
//! and push them to a HTTP endpoint.
|
||||
use crate::{config::MetricCollectionConfig, http};
|
||||
use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use dashmap::{mapref::entry::Entry, DashMap};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smol_str::SmolStr;
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
sync::{
|
||||
@@ -30,8 +29,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
/// because we enrich the event with project_id in the control-plane endpoint.
|
||||
#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct Ids {
|
||||
pub endpoint_id: SmolStr,
|
||||
pub branch_id: SmolStr,
|
||||
pub endpoint_id: EndpointId,
|
||||
pub branch_id: BranchId,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -7,6 +7,8 @@ use tracing::*;
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
const ALLOW_INACTIVE_TIMELINES: bool = true;
|
||||
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let wal_removal_interval = Duration::from_millis(5000);
|
||||
loop {
|
||||
@@ -15,10 +17,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
|
||||
let tlis = GlobalTimelines::get_all();
|
||||
for tli in &tlis {
|
||||
if !tli.is_active().await {
|
||||
let is_active = tli.is_active().await;
|
||||
if is_active {
|
||||
active_timelines += 1;
|
||||
}
|
||||
if !ALLOW_INACTIVE_TIMELINES && !is_active {
|
||||
continue;
|
||||
}
|
||||
active_timelines += 1;
|
||||
let ttid = tli.ttid;
|
||||
async {
|
||||
if let Err(e) = tli.maybe_persist_control_file().await {
|
||||
|
||||
@@ -39,6 +39,9 @@ SETUP COMPLETE
|
||||
To run your local neon.git build on the instance store volume,
|
||||
run the following commands from the top of the neon.git checkout
|
||||
|
||||
# raise file descriptor limit of your shell and its child processes
|
||||
sudo prlimit -p $$ --nofile=800000:800000
|
||||
|
||||
# test suite run
|
||||
export TEST_OUTPUT="$TEST_OUTPUT"
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
|
||||
|
||||
@@ -10,16 +10,18 @@ import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import textwrap
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from fcntl import LOCK_EX, LOCK_UN, flock
|
||||
from functools import cached_property
|
||||
from itertools import chain, product
|
||||
from pathlib import Path
|
||||
from types import TracebackType
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
|
||||
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import asyncpg
|
||||
@@ -49,7 +51,10 @@ from fixtures.pageserver.allowed_errors import (
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.types import IndexPartDump
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
from fixtures.remote_storage import (
|
||||
@@ -424,6 +429,7 @@ class NeonEnvBuilder:
|
||||
pg_distrib_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
test_name: str,
|
||||
top_output_dir: Path,
|
||||
test_output_dir: Path,
|
||||
test_overlay_dir: Optional[Path] = None,
|
||||
pageserver_remote_storage: Optional[RemoteStorage] = None,
|
||||
@@ -473,6 +479,7 @@ class NeonEnvBuilder:
|
||||
self.test_overlay_dir = test_overlay_dir
|
||||
self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
|
||||
self.config_init_force: Optional[str] = None
|
||||
self.top_output_dir = top_output_dir
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
@@ -526,6 +533,64 @@ class NeonEnvBuilder:
|
||||
|
||||
return env
|
||||
|
||||
def build_and_use_snapshot(
|
||||
self, global_ident: str, create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv]
|
||||
) -> NeonEnv:
|
||||
if os.getenv("CI", "false") == "true":
|
||||
log.info("do not use snapshots in ephemeral CI environment")
|
||||
env = create_env_for_snapshot(self)
|
||||
env.stop(immediate=True, ps_assert_metric_no_errors=False)
|
||||
return env
|
||||
|
||||
with shared_snapshot_dir(self.top_output_dir, global_ident) as snapshot_dir:
|
||||
if not snapshot_dir.is_initialized():
|
||||
self._build_and_use_snapshot_impl(snapshot_dir, create_env_for_snapshot)
|
||||
assert snapshot_dir.is_initialized()
|
||||
|
||||
return self.from_repo_dir(snapshot_dir.path)
|
||||
|
||||
def _build_and_use_snapshot_impl(
|
||||
self,
|
||||
snapshot_dir: SnapshotDirLocked,
|
||||
create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv],
|
||||
):
|
||||
if snapshot_dir.path.exists():
|
||||
shutil.rmtree(snapshot_dir.path)
|
||||
|
||||
if self.test_overlay_dir is not None:
|
||||
# Make repo_dir an overlayfs mount with lowerdir being the empty snapshot_dir.
|
||||
# When we're done filling up repo_dir, tear everything down, unmount the overlayfs, and use
|
||||
# the upperdir as the snapshot. This is equivalent to docker `FROM scratch`.
|
||||
assert not self.repo_dir.exists()
|
||||
assert self.repo_dir.parent.exists()
|
||||
snapshot_dir.path.mkdir()
|
||||
self.overlay_mount("create-snapshot-repo-dir", snapshot_dir.path, self.repo_dir)
|
||||
self.config_init_force = "empty-dir-ok"
|
||||
|
||||
env = create_env_for_snapshot(self)
|
||||
assert self.env is not None
|
||||
assert self.env == env
|
||||
|
||||
# shut down everything for snapshot
|
||||
env.stop(immediate=True, ps_assert_metric_no_errors=True)
|
||||
|
||||
# TODO: all kinds of assertions to ensure the env is unused
|
||||
|
||||
if self.test_overlay_dir is None:
|
||||
log.info("take snapshot by moving repo dir")
|
||||
env.repo_dir.rename(snapshot_dir.path)
|
||||
else:
|
||||
log.info("take snapshot by using overlayfs upperdir")
|
||||
self.overlay_unmount_and_move("create-snapshot-repo-dir", snapshot_dir.path)
|
||||
log.info("remove empty repo_dir (previously mountpoint) for snapshot overlay_mount")
|
||||
env.repo_dir.rmdir()
|
||||
# TODO from here on, we should be able to reset / goto top where snapshot_dir.is_initialized()
|
||||
log.info("make repo_dir an overlayfs mount of the snapshot we just created")
|
||||
assert not env.repo_dir.exists(), "both branches above should remove it"
|
||||
snapshot_dir.set_initialized()
|
||||
|
||||
self.env = None # so that from_repo_dir works again
|
||||
|
||||
def from_repo_dir(
|
||||
self,
|
||||
repo_dir: Path,
|
||||
@@ -557,10 +622,15 @@ class NeonEnvBuilder:
|
||||
tenants_from_dir = ps_dir / "tenants"
|
||||
tenants_to_dir = self.repo_dir / ps_dir.name / "tenants"
|
||||
|
||||
log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}")
|
||||
if self.test_overlay_dir is None:
|
||||
log.info(
|
||||
f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
|
||||
)
|
||||
shutil.copytree(tenants_from_dir, tenants_to_dir)
|
||||
else:
|
||||
log.info(
|
||||
f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
|
||||
)
|
||||
self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir)
|
||||
|
||||
for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"):
|
||||
@@ -571,10 +641,12 @@ class NeonEnvBuilder:
|
||||
|
||||
shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
|
||||
if self.test_overlay_dir is None:
|
||||
log.info("Copying local_fs_remote_storage directory from snapshot")
|
||||
shutil.copytree(
|
||||
repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
|
||||
)
|
||||
else:
|
||||
log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot")
|
||||
self.overlay_mount(
|
||||
"local_fs_remote_storage",
|
||||
repo_dir / "local_fs_remote_storage",
|
||||
@@ -631,6 +703,54 @@ class NeonEnvBuilder:
|
||||
)
|
||||
self.overlay_mounts_created_by_us.append((ident, dstdir))
|
||||
|
||||
def _overlay_umount(self, mountpoint: Path):
|
||||
cmd = ["sudo", "umount", str(mountpoint)]
|
||||
assert mountpoint.is_mount()
|
||||
subprocess_capture(
|
||||
self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
|
||||
)
|
||||
|
||||
def overlay_unmount_and_move(self, ident: str, dst: Path):
|
||||
"""
|
||||
Unmount previously established overlayfs mount at `dstdir` and move the upperdir contents to `dst`.
|
||||
If `dst` is an empty directory, it gets replaced.
|
||||
Caller is responsible for ensuring the unmount will succeed, i.e., that there aren't any nested mounts.
|
||||
|
||||
Raises exception if self.test_overlay_dir is None
|
||||
"""
|
||||
assert self.test_overlay_dir is not None
|
||||
# not mutating state yet, make checks
|
||||
ident_state_dir = self.test_overlay_dir / ident
|
||||
assert ident_state_dir.is_dir()
|
||||
upper = ident_state_dir / "upper"
|
||||
work = ident_state_dir / "work"
|
||||
assert upper.is_dir()
|
||||
assert work.is_dir()
|
||||
assert (
|
||||
self.test_overlay_dir not in dst.parents
|
||||
), "otherwise workdir cleanup below wouldn't work"
|
||||
# find index, still not mutating state
|
||||
idxmap = {
|
||||
existing_ident: idx
|
||||
for idx, (existing_ident, _) in enumerate(self.overlay_mounts_created_by_us)
|
||||
}
|
||||
idx = idxmap.get(ident)
|
||||
if idx is None:
|
||||
raise RuntimeError(f"cannot find mount for ident {ident}")
|
||||
|
||||
if dst.is_dir():
|
||||
dst.rmdir() # raises exception if not empty, which is what we want
|
||||
|
||||
_, mountpoint = self.overlay_mounts_created_by_us.pop(idx)
|
||||
self._overlay_umount(mountpoint)
|
||||
upper.rename(dst)
|
||||
# we moved the upperdir, clean up workdir and then its parent ident_state_dir
|
||||
cmd = ["sudo", "rm", "-rf", str(work)]
|
||||
subprocess_capture(
|
||||
self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
|
||||
)
|
||||
ident_state_dir.rmdir() # should be empty since we moved `upper` out
|
||||
|
||||
def overlay_cleanup_teardown(self):
|
||||
"""
|
||||
Unmount the overlayfs mounts created by `self.overlay_mount()`.
|
||||
@@ -641,13 +761,10 @@ class NeonEnvBuilder:
|
||||
while len(self.overlay_mounts_created_by_us) > 0:
|
||||
(ident, mountpoint) = self.overlay_mounts_created_by_us.pop()
|
||||
ident_state_dir = self.test_overlay_dir / ident
|
||||
cmd = ["sudo", "umount", str(mountpoint)]
|
||||
log.info(
|
||||
f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}"
|
||||
)
|
||||
subprocess_capture(
|
||||
self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
|
||||
f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}"
|
||||
)
|
||||
self._overlay_umount(mountpoint)
|
||||
log.info(
|
||||
f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}"
|
||||
)
|
||||
@@ -725,8 +842,15 @@ class NeonEnvBuilder:
|
||||
if self.preserve_database_files:
|
||||
return
|
||||
|
||||
overlayfs_mounts = {mountpoint for _, mountpoint in self.overlay_mounts_created_by_us}
|
||||
|
||||
directories_to_clean: List[Path] = []
|
||||
for test_entry in Path(self.repo_dir).glob("**/*"):
|
||||
if test_entry in overlayfs_mounts:
|
||||
continue
|
||||
for parent in test_entry.parents:
|
||||
if parent in overlayfs_mounts:
|
||||
continue
|
||||
if test_entry.is_file():
|
||||
test_file = test_entry
|
||||
if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name):
|
||||
@@ -775,13 +899,6 @@ class NeonEnvBuilder:
|
||||
log.error(f"Error during remote storage scrub: {e}")
|
||||
cleanup_error = e
|
||||
|
||||
try:
|
||||
self.overlay_cleanup_teardown()
|
||||
except Exception as e:
|
||||
log.error(f"Error cleaning up overlay state: {e}")
|
||||
if cleanup_error is not None:
|
||||
cleanup_error = e
|
||||
|
||||
try:
|
||||
self.cleanup_remote_storage()
|
||||
except Exception as e:
|
||||
@@ -802,6 +919,13 @@ class NeonEnvBuilder:
|
||||
for pageserver in self.env.pageservers:
|
||||
pageserver.assert_no_errors()
|
||||
|
||||
try:
|
||||
self.overlay_cleanup_teardown()
|
||||
except Exception as e:
|
||||
log.error(f"Error cleaning up overlay state: {e}")
|
||||
if cleanup_error is not None:
|
||||
cleanup_error = e
|
||||
|
||||
|
||||
class NeonEnv:
|
||||
"""
|
||||
@@ -971,7 +1095,9 @@ class NeonEnv:
|
||||
assert that there is only one. Tests with multiple pageservers should always use
|
||||
get_pageserver with an explicit ID.
|
||||
"""
|
||||
assert len(self.pageservers) == 1
|
||||
assert (
|
||||
len(self.pageservers) == 1
|
||||
), "env.pageserver must only be used with single pageserver NeonEnv"
|
||||
return self.pageservers[0]
|
||||
|
||||
def get_pageserver(self, id: Optional[int]) -> NeonPageserver:
|
||||
@@ -1082,6 +1208,7 @@ def _shared_simple_env(
|
||||
shutil.rmtree(repo_dir, ignore_errors=True)
|
||||
|
||||
with NeonEnvBuilder(
|
||||
top_output_dir=top_output_dir,
|
||||
repo_dir=repo_dir,
|
||||
port_distributor=port_distributor,
|
||||
broker=default_broker,
|
||||
@@ -1130,6 +1257,7 @@ def neon_env_builder(
|
||||
run_id: uuid.UUID,
|
||||
request: FixtureRequest,
|
||||
test_overlay_dir: Path,
|
||||
top_output_dir: Path,
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
"""
|
||||
Fixture to create a Neon environment for test.
|
||||
@@ -1149,6 +1277,7 @@ def neon_env_builder(
|
||||
|
||||
# Return the builder to the caller
|
||||
with NeonEnvBuilder(
|
||||
top_output_dir=top_output_dir,
|
||||
repo_dir=Path(repo_dir),
|
||||
port_distributor=port_distributor,
|
||||
mock_s3_server=mock_s3_server,
|
||||
@@ -3487,6 +3616,10 @@ def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
return _get_test_dir(request, top_output_dir, "overlay-")
|
||||
|
||||
|
||||
def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path:
|
||||
return top_output_dir / "shared-snapshots" / snapshot_name
|
||||
|
||||
|
||||
def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
return get_test_output_dir(request, top_output_dir) / "repo"
|
||||
|
||||
@@ -3533,6 +3666,75 @@ def test_output_dir(
|
||||
allure_attach_from_dir(test_dir)
|
||||
|
||||
|
||||
class FileAndThreadLock:
|
||||
def __init__(self, path: Path):
|
||||
self.path = path
|
||||
self.thread_lock = threading.Lock()
|
||||
self.fd: Optional[int] = None
|
||||
|
||||
def __enter__(self):
|
||||
self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY)
|
||||
# lock thread lock before file lock so that there's no race
|
||||
# around flocking / funlocking the file lock
|
||||
self.thread_lock.acquire()
|
||||
flock(self.fd, LOCK_EX)
|
||||
|
||||
def __exit__(self, exc_type, exc_value, exc_traceback):
|
||||
assert self.fd is not None
|
||||
assert self.thread_lock.locked() # ... by us
|
||||
flock(self.fd, LOCK_UN)
|
||||
self.thread_lock.release()
|
||||
os.close(self.fd)
|
||||
self.fd = None
|
||||
|
||||
|
||||
class SnapshotDirLocked:
|
||||
def __init__(self, parent: SnapshotDir):
|
||||
self._parent = parent
|
||||
|
||||
def is_initialized(self):
|
||||
# TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized.
|
||||
# Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed.
|
||||
return self._parent._marker_file_path.exists()
|
||||
|
||||
def set_initialized(self):
|
||||
self._parent._marker_file_path.write_text("")
|
||||
|
||||
@property
|
||||
def path(self) -> Path:
|
||||
return self._parent._path / "snapshot"
|
||||
|
||||
|
||||
class SnapshotDir:
|
||||
_path: Path
|
||||
|
||||
def __init__(self, path: Path):
|
||||
self._path = path
|
||||
assert self._path.is_dir()
|
||||
self._lock = FileAndThreadLock(self._lock_file_path)
|
||||
|
||||
@property
|
||||
def _lock_file_path(self) -> Path:
|
||||
return self._path / "initializing.flock"
|
||||
|
||||
@property
|
||||
def _marker_file_path(self) -> Path:
|
||||
return self._path / "initialized.marker"
|
||||
|
||||
def __enter__(self) -> SnapshotDirLocked:
|
||||
self._lock.__enter__()
|
||||
return SnapshotDirLocked(self)
|
||||
|
||||
def __exit__(self, exc_type, exc_value, exc_traceback):
|
||||
self._lock.__exit__(exc_type, exc_value, exc_traceback)
|
||||
|
||||
|
||||
def shared_snapshot_dir(top_output_dir, ident: str) -> SnapshotDir:
|
||||
snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident)
|
||||
snapshot_dir_path.mkdir(exist_ok=True, parents=True)
|
||||
return SnapshotDir(snapshot_dir_path)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]:
|
||||
"""
|
||||
@@ -3542,7 +3744,7 @@ def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[
|
||||
The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc).
|
||||
"""
|
||||
|
||||
if os.getenv("NEON_ENV_BUILDER_FROM_REPO_DIR_USE_OVERLAYFS") is None:
|
||||
if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None:
|
||||
return None
|
||||
|
||||
overlay_dir = get_test_overlay_dir(request, top_output_dir)
|
||||
|
||||
@@ -20,6 +20,7 @@ from fixtures.utils import Fn
|
||||
class PageserverApiException(Exception):
|
||||
def __init__(self, message, status_code: int):
|
||||
super().__init__(message)
|
||||
self.message = message
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
@@ -261,12 +262,18 @@ class PageserverHttpClient(requests.Session):
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
|
||||
def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None):
|
||||
params = {}
|
||||
if detach_ignored:
|
||||
params["detach_ignored"] = "true"
|
||||
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
|
||||
kwargs = {}
|
||||
if timeout_secs is not None:
|
||||
kwargs["timeout"] = timeout_secs
|
||||
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool):
|
||||
@@ -526,6 +533,17 @@ class PageserverHttpClient(requests.Session):
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_preserve_initdb_archive(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
|
||||
):
|
||||
log.info(
|
||||
f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}"
|
||||
)
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def timeline_get_lsn_by_timestamp(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
|
||||
85
test_runner/fixtures/pageserver/many_tenants.py
Normal file
85
test_runner/fixtures/pageserver/many_tenants.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import concurrent.futures
|
||||
import time
|
||||
from typing import Any, Callable, Dict, Tuple
|
||||
|
||||
import fixtures.pageserver.remote_storage
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
|
||||
def single_timeline(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
|
||||
ncopies: int,
|
||||
) -> NeonEnv:
|
||||
"""
|
||||
Create `ncopies` duplicates of a template tenant that has a single timeline.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
remote_storage = env.pageserver_remote_storage
|
||||
assert isinstance(remote_storage, LocalFsStorage)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
# clean up the useless default tenant
|
||||
ps_http.tenant_delete(env.initial_tenant)
|
||||
|
||||
log.info("invoking callback to create template tenant")
|
||||
template_tenant, template_timeline, template_config = setup_template(env)
|
||||
log.info(
|
||||
f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}"
|
||||
)
|
||||
|
||||
log.info("detach template tenant form pageserver")
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
env.pageserver.allowed_errors.append(
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
)
|
||||
|
||||
log.info(f"duplicating template tenant {ncopies} times in S3")
|
||||
tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
|
||||
|
||||
log.info("attach duplicated tenants to pageserver")
|
||||
# In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
|
||||
# However, on-demand downloads are quite slow ATM.
|
||||
# => do the on-demand downloads in Python.
|
||||
assert ps_http.tenant_list() == []
|
||||
# make the attach fail after it created enough on-disk state to retry loading
|
||||
# the tenant next startup, but before it can start background loops that would start download
|
||||
ps_http.configure_failpoints(("attach-before-activate", "return"))
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*attach failed, setting tenant state to Broken: attach-before-activate.*"
|
||||
)
|
||||
|
||||
def attach_broken(tenant):
|
||||
env.pageserver.tenant_attach(
|
||||
tenant,
|
||||
config=template_config.copy(),
|
||||
)
|
||||
time.sleep(0.1)
|
||||
wait_until_tenant_state(ps_http, tenant, "Broken", 10)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
|
||||
executor.map(attach_broken, tenants)
|
||||
|
||||
env.pageserver.stop(
|
||||
immediate=True
|
||||
) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
|
||||
tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
|
||||
log.info("python-side on-demand download the layer files into local tenant dir")
|
||||
fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
|
||||
env, tenant_timelines
|
||||
)
|
||||
|
||||
return env
|
||||
116
test_runner/fixtures/pageserver/remote_storage.py
Normal file
116
test_runner/fixtures/pageserver/remote_storage.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import concurrent.futures
|
||||
import os
|
||||
import queue
|
||||
import shutil
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv, Pagectl
|
||||
from fixtures.pageserver.types import (
|
||||
InvalidFileName,
|
||||
parse_layer_file_name,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
|
||||
def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId):
|
||||
remote_storage = env.pageserver_remote_storage
|
||||
assert isinstance(remote_storage, LocalFsStorage)
|
||||
|
||||
src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines"
|
||||
assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
|
||||
|
||||
assert isinstance(remote_storage, LocalFsStorage)
|
||||
dst_timelines_dir: Path = remote_storage.tenant_path(new_tenant) / "timelines"
|
||||
dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False)
|
||||
dst_timelines_dir.mkdir(parents=False, exist_ok=False)
|
||||
|
||||
for tl in src_timelines_dir.iterdir():
|
||||
src_tl_dir = src_timelines_dir / tl.name
|
||||
assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory"
|
||||
dst_tl_dir = dst_timelines_dir / tl.name
|
||||
dst_tl_dir.mkdir(parents=False, exist_ok=False)
|
||||
for file in tl.iterdir():
|
||||
shutil.copy2(file, dst_tl_dir)
|
||||
if "__" in file.name:
|
||||
Pagectl(env).raw_cli(
|
||||
[
|
||||
"layer",
|
||||
"rewrite-summary",
|
||||
str(dst_tl_dir / file.name),
|
||||
"--new-tenant-id",
|
||||
str(new_tenant),
|
||||
]
|
||||
)
|
||||
else:
|
||||
# index_part etc need no patching
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def duplicate_tenant(env: NeonEnv, template_tenant: TenantId, ncopies: int) -> List[TenantId]:
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
|
||||
def work(tenant_id):
|
||||
duplicate_one_tenant(env, template_tenant, tenant_id)
|
||||
|
||||
new_tenants: List[TenantId] = [TenantId.generate() for _ in range(0, ncopies)]
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
|
||||
executor.map(work, new_tenants)
|
||||
return new_tenants
|
||||
|
||||
|
||||
def local_layer_name_from_remote_name(remote_name: str) -> str:
|
||||
try:
|
||||
return parse_layer_file_name(remote_name).to_str()
|
||||
except InvalidFileName as e:
|
||||
comps = remote_name.rsplit("-", 1)
|
||||
if len(comps) == 1:
|
||||
raise InvalidFileName("no generation suffix found") from e
|
||||
else:
|
||||
assert len(comps) == 2
|
||||
layer_file_name, _generation = comps
|
||||
try:
|
||||
return parse_layer_file_name(layer_file_name).to_str()
|
||||
except InvalidFileName:
|
||||
raise
|
||||
|
||||
|
||||
def copy_all_remote_layer_files_to_local_tenant_dir(
|
||||
env: NeonEnv, tenant_timelines: List[Tuple[TenantId, TimelineId]]
|
||||
):
|
||||
remote_storage = env.pageserver_remote_storage
|
||||
assert isinstance(remote_storage, LocalFsStorage)
|
||||
work: queue.Queue[Any] = queue.Queue()
|
||||
for tenant, timeline in tenant_timelines:
|
||||
remote_timeline_path = remote_storage.timeline_path(tenant, timeline)
|
||||
local_timeline_path = env.pageserver.timeline_dir(tenant, timeline)
|
||||
local_timeline_path.mkdir(parents=True, exist_ok=True)
|
||||
downloads = {}
|
||||
for remote_layer in remote_timeline_path.glob("*__*"):
|
||||
local_name = local_layer_name_from_remote_name(remote_layer.name)
|
||||
assert local_name not in downloads, "remote storage must have had split brain"
|
||||
downloads[local_name] = remote_layer
|
||||
for local_name, remote_path in downloads.items():
|
||||
work.put((remote_path, local_timeline_path / local_name))
|
||||
|
||||
def copy_layer_worker(queue):
|
||||
while True:
|
||||
item = queue.get()
|
||||
if item is None:
|
||||
return
|
||||
remote_path, local_path = item
|
||||
# not copy2, so it looks like a recent download, in case that's relevant to e.g. eviction
|
||||
shutil.copy(remote_path, local_path, follow_symlinks=False)
|
||||
|
||||
workers = []
|
||||
n_threads = os.cpu_count() or 1
|
||||
for _ in range(0, n_threads):
|
||||
w = threading.Thread(target=copy_layer_worker, args=[work])
|
||||
workers.append(w)
|
||||
w.start()
|
||||
work.put(None)
|
||||
for w in workers:
|
||||
w.join()
|
||||
@@ -31,10 +31,10 @@ class DeltaLayerFileName:
|
||||
key_start: Key
|
||||
key_end: Key
|
||||
|
||||
def is_l0(self):
|
||||
def is_l0(self) -> bool:
|
||||
return self.key_start == KEY_MIN and self.key_end == KEY_MAX
|
||||
|
||||
def to_str(self):
|
||||
def to_str(self) -> str:
|
||||
ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}"
|
||||
assert self == parse_layer_file_name(ret)
|
||||
return ret
|
||||
@@ -107,7 +107,7 @@ def parse_layer_file_name(file_name: str) -> LayerFileName:
|
||||
except InvalidFileName:
|
||||
pass
|
||||
|
||||
raise ValueError()
|
||||
raise InvalidFileName("neither image nor delta layer")
|
||||
|
||||
|
||||
def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
|
||||
|
||||
@@ -63,6 +63,14 @@ def wait_for_upload(
|
||||
)
|
||||
|
||||
|
||||
def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str):
|
||||
if tenant_info["state"]["slug"] == expected_state:
|
||||
return True
|
||||
if tenant_info["state"]["slug"] == "Broken":
|
||||
raise RuntimeError(f"tenant became Broken, not {expected_state}")
|
||||
return False
|
||||
|
||||
|
||||
def wait_until_tenant_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
@@ -80,10 +88,8 @@ def wait_until_tenant_state(
|
||||
log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
|
||||
else:
|
||||
log.debug(f"Tenant {tenant_id} data: {tenant}")
|
||||
if tenant["state"]["slug"] == expected_state:
|
||||
if _tenant_in_expected_state(tenant, expected_state):
|
||||
return tenant
|
||||
if tenant["state"]["slug"] == "Broken":
|
||||
raise RuntimeError(f"tenant became Broken, not {expected_state}")
|
||||
|
||||
time.sleep(period)
|
||||
|
||||
@@ -92,6 +98,34 @@ def wait_until_tenant_state(
|
||||
)
|
||||
|
||||
|
||||
def wait_until_all_tenants_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
expected_state: str,
|
||||
iterations: int,
|
||||
period: float = 1.0,
|
||||
http_error_ok: bool = True,
|
||||
):
|
||||
"""
|
||||
Like wait_until_tenant_state, but checks all tenants.
|
||||
"""
|
||||
for _ in range(iterations):
|
||||
try:
|
||||
tenants = pageserver_http.tenant_list()
|
||||
except Exception as e:
|
||||
if http_error_ok:
|
||||
log.debug(f"Failed to list tenants: {e}")
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)):
|
||||
return
|
||||
time.sleep(period)
|
||||
|
||||
raise Exception(
|
||||
f"Not all tenants became active {expected_state} within {iterations * period} seconds"
|
||||
)
|
||||
|
||||
|
||||
def wait_until_timeline_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
@@ -337,8 +371,24 @@ def tenant_delete_wait_completed(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
iterations: int,
|
||||
ignore_errors: bool = False,
|
||||
):
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
if not ignore_errors:
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
else:
|
||||
interval = 0.5
|
||||
|
||||
def delete_request_sent():
|
||||
try:
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
except PageserverApiException as e:
|
||||
log.debug(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
except Exception as e:
|
||||
log.debug(e)
|
||||
|
||||
wait_until(iterations, interval=interval, func=delete_request_sent)
|
||||
wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
|
||||
|
||||
|
||||
|
||||
@@ -397,3 +397,36 @@ def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
|
||||
}
|
||||
"""
|
||||
pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
|
||||
|
||||
|
||||
def humantime_to_ms(humantime: str) -> float:
|
||||
"""
|
||||
Converts Rust humantime's output string to milliseconds.
|
||||
|
||||
humantime_to_ms("1h 1ms 406us") -> 3600001.406
|
||||
"""
|
||||
|
||||
unit_multiplier_map = {
|
||||
"ns": 1e-6,
|
||||
"us": 1e-3,
|
||||
"ms": 1,
|
||||
"s": 1e3,
|
||||
"m": 1e3 * 60,
|
||||
"h": 1e3 * 60 * 60,
|
||||
}
|
||||
matcher = re.compile(rf"^(\d+)({'|'.join(unit_multiplier_map.keys())})$")
|
||||
total_ms = 0.0
|
||||
|
||||
if humantime == "0":
|
||||
return total_ms
|
||||
|
||||
for item in humantime.split():
|
||||
if (match := matcher.search(item)) is not None:
|
||||
n, unit = match.groups()
|
||||
total_ms += int(n) * unit_multiplier_map[unit]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"can't parse '{item}' (from string '{humantime}'), known units are {', '.join(unit_multiplier_map.keys())}."
|
||||
)
|
||||
|
||||
return round(total_ms, 3)
|
||||
|
||||
16
test_runner/performance/pageserver/README.md
Normal file
16
test_runner/performance/pageserver/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
How to reproduce benchmark results / run these benchmarks interactively.
|
||||
|
||||
1. Get an EC2 instance with Instance Store. Use the same instance type as used for the benchmark run.
|
||||
2. Mount the Instance Store => `neon.git/scripts/ps_ec2_setup_instance_store`
|
||||
3. Use a pytest command line (see other READMEs further up in the pytest hierarchy).
|
||||
|
||||
For tests that take a long time to set up / consume a lot of storage space,
|
||||
we use the test suite's repo_dir snapshotting functionality (`from_repo_dir`).
|
||||
It supports mounting snapshots using overlayfs, which improves iteration time.
|
||||
|
||||
Here's a full command line.
|
||||
|
||||
```
|
||||
RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \
|
||||
./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
|
||||
````
|
||||
0
test_runner/performance/pageserver/__init__.py
Normal file
0
test_runner/performance/pageserver/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Tests that aren't really tests or benchmarks.
|
||||
|
||||
They're intended for the case where we want to standardize & automate setup,
|
||||
but then debug a performance problem interactively.
|
||||
It's kind of an abuse of the test framework, but, it's our only tool right
|
||||
now to automate a complex test bench setup.
|
||||
"""
|
||||
@@ -0,0 +1,79 @@
|
||||
import os
|
||||
import pdb
|
||||
|
||||
import fixtures.pageserver.many_tenants as many_tenants
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
|
||||
from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
|
||||
|
||||
"""
|
||||
Usage:
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
|
||||
./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("INTERACTIVE", "false") != "true",
|
||||
reason="test is for interactive use only",
|
||||
)
|
||||
def test_many_small_tenants(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
_env = setup_env(neon_env_builder, 2) # vary this to the desired number of tenants
|
||||
_pg_bin = pg_bin
|
||||
|
||||
# drop into pdb so that we can debug pageserver interactively, use pdb here
|
||||
# For example, to interactively examine pageserver startup behavior, call
|
||||
# _env.pageserver.stop(immediate=True)
|
||||
# _env.pageserver.start()
|
||||
# from the pdb shell.
|
||||
pdb.set_trace()
|
||||
|
||||
|
||||
def setup_env(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
n_tenants: int,
|
||||
) -> NeonEnv:
|
||||
def setup_template(env: NeonEnv):
|
||||
# create our template tenant
|
||||
config = {
|
||||
"gc_period": "0s",
|
||||
"checkpoint_timeout": "10 years",
|
||||
"compaction_period": "20 s",
|
||||
"compaction_threshold": 10,
|
||||
"compaction_target_size": 134217728,
|
||||
"checkpoint_distance": 268435456,
|
||||
"image_creation_threshold": 3,
|
||||
}
|
||||
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
env.pageserver.allowed_errors.append(
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
)
|
||||
env.pageserver.tenant_attach(template_tenant, config)
|
||||
ep = env.endpoints.create_start("main", tenant_id=template_tenant)
|
||||
ep.safe_psql("create table foo(b text)")
|
||||
for _ in range(0, 8):
|
||||
ep.safe_psql("insert into foo(b) values ('some text')")
|
||||
last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
|
||||
ep.stop_and_destroy()
|
||||
return (template_tenant, template_timeline, config)
|
||||
|
||||
def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
|
||||
return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
|
||||
|
||||
env = neon_env_builder.build_and_use_snapshot(f"many-small-tenants-{n_tenants}", doit)
|
||||
|
||||
env.start()
|
||||
ensure_pageserver_ready_for_benchmarking(env, n_tenants)
|
||||
|
||||
return env
|
||||
10
test_runner/performance/pageserver/pagebench/__init__.py
Normal file
10
test_runner/performance/pageserver/pagebench/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""
|
||||
Pagebench-based performance regression tests.
|
||||
|
||||
The defining characteristic of tests in this sub-directory is that they
|
||||
are component-level tests, i.e., they exercise pageserver directly using `pagebench`
|
||||
instead of benchmarking the full stack.
|
||||
|
||||
See https://github.com/neondatabase/neon/issues/5771
|
||||
for the context in which this was developed.
|
||||
"""
|
||||
@@ -0,0 +1,210 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
import fixtures.pageserver.many_tenants as many_tenants
|
||||
import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.utils import get_scale_for_db, humantime_to_ms
|
||||
|
||||
from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
|
||||
|
||||
|
||||
# For reference, the space usage of the snapshots:
|
||||
# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
|
||||
# 137G /instance_store/test_output/shared-snapshots
|
||||
# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
|
||||
# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
|
||||
# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
|
||||
# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
|
||||
# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
|
||||
# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
|
||||
# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
|
||||
@pytest.mark.parametrize("duration", [30])
|
||||
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
|
||||
@pytest.mark.parametrize("n_tenants", [1, 10, 100])
|
||||
@pytest.mark.timeout(
|
||||
10000
|
||||
) # TODO: this value is just "a really high number"; have this per instance type
|
||||
def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
pg_bin: PgBin,
|
||||
n_tenants: int,
|
||||
pgbench_scale: int,
|
||||
duration: int,
|
||||
):
|
||||
def record(metric, **kwargs):
|
||||
zenbenchmark.record(
|
||||
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
|
||||
)
|
||||
|
||||
params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
|
||||
|
||||
# params from fixtures
|
||||
params.update(
|
||||
{
|
||||
"n_tenants": (n_tenants, {"unit": ""}),
|
||||
"pgbench_scale": (pgbench_scale, {"unit": ""}),
|
||||
"duration": (duration, {"unit": "s"}),
|
||||
}
|
||||
)
|
||||
|
||||
# configure cache sizes like in prod
|
||||
page_cache_size = 16384
|
||||
max_file_descriptors = 500000
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
|
||||
)
|
||||
params.update(
|
||||
{
|
||||
"pageserver_config_override.page_cache_size": (
|
||||
page_cache_size * 8192,
|
||||
{"unit": "byte"},
|
||||
),
|
||||
"pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
|
||||
}
|
||||
)
|
||||
|
||||
for param, (value, kwargs) in params.items():
|
||||
record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
|
||||
env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale)
|
||||
run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
|
||||
|
||||
|
||||
def run_benchmark_max_throughput_latest_lsn(
|
||||
env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
|
||||
):
|
||||
"""
|
||||
Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
|
||||
"""
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
cmd = [
|
||||
str(env.neon_binpath / "pagebench"),
|
||||
"get-page-latest-lsn",
|
||||
"--mgmt-api-endpoint",
|
||||
ps_http.base_url,
|
||||
"--page-service-connstring",
|
||||
env.pageserver.connstr(password=None),
|
||||
"--runtime",
|
||||
f"{duration_secs}s",
|
||||
# don't specify the targets explicitly, let pagebench auto-discover them
|
||||
]
|
||||
log.info(f"command: {' '.join(cmd)}")
|
||||
basepath = pg_bin.run_capture(cmd, with_command_header=False)
|
||||
results_path = Path(basepath + ".stdout")
|
||||
log.info(f"Benchmark results at: {results_path}")
|
||||
|
||||
with open(results_path, "r") as f:
|
||||
results = json.load(f)
|
||||
log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
|
||||
|
||||
total = results["total"]
|
||||
|
||||
metric = "request_count"
|
||||
record(
|
||||
metric,
|
||||
metric_value=total[metric],
|
||||
unit="",
|
||||
report=MetricReport.HIGHER_IS_BETTER,
|
||||
)
|
||||
|
||||
metric = "latency_mean"
|
||||
record(
|
||||
metric,
|
||||
metric_value=humantime_to_ms(total[metric]),
|
||||
unit="ms",
|
||||
report=MetricReport.LOWER_IS_BETTER,
|
||||
)
|
||||
|
||||
metric = "latency_percentiles"
|
||||
for k, v in total[metric].items():
|
||||
record(
|
||||
f"{metric}.{k}",
|
||||
metric_value=humantime_to_ms(v),
|
||||
unit="ms",
|
||||
report=MetricReport.LOWER_IS_BETTER,
|
||||
)
|
||||
|
||||
|
||||
def setup_pageserver_with_pgbench_tenants(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
n_tenants: int,
|
||||
scale: int,
|
||||
) -> NeonEnv:
|
||||
"""
|
||||
Utility function to set up a pageserver with a given number of identical tenants.
|
||||
Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards
|
||||
with a repeat application of (pgbench simple-update workload, checkpoint, compact).
|
||||
"""
|
||||
|
||||
def setup_template(env: NeonEnv):
|
||||
# use a config that makes production of on-disk state timing-insensitive
|
||||
# as we ingest data into the tenant.
|
||||
config = {
|
||||
"gc_period": "0s", # disable periodic gc
|
||||
"checkpoint_timeout": "10 years",
|
||||
"compaction_period": "0s", # disable periodic compaction
|
||||
"compaction_threshold": 10,
|
||||
"compaction_target_size": 134217728,
|
||||
"checkpoint_distance": 268435456,
|
||||
"image_creation_threshold": 3,
|
||||
}
|
||||
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
env.pageserver.allowed_errors.append(
|
||||
# tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
)
|
||||
env.pageserver.tenant_attach(template_tenant, config)
|
||||
ps_http = env.pageserver.http_client()
|
||||
with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
|
||||
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
|
||||
wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
|
||||
ps_http.timeline_checkpoint(template_tenant, template_timeline)
|
||||
ps_http.timeline_compact(template_tenant, template_timeline)
|
||||
for _ in range(
|
||||
0, 17
|
||||
): # some prime number to avoid potential resonances with the "_threshold" variables from the config
|
||||
# the L0s produced by this appear to have size ~5MiB
|
||||
num_txns = 10_000
|
||||
pg_bin.run_capture(
|
||||
["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
|
||||
)
|
||||
wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
|
||||
ps_http.timeline_checkpoint(template_tenant, template_timeline)
|
||||
ps_http.timeline_compact(template_tenant, template_timeline)
|
||||
# for reference, the output at scale=6 looked like so (306M total)
|
||||
# ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
|
||||
# total 306M
|
||||
# 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
|
||||
# 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
|
||||
# 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
|
||||
# 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
|
||||
# 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
|
||||
# 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
|
||||
# 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
|
||||
# 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
|
||||
# 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
|
||||
# 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
|
||||
|
||||
return (template_tenant, template_timeline, config)
|
||||
|
||||
def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
|
||||
return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
|
||||
|
||||
env = neon_env_builder.build_and_use_snapshot(
|
||||
f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit
|
||||
)
|
||||
env.start()
|
||||
ensure_pageserver_ready_for_benchmarking(env, n_tenants)
|
||||
return env
|
||||
29
test_runner/performance/pageserver/util.py
Normal file
29
test_runner/performance/pageserver/util.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""
|
||||
Utilities used by all code in this sub-directory
|
||||
"""
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.pageserver.utils import wait_until_all_tenants_state
|
||||
|
||||
|
||||
def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
|
||||
"""
|
||||
Helper function.
|
||||
"""
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
log.info("wait for all tenants to become active")
|
||||
wait_until_all_tenants_state(
|
||||
ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
|
||||
)
|
||||
|
||||
# ensure all layers are resident for predictiable performance
|
||||
tenants = [info["id"] for info in ps_http.tenant_list()]
|
||||
for tenant in tenants:
|
||||
for timeline in ps_http.tenant_status(tenant)["timelines"]:
|
||||
info = ps_http.layer_map_info(tenant, timeline)
|
||||
for layer in info.historic_layers:
|
||||
assert not layer.remote
|
||||
|
||||
log.info("ready")
|
||||
@@ -7,11 +7,13 @@ from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
import toml
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
wait_for_last_record_lsn,
|
||||
@@ -269,14 +271,20 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
|
||||
timeline_id = env.initial_timeline
|
||||
pg_version = env.pg_version
|
||||
|
||||
# Delete all files from local_fs_remote_storage except initdb.tar.zst,
|
||||
try:
|
||||
pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
|
||||
except PageserverApiException as e:
|
||||
# Allow the error as we might be running the old pageserver binary
|
||||
log.info(f"Got allowed error: '{e}'")
|
||||
|
||||
# Delete all files from local_fs_remote_storage except initdb-preserved.tar.zst,
|
||||
# the file is required for `timeline_create` with `existing_initdb_timeline_id`.
|
||||
#
|
||||
# TODO: switch to Path.walk() in Python 3.12
|
||||
# for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk():
|
||||
for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"):
|
||||
for filename in filenames:
|
||||
if filename != "initdb.tar.zst":
|
||||
if filename != "initdb-preserved.tar.zst" and filename != "initdb.tar.zst":
|
||||
(Path(dirpath) / filename).unlink()
|
||||
|
||||
timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
|
||||
|
||||
@@ -2,7 +2,7 @@ import enum
|
||||
import time
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Tuple
|
||||
from typing import Any, Dict, Iterable, Tuple
|
||||
|
||||
import pytest
|
||||
import toml
|
||||
@@ -121,17 +121,7 @@ class EvictionEnv:
|
||||
}
|
||||
|
||||
def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]:
|
||||
ret: Counter[TenantId] = Counter()
|
||||
|
||||
for tenant_id, timeline_id in self.timelines:
|
||||
timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
assert timeline_dir.exists()
|
||||
for file in timeline_dir.iterdir():
|
||||
if "__" not in file.name:
|
||||
continue
|
||||
ret[tenant_id] += 1
|
||||
|
||||
return dict(ret)
|
||||
return count_layers_per_tenant(pageserver, self.timelines)
|
||||
|
||||
def warm_up_tenant(self, tenant_id: TenantId):
|
||||
"""
|
||||
@@ -199,6 +189,22 @@ class EvictionEnv:
|
||||
wait_until(10, 1, statvfs_called)
|
||||
|
||||
|
||||
def count_layers_per_tenant(
|
||||
pageserver: NeonPageserver, timelines: Iterable[Tuple[TenantId, TimelineId]]
|
||||
) -> Dict[TenantId, int]:
|
||||
ret: Counter[TenantId] = Counter()
|
||||
|
||||
for tenant_id, timeline_id in timelines:
|
||||
timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
assert timeline_dir.exists()
|
||||
for file in timeline_dir.iterdir():
|
||||
if "__" not in file.name:
|
||||
continue
|
||||
ret[tenant_id] += 1
|
||||
|
||||
return dict(ret)
|
||||
|
||||
|
||||
def human_bytes(amt: float) -> str:
|
||||
suffixes = ["", "Ki", "Mi", "Gi"]
|
||||
|
||||
@@ -243,21 +249,7 @@ def _eviction_env(
|
||||
|
||||
timelines = []
|
||||
for scale in pgbench_scales:
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
"checkpoint_distance": f"{layer_size}",
|
||||
"image_creation_threshold": "100",
|
||||
"compaction_target_size": f"{layer_size}",
|
||||
}
|
||||
)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
timelines.append((tenant_id, timeline_id))
|
||||
timelines.append(pgbench_init_tenant(layer_size, scale, env, pg_bin))
|
||||
|
||||
# stop the safekeepers to avoid on-demand downloads caused by
|
||||
# initial logical size calculation triggered by walreceiver connection status
|
||||
@@ -266,25 +258,13 @@ def _eviction_env(
|
||||
|
||||
# after stopping the safekeepers, we know that no new WAL will be coming in
|
||||
for tenant_id, timeline_id in timelines:
|
||||
pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
|
||||
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
|
||||
tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
|
||||
assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
|
||||
assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
|
||||
pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"])
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info(f"{layers}")
|
||||
assert (
|
||||
len(layers.historic_layers) >= 10
|
||||
), "evictions happen at layer granularity, but we often assert at byte-granularity"
|
||||
pgbench_init_lsns[tenant_id] = finish_tenant_creation(env, tenant_id, timeline_id, 10)
|
||||
|
||||
eviction_env = EvictionEnv(
|
||||
timelines=timelines,
|
||||
neon_env=env,
|
||||
pageserver_http=pageserver_http,
|
||||
# this last tenant http client works for num_pageservers=1
|
||||
pageserver_http=env.get_tenant_pageserver(timelines[-1][0]).http_client(),
|
||||
layer_size=layer_size,
|
||||
pg_bin=pg_bin,
|
||||
pgbench_init_lsns=pgbench_init_lsns,
|
||||
@@ -293,6 +273,49 @@ def _eviction_env(
|
||||
return eviction_env
|
||||
|
||||
|
||||
def pgbench_init_tenant(
|
||||
layer_size: int, scale: int, env: NeonEnv, pg_bin: PgBin
|
||||
) -> Tuple[TenantId, TimelineId]:
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
"checkpoint_distance": f"{layer_size}",
|
||||
"image_creation_threshold": "100",
|
||||
"compaction_target_size": f"{layer_size}",
|
||||
}
|
||||
)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
return (tenant_id, timeline_id)
|
||||
|
||||
|
||||
def finish_tenant_creation(
|
||||
env: NeonEnv,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
min_expected_layers: int,
|
||||
) -> Lsn:
|
||||
pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
|
||||
tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
|
||||
assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
|
||||
assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
|
||||
pgbench_init_lsn = Lsn(tl_info["last_record_lsn"])
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
# log.info(f"{layers}")
|
||||
assert (
|
||||
len(layers.historic_layers) >= min_expected_layers
|
||||
), "evictions happen at layer granularity, but we often assert at byte-granularity"
|
||||
|
||||
return pgbench_init_lsn
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
|
||||
return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1)
|
||||
@@ -598,9 +621,82 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
|
||||
assert abs_diff < 0.05
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[
|
||||
EvictionOrder.ABSOLUTE_ORDER,
|
||||
EvictionOrder.RELATIVE_ORDER_EQUAL,
|
||||
EvictionOrder.RELATIVE_ORDER_SPARE,
|
||||
],
|
||||
)
|
||||
def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, order: EvictionOrder):
|
||||
"""
|
||||
Create in order first smaller tenants and finally a single larger tenant.
|
||||
Assert that with relative order modes, the disk usage based eviction is
|
||||
more fair towards the smaller tenants.
|
||||
"""
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
|
||||
|
||||
# initial_tenant and initial_timeline do not exist
|
||||
|
||||
# create N tenants the same fashion as EvictionEnv
|
||||
layer_size = 5 * 1024**2
|
||||
timelines = []
|
||||
for scale in [1, 1, 1, 4]:
|
||||
timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale))
|
||||
|
||||
env.neon_cli.safekeeper_stop()
|
||||
|
||||
for (tenant_id, timeline_id), scale in timelines:
|
||||
min_expected_layers = 4 if scale == 1 else 10
|
||||
finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers)
|
||||
|
||||
tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
|
||||
(total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False)
|
||||
|
||||
# cut 10 percent
|
||||
response = env.pageserver.http_client().disk_usage_eviction_run(
|
||||
{"evict_bytes": total_on_disk // 10, "eviction_order": order.config()}
|
||||
)
|
||||
log.info(f"{response}")
|
||||
|
||||
after_tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
|
||||
|
||||
ratios = []
|
||||
for i, ((tenant_id, _timeline_id), _scale) in enumerate(timelines):
|
||||
# we expect the oldest to suffer most
|
||||
originally, after = tenant_layers[tenant_id], after_tenant_layers[tenant_id]
|
||||
log.info(f"{i + 1}th tenant went from {originally} -> {after}")
|
||||
ratio = after / originally
|
||||
ratios.append(ratio)
|
||||
|
||||
assert (
|
||||
len(ratios) == 4
|
||||
), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
|
||||
log.info(f"{ratios}")
|
||||
|
||||
if order == EvictionOrder.ABSOLUTE_ORDER:
|
||||
# first tenant loses most
|
||||
assert ratios[0] <= ratios[1], "first should lose the most"
|
||||
assert ratios[1] < ratios[2], "second should lose some"
|
||||
assert ratios[1] < 1.0
|
||||
assert ratios[2] <= ratios[3], "third might not lose"
|
||||
assert ratios[3] == 1.0, "tenant created last does not lose"
|
||||
elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
|
||||
assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
|
||||
elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
|
||||
# with different layer sizes and pg versions, there are different combinations
|
||||
assert len([x for x in ratios if x < 1.0]) >= 2, "require 2..4 tenants to lose layers"
|
||||
assert ratios[3] < 1.0, "largest tenant always loses layers"
|
||||
else:
|
||||
raise RuntimeError(f"unimplemented {order}")
|
||||
|
||||
|
||||
def poor_mans_du(
|
||||
env: NeonEnv,
|
||||
timelines: list[Tuple[TenantId, TimelineId]],
|
||||
timelines: Iterable[Tuple[TenantId, TimelineId]],
|
||||
pageserver: NeonPageserver,
|
||||
verbose: bool = False,
|
||||
) -> Tuple[int, int, int]:
|
||||
|
||||
34
test_runner/regress/test_neon_superuser.py
Normal file
34
test_runner/regress/test_neon_superuser.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import time
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
env = neon_simple_env
|
||||
env.neon_cli.create_branch("test_neon_superuser", "empty")
|
||||
endpoint = env.endpoints.create("test_neon_superuser")
|
||||
endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
|
||||
endpoint.start()
|
||||
|
||||
time.sleep(1) # Sleep to let migrations run
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
|
||||
)
|
||||
cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
|
||||
cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
|
||||
|
||||
with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
|
||||
cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')")
|
||||
assert cur.fetchall()[0][0]
|
||||
cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')")
|
||||
assert cur.fetchall()[0][0]
|
||||
|
||||
if pg_version == PgVersion.V16:
|
||||
cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'set')")
|
||||
assert cur.fetchall()[0][0]
|
||||
|
||||
cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
|
||||
cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'")
|
||||
@@ -5,7 +5,10 @@ from typing import Any, Dict, Optional
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
|
||||
from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
tenant_delete_wait_completed,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
@@ -135,6 +138,16 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
|
||||
pageserver.stop()
|
||||
pageserver.start()
|
||||
if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id:
|
||||
# /re-attach call will bump generation: track that in our state in case we do an
|
||||
# "attach in same generation" operation later
|
||||
assert last_state_ps[1] is not None # latest_attached == pageserfer.id implies this
|
||||
# The re-attach API increments generation by exactly one.
|
||||
new_generation = last_state_ps[1] + 1
|
||||
last_state[pageserver.id] = (last_state_ps[0], new_generation)
|
||||
tenants = pageserver.http_client().tenant_list()
|
||||
assert len(tenants) == 1
|
||||
assert tenants[0]["generation"] == new_generation
|
||||
|
||||
log.info("Entering postgres...")
|
||||
workload.churn_rows(rng.randint(128, 256), pageserver.id)
|
||||
workload.validate(pageserver.id)
|
||||
|
||||
@@ -411,9 +411,7 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
|
||||
pageserver_http.configure_failpoints((failpoint, "pause"))
|
||||
|
||||
def hit_pausable_failpoint_and_later_fail():
|
||||
with pytest.raises(
|
||||
PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn"
|
||||
):
|
||||
with pytest.raises(PageserverApiException, match="NotFound: tenant"):
|
||||
pageserver_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
@@ -443,8 +441,8 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
|
||||
try:
|
||||
wait_until(10, 1, has_hit_failpoint)
|
||||
|
||||
# it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated
|
||||
# then deletion should fail and set the tenant broken
|
||||
# it should start ok, sync up with the stuck creation, then hang waiting for the timeline
|
||||
# to shut down.
|
||||
deletion = Thread(target=start_deletion)
|
||||
deletion.start()
|
||||
|
||||
@@ -573,11 +571,16 @@ def test_tenant_delete_races_timeline_creation(
|
||||
ps_http = env.pageserver.http_client()
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
# Sometimes it ends with "InternalServerError(Cancelled", sometimes with "InternalServerError(Operation was cancelled"
|
||||
# When timeline creation is cancelled by tenant deletion, it is during Tenant::shutdown(), and
|
||||
# acting on a shutdown tenant generates a 503 response (if caller retried they would later) get
|
||||
# a 404 after the tenant is fully deleted.
|
||||
CANCELLED_ERROR = (
|
||||
".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
|
||||
".*POST.*Cancelled request finished successfully status=503 Service Unavailable"
|
||||
)
|
||||
|
||||
# This can occur sometimes.
|
||||
CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# lucky race with stopping from flushing a layer we fail to schedule any uploads
|
||||
@@ -586,6 +589,9 @@ def test_tenant_delete_races_timeline_creation(
|
||||
".*POST.*/timeline.* request was dropped before completing",
|
||||
# Timeline creation runs into this error
|
||||
CANCELLED_ERROR,
|
||||
# Timeline deletion can run into this error during deletion
|
||||
CONFLICT_MESSAGE,
|
||||
".*tenant_delete_handler.*still waiting, taking longer than expected.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -621,7 +627,10 @@ def test_tenant_delete_races_timeline_creation(
|
||||
ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
|
||||
|
||||
def tenant_delete():
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
def tenant_delete_inner():
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
|
||||
wait_until(100, 0.5, tenant_delete_inner)
|
||||
|
||||
Thread(target=tenant_delete).start()
|
||||
|
||||
@@ -638,10 +647,8 @@ def test_tenant_delete_races_timeline_creation(
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
try:
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
except PageserverApiException:
|
||||
pass
|
||||
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
|
||||
@@ -482,7 +482,7 @@ def test_detach_while_attaching(
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
# And re-attach
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
|
||||
pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(5000)")])
|
||||
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
@@ -681,7 +681,7 @@ def test_detach_while_activating(
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
# And re-attach, but stop attach task_mgr task from completing
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "return(600000)")])
|
||||
pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(600000)")])
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
|
||||
# The tenant is in the Activating state. This should not block us from
|
||||
@@ -695,7 +695,7 @@ def test_detach_while_activating(
|
||||
), "Only ignored tenant should be missing"
|
||||
|
||||
# Subsequently attaching it again should still work
|
||||
pageserver_http.configure_failpoints([("attach-before-activate", "off")])
|
||||
pageserver_http.configure_failpoints([("attach-before-activate-sleep", "off")])
|
||||
env.pageserver.tenant_attach(tenant_id)
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import concurrent.futures
|
||||
import os
|
||||
import time
|
||||
from contextlib import closing
|
||||
@@ -7,6 +8,7 @@ from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import (
|
||||
PAGESERVER_GLOBAL_METRICS,
|
||||
@@ -17,7 +19,9 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.utils import timeline_delete_wait_completed
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.types import Lsn, TenantId
|
||||
from fixtures.utils import wait_until
|
||||
@@ -341,3 +345,78 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
|
||||
assert (
|
||||
tenant_active_count == 1
|
||||
), f"Tenant {tenant_with_empty_timelines} should have metric as active"
|
||||
|
||||
|
||||
def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Probabilistic stress test for the pageserver's handling of tenant requests
|
||||
across a restart. This is intended to catch things like:
|
||||
- Bad response codes during shutdown (e.g. returning 500 instead of 503)
|
||||
- Issues where a tenant is still starting up while we receive a request for it
|
||||
- Issues with interrupting/resuming tenant/timeline creation in shutdown
|
||||
"""
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
tenant_id: TenantId = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Multiple creation requests which race will generate this error
|
||||
env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
|
||||
|
||||
# Tenant creation requests which arrive out of order will generate complaints about
|
||||
# generation nubmers out of order.
|
||||
env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
|
||||
|
||||
# Our multiple creation requests will advance generation quickly, and when we skip
|
||||
# a generation number we can generate these warnings
|
||||
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+")
|
||||
|
||||
# Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
|
||||
# an incomplete attach, or some other problem. In the field this should be rare,
|
||||
# so we allow it to log at WARN, even if it is occasionally a false positive.
|
||||
env.pageserver.allowed_errors.append(".*failed to freeze and flush.*")
|
||||
|
||||
# When we shut down a tenant during a timeline creation, initdb is not cancelled, we wait
|
||||
# for it to complete (since https://github.com/neondatabase/neon/pull/6451). This means
|
||||
# that shutdown can be delayed by >=1s on debug builds where initdb takes a long time to run.
|
||||
env.pageserver.allowed_errors.append(".*still waiting, taking longer than expected... gate.*")
|
||||
|
||||
def create_bg(delay_ms):
|
||||
time.sleep(delay_ms / 1000.0)
|
||||
try:
|
||||
env.pageserver.tenant_create(tenant_id=tenant_id)
|
||||
env.pageserver.http_client().timeline_create(
|
||||
PgVersion.NOT_SET, tenant_id, new_timeline_id=timeline_id
|
||||
)
|
||||
except PageserverApiException as e:
|
||||
if e.status_code == 409:
|
||||
log.info(f"delay_ms={delay_ms} 409")
|
||||
pass
|
||||
elif e.status_code == 400:
|
||||
if "is less than existing" in e.message:
|
||||
# We send creation requests very close together in time: it is expected that these
|
||||
# race, and sometimes chigher-generation'd requests arrive first. The pageserver rightly
|
||||
# rejects any attempt to make a generation number go backwards.
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
except requests.exceptions.ConnectionError:
|
||||
# Our requests might arrive during shutdown and be cut off at the transport level
|
||||
pass
|
||||
|
||||
for _ in range(0, 10):
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futs = []
|
||||
for delay_ms in (0, 1, 10, 50, 100, 200, 500, 800):
|
||||
f = executor.submit(create_bg, delay_ms)
|
||||
futs.append(f)
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
for f in futs:
|
||||
f.result(timeout=10)
|
||||
|
||||
# The tenant should end up active
|
||||
wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)
|
||||
|
||||
@@ -868,7 +868,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
|
||||
|
||||
# Check that tenant deletion proactively wakes tenants: this is done separately to the main
|
||||
# Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
|
||||
# body of the test because it will disrupt tenant counts
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start(
|
||||
@@ -876,9 +876,22 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
|
||||
wait_until(10, 1, at_least_one_active)
|
||||
delete_tenant_id = list(
|
||||
|
||||
detach_tenant_id = list(
|
||||
[(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
|
||||
)[0][0]
|
||||
delete_tenant_id = list(
|
||||
[(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
|
||||
)[1][0]
|
||||
|
||||
# Detaching a stuck tenant should proceed promptly
|
||||
# (reproducer for https://github.com/neondatabase/neon/pull/6430)
|
||||
env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10)
|
||||
tenant_ids.remove(detach_tenant_id)
|
||||
# FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*attach failed, setting tenant state to Broken: Shut down while Attaching"
|
||||
)
|
||||
|
||||
# Deleting a stuck tenant should prompt it to go active
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
@@ -912,9 +925,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
|
||||
tenant_ids.remove(delete_tenant_id)
|
||||
|
||||
# Check that all the stuck tenants proceed to active (apart from the one that deletes)
|
||||
# Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
|
||||
# we detached)
|
||||
wait_until(10, 1, all_active)
|
||||
assert len(get_tenant_states()) == n_tenants - 1
|
||||
assert len(get_tenant_states()) == n_tenants - 2
|
||||
|
||||
|
||||
def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -137,6 +137,9 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
ps_client = env.pageserver.http_client()
|
||||
|
||||
# Mark the initdb archive for preservation
|
||||
ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id)
|
||||
|
||||
# shut down the endpoint and delete the timeline from the pageserver
|
||||
endpoint.stop()
|
||||
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 8207291128...11e970fe2b
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: c1c2272f43...731b4d1609
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 7be4a52d72...cf302768b2
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"postgres-v16": "7be4a52d728459b79b59343c57d338c3073059c8",
|
||||
"postgres-v15": "c1c2272f436ed9231f6172f49de219fe71a9280d",
|
||||
"postgres-v14": "82072911287cabb32018cf92c8425fa1c744def4"
|
||||
"postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
|
||||
"postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b",
|
||||
"postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user