mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-03 18:50:38 +00:00
Compare commits
19 Commits
jcsp/ha-te
...
jcsp/paths
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4363132305 | ||
|
|
621ea2ec44 | ||
|
|
74d09b78c7 | ||
|
|
0cf0731d8b | ||
|
|
98723844ee | ||
|
|
73a8c97ac8 | ||
|
|
17a3c9036e | ||
|
|
8c5b310090 | ||
|
|
8224580f3e | ||
|
|
2b0f3549f7 | ||
|
|
b4972d07d4 | ||
|
|
26ae7b0b3e | ||
|
|
f8483cc4a3 | ||
|
|
cc5d6c66b3 | ||
|
|
d894d2b450 | ||
|
|
b09d686335 | ||
|
|
74d24582cf | ||
|
|
4834d22d2d | ||
|
|
86e8c43ddf |
4
.github/workflows/build_and_test.yml
vendored
4
.github/workflows/build_and_test.yml
vendored
@@ -474,7 +474,7 @@ jobs:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
@@ -554,7 +554,7 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use nix::unistd::Pid;
|
||||
use postgres::error::SqlState;
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
@@ -722,8 +723,12 @@ impl ComputeNode {
|
||||
// Stop it when it's ready
|
||||
info!("waiting for postgres");
|
||||
wait_for_postgres(&mut pg, Path::new(pgdata))?;
|
||||
pg.kill()?;
|
||||
info!("sent kill signal");
|
||||
// SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
|
||||
// it to avoid orphaned processes prowling around while datadir is
|
||||
// wiped.
|
||||
let pm_pid = Pid::from_raw(pg.id() as i32);
|
||||
kill(pm_pid, Signal::SIGQUIT)?;
|
||||
info!("sent SIGQUIT signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
|
||||
|
||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
@@ -805,6 +805,18 @@ $$;"#,
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -83,16 +83,10 @@ struct ServiceState {
|
||||
nodes: Arc<HashMap<NodeId, Node>>,
|
||||
|
||||
scheduler: Scheduler,
|
||||
|
||||
compute_hook: Arc<ComputeHook>,
|
||||
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
}
|
||||
|
||||
impl ServiceState {
|
||||
fn new(
|
||||
config: Config,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
nodes: HashMap<NodeId, Node>,
|
||||
tenants: BTreeMap<TenantShardId, TenantState>,
|
||||
scheduler: Scheduler,
|
||||
@@ -101,8 +95,6 @@ impl ServiceState {
|
||||
tenants,
|
||||
nodes: Arc::new(nodes),
|
||||
scheduler,
|
||||
compute_hook: Arc::new(ComputeHook::new(config)),
|
||||
result_tx,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,6 +144,8 @@ pub struct Service {
|
||||
inner: Arc<std::sync::RwLock<ServiceState>>,
|
||||
config: Config,
|
||||
persistence: Arc<Persistence>,
|
||||
compute_hook: Arc<ComputeHook>,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
|
||||
// Process shutdown will fire this token
|
||||
cancel: CancellationToken,
|
||||
@@ -481,8 +475,6 @@ impl Service {
|
||||
notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
|
||||
deadline: Instant,
|
||||
) -> HashSet<TenantShardId> {
|
||||
let compute_hook = self.inner.read().unwrap().compute_hook.clone();
|
||||
|
||||
let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
|
||||
let mut success_shards = HashSet::new();
|
||||
|
||||
@@ -490,7 +482,7 @@ impl Service {
|
||||
// in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
|
||||
let mut stream = futures::stream::iter(notifications.into_iter())
|
||||
.map(|(tenant_shard_id, node_id, stripe_size)| {
|
||||
let compute_hook = compute_hook.clone();
|
||||
let compute_hook = self.compute_hook.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
async move {
|
||||
if let Err(e) = compute_hook
|
||||
@@ -730,14 +722,12 @@ impl Service {
|
||||
|
||||
let this = Arc::new(Self {
|
||||
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
|
||||
config.clone(),
|
||||
result_tx,
|
||||
nodes,
|
||||
tenants,
|
||||
scheduler,
|
||||
nodes, tenants, scheduler,
|
||||
))),
|
||||
config,
|
||||
config: config.clone(),
|
||||
persistence,
|
||||
compute_hook: Arc::new(ComputeHook::new(config)),
|
||||
result_tx,
|
||||
startup_complete: startup_complete.clone(),
|
||||
cancel: CancellationToken::new(),
|
||||
gate: Gate::default(),
|
||||
@@ -1145,8 +1135,6 @@ impl Service {
|
||||
|
||||
let (waiters, response_shards) = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let mut response_shards = Vec::new();
|
||||
@@ -1231,17 +1219,7 @@ impl Service {
|
||||
|
||||
let waiters = tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
.filter_map(|(_shard_id, shard)| {
|
||||
shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
)
|
||||
})
|
||||
.filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
|
||||
.collect::<Vec<_>>();
|
||||
(waiters, response_shards)
|
||||
};
|
||||
@@ -1432,8 +1410,6 @@ impl Service {
|
||||
let mut waiters = Vec::new();
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
for ShardUpdate {
|
||||
@@ -1461,15 +1437,7 @@ impl Service {
|
||||
|
||||
shard.schedule(scheduler)?;
|
||||
|
||||
let maybe_waiter = shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
);
|
||||
let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
|
||||
if let Some(waiter) = maybe_waiter {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
@@ -1514,20 +1482,10 @@ impl Service {
|
||||
let waiters = {
|
||||
let mut waiters = Vec::new();
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
shard.config = config.clone();
|
||||
if let Some(waiter) = shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
) {
|
||||
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
@@ -2159,7 +2117,7 @@ impl Service {
|
||||
}
|
||||
|
||||
// Validate input, and calculate which shards we will create
|
||||
let (old_shard_count, targets, compute_hook) =
|
||||
let (old_shard_count, targets) =
|
||||
{
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
@@ -2255,7 +2213,7 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
(old_shard_count, targets, locked.compute_hook.clone())
|
||||
(old_shard_count, targets)
|
||||
};
|
||||
|
||||
// unwrap safety: we would have returned above if we didn't find at least one shard to split
|
||||
@@ -2451,7 +2409,8 @@ impl Service {
|
||||
// Send compute notifications for all the new shards
|
||||
let mut failed_notifications = Vec::new();
|
||||
for (child_id, child_ps, stripe_size) in child_locations {
|
||||
if let Err(e) = compute_hook
|
||||
if let Err(e) = self
|
||||
.compute_hook
|
||||
.notify(child_id, child_ps, stripe_size, &self.cancel)
|
||||
.await
|
||||
{
|
||||
@@ -2481,8 +2440,6 @@ impl Service {
|
||||
) -> Result<TenantShardMigrateResponse, ApiError> {
|
||||
let waiter = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let Some(node) = nodes.get(&migrate_req.node_id) else {
|
||||
@@ -2542,15 +2499,7 @@ impl Service {
|
||||
shard.sequence = shard.sequence.next();
|
||||
}
|
||||
|
||||
shard.maybe_reconcile(
|
||||
result_tx,
|
||||
nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
)
|
||||
self.maybe_reconcile_shard(shard, nodes)
|
||||
};
|
||||
|
||||
if let Some(waiter) = waiter {
|
||||
@@ -2814,8 +2763,6 @@ impl Service {
|
||||
}
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let mut new_nodes = (**nodes).clone();
|
||||
@@ -2867,16 +2814,8 @@ impl Service {
|
||||
tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
|
||||
}
|
||||
Ok(()) => {
|
||||
if tenant_state
|
||||
.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
&new_nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
)
|
||||
if self
|
||||
.maybe_reconcile_shard(tenant_state, &new_nodes)
|
||||
.is_some()
|
||||
{
|
||||
tenants_affected += 1;
|
||||
@@ -2900,15 +2839,7 @@ impl Service {
|
||||
tenant_state.observed.locations.get_mut(&config_req.node_id)
|
||||
{
|
||||
if observed_loc.conf.is_none() {
|
||||
tenant_state.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
&new_nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
);
|
||||
self.maybe_reconcile_shard(tenant_state, &new_nodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2937,22 +2868,12 @@ impl Service {
|
||||
tenant_id: TenantId,
|
||||
) -> Result<Vec<ReconcilerWaiter>, anyhow::Error> {
|
||||
let mut waiters = Vec::new();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
shard.schedule(scheduler)?;
|
||||
|
||||
if let Some(waiter) = shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
) {
|
||||
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
@@ -2987,28 +2908,34 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides
|
||||
/// all the references to parts of Self that are needed
|
||||
fn maybe_reconcile_shard(
|
||||
&self,
|
||||
shard: &mut TenantState,
|
||||
nodes: &Arc<HashMap<NodeId, Node>>,
|
||||
) -> Option<ReconcilerWaiter> {
|
||||
shard.maybe_reconcile(
|
||||
&self.result_tx,
|
||||
nodes,
|
||||
&self.compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
)
|
||||
}
|
||||
|
||||
/// Check all tenants for pending reconciliation work, and reconcile those in need
|
||||
///
|
||||
/// Returns how many reconciliation tasks were started
|
||||
fn reconcile_all(&self) -> usize {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let pageservers = locked.nodes.clone();
|
||||
locked
|
||||
.tenants
|
||||
.iter_mut()
|
||||
.filter_map(|(_tenant_shard_id, shard)| {
|
||||
shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
&pageservers,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
)
|
||||
})
|
||||
.filter_map(|(_tenant_shard_id, shard)| self.maybe_reconcile_shard(shard, &pageservers))
|
||||
.count()
|
||||
}
|
||||
|
||||
|
||||
@@ -617,7 +617,7 @@ impl TenantState {
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
service_config: &service::Config,
|
||||
@@ -729,6 +729,7 @@ impl TenantState {
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let result_tx = result_tx.clone();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
|
||||
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub mod metric_vec_duration;
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod more_process_metrics;
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
pub trait DurationResultObserver {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
|
||||
}
|
||||
|
||||
pub async fn observe_async_block_duration_by_result<
|
||||
T,
|
||||
E,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
O: DurationResultObserver,
|
||||
>(
|
||||
observer: &O,
|
||||
block: F,
|
||||
) -> Result<T, E> {
|
||||
let start = Instant::now();
|
||||
let result = block.await;
|
||||
let duration = start.elapsed();
|
||||
observer.observe_result(&result, duration);
|
||||
result
|
||||
}
|
||||
@@ -17,6 +17,7 @@ use remote_storage::{
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
@@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
{
|
||||
let mut stream = ctx
|
||||
let stream = ctx
|
||||
.client
|
||||
.download(&path, &cancel)
|
||||
.await
|
||||
.expect("download succeeds")
|
||||
.download_stream;
|
||||
|
||||
let first = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("should have the first blob")
|
||||
.expect("should have succeeded");
|
||||
let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
|
||||
|
||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
||||
let first = reader.fill_buf().await.expect("should have the first blob");
|
||||
|
||||
let len = first.len();
|
||||
tracing::info!(len, "downloaded first chunk");
|
||||
|
||||
assert!(
|
||||
first.len() < len,
|
||||
first.len() < file_len,
|
||||
"uploaded file is too small, we downloaded all on first chunk"
|
||||
);
|
||||
|
||||
reader.consume(len);
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
let next = stream.next().await.expect("stream should have more");
|
||||
let next = reader.fill_buf().await;
|
||||
|
||||
let e = next.expect_err("expected an error, but got a chunk?");
|
||||
|
||||
@@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
|
||||
let e = DownloadError::from(e);
|
||||
|
||||
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
@@ -84,6 +84,9 @@ where
|
||||
info!("Handling request");
|
||||
}
|
||||
|
||||
// Take a copy of the path for error logging
|
||||
let path = request.uri().path().to_string();
|
||||
|
||||
// No special handling for panics here. There's a `tracing_panic_hook` from another
|
||||
// module to do that globally.
|
||||
let res = handler(request).await;
|
||||
@@ -110,7 +113,7 @@ where
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
Err(err) => Ok(api_error_handler(err)),
|
||||
Err(err) => Ok(api_error_handler(err, Some(&path))),
|
||||
}
|
||||
}
|
||||
.instrument(request_span)
|
||||
|
||||
@@ -108,7 +108,7 @@ impl HttpErrorBody {
|
||||
|
||||
pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
|
||||
match err.downcast::<ApiError>() {
|
||||
Ok(api_error) => api_error_handler(*api_error),
|
||||
Ok(api_error) => api_error_handler(*api_error, None),
|
||||
Err(other_error) => {
|
||||
// We expect all the request handlers to return an ApiError, so this should
|
||||
// not be reached. But just in case.
|
||||
@@ -121,12 +121,16 @@ pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
pub fn api_error_handler(api_error: ApiError, path: Option<&str>) -> Response<Body> {
|
||||
// Print a stack trace for Internal Server errors
|
||||
|
||||
match api_error {
|
||||
ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
|
||||
warn!("Error processing HTTP request: {api_error:#}")
|
||||
warn!(
|
||||
"Error processing HTTP request: {api_error:#} {}{}",
|
||||
path.as_ref().map(|_| "at").unwrap_or(""),
|
||||
path.unwrap_or("")
|
||||
)
|
||||
}
|
||||
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
|
||||
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
|
||||
(*api).process_safekeeper_feedback(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
|
||||
fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
|
||||
@@ -83,6 +83,10 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
|
||||
@@ -932,6 +932,59 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/heatmap_upload:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
If the location is in an attached mode, upload the current state to the remote heatmap
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
If the location is in secondary mode, download latest heatmap and layers
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -1391,7 +1444,7 @@ components:
|
||||
trace_read_requests:
|
||||
type: boolean
|
||||
heatmap_period:
|
||||
type: integer
|
||||
type: string
|
||||
TenantConfigResponse:
|
||||
type: object
|
||||
properties:
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use enum_map::EnumMap;
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
||||
@@ -1283,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
|
||||
})
|
||||
});
|
||||
|
||||
impl DurationResultObserver for BasebackupQueryTime {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
|
||||
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
parent: &'a BasebackupQueryTime,
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl BasebackupQueryTime {
|
||||
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
BasebackupQueryTimeOngoingRecording {
|
||||
parent: self,
|
||||
ctx,
|
||||
start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
.micros_spent_throttled
|
||||
.close_and_checked_sub_from(elapsed);
|
||||
let ex_throttled = match ex_throttled {
|
||||
Ok(ex_throttled) => ex_throttled,
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
let label_value = if res.is_ok() { "ok" } else { "error" };
|
||||
let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
|
||||
metric.observe(duration.as_secs_f64());
|
||||
let metric = self
|
||||
.parent
|
||||
.0
|
||||
.get_metric_with_label_values(&[label_value])
|
||||
.unwrap();
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1964,10 +2017,8 @@ impl TimelineMetrics {
|
||||
pub(crate) fn resident_physical_size_get(&self) -> u64 {
|
||||
self.resident_physical_size_gauge.get()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineMetrics {
|
||||
fn drop(&mut self) {
|
||||
pub(crate) fn shutdown(&self) {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
@@ -2623,6 +2674,12 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
||||
Lazy::force(&disk_usage_based_eviction::METRICS);
|
||||
|
||||
for state_name in pageserver_api::models::TenantState::VARIANTS {
|
||||
// initialize the metric for all gauges, otherwise the time series might seemingly show
|
||||
// values from last restart.
|
||||
TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
|
||||
}
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
|
||||
@@ -1199,7 +1199,7 @@ impl PageServerHandler {
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: RequestContext,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -1214,7 +1214,7 @@ impl PageServerHandler {
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", lsn);
|
||||
timeline.wait_lsn(lsn, &ctx).await?;
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
@@ -1236,7 +1236,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
@@ -1257,7 +1257,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
@@ -1269,7 +1269,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1449,25 +1449,25 @@ where
|
||||
false
|
||||
};
|
||||
|
||||
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
}
|
||||
.await;
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
@@ -1563,7 +1563,7 @@ where
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
@@ -272,9 +272,6 @@ pub enum TaskKind {
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// Task that downloads a file from remote storage
|
||||
RemoteDownloadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
|
||||
@@ -1846,6 +1846,8 @@ impl Tenant {
|
||||
// Wait for any in-flight operations to complete
|
||||
self.gate.close().await;
|
||||
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3557,11 +3559,6 @@ async fn run_initdb(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Drop for Tenant {
|
||||
fn drop(&mut self) {
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
/// Dump contents of a layer file to stdout.
|
||||
pub async fn dump_layerfile_from_path(
|
||||
path: &Utf8Path,
|
||||
|
||||
@@ -354,6 +354,7 @@ pub struct TenantConf {
|
||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||
/// locations will use the heatmap uploaded by attached locations.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heatmap_period: Duration,
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
|
||||
@@ -536,6 +536,18 @@ impl Drop for LayerInner {
|
||||
// carry this until we are finished for [`Layer::wait_drop`] support
|
||||
let _status = status;
|
||||
|
||||
let Some(timeline) = timeline.upgrade() else {
|
||||
// no need to nag that timeline is gone: under normal situation on
|
||||
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
return;
|
||||
};
|
||||
|
||||
let Ok(_guard) = timeline.gate.enter() else {
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
return;
|
||||
};
|
||||
|
||||
let removed = match std::fs::remove_file(path) {
|
||||
Ok(()) => true,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
@@ -554,32 +566,26 @@ impl Drop for LayerInner {
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(timeline) = timeline.upgrade() {
|
||||
if removed {
|
||||
timeline.metrics.resident_physical_size_sub(file_size);
|
||||
}
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
|
||||
if removed {
|
||||
timeline.metrics.resident_physical_size_sub(file_size);
|
||||
}
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
|
||||
|
||||
if let Err(e) = res {
|
||||
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
||||
// demonstrating this deadlock (without spawn_blocking): stop will drop
|
||||
// queued items, which will have ResidentLayer's, and those drops would try
|
||||
// to re-entrantly lock the RemoteTimelineClient inner state.
|
||||
if !timeline.is_active() {
|
||||
tracing::info!("scheduling deletion on drop failed: {e:#}");
|
||||
} else {
|
||||
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
||||
}
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
|
||||
if let Err(e) = res {
|
||||
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
||||
// demonstrating this deadlock (without spawn_blocking): stop will drop
|
||||
// queued items, which will have ResidentLayer's, and those drops would try
|
||||
// to re-entrantly lock the RemoteTimelineClient inner state.
|
||||
if !timeline.is_active() {
|
||||
tracing::info!("scheduling deletion on drop failed: {e:#}");
|
||||
} else {
|
||||
LAYER_IMPL_METRICS.inc_completed_deletes();
|
||||
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
||||
}
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
|
||||
} else {
|
||||
LAYER_IMPL_METRICS.inc_completed_deletes();
|
||||
}
|
||||
} else {
|
||||
// no need to nag that timeline is gone: under normal situation on
|
||||
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -880,23 +886,18 @@ impl LayerInner {
|
||||
) -> Result<heavier_once_cell::InitPermit, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let task_name = format!("download layer {}", self);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
// this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
|
||||
// block tenant::mgr::remove_tenant_from_memory.
|
||||
|
||||
let this: Arc<Self> = self.clone();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
||||
Some(self.desc.tenant_shard_id),
|
||||
Some(self.desc.timeline_id),
|
||||
&task_name,
|
||||
false,
|
||||
async move {
|
||||
let guard = timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| DownloadError::DownloadCancelled)?;
|
||||
|
||||
tokio::task::spawn(async move {
|
||||
|
||||
let _guard = guard;
|
||||
|
||||
let client = timeline
|
||||
.remote_client
|
||||
@@ -906,7 +907,7 @@ impl LayerInner {
|
||||
let result = client.download_layer_file(
|
||||
&this.desc.filename(),
|
||||
&this.metadata(),
|
||||
&crate::task_mgr::shutdown_token()
|
||||
&timeline.cancel
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -929,7 +930,6 @@ impl LayerInner {
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(backoff) => {},
|
||||
_ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
|
||||
_ = timeline.cancel.cancelled() => {},
|
||||
};
|
||||
|
||||
@@ -959,11 +959,10 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
match rx.await {
|
||||
Ok((Ok(()), permit)) => {
|
||||
if let Some(reason) = self
|
||||
@@ -1102,6 +1101,10 @@ impl LayerInner {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
};
|
||||
|
||||
let Ok(_gate) = timeline.gate.enter() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
};
|
||||
|
||||
// to avoid starting a new download while we evict, keep holding on to the
|
||||
// permit.
|
||||
let _permit = {
|
||||
|
||||
@@ -1257,6 +1257,8 @@ impl Timeline {
|
||||
|
||||
// Finally wait until any gate-holders are complete
|
||||
self.gate.close().await;
|
||||
|
||||
self.metrics.shutdown();
|
||||
}
|
||||
|
||||
pub(crate) fn set_state(&self, new_state: TimelineState) {
|
||||
|
||||
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
|
||||
struct sysinfo si;
|
||||
Size total;
|
||||
if (sysinfo(&si) < 0)
|
||||
elog(ERROR, "Failed to get amount of RAM: %n");
|
||||
elog(ERROR, "Failed to get amount of RAM: %m");
|
||||
|
||||
total = si.totalram*si.mem_unit;
|
||||
if ((Size)NBuffers*BLCKSZ + requested >= total)
|
||||
|
||||
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -38,7 +38,6 @@ neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
|
||||
if (handle == NULL)
|
||||
|
||||
@@ -316,6 +316,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
bool broke_from_loop = false;
|
||||
|
||||
Assert(page_servers[shard_no].conn == NULL);
|
||||
|
||||
@@ -418,7 +419,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
|
||||
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
return false;
|
||||
/* Returning from inside PG_TRY is bad, so we break/return later */
|
||||
broke_from_loop = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -431,6 +434,11 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (broke_from_loop)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
|
||||
page_servers[shard_no].conn = conn;
|
||||
page_servers[shard_no].wes = wes;
|
||||
|
||||
6
pgxn/neon/neon--1.1--1.0.sql
Normal file
6
pgxn/neon/neon--1.1--1.0.sql
Normal file
@@ -0,0 +1,6 @@
|
||||
-- the order of operations is important here
|
||||
-- because the view depends on the function
|
||||
|
||||
DROP VIEW IF EXISTS neon_lfc_stats CASCADE;
|
||||
|
||||
DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE;
|
||||
1
pgxn/neon/neon--1.2--1.1.sql
Normal file
1
pgxn/neon/neon--1.2--1.1.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE;
|
||||
1
pgxn/neon/neon--1.3--1.2.sql
Normal file
1
pgxn/neon/neon--1.3--1.2.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE;
|
||||
@@ -95,7 +95,6 @@ get_num_snap_files_lsn_threshold(void)
|
||||
DIR *dirdesc;
|
||||
struct dirent *de;
|
||||
char *snap_path = "pg_logical/snapshots/";
|
||||
int cnt = 0;
|
||||
int lsns_allocated = 1024;
|
||||
int lsns_num = 0;
|
||||
XLogRecPtr *lsns;
|
||||
@@ -161,9 +160,6 @@ get_num_snap_files_lsn_threshold(void)
|
||||
PGDLLEXPORT void
|
||||
LogicalSlotsMonitorMain(Datum main_arg)
|
||||
{
|
||||
TimestampTz now,
|
||||
last_checked;
|
||||
|
||||
/* Establish signal handlers. */
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
|
||||
@@ -1888,7 +1888,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
int nblocks, bool skipFsync)
|
||||
{
|
||||
const PGAlignedBlock buffer = {0};
|
||||
BlockNumber curblocknum = blocknum;
|
||||
int remblocks = nblocks;
|
||||
XLogRecPtr lsn = 0;
|
||||
|
||||
|
||||
@@ -1220,7 +1220,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin
|
||||
req->epochStartLsn = wp->propEpochStartLsn;
|
||||
req->beginLsn = beginLsn;
|
||||
req->endLsn = endLsn;
|
||||
req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
req->commitLsn = wp->commitLsn;
|
||||
req->truncateLsn = wp->truncateLsn;
|
||||
req->proposerId = wp->greetRequest.proposerId;
|
||||
}
|
||||
@@ -1405,7 +1405,7 @@ static bool
|
||||
RecvAppendResponses(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
XLogRecPtr minQuorumLsn;
|
||||
XLogRecPtr newCommitLsn;
|
||||
bool readAnything = false;
|
||||
|
||||
while (true)
|
||||
@@ -1444,18 +1444,19 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (!readAnything)
|
||||
return sk->state == SS_ACTIVE;
|
||||
|
||||
HandleSafekeeperResponse(wp);
|
||||
|
||||
/* update commit_lsn */
|
||||
newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
/*
|
||||
* Also send the new commit lsn to all the safekeepers.
|
||||
* Send the new value to all safekeepers.
|
||||
*/
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
if (minQuorumLsn > wp->lastSentCommitLsn)
|
||||
if (newCommitLsn > wp->commitLsn)
|
||||
{
|
||||
wp->commitLsn = newCommitLsn;
|
||||
BroadcastAppendRequest(wp);
|
||||
wp->lastSentCommitLsn = minQuorumLsn;
|
||||
}
|
||||
|
||||
HandleSafekeeperResponse(wp);
|
||||
|
||||
return sk->state == SS_ACTIVE;
|
||||
}
|
||||
|
||||
@@ -1632,11 +1633,9 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
|
||||
static void
|
||||
HandleSafekeeperResponse(WalProposer *wp)
|
||||
{
|
||||
XLogRecPtr minQuorumLsn;
|
||||
XLogRecPtr candidateTruncateLsn;
|
||||
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
|
||||
wp->api.process_safekeeper_feedback(wp);
|
||||
|
||||
/*
|
||||
* Try to advance truncateLsn -- the last record flushed to all
|
||||
@@ -1649,7 +1648,7 @@ HandleSafekeeperResponse(WalProposer *wp)
|
||||
* can't commit entries from previous term' in Raft); 2)
|
||||
*/
|
||||
candidateTruncateLsn = CalculateMinFlushLsn(wp);
|
||||
candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
|
||||
candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn);
|
||||
if (candidateTruncateLsn > wp->truncateLsn)
|
||||
{
|
||||
wp->truncateLsn = candidateTruncateLsn;
|
||||
|
||||
@@ -564,7 +564,7 @@ typedef struct walproposer_api
|
||||
* backpressure feedback and to confirm WAL persistence (has been commited
|
||||
* on the quorum of safekeepers).
|
||||
*/
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp);
|
||||
|
||||
/*
|
||||
* Write a log message to the internal log processor. This is used only
|
||||
@@ -646,8 +646,8 @@ typedef struct WalProposer
|
||||
/* WAL has been generated up to this point */
|
||||
XLogRecPtr availableLsn;
|
||||
|
||||
/* last commitLsn broadcasted to safekeepers */
|
||||
XLogRecPtr lastSentCommitLsn;
|
||||
/* cached GetAcknowledgedByQuorumWALPosition result */
|
||||
XLogRecPtr commitLsn;
|
||||
|
||||
ProposerGreeting greetRequest;
|
||||
|
||||
|
||||
@@ -68,6 +68,8 @@ static WalproposerShmemState *walprop_shared;
|
||||
static WalProposerConfig walprop_config;
|
||||
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
|
||||
static const walproposer_api walprop_pg;
|
||||
static volatile sig_atomic_t got_SIGUSR2 = false;
|
||||
static bool reported_sigusr2 = false;
|
||||
|
||||
static void nwp_shmem_startup_hook(void);
|
||||
static void nwp_register_gucs(void);
|
||||
@@ -101,6 +103,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void update_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
|
||||
|
||||
static void CheckGracefulShutdown(WalProposer *wp);
|
||||
|
||||
static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
|
||||
|
||||
static void
|
||||
@@ -492,6 +496,24 @@ walprop_pg_init_standalone_sync_safekeepers(void)
|
||||
BackgroundWorkerUnblockSignals();
|
||||
}
|
||||
|
||||
/*
|
||||
* We pretend to be a walsender process, and the lifecycle of a walsender is
|
||||
* slightly different than other procesess. At shutdown, walsender processes
|
||||
* stay alive until the very end, after the checkpointer has written the
|
||||
* shutdown checkpoint. When the checkpointer exits, the postmaster sends all
|
||||
* remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send
|
||||
* the remaining WAL, and then exit. This ensures that the checkpoint record
|
||||
* reaches durable storage (in safekeepers), before the server shuts down
|
||||
* completely.
|
||||
*/
|
||||
static void
|
||||
walprop_sigusr2(SIGNAL_ARGS)
|
||||
{
|
||||
got_SIGUSR2 = true;
|
||||
|
||||
SetLatch(MyLatch);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_init_bgworker(void)
|
||||
{
|
||||
@@ -503,6 +525,7 @@ walprop_pg_init_bgworker(void)
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGTERM, die);
|
||||
pqsignal(SIGUSR2, walprop_sigusr2);
|
||||
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
@@ -1026,7 +1049,7 @@ static void
|
||||
StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
{
|
||||
XLogRecPtr FlushPtr;
|
||||
TimeLineID currTLI;
|
||||
__attribute__((unused)) TimeLineID currTLI;
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (ThisTimeLineID == 0)
|
||||
@@ -1075,14 +1098,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When we first start replication the standby will be behind the primary.
|
||||
* For some applications, for example synchronous replication, it is
|
||||
* important to have a clear state for this initial catchup mode, so we
|
||||
* can trigger actions when we change streaming state later. We may stay
|
||||
* in this state for a long time, which is exactly why we want to be able
|
||||
* to monitor whether or not we are still here.
|
||||
* XXX: Move straight to STOPPING state, skipping the STREAMING state.
|
||||
*
|
||||
* This is a bit weird. Normal walsenders stay in STREAMING state, until
|
||||
* the checkpointer signals them that it is about to start writing the
|
||||
* shutdown checkpoint. The walsenders acknowledge that they have received
|
||||
* that signal by switching to STOPPING state. That tells the walsenders
|
||||
* that they must not write any new WAL.
|
||||
*
|
||||
* However, we cannot easily intercept that signal from the checkpointer.
|
||||
* It's sent by WalSndInitStopping(), using
|
||||
* SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by
|
||||
* HandleWalSndInitStopping, which sets a process-local got_STOPPING flag.
|
||||
* However, that's all private to walsender.c.
|
||||
*
|
||||
* We don't need to do anything special upon receiving the signal, the
|
||||
* walproposer doesn't write any WAL anyway, so we skip the STREAMING
|
||||
* state and go directly to STOPPING mode. That way, the checkpointer
|
||||
* won't wait for us.
|
||||
*/
|
||||
WalSndSetState(WALSNDSTATE_CATCHUP);
|
||||
WalSndSetState(WALSNDSTATE_STOPPING);
|
||||
|
||||
/*
|
||||
* Don't allow a request to stream from a future point in WAL that hasn't
|
||||
@@ -1122,6 +1157,8 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
static void
|
||||
WalSndLoop(WalProposer *wp)
|
||||
{
|
||||
XLogRecPtr flushPtr;
|
||||
|
||||
/* Clear any already-pending wakeups */
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
@@ -1130,9 +1167,6 @@ WalSndLoop(WalProposer *wp)
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
XLogBroadcastWalProposer(wp);
|
||||
|
||||
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
|
||||
WalSndSetState(WALSNDSTATE_STREAMING);
|
||||
WalProposerPoll(wp);
|
||||
}
|
||||
}
|
||||
@@ -1230,7 +1264,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
||||
TimeLineID timeline;
|
||||
XLogRecPtr startpos;
|
||||
XLogRecPtr endpos;
|
||||
uint64 download_range_mb;
|
||||
|
||||
startpos = GetLogRepRestartLSN(wp);
|
||||
if (startpos == InvalidXLogRecPtr)
|
||||
@@ -1745,6 +1778,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
|
||||
{
|
||||
ConditionVariableCancelSleep();
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CheckGracefulShutdown(wp);
|
||||
|
||||
*events = WL_LATCH_SET;
|
||||
return 1;
|
||||
}
|
||||
@@ -1798,6 +1834,41 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Like vanilla walsender, on sigusr2 send all remaining WAL and exit.
|
||||
*
|
||||
* Note that unlike sync-safekeepers waiting here is not reliable: we
|
||||
* don't check that majority of safekeepers received and persisted
|
||||
* commit_lsn -- only that walproposer reached it (which immediately
|
||||
* broadcasts new value). Doing that without incurring redundant control
|
||||
* file syncing would need wp -> sk protocol change. OTOH unlike
|
||||
* sync-safekeepers which must bump commit_lsn or basebackup will fail,
|
||||
* this catchup is important only for tests where safekeepers/network
|
||||
* don't crash on their own.
|
||||
*/
|
||||
static void
|
||||
CheckGracefulShutdown(WalProposer *wp)
|
||||
{
|
||||
if (got_SIGUSR2)
|
||||
{
|
||||
if (!reported_sigusr2)
|
||||
{
|
||||
XLogRecPtr flushPtr = walprop_pg_get_flush_rec_ptr(wp);
|
||||
|
||||
wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr));
|
||||
reported_sigusr2 = true;
|
||||
}
|
||||
|
||||
if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp))
|
||||
{
|
||||
wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting",
|
||||
LSN_FORMAT_ARGS(wp->commitLsn));
|
||||
proc_exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Choose most advanced PageserverFeedback and set it to *rf.
|
||||
*/
|
||||
@@ -1878,7 +1949,7 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
|
||||
* None of that is functional in sync-safekeepers.
|
||||
*/
|
||||
static void
|
||||
walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
walprop_pg_process_safekeeper_feedback(WalProposer *wp)
|
||||
{
|
||||
HotStandbyFeedback hsFeedback;
|
||||
XLogRecPtr oldDiskConsistentLsn;
|
||||
@@ -1893,10 +1964,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
replication_feedback_set(&quorumFeedback.rf);
|
||||
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
|
||||
|
||||
if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
{
|
||||
if (commitLsn > quorumFeedback.flushLsn)
|
||||
quorumFeedback.flushLsn = commitLsn;
|
||||
if (wp->commitLsn > quorumFeedback.flushLsn)
|
||||
quorumFeedback.flushLsn = wp->commitLsn;
|
||||
|
||||
/*
|
||||
* Advance the replication slot to commitLsn. WAL before it is
|
||||
@@ -1929,6 +2000,8 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
XidFromFullTransactionId(hsFeedback.catalog_xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
}
|
||||
|
||||
CheckGracefulShutdown(wp);
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
|
||||
@@ -182,8 +182,6 @@ test_consume_memory(PG_FUNCTION_ARGS)
|
||||
Datum
|
||||
test_release_memory(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TimestampTz start;
|
||||
|
||||
if (PG_ARGISNULL(0))
|
||||
{
|
||||
if (consume_cxt)
|
||||
|
||||
@@ -220,6 +220,9 @@ enter_seccomp_mode(void)
|
||||
}
|
||||
#endif /* HAVE_LIBSECCOMP */
|
||||
|
||||
PGDLLEXPORT void
|
||||
WalRedoMain(int argc, char *argv[]);
|
||||
|
||||
/*
|
||||
* Entry point for the WAL redo process.
|
||||
*
|
||||
|
||||
@@ -73,7 +73,7 @@ pub mod errors {
|
||||
// Status 406: endpoint is disabled (we don't allow connections).
|
||||
format!("{REQUEST_FAILED}: endpoint is disabled")
|
||||
}
|
||||
http::StatusCode::LOCKED => {
|
||||
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
|
||||
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
|
||||
}
|
||||
@@ -91,6 +91,12 @@ pub mod errors {
|
||||
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
text,
|
||||
} if text.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::LOCKED,
|
||||
text,
|
||||
@@ -120,6 +126,11 @@ pub mod errors {
|
||||
status: http::StatusCode::BAD_REQUEST,
|
||||
..
|
||||
} => true,
|
||||
// don't retry when quotas are exceeded
|
||||
Self::Console {
|
||||
status: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref text,
|
||||
} => !text.contains("compute time quota of non-primary branches is exceeded"),
|
||||
// locked can be returned when the endpoint was in transition
|
||||
// or when quotas are exceeded. don't retry when quotas are exceeded
|
||||
Self::Console {
|
||||
|
||||
@@ -69,6 +69,12 @@ fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
{
|
||||
"quota_exceeded"
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref text,
|
||||
}) if text.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
"quota_exceeded"
|
||||
}
|
||||
WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::LOCKED,
|
||||
..
|
||||
|
||||
@@ -196,6 +196,7 @@ pub struct SimulationApi {
|
||||
safekeepers: RefCell<Vec<SafekeeperConn>>,
|
||||
disk: Arc<DiskWalProposer>,
|
||||
redo_start_lsn: Option<Lsn>,
|
||||
last_logged_commit_lsn: u64,
|
||||
shmem: UnsafeCell<walproposer::bindings::WalproposerShmemState>,
|
||||
config: Config,
|
||||
event_set: RefCell<Option<EventSet>>,
|
||||
@@ -228,6 +229,7 @@ impl SimulationApi {
|
||||
safekeepers: RefCell::new(sk_conns),
|
||||
disk: args.disk,
|
||||
redo_start_lsn: args.redo_start_lsn,
|
||||
last_logged_commit_lsn: 0,
|
||||
shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
|
||||
mutex: 0,
|
||||
feedback: PageserverFeedback {
|
||||
@@ -596,14 +598,11 @@ impl ApiImpl for SimulationApi {
|
||||
}
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(
|
||||
&self,
|
||||
wp: &mut walproposer::bindings::WalProposer,
|
||||
commit_lsn: u64,
|
||||
) {
|
||||
debug!("process_safekeeper_feedback, commit_lsn={}", commit_lsn);
|
||||
if commit_lsn > wp.lastSentCommitLsn {
|
||||
self.os.log_event(format!("commit_lsn;{}", commit_lsn));
|
||||
fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) {
|
||||
debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn);
|
||||
if wp.commitLsn > self.last_logged_commit_lsn {
|
||||
self.os.log_event(format!("commit_lsn;{}", wp.commitLsn));
|
||||
self.last_logged_commit_lsn = wp.commitLsn;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,8 @@ FLAKY_TESTS_QUERY = """
|
||||
DISTINCT parent_suite, suite, name
|
||||
FROM results
|
||||
WHERE
|
||||
started_at > CURRENT_DATE - INTERVAL '%s' day
|
||||
started_at > CURRENT_DATE - INTERVAL '10' day
|
||||
AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
|
||||
AND (
|
||||
(status IN ('failed', 'broken') AND reference = 'refs/heads/main')
|
||||
OR flaky
|
||||
@@ -46,11 +47,14 @@ def main(args: argparse.Namespace):
|
||||
logging.error("cannot fetch flaky tests from the DB due to an error", exc)
|
||||
rows = []
|
||||
|
||||
# If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs),
|
||||
# If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
|
||||
# use it to parametrize test name along with build_type and pg_version
|
||||
#
|
||||
# See test_runner/fixtures/parametrize.py for details
|
||||
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
|
||||
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
|
||||
"",
|
||||
"tokio-epoll-uring",
|
||||
):
|
||||
pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
|
||||
else:
|
||||
pageserver_virtual_file_io_engine_parameter = ""
|
||||
|
||||
@@ -15,11 +15,11 @@ import threading
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from fcntl import LOCK_EX, LOCK_UN, flock
|
||||
from functools import cached_property
|
||||
from functools import cached_property, partial
|
||||
from itertools import chain, product
|
||||
from pathlib import Path
|
||||
from types import TracebackType
|
||||
@@ -70,6 +70,8 @@ from fixtures.remote_storage import (
|
||||
default_remote_storage,
|
||||
remote_storage_to_toml_inline_table,
|
||||
)
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.safekeeper.utils import are_walreceivers_absent
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import (
|
||||
ATTACHMENT_NAME_REGEX,
|
||||
@@ -2547,6 +2549,20 @@ class PgBin:
|
||||
)
|
||||
return base_path
|
||||
|
||||
def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn:
|
||||
"""
|
||||
Run pg_controldata on given datadir and extract checkpoint lsn.
|
||||
"""
|
||||
|
||||
pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata")
|
||||
cmd = f"{pg_controldata_path} -D {pgdata}"
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
|
||||
checkpoint_lsn = re.findall(
|
||||
"Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
|
||||
)[0]
|
||||
log.info(f"last checkpoint at {checkpoint_lsn}")
|
||||
return Lsn(checkpoint_lsn)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
|
||||
@@ -3565,220 +3581,6 @@ class Safekeeper:
|
||||
return segments
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@dataclass
|
||||
class Walreceiver:
|
||||
conn_id: int
|
||||
state: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
flush_lsn: Lsn
|
||||
commit_lsn: Lsn
|
||||
timeline_start_lsn: Lsn
|
||||
backup_lsn: Lsn
|
||||
peer_horizon_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
walreceivers: List[Walreceiver]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperMetrics:
|
||||
# These are metrics from Prometheus which uses float64 internally.
|
||||
# As a consequence, values may differ from real original int64s.
|
||||
flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled = is_testing_enabled
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def is_testing_enabled_or_skip(self):
|
||||
if not self.is_testing_enabled:
|
||||
pytest.skip("safekeeper was built without 'testing' feature")
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
return res_json
|
||||
|
||||
def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
params = params or {}
|
||||
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
||||
res.raise_for_status()
|
||||
res_json = json.loads(res.text)
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def patch_control_file(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
patch: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
res = self.patch(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
|
||||
json={
|
||||
"updates": patch,
|
||||
"apply_fields": list(patch.keys()),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
|
||||
json=body,
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_digest(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
|
||||
) -> Dict[str, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
|
||||
params={
|
||||
"from_lsn": str(from_lsn),
|
||||
"until_lsn": str(until_lsn),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
commit_lsn: Lsn,
|
||||
):
|
||||
body = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"pg_version": pg_version,
|
||||
"commit_lsn": str(commit_lsn),
|
||||
}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_status(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId
|
||||
) -> SafekeeperTimelineStatus:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
return SafekeeperTimelineStatus(
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
flush_lsn=Lsn(resj["flush_lsn"]),
|
||||
commit_lsn=Lsn(resj["commit_lsn"]),
|
||||
timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
walreceivers=walreceivers,
|
||||
)
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
|
||||
json=body,
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
) -> Dict[Any, Any]:
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params={
|
||||
"only_local": str(only_local).lower(),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def get_metrics_str(self) -> str:
|
||||
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
return request_result.text
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
all_metrics_text = self.get_metrics_str()
|
||||
|
||||
metrics = SafekeeperMetrics()
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
|
||||
match.group(3)
|
||||
)
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.commit_lsn_inexact[
|
||||
(TenantId(match.group(1)), TimelineId(match.group(2)))
|
||||
] = int(match.group(3))
|
||||
return metrics
|
||||
|
||||
|
||||
class S3Scrubber:
|
||||
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
|
||||
self.env = env
|
||||
@@ -4088,24 +3890,21 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
|
||||
|
||||
# pg is the existing and running compute node, that we want to compare with a basebackup
|
||||
def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
|
||||
# Get the timeline ID. We need it for the 'basebackup' command
|
||||
timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
|
||||
|
||||
# many tests already checkpoint, but do it just in case
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CHECKPOINT")
|
||||
|
||||
# wait for pageserver to catch up
|
||||
wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
|
||||
# stop postgres to ensure that files won't change
|
||||
endpoint.stop()
|
||||
|
||||
# Read the shutdown checkpoint's LSN
|
||||
checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path())
|
||||
|
||||
# Take a basebackup from pageserver
|
||||
restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
|
||||
restored_dir_path.mkdir(exist_ok=True)
|
||||
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
|
||||
|
||||
pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
|
||||
@@ -4113,7 +3912,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
|
||||
{psql_path} \
|
||||
--no-psqlrc \
|
||||
postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \
|
||||
-c 'basebackup {endpoint.tenant_id} {timeline_id}' \
|
||||
-c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}' \
|
||||
| tar -x -C {restored_dir_path}
|
||||
"""
|
||||
|
||||
@@ -4262,6 +4061,49 @@ def wait_for_last_flush_lsn(
|
||||
return min(results)
|
||||
|
||||
|
||||
def flush_ep_to_pageserver(
|
||||
env: NeonEnv,
|
||||
ep: Endpoint,
|
||||
tenant: TenantId,
|
||||
timeline: TimelineId,
|
||||
pageserver_id: Optional[int] = None,
|
||||
) -> Lsn:
|
||||
"""
|
||||
Stop endpoint and wait until all committed WAL reaches the pageserver
|
||||
(last_record_lsn). This is for use by tests which want everything written so
|
||||
far to reach pageserver *and* expecting that no more data will arrive until
|
||||
endpoint starts again, so unlike wait_for_last_flush_lsn it polls
|
||||
safekeepers instead of compute to learn LSN.
|
||||
|
||||
Returns the catch up LSN.
|
||||
"""
|
||||
ep.stop()
|
||||
|
||||
commit_lsn: Lsn = Lsn(0)
|
||||
# In principle in the absense of failures polling single sk would be enough.
|
||||
for sk in env.safekeepers:
|
||||
cli = sk.http_client()
|
||||
# wait until compute connections are gone
|
||||
wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline))
|
||||
commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn)
|
||||
|
||||
# Note: depending on WAL filtering implementation, probably most shards
|
||||
# won't be able to reach commit_lsn (unless gaps are also ack'ed), so this
|
||||
# is broken in sharded case.
|
||||
shards = tenant_get_shards(env, tenant, pageserver_id)
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
log.info(
|
||||
f"flush_ep_to_pageserver: waiting for {commit_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})"
|
||||
)
|
||||
waited = wait_for_last_record_lsn(
|
||||
pageserver.http_client(), tenant_shard_id, timeline, commit_lsn
|
||||
)
|
||||
|
||||
assert waited >= commit_lsn
|
||||
|
||||
return commit_lsn
|
||||
|
||||
|
||||
def wait_for_wal_insert_lsn(
|
||||
env: NeonEnv,
|
||||
endpoint: Endpoint,
|
||||
|
||||
@@ -46,9 +46,12 @@ def pytest_generate_tests(metafunc: Metafunc):
|
||||
|
||||
metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
|
||||
|
||||
# A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
|
||||
# And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
|
||||
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
|
||||
# A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs`
|
||||
# And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics
|
||||
if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
|
||||
"",
|
||||
"tokio-epoll-uring",
|
||||
):
|
||||
metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
|
||||
|
||||
# For performance tests, parametrize also by platform
|
||||
|
||||
0
test_runner/fixtures/safekeeper/__init__.py
Normal file
0
test_runner/fixtures/safekeeper/__init__.py
Normal file
227
test_runner/fixtures/safekeeper/http.py
Normal file
227
test_runner/fixtures/safekeeper/http.py
Normal file
@@ -0,0 +1,227 @@
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@dataclass
|
||||
class Walreceiver:
|
||||
conn_id: int
|
||||
state: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
flush_lsn: Lsn
|
||||
commit_lsn: Lsn
|
||||
timeline_start_lsn: Lsn
|
||||
backup_lsn: Lsn
|
||||
peer_horizon_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
walreceivers: List[Walreceiver]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperMetrics:
|
||||
# These are metrics from Prometheus which uses float64 internally.
|
||||
# As a consequence, values may differ from real original int64s.
|
||||
flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session):
|
||||
HTTPError = requests.HTTPError
|
||||
|
||||
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled = is_testing_enabled
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def is_testing_enabled_or_skip(self):
|
||||
if not self.is_testing_enabled:
|
||||
pytest.skip("safekeeper was built without 'testing' feature")
|
||||
|
||||
def configure_failpoints(self, config_strings: Union[Tuple[str, str], List[Tuple[str, str]]]):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
return res_json
|
||||
|
||||
def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
params = params or {}
|
||||
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
||||
res.raise_for_status()
|
||||
res_json = json.loads(res.text)
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def patch_control_file(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
patch: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
res = self.patch(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
|
||||
json={
|
||||
"updates": patch,
|
||||
"apply_fields": list(patch.keys()),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
|
||||
json=body,
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_digest(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
|
||||
) -> Dict[str, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
|
||||
params={
|
||||
"from_lsn": str(from_lsn),
|
||||
"until_lsn": str(until_lsn),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
commit_lsn: Lsn,
|
||||
):
|
||||
body = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"pg_version": pg_version,
|
||||
"commit_lsn": str(commit_lsn),
|
||||
}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
|
||||
res.raise_for_status()
|
||||
|
||||
def timeline_status(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId
|
||||
) -> SafekeeperTimelineStatus:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
return SafekeeperTimelineStatus(
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
flush_lsn=Lsn(resj["flush_lsn"]),
|
||||
commit_lsn=Lsn(resj["commit_lsn"]),
|
||||
timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
walreceivers=walreceivers,
|
||||
)
|
||||
|
||||
def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
|
||||
return self.timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
|
||||
json=body,
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
# only_local doesn't remove segments in the remote storage.
|
||||
def timeline_delete(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
|
||||
) -> Dict[Any, Any]:
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params={
|
||||
"only_local": str(only_local).lower(),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def get_metrics_str(self) -> str:
|
||||
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
return request_result.text
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
all_metrics_text = self.get_metrics_str()
|
||||
|
||||
metrics = SafekeeperMetrics()
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int(
|
||||
match.group(3)
|
||||
)
|
||||
for match in re.finditer(
|
||||
r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE,
|
||||
):
|
||||
metrics.commit_lsn_inexact[
|
||||
(TenantId(match.group(1)), TimelineId(match.group(2)))
|
||||
] = int(match.group(3))
|
||||
return metrics
|
||||
11
test_runner/fixtures/safekeeper/utils.py
Normal file
11
test_runner/fixtures/safekeeper/utils.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
|
||||
def are_walreceivers_absent(
|
||||
sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
status = sk_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
|
||||
return len(status.walreceivers) == 0
|
||||
@@ -4,12 +4,11 @@ import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
flush_ep_to_pageserver,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.pageserver.utils import wait_for_upload
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
# Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
|
||||
@@ -46,14 +45,15 @@ def test_basic_eviction(
|
||||
FROM generate_series(1, 5000000) g
|
||||
"""
|
||||
)
|
||||
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
|
||||
# stops the endpoint
|
||||
current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(client, tenant_id, timeline_id, current_lsn)
|
||||
|
||||
# disable compute & sks to avoid on-demand downloads by walreceiver / getpage
|
||||
endpoint.stop()
|
||||
# stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint
|
||||
# has already been stopped by flush_ep_to_pageserver
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import time
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
|
||||
from fixtures.pageserver.types import (
|
||||
DeltaLayerFileName,
|
||||
ImageLayerFileName,
|
||||
@@ -115,8 +115,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
endpoint.stop()
|
||||
last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
|
||||
|
||||
endpoint.wait_for_migrations()
|
||||
|
||||
num_migrations = 8
|
||||
num_migrations = 9
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT id FROM neon_migration.migration_id")
|
||||
|
||||
@@ -29,3 +29,34 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
|
||||
log.info(res)
|
||||
assert len(res) == 1
|
||||
assert len(res[0]) == 5
|
||||
|
||||
|
||||
# Verify that the neon extension can be upgraded/downgraded.
|
||||
def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
env.neon_cli.create_branch("test_neon_extension_compatibility")
|
||||
|
||||
endpoint_main = env.endpoints.create("test_neon_extension_compatibility")
|
||||
# don't skip pg_catalog updates - it runs CREATE EXTENSION neon
|
||||
endpoint_main.respec(skip_pg_catalog_updates=False)
|
||||
endpoint_main.start()
|
||||
|
||||
with closing(endpoint_main.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
all_versions = ["1.3", "1.2", "1.1", "1.0"]
|
||||
current_version = "1.3"
|
||||
for idx, begin_version in enumerate(all_versions):
|
||||
for target_version in all_versions[idx + 1 :]:
|
||||
if current_version != begin_version:
|
||||
cur.execute(
|
||||
f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}"
|
||||
)
|
||||
current_version = begin_version
|
||||
# downgrade
|
||||
cur.execute(
|
||||
f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}"
|
||||
)
|
||||
# upgrade
|
||||
cur.execute(
|
||||
f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}"
|
||||
)
|
||||
|
||||
@@ -8,6 +8,7 @@ from typing import Any, DefaultDict, Dict, Tuple
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
flush_ep_to_pageserver,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
@@ -517,7 +518,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("update a set id = -id")
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
|
||||
@@ -190,6 +190,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
# So by ignoring these instead of waiting for empty upload queue
|
||||
# we execute more distinct code paths.
|
||||
'.*stopping left-over name="remote upload".*',
|
||||
# an on-demand is cancelled by shutdown
|
||||
".*initial size calculation failed: downloading failed, possibly for shutdown",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -213,7 +213,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
# This happens when timeline remains are cleaned up during loading
|
||||
".*Timeline dir entry become invalid.*",
|
||||
# In one of the branches we poll for tenant to become active. Polls can generate this log message:
|
||||
f".*Tenant {env.initial_tenant} is not active*",
|
||||
f".*Tenant {env.initial_tenant} is not active.*",
|
||||
# an on-demand is cancelled by shutdown
|
||||
".*initial size calculation failed: downloading failed, possibly for shutdown",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -28,7 +28,6 @@ from fixtures.neon_fixtures import (
|
||||
PgBin,
|
||||
PgProtocol,
|
||||
Safekeeper,
|
||||
SafekeeperHttpClient,
|
||||
SafekeeperPort,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
@@ -46,6 +45,8 @@ from fixtures.remote_storage import (
|
||||
default_remote_storage,
|
||||
s3_storage,
|
||||
)
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.safekeeper.utils import are_walreceivers_absent
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import get_dir_size, query_scalar, start_in_background
|
||||
|
||||
@@ -1097,12 +1098,6 @@ def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
|
||||
return all([flush_lsns[0] == flsn for flsn in flush_lsns])
|
||||
|
||||
|
||||
def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
status = sk_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
|
||||
return len(status.walreceivers) == 0
|
||||
|
||||
|
||||
# Assert by xxd that WAL on given safekeepers is identical. No compute must be
|
||||
# running for this to be reliable.
|
||||
def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
|
||||
@@ -1347,6 +1342,36 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
|
||||
|
||||
|
||||
# Test that when compute is terminated in fast (or smart) mode, walproposer is
|
||||
# allowed to run and self terminate after shutdown checkpoint is written, so it
|
||||
# commits it to safekeepers before exiting. This not required for correctness,
|
||||
# but needed for tests using check_restored_datadir_content.
|
||||
def test_wp_graceful_shutdown(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_wp_graceful_shutdown")
|
||||
ep = env.endpoints.create_start("test_wp_graceful_shutdown")
|
||||
ep.safe_psql("create table t(key int, value text)")
|
||||
ep.stop()
|
||||
|
||||
# figure out checkpoint lsn
|
||||
ckpt_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(ep.pg_data_dir_path())
|
||||
|
||||
sk_http_cli = env.safekeepers[0].http_client()
|
||||
commit_lsn = sk_http_cli.timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
# Note: this is in memory value. Graceful shutdown of walproposer currently
|
||||
# doesn't guarantee persisted value, which is ok as we need it only for
|
||||
# tests. Persisting it without risking too many cf flushes needs a wp -> sk
|
||||
# protocol change. (though in reality shutdown sync-safekeepers does flush
|
||||
# of cf, so most of the time persisted value wouldn't lag)
|
||||
log.info(f"sk commit_lsn {commit_lsn}")
|
||||
# note that ckpt_lsn is the *beginning* of checkpoint record, so commit_lsn
|
||||
# must be actually higher
|
||||
assert commit_lsn > ckpt_lsn, "safekeeper must have checkpoint record"
|
||||
|
||||
|
||||
class SafekeeperEnv:
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: f49a962b9b...b980d6f090
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: e8b9a28006...56f32c0e73
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 072697b225...9007894722
7
vendor/revisions.json
vendored
7
vendor/revisions.json
vendored
@@ -1,6 +1,5 @@
|
||||
{
|
||||
"postgres-v16": "072697b2250da3251af75887b577104554b9cd44",
|
||||
"postgres-v15": "e8b9a28006a550d7ca7cbb9bd0238eb9cd57bbd8",
|
||||
"postgres-v14": "f49a962b9b3715d6f47017d1dcf905c36f93ae5e"
|
||||
"postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b",
|
||||
"postgres-v15": "56f32c0e7330d17aaeee8bf211a73995180bd133",
|
||||
"postgres-v14": "b980d6f090c676e55fb2c830fb2434f532f635c0"
|
||||
}
|
||||
|
||||
|
||||
@@ -142,6 +142,51 @@ files:
|
||||
query: |
|
||||
select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
|
||||
|
||||
- metric_name: pg_stats_userdb
|
||||
type: gauge
|
||||
help: 'Stats for the oldest non-system db'
|
||||
key_labels:
|
||||
- datname
|
||||
value_label: kind
|
||||
values:
|
||||
- db_size
|
||||
- deadlocks
|
||||
# Rows
|
||||
- inserted
|
||||
- updated
|
||||
- deleted
|
||||
# We export stats for only one non-system database. Without this limit
|
||||
# it is too easy to abuse the system by creating lots of databases.
|
||||
# We can try lifting this limit in the future after we understand the needs better.
|
||||
query: |
|
||||
select pg_database_size(datname) as db_size, deadlocks,
|
||||
tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
|
||||
datname
|
||||
from pg_stat_database
|
||||
where datname IN (
|
||||
select datname
|
||||
from pg_database
|
||||
where datname <> 'postgres' and not datistemplate
|
||||
order by oid
|
||||
limit 1
|
||||
);
|
||||
|
||||
- metric_name: max_cluster_size
|
||||
type: gauge
|
||||
help: 'neon.max_cluster_size setting'
|
||||
key_labels:
|
||||
values: [max_cluster_size]
|
||||
query: |
|
||||
select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
|
||||
|
||||
- metric_name: db_total_size
|
||||
type: gauge
|
||||
help: 'Size of all databases'
|
||||
key_labels:
|
||||
values: [total]
|
||||
query: |
|
||||
select sum(pg_database_size(datname)) as total from pg_database;
|
||||
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
|
||||
Reference in New Issue
Block a user