Unify remote index loading

Make http timeout configurable (#5532 )
## Problem Currently http timeout is hardcoded to 15 seconds. ## Summary of changes Added an option to configure it via cli args. Context: https://neondb.slack.com/archives/C04DGM6SMTM/p1696941726151899
2026-05-17 13:10:38 +00:00 · 2023-10-13 18:13:21 +01:00 · 2023-10-12 11:41:07 +02:00 · 2023-10-11 16:24:36 +01:00 · 2023-10-11 15:25:08 +01:00 · 2023-10-11 13:22:00 +01:00
18 changed files with 627 additions and 321 deletions
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -224,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
-    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,7 +4,7 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::sync::Arc;
+use std::{borrow::Cow, sync::Arc};

 use anyhow::Context;
 use aws_config::{
@@ -556,6 +556,20 @@ impl RemoteStorage for S3Bucket {
                        .deleted_objects_total
                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
+                        // Log a bounded number of the errors within the response:
+                        // these requests can carry 1000 keys so logging each one
+                        // would be too verbose, especially as errors may lead us
+                        // to retry repeatedly.
+                        const LOG_UP_TO_N_ERRORS: usize = 10;
+                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                            tracing::warn!(
+                                "DeleteObjects key {} failed: {}: {}",
+                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                            );
+                        }
+
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -153,7 +153,7 @@ impl FlushOp {

 #[derive(Clone, Debug)]
 pub struct DeletionQueueClient {
-    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+    tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,

    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
@@ -416,7 +416,7 @@ pub enum DeletionQueueError {
 impl DeletionQueueClient {
    pub(crate) fn broken() -> Self {
        // Channels whose receivers are immediately dropped.
-        let (tx, _rx) = tokio::sync::mpsc::channel(1);
+        let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
        Self {
            tx,
@@ -428,12 +428,12 @@ impl DeletionQueueClient {
    /// This is cancel-safe.  If you drop the future before it completes, the message
    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
    /// we decide to do a deletion the decision is always final.
-    async fn do_push<T>(
+    fn do_push<T>(
        &self,
-        queue: &tokio::sync::mpsc::Sender<T>,
+        queue: &tokio::sync::mpsc::UnboundedSender<T>,
        msg: T,
    ) -> Result<(), DeletionQueueError> {
-        match queue.send(msg).await {
+        match queue.send(msg) {
            Ok(_) => Ok(()),
            Err(e) => {
                // This shouldn't happen, we should shut down all tenants before
@@ -445,7 +445,7 @@ impl DeletionQueueClient {
        }
    }

-    pub(crate) async fn recover(
+    pub(crate) fn recover(
        &self,
        attached_tenants: HashMap<TenantId, Generation>,
    ) -> Result<(), DeletionQueueError> {
@@ -453,7 +453,6 @@ impl DeletionQueueClient {
            &self.tx,
            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
        )
-        .await
    }

    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
@@ -526,6 +525,21 @@ impl DeletionQueueClient {
            return self.flush_immediate().await;
        }

+        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
+    }
+
+    /// When a Tenant has a generation, push_layers is always synchronous because
+    /// the ListValidator channel is an unbounded channel.
+    ///
+    /// This can be merged into push_layers when we remove the Generation-less mode
+    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
+    pub(crate) fn push_layers_sync(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
            .inc_by(layers.len() as u64);
@@ -539,17 +553,16 @@ impl DeletionQueueClient {
                objects: Vec::new(),
            }),
        )
-        .await
    }

    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
    async fn do_flush<T>(
        &self,
-        queue: &tokio::sync::mpsc::Sender<T>,
+        queue: &tokio::sync::mpsc::UnboundedSender<T>,
        msg: T,
        rx: tokio::sync::oneshot::Receiver<()>,
    ) -> Result<(), DeletionQueueError> {
-        self.do_push(queue, msg).await?;
+        self.do_push(queue, msg)?;
        if rx.await.is_err() {
            // This shouldn't happen if tenants are shut down before deletion queue.  If we
            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
@@ -570,6 +583,18 @@ impl DeletionQueueClient {
            .await
    }

+    /// Issue a flush without waiting for it to complete.  This is useful on advisory flushes where
+    /// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant
+    /// detach where flushing is nice but not necessary.
+    ///
+    /// This function provides no guarantees of work being done.
+    pub fn flush_advisory(&self) {
+        let (flush_op, _) = FlushOp::new();
+
+        // Transmit the flush message, ignoring any result (such as a closed channel during shutdown).
+        drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op)));
+    }
+
    // Wait until all previous deletions are executed
    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
        debug!("flush_execute: flushing to deletion lists...");
@@ -586,9 +611,7 @@ impl DeletionQueueClient {
        // Flush any immediate-mode deletions (the above backend flush will only flush
        // the executor if deletions had flowed through the backend)
        debug!("flush_execute: flushing execution...");
-        let (flush_op, rx) = FlushOp::new();
-        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
-            .await?;
+        self.flush_immediate().await?;
        debug!("flush_execute: finished flushing execution...");
        Ok(())
    }
@@ -643,8 +666,10 @@ impl DeletionQueue {
    where
        C: ControlPlaneGenerationsApi + Send + Sync,
    {
-        // Deep channel: it consumes deletions from all timelines and we do not want to block them
-        let (tx, rx) = tokio::sync::mpsc::channel(16384);
+        // Unbounded channel: enables non-async functions to submit deletions.  The actual length is
+        // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent
+        // enough to avoid this taking pathologically large amount of memory.
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();

        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
@@ -957,7 +982,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let tenant_id = ctx.harness.tenant_id;
@@ -1025,7 +1050,7 @@ mod test {
    async fn deletion_queue_validation() -> anyhow::Result<()> {
        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        // Generation that the control plane thinks is current
        let latest_generation = Generation::new(0xdeadbeef);
@@ -1082,7 +1107,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        let tenant_id = ctx.harness.tenant_id;

@@ -1145,9 +1170,7 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client
-            .recover(HashMap::from([(tenant_id, now_generation)]))
-            .await?;
+        client.recover(HashMap::from([(tenant_id, now_generation)]))?;

        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1173,7 +1196,7 @@ pub(crate) mod mock {
    };

    pub struct ConsumerState {
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
    }

@@ -1250,7 +1273,7 @@ pub(crate) mod mock {
    }

    pub struct MockDeletionQueue {
-        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
        executed: Arc<AtomicUsize>,
        remote_storage: Option<GenericRemoteStorage>,
@@ -1260,7 +1283,7 @@ pub(crate) mod mock {

    impl MockDeletionQueue {
        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
-            let (tx, rx) = tokio::sync::mpsc::channel(16384);
+            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);

            let executed = Arc::new(AtomicUsize::new(0));
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -13,6 +13,7 @@ use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
+use utils::backoff;

 use crate::metrics;

@@ -63,7 +64,19 @@ impl Deleter {
            Err(anyhow::anyhow!("failpoint hit"))
        });

-        self.remote_storage.delete_objects(&self.accumulator).await
+        // A backoff::retry is used here for two reasons:
+        // - To provide a backoff rather than busy-polling the API on errors
+        // - To absorb transient 429/503 conditions without hitting our error
+        //   logging path for issues deleting objects.
+        backoff::retry(
+            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            |_| false,
+            3,
+            10,
+            "executing deletion batch",
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
+        )
+        .await
    }

    /// Block until everything in accumulator has been executed
@@ -88,7 +101,10 @@ impl Deleter {
                    self.accumulator.clear();
                }
                Err(e) => {
-                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    if self.cancel.is_cancelled() {
+                        return Err(DeletionQueueError::ShuttingDown);
+                    }
+                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
                    metrics::DELETION_QUEUE
                        .remote_errors
                        .with_label_values(&["execute"])
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -85,7 +85,7 @@ pub(super) struct ListWriter {
    conf: &'static PageServerConf,

    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+    rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,

    // Outbound requests to the backend to execute deletion lists we have composed.
    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
@@ -111,7 +111,7 @@ impl ListWriter {

    pub(super) fn new(
        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
        cancel: CancellationToken,
    ) -> Self {
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -77,7 +77,7 @@ impl State {
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<Self> {
-        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
+        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
            .iter()
            .map(|v| v.parse().unwrap())
            .collect::<Vec<_>>();
@@ -164,9 +164,6 @@ impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
-            TenantStateError::NotActive(_) => {
-                ApiError::ResourceUnavailable("Tenant not yet active".into())
-            }
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
@@ -575,9 +572,14 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
-        .instrument(info_span!("tenant_detach", %tenant_id))
-        .await?;
+    mgr::detach_tenant(
+        conf,
+        tenant_id,
+        detach_ignored.unwrap_or(false),
+        &state.deletion_queue_client,
+    )
+    .instrument(info_span!("tenant_detach", %tenant_id))
+    .await?;

    json_response(StatusCode::OK, ())
 }
@@ -1034,7 +1036,7 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        mgr::detach_tenant(conf, tenant_id, true)
+        mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
            .instrument(info_span!("tenant_detach", %tenant_id))
            .await?;
        return json_response(StatusCode::OK, ());
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -29,11 +29,11 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::collections::HashSet;
 use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
-use std::io;
 use std::ops::Bound::Included;
 use std::process::Command;
 use std::process::Stdio;
@@ -45,10 +45,10 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::AttachedLocationConfig;
+use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
-use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
@@ -208,7 +208,7 @@ pub struct Tenant {

    /// The remote storage generation, used to protect S3 objects from split-brain.
    /// Does not change over the lifetime of the [`Tenant`] object.
-    ///  
+    ///
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,
@@ -372,6 +372,13 @@ struct RemoteStartupData {
    remote_metadata: TimelineMetadata,
 }

+struct TimelinePreload {
+    timeline_id: TimelineId,
+    remote_client: Option<RemoteTimelineClient>,
+    index_part: Option<IndexPart>,
+    metadata: TimelineMetadata,
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum WaitToBecomeActiveError {
    WillNotBecomeActive {
@@ -412,11 +419,6 @@ pub enum CreateTimelineError {
    Other(#[from] anyhow::Error),
 }

-struct TenantDirectoryScan {
-    sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
-    timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
-}
-
 enum CreateTimelineCause {
    Load,
    Delete,
@@ -660,41 +662,14 @@ impl Tenant {
        Ok(tenant)
    }

-    ///
-    /// Background task that downloads all data for a tenant and brings it to Active state.
-    ///
-    /// No background tasks are started as part of this routine.
-    ///
-    async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
-        if !tokio::fs::try_exists(&marker_file)
-            .await
-            .context("check for existence of marker file")?
-        {
-            anyhow::bail!(
-                "implementation error: marker file should exist at beginning of this function"
-            );
-        }
-
-        // Get list of remote timelines
-        // download index files for every tenant timeline
-        info!("listing remote timelines");
-
-        let remote_storage = self
-            .remote_storage
-            .as_ref()
-            .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
-
-        let remote_timeline_ids =
-            remote_timeline_client::list_remote_timelines(remote_storage, self.tenant_id).await?;
-
-        info!("found {} timelines", remote_timeline_ids.len());
-
-        // Download & parse index parts
+    fn download_indices(
+        &self,
+        timeline_ids: HashSet<TimelineId>,
+        remote_storage: &GenericRemoteStorage,
+    ) -> JoinSet<Result<(TimelineId, RemoteTimelineClient, MaybeDeletedIndexPart), anyhow::Error>>
+    {
        let mut part_downloads = JoinSet::new();
-        for timeline_id in remote_timeline_ids {
+        for timeline_id in timeline_ids {
            let client = RemoteTimelineClient::new(
                remote_storage.clone(),
                self.deletion_queue_client.clone(),
@@ -723,11 +698,56 @@ impl Tenant {
            );
        }

+        part_downloads
+    }
+
+    /// Special variant of preload_timelines that does not rely on remote storage
+    async fn preload_timelines_local(
+        self: &Arc<Self>,
+        timeline_ids: &HashSet<TimelineId>,
+    ) -> anyhow::Result<Vec<TimelinePreload>> {
+        let mut preload_map = HashMap::new();
+        for timeline_id in timeline_ids {
+            let metadata = load_metadata(self.conf, &self.tenant_id, timeline_id)?;
+            preload_map.insert(
+                *timeline_id,
+                TimelinePreload {
+                    timeline_id: *timeline_id,
+                    remote_client: None,
+                    // TODO: synthesize an index_part and make it non-optional
+                    index_part: None,
+                    metadata,
+                },
+            );
+        }
+
+        // Sort by ancestry
+        Ok(
+            tree_sort_timelines(preload_map, |p| p.metadata.ancestor_timeline())?
+                .into_iter()
+                .map(|i| i.1)
+                .collect(),
+        )
+    }
+
+    /// Do the remote I/O and sorting required to prepare a list of timelines
+    /// with their IndexParts, ready for hydrating into `Timeline`
+    async fn preload_timelines(
+        self: &Arc<Self>,
+        timeline_ids: HashSet<TimelineId>,
+        remote_storage: &GenericRemoteStorage,
+    ) -> anyhow::Result<Vec<TimelinePreload>> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        let mut part_downloads = self.download_indices(timeline_ids, remote_storage);
+
        let mut timelines_to_resume_deletions = vec![];

+        // We construct a map all timeline's preload state, prior to sorting
+        // it by ancestry at the end of the function
+        let mut preload_map: HashMap<TimelineId, TimelinePreload> = HashMap::new();
+
        // Wait for all the download tasks to complete & collect results.
-        let mut remote_index_and_client = HashMap::new();
-        let mut timeline_ancestors = HashMap::new();
        while let Some(result) = part_downloads.join_next().await {
            // NB: we already added timeline_id as context to the error
            let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
@@ -735,8 +755,16 @@ impl Tenant {
            debug!("successfully downloaded index part for timeline {timeline_id}");
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
-                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
-                    remote_index_and_client.insert(timeline_id, (index_part, client));
+                    let metadata = index_part.metadata.clone();
+                    preload_map.insert(
+                        timeline_id,
+                        TimelinePreload {
+                            timeline_id,
+                            remote_client: Some(client),
+                            index_part: Some(index_part),
+                            metadata,
+                        },
+                    );
                }
                MaybeDeletedIndexPart::Deleted(index_part) => {
                    info!(
@@ -748,35 +776,6 @@ impl Tenant {
            }
        }

-        // For every timeline, download the metadata file, scan the local directory,
-        // and build a layer map that contains an entry for each remote and local
-        // layer file.
-        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
-        for (timeline_id, remote_metadata) in sorted_timelines {
-            let (index_part, remote_client) = remote_index_and_client
-                .remove(&timeline_id)
-                .expect("just put it in above");
-
-            // TODO again handle early failure
-            self.load_remote_timeline(
-                timeline_id,
-                index_part,
-                remote_metadata,
-                TimelineResources {
-                    remote_client: Some(remote_client),
-                    deletion_queue_client: self.deletion_queue_client.clone(),
-                },
-                ctx,
-            )
-            .await
-            .with_context(|| {
-                format!(
-                    "failed to load remote timeline {} for tenant {}",
-                    timeline_id, self.tenant_id
-                )
-            })?;
-        }
-
        // Walk through deleted timelines, resume deletion
        for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions {
            remote_timeline_client
@@ -797,6 +796,81 @@ impl Tenant {
            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
        }

+        // Sort by ancestry
+        Ok(
+            tree_sort_timelines(preload_map, |p| p.metadata.ancestor_timeline())?
+                .into_iter()
+                .map(|i| i.1)
+                .collect(),
+        )
+    }
+
+    ///
+    /// Background task that downloads all data for a tenant and brings it to Active state.
+    ///
+    /// No background tasks are started as part of this routine.
+    ///
+    async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
+        if !tokio::fs::try_exists(&marker_file)
+            .await
+            .context("check for existence of marker file")?
+        {
+            anyhow::bail!(
+                "implementation error: marker file should exist at beginning of this function"
+            );
+        }
+
+        // Get list of remote timelines
+        info!("listing remote timelines");
+
+        let remote_storage = self
+            .remote_storage
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
+
+        let remote_timeline_ids =
+            remote_timeline_client::list_remote_timelines(remote_storage, self.tenant_id).await?;
+
+        info!("found {} timelines", remote_timeline_ids.len());
+
+        // Download & parse index parts
+        let sorted_timelines = self
+            .preload_timelines(remote_timeline_ids, remote_storage)
+            .await?;
+
+        // For every timeline, download the metadata file, scan the local directory,
+        // and build a layer map that contains an entry for each remote and local
+        // layer file.
+        for timeline_preload in sorted_timelines {
+            let TimelinePreload {
+                timeline_id,
+                remote_client,
+                index_part,
+                metadata: _,
+            } = timeline_preload;
+
+            // TODO again handle early failure
+            self.load_remote_timeline(
+                timeline_id,
+                index_part.unwrap(),
+                TimelineResources {
+                    remote_client,
+                    deletion_queue_client: self.deletion_queue_client.clone(),
+                },
+                ctx,
+            )
+            .await
+            .with_context(|| {
+                format!(
+                    "failed to load remote timeline {} for tenant {}",
+                    timeline_id, self.tenant_id
+                )
+            })?;
+        }
+
        std::fs::remove_file(&marker_file)
            .with_context(|| format!("unlink attach marker file {marker_file}"))?;
        crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
@@ -829,7 +903,6 @@ impl Tenant {
        &self,
        timeline_id: TimelineId,
        index_part: IndexPart,
-        remote_metadata: TimelineMetadata,
        resources: TimelineResources,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -840,7 +913,7 @@ impl Tenant {
            .await
            .context("Failed to create new timeline directory")?;

-        let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
+        let ancestor = if let Some(ancestor_id) = index_part.metadata.ancestor_timeline() {
            let timelines = self.timelines.lock().unwrap();
            Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
                || {
@@ -858,6 +931,7 @@ impl Tenant {
        // cannot be older than the local one
        let local_metadata = None;

+        let remote_metadata = index_part.metadata.clone();
        self.timeline_init_and_sync(
            timeline_id,
            resources,
@@ -1031,12 +1105,9 @@ impl Tenant {
        tenant
    }

-    fn scan_and_sort_timelines_dir(self: Arc<Tenant>) -> anyhow::Result<TenantDirectoryScan> {
-        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        // Note timelines_to_resume_deletion needs to be separate because it can be not sortable
-        // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
-        // completed in non topological order (for example because parent has smaller number of layer files in it)
-        let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
+    async fn scan_timelines_dir(self: &Arc<Tenant>) -> anyhow::Result<HashSet<TimelineId>> {
+        let mut timelines_to_load: HashSet<TimelineId> = HashSet::new();
+        let mut timelines_to_resume_deletion: HashSet<TimelineId> = HashSet::new();

        let timelines_dir = self.conf.timelines_path(&self.tenant_id);

@@ -1085,38 +1156,7 @@ impl Tenant {
                    })?;

                info!("Found deletion mark for timeline {}", timeline_id);
-
-                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
-                    Ok(metadata) => {
-                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
-                    }
-                    Err(e) => match &e {
-                        LoadMetadataError::Read(r) => {
-                            if r.kind() != io::ErrorKind::NotFound {
-                                return Err(anyhow::anyhow!(e)).with_context(|| {
-                                    format!("Failed to load metadata for timeline_id {timeline_id}")
-                                });
-                            }
-
-                            // If metadata doesnt exist it means that we've crashed without
-                            // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
-                            // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
-                            // We cant do it here because the method is async so we'd need block_on
-                            // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
-                            // so that basically results in a cycle:
-                            // spawn_blocking
-                            // - block_on
-                            //   - spawn_blocking
-                            // which can lead to running out of threads in blocing pool.
-                            timelines_to_resume_deletion.push((timeline_id, None));
-                        }
-                        _ => {
-                            return Err(anyhow::anyhow!(e)).with_context(|| {
-                                format!("Failed to load metadata for timeline_id {timeline_id}")
-                            })
-                        }
-                    },
-                }
+                timelines_to_resume_deletion.insert(timeline_id);
            } else {
                if !timeline_dir.exists() {
                    warn!("Timeline dir entry become invalid: {timeline_dir}");
@@ -1154,9 +1194,7 @@ impl Tenant {

                let file_name = entry.file_name();
                if let Ok(timeline_id) = file_name.parse::<TimelineId>() {
-                    let metadata = load_metadata(self.conf, &self.tenant_id, &timeline_id)
-                        .context("failed to load metadata")?;
-                    timelines_to_load.insert(timeline_id, metadata);
+                    timelines_to_load.insert(timeline_id);
                } else {
                    // A file or directory that doesn't look like a timeline ID
                    warn!("unexpected file or directory in timelines directory: {file_name}");
@@ -1164,14 +1202,18 @@ impl Tenant {
            }
        }

-        // Sort the array of timeline IDs into tree-order, so that parent comes before
-        // all its children.
-        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
-            TenantDirectoryScan {
-                sorted_timelines_to_load: sorted_timelines,
-                timelines_to_resume_deletion,
+        for timeline_id in timelines_to_resume_deletion {
+            if let Err(e) =
+                DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id).await
+            {
+                warn!(
+                    "cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
+                    timeline_id, e
+                );
            }
-        })
+        }
+
+        Ok(timelines_to_load)
    }

    ///
@@ -1194,24 +1236,34 @@ impl Tenant {
        //
        // Scan the directory, peek into the metadata file of each timeline, and
        // collect a list of timelines and their ancestors.
-        let span = info_span!("blocking");
        let cloned = Arc::clone(self);

-        let scan = tokio::task::spawn_blocking(move || {
-            let _g = span.entered();
-            cloned.scan_and_sort_timelines_dir()
-        })
-        .await
-        .context("load spawn_blocking")
-        .and_then(|res| res)?;
+        let local_timelines = tokio::task::spawn(async move { cloned.scan_timelines_dir().await })
+            .await
+            .context("load spawn_blocking")
+            .and_then(|res| res)?;

-        // FIXME original collect_timeline_files contained one more check:
-        //    1. "Timeline has no ancestor and no layer files"
+        let sorted_timelines = match &self.remote_storage {
+            Some(remote_storage) => {
+                self.preload_timelines(local_timelines, remote_storage)
+                    .await?
+            }
+            None => {
+                // Deprecated mode, only used in dev.
+                self.preload_timelines_local(&local_timelines).await?
+            }
+        };
+
+        for timeline_preload in sorted_timelines {
+            let TimelinePreload {
+                timeline_id,
+                remote_client: _,
+                index_part: _,
+                metadata,
+            } = timeline_preload;

-        // Process loadable timelines first
-        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
            if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, init_order, ctx, false)
+                .load_local_timeline(timeline_id, metadata, init_order, ctx, false)
                .await
            {
                match e {
@@ -1228,43 +1280,6 @@ impl Tenant {
            }
        }

-        // Resume deletion ones with deleted_mark
-        for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
-            match maybe_local_metadata {
-                None => {
-                    // See comment in `scan_and_sort_timelines_dir`.
-                    if let Err(e) =
-                        DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
-                            .await
-                    {
-                        warn!(
-                            "cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
-                            timeline_id, e
-                        );
-                    }
-                }
-                Some(local_metadata) => {
-                    if let Err(e) = self
-                        .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
-                        .await
-                    {
-                        match e {
-                            LoadLocalTimelineError::Load(source) => {
-                                // We tried to load deleted timeline, this is a bug.
-                                return Err(anyhow::anyhow!(source).context(
-                                "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
-                            ));
-                            }
-                            LoadLocalTimelineError::ResumeDeletion(source) => {
-                                // Make sure resumed deletion wont fail loading for entire tenant.
-                                error!("Failed to resume timeline deletion: {source:#}")
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
        trace!("Done");

        Ok(())
@@ -2076,6 +2091,15 @@ impl Tenant {
            }
        }
    }
+
+    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
+        self.tenant_conf
+            .read()
+            .unwrap()
+            .location
+            .attach_mode
+            .clone()
+    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2745,6 +2769,11 @@ impl Tenant {
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

+        // First acquire the GC lock so that another task cannot advance the GC
+        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
+        // creating the branch.
+        let _gc_cs = self.gc_cs.lock().await;
+
        // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
        let start_lsn = start_lsn.unwrap_or_else(|| {
            let lsn = src_timeline.get_last_record_lsn();
@@ -2752,11 +2781,6 @@ impl Tenant {
            lsn
        });

-        // First acquire the GC lock so that another task cannot advance the GC
-        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
-        // creating the branch.
-        let _gc_cs = self.gc_cs.lock().await;
-
        // Create a placeholder for the new branch. This will error
        // out if the new timeline ID is already in use.
        let timeline_uninit_mark = {
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -31,7 +31,7 @@ use super::{
 const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;

 #[derive(Debug, thiserror::Error)]
-pub enum DeleteTenantError {
+pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

@@ -376,7 +376,7 @@ impl DeleteTenantFlow {
        Ok(())
    }

-    pub async fn should_resume_deletion(
+    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -24,7 +24,7 @@ use crate::control_plane_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
-use crate::tenant::config::{LocationConf, LocationMode, TenantConfOpt};
+use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{
    create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState,
@@ -50,7 +50,7 @@ use super::TenantSharedResources;
 /// its lifetime, and we can preserve some important safety invariants like `Tenant` always
 /// having a properly acquired generation (Secondary doesn't need a generation)
 #[derive(Clone)]
-pub enum TenantSlot {
+pub(crate) enum TenantSlot {
    Attached(Arc<Tenant>),
    Secondary,
 }
@@ -206,8 +206,7 @@ async fn init_load_generations(
    if resources.remote_storage.is_some() {
        resources
            .deletion_queue_client
-            .recover(generations.clone())
-            .await?;
+            .recover(generations.clone())?;
    }

    Ok(Some(generations))
@@ -482,7 +481,7 @@ pub(crate) fn schedule_local_tenant_processing(
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
 #[instrument(skip_all)]
-pub async fn shutdown_all_tenants() {
+pub(crate) async fn shutdown_all_tenants() {
    shutdown_all_tenants0(&TENANTS).await
 }

@@ -594,7 +593,7 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
    // caller will log how long we took
 }

-pub async fn create_tenant(
+pub(crate) async fn create_tenant(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
@@ -629,14 +628,14 @@ pub async fn create_tenant(
 }

 #[derive(Debug, thiserror::Error)]
-pub enum SetNewTenantConfigError {
+pub(crate) enum SetNewTenantConfigError {
    #[error(transparent)]
    GetTenant(#[from] GetTenantError),
    #[error(transparent)]
    Persist(anyhow::Error),
 }

-pub async fn set_new_tenant_config(
+pub(crate) async fn set_new_tenant_config(
    conf: &'static PageServerConf,
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
@@ -695,6 +694,18 @@ pub(crate) async fn upsert_location(

    if let Some(tenant) = shutdown_tenant {
        let (_guard, progress) = utils::completion::channel();
+
+        match tenant.get_attach_mode() {
+            AttachmentMode::Single | AttachmentMode::Multi => {
+                // Before we leave our state as the presumed holder of the latest generation,
+                // flush any outstanding deletions to reduce the risk of leaking objects.
+                deletion_queue_client.flush_advisory()
+            }
+            AttachmentMode::Stale => {
+                // If we're stale there's not point trying to flush deletions
+            }
+        };
+
        info!("Shutting down attached tenant");
        match tenant.shutdown(progress, false).await {
            Ok(()) => {}
@@ -765,7 +776,7 @@ pub(crate) async fn upsert_location(
 }

 #[derive(Debug, thiserror::Error)]
-pub enum GetTenantError {
+pub(crate) enum GetTenantError {
    #[error("Tenant {0} not found")]
    NotFound(TenantId),
    #[error("Tenant {0} is not active")]
@@ -781,7 +792,7 @@ pub enum GetTenantError {
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
 ///
 /// This method is cancel-safe.
-pub async fn get_tenant(
+pub(crate) async fn get_tenant(
    tenant_id: TenantId,
    active_only: bool,
 ) -> Result<Arc<Tenant>, GetTenantError> {
@@ -806,7 +817,7 @@ pub async fn get_tenant(
    }
 }

-pub async fn delete_tenant(
+pub(crate) async fn delete_tenant(
    conf: &'static PageServerConf,
    remote_storage: Option<GenericRemoteStorage>,
    tenant_id: TenantId,
@@ -815,7 +826,7 @@ pub async fn delete_tenant(
 }

 #[derive(Debug, thiserror::Error)]
-pub enum DeleteTimelineError {
+pub(crate) enum DeleteTimelineError {
    #[error("Tenant {0}")]
    Tenant(#[from] GetTenantError),

@@ -823,7 +834,7 @@ pub enum DeleteTimelineError {
    Timeline(#[from] crate::tenant::DeleteTimelineError),
 }

-pub async fn delete_timeline(
+pub(crate) async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    _ctx: &RequestContext,
@@ -834,23 +845,29 @@ pub async fn delete_timeline(
 }

 #[derive(Debug, thiserror::Error)]
-pub enum TenantStateError {
+pub(crate) enum TenantStateError {
    #[error("Tenant {0} not found")]
    NotFound(TenantId),
    #[error("Tenant {0} is stopping")]
    IsStopping(TenantId),
-    #[error("Tenant {0} is not active")]
-    NotActive(TenantId),
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

-pub async fn detach_tenant(
+pub(crate) async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    detach_ignored: bool,
+    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<(), TenantStateError> {
-    let tmp_path = detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await?;
+    let tmp_path = detach_tenant0(
+        conf,
+        &TENANTS,
+        tenant_id,
+        detach_ignored,
+        deletion_queue_client,
+    )
+    .await?;
    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
    let task_tenant_id = None;
@@ -875,6 +892,7 @@ async fn detach_tenant0(
    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    detach_ignored: bool,
+    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<Utf8PathBuf, TenantStateError> {
    let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
@@ -886,6 +904,10 @@ async fn detach_tenant0(
    let removal_result =
        remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await;

+    // Flush pending deletions, so that they have a good chance of passing validation
+    // before this tenant is potentially re-attached elsewhere.
+    deletion_queue_client.flush_advisory();
+
    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
    if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
@@ -902,7 +924,7 @@ async fn detach_tenant0(
    removal_result
 }

-pub async fn load_tenant(
+pub(crate) async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    generation: Generation,
@@ -939,7 +961,7 @@ pub async fn load_tenant(
    Ok(())
 }

-pub async fn ignore_tenant(
+pub(crate) async fn ignore_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
 ) -> Result<(), TenantStateError> {
@@ -967,7 +989,7 @@ async fn ignore_tenant0(
 }

 #[derive(Debug, thiserror::Error)]
-pub enum TenantMapListError {
+pub(crate) enum TenantMapListError {
    #[error("tenant map is still initiailizing")]
    Initializing,
 }
@@ -975,7 +997,7 @@ pub enum TenantMapListError {
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
    let tenants = TENANTS.read().await;
    let m = match &*tenants {
        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -993,7 +1015,7 @@ pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapLis
 ///
 /// Downloading all the tenant data is performed in the background, this merely
 /// spawns the background task and returns quickly.
-pub async fn attach_tenant(
+pub(crate) async fn attach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    generation: Generation,
@@ -1030,7 +1052,7 @@ pub async fn attach_tenant(
 }

 #[derive(Debug, thiserror::Error)]
-pub enum TenantMapInsertError {
+pub(crate) enum TenantMapInsertError {
    #[error("tenant map is still initializing")]
    StillInitializing,
    #[error("tenant map is shutting down")]
@@ -1193,7 +1215,7 @@ use {
    utils::http::error::ApiError,
 };

-pub async fn immediate_gc(
+pub(crate) async fn immediate_gc(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,5 +1,6 @@
 use futures::future::Either;
 use proxy::auth;
+use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
 use proxy::metrics;
@@ -79,6 +80,9 @@ struct ProxyCliArgs {
    /// Allow self-signed certificates for compute nodes (for testing)
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    allow_self_signed_compute: bool,
+    /// timeout for http connections
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    sql_over_http_timeout: tokio::time::Duration,
 }

 #[tokio::main]
@@ -220,12 +224,15 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            auth::BackendType::Link(Cow::Owned(url))
        }
    };
-
+    let http_config = HttpConfig {
+        sql_over_http_timeout: args.sql_over_http_timeout,
+    };
    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
        metric_collection,
        allow_self_signed_compute: args.allow_self_signed_compute,
+        http_config,
    }));

    Ok(config)
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -13,6 +13,7 @@ pub struct ProxyConfig {
    pub auth_backend: auth::BackendType<'static, ()>,
    pub metric_collection: Option<MetricCollectionConfig>,
    pub allow_self_signed_compute: bool,
+    pub http_config: HttpConfig,
 }

 #[derive(Debug)]
@@ -26,6 +27,10 @@ pub struct TlsConfig {
    pub common_names: Option<HashSet<String>>,
 }

+pub struct HttpConfig {
+    pub sql_over_http_timeout: tokio::time::Duration,
+}
+
 impl TlsConfig {
    pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
        self.config.clone()
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -20,6 +20,7 @@ use tokio_postgres::AsyncMessage;
 use crate::{
    auth, console,
    metrics::{Ids, MetricCounter, USAGE_METRICS},
+    proxy::{NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
 };
 use crate::{compute, config};

@@ -418,36 +419,42 @@ async fn connect_to_compute_once(
    };

    tokio::spawn(
-        poll_fn(move |cx| {
-            if matches!(rx.has_changed(), Ok(true)) {
-                session = *rx.borrow_and_update();
-                info!(%session, "changed session");
+        async move {
+            NUM_DB_CONNECTIONS_OPENED_COUNTER.with_label_values(&["http"]).inc();
+            scopeguard::defer! {
+                NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
            }
+            poll_fn(move |cx| {
+                if matches!(rx.has_changed(), Ok(true)) {
+                    session = *rx.borrow_and_update();
+                    info!(%session, "changed session");
+                }

-            loop {
-                let message = ready!(connection.poll_message(cx));
+                loop {
+                    let message = ready!(connection.poll_message(cx));

-                match message {
-                    Some(Ok(AsyncMessage::Notice(notice))) => {
-                        info!(%session, "notice: {}", notice);
-                    }
-                    Some(Ok(AsyncMessage::Notification(notif))) => {
-                        warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    }
-                    Some(Ok(_)) => {
-                        warn!(%session, "unknown message");
-                    }
-                    Some(Err(e)) => {
-                        error!(%session, "connection error: {}", e);
-                        return Poll::Ready(())
-                    }
-                    None => {
-                        info!("connection closed");
-                        return Poll::Ready(())
+                    match message {
+                        Some(Ok(AsyncMessage::Notice(notice))) => {
+                            info!(%session, "notice: {}", notice);
+                        }
+                        Some(Ok(AsyncMessage::Notification(notif))) => {
+                            warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                        }
+                        Some(Ok(_)) => {
+                            warn!(%session, "unknown message");
+                        }
+                        Some(Err(e)) => {
+                            error!(%session, "connection error: {}", e);
+                            return Poll::Ready(())
+                        }
+                        None => {
+                            info!("connection closed");
+                            return Poll::Ready(())
+                        }
                    }
                }
-            }
-        })
+            }).await
+        }
        .instrument(span)
    );

--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -24,6 +24,9 @@ use url::Url;
 use utils::http::error::ApiError;
 use utils::http::json::json_response;

+use crate::config::HttpConfig;
+use crate::proxy::{NUM_CONNECTIONS_ACCEPTED_COUNTER, NUM_CONNECTIONS_CLOSED_COUNTER};
+
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;

@@ -188,28 +191,46 @@ pub async fn handle(
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
    session_id: uuid::Uuid,
+    config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
-    let result = handle_inner(request, sni_hostname, conn_pool, session_id).await;
-
+    let result = tokio::time::timeout(
+        config.sql_over_http_timeout,
+        handle_inner(request, sni_hostname, conn_pool, session_id),
+    )
+    .await;
    let mut response = match result {
-        Ok(r) => r,
-        Err(e) => {
-            let message = format!("{:?}", e);
-            let code = match e.downcast_ref::<tokio_postgres::Error>() {
-                Some(e) => match e.code() {
-                    Some(e) => serde_json::to_value(e.code()).unwrap(),
+        Ok(r) => match r {
+            Ok(r) => r,
+            Err(e) => {
+                let message = format!("{:?}", e);
+                let code = e.downcast_ref::<tokio_postgres::Error>().and_then(|e| {
+                    e.code()
+                        .map(|s| serde_json::to_value(s.code()).unwrap_or_default())
+                });
+                let code = match code {
+                    Some(c) => c,
                    None => Value::Null,
-                },
-                None => Value::Null,
-            };
-            error!(
-                ?code,
-                "sql-over-http per-client task finished with an error: {e:#}"
+                };
+                error!(
+                    ?code,
+                    "sql-over-http per-client task finished with an error: {e:#}"
+                );
+                // TODO: this shouldn't always be bad request.
+                json_response(
+                    StatusCode::BAD_REQUEST,
+                    json!({ "message": message, "code": code }),
+                )?
+            }
+        },
+        Err(_) => {
+            let message = format!(
+                "HTTP-Connection timed out, execution time exeeded {} seconds",
+                config.sql_over_http_timeout.as_secs()
            );
-            // TODO: this shouldn't always be bad request.
+            error!(message);
            json_response(
-                StatusCode::BAD_REQUEST,
-                json!({ "message": message, "code": code }),
+                StatusCode::GATEWAY_TIMEOUT,
+                json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }),
            )?
        }
    };
@@ -227,6 +248,13 @@ async fn handle_inner(
    conn_pool: Arc<GlobalConnPool>,
    session_id: uuid::Uuid,
 ) -> anyhow::Result<Response<Body>> {
+    NUM_CONNECTIONS_ACCEPTED_COUNTER
+        .with_label_values(&["http"])
+        .inc();
+    scopeguard::defer! {
+        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
+    }
+
    //
    // Determine the destination and connection params
    //
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -3,7 +3,10 @@ use crate::{
    config::ProxyConfig,
    error::io_error,
    protocol2::{ProxyProtocolAccept, WithClientIp},
-    proxy::{handle_client, ClientMode},
+    proxy::{
+        handle_client, ClientMode, NUM_CLIENT_CONNECTION_CLOSED_COUNTER,
+        NUM_CLIENT_CONNECTION_OPENED_COUNTER,
+    },
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
@@ -202,7 +205,14 @@ async fn ws_handler(
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await
+        sql_over_http::handle(
+            request,
+            sni_hostname,
+            conn_pool,
+            session_id,
+            &config.http_config,
+        )
+        .await
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
        Response::builder()
            .header("Allow", "OPTIONS, POST")
@@ -275,23 +285,25 @@ pub async fn task_main(
            let conn_pool = conn_pool.clone();

            async move {
-                Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
-                    let sni_name = sni_name.clone();
-                    let conn_pool = conn_pool.clone();
+                Ok::<_, Infallible>(MetricService::new(hyper::service::service_fn(
+                    move |req: Request<Body>| {
+                        let sni_name = sni_name.clone();
+                        let conn_pool = conn_pool.clone();

-                    async move {
-                        let cancel_map = Arc::new(CancelMap::default());
-                        let session_id = uuid::Uuid::new_v4();
+                        async move {
+                            let cancel_map = Arc::new(CancelMap::default());
+                            let session_id = uuid::Uuid::new_v4();

-                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
-                            .instrument(info_span!(
-                                "ws-client",
-                                session = %session_id,
-                                %peer_addr,
-                            ))
-                            .await
-                    }
-                }))
+                            ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
+                                .instrument(info_span!(
+                                    "ws-client",
+                                    session = %session_id,
+                                    %peer_addr,
+                                ))
+                                .await
+                        }
+                    },
+                )))
            }
        },
    );
@@ -303,3 +315,41 @@ pub async fn task_main(

    Ok(())
 }
+
+struct MetricService<S> {
+    inner: S,
+}
+
+impl<S> MetricService<S> {
+    fn new(inner: S) -> MetricService<S> {
+        NUM_CLIENT_CONNECTION_OPENED_COUNTER
+            .with_label_values(&["http"])
+            .inc();
+        MetricService { inner }
+    }
+}
+
+impl<S> Drop for MetricService<S> {
+    fn drop(&mut self) {
+        NUM_CLIENT_CONNECTION_CLOSED_COUNTER
+            .with_label_values(&["http"])
+            .inc();
+    }
+}
+
+impl<S, ReqBody> hyper::service::Service<Request<ReqBody>> for MetricService<S>
+where
+    S: hyper::service::Service<Request<ReqBody>>,
+{
+    type Response = S::Response;
+    type Error = S::Error;
+    type Future = S::Future;
+
+    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
+        self.inner.call(req)
+    }
+}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ use crate::{
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    http::StatusCode,
    metrics::{Ids, USAGE_METRICS},
    protocol2::WithClientIp,
    stream::{PqStream, Stream},
@@ -38,19 +39,55 @@ const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";

-static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static NUM_DB_CONNECTIONS_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
-        "proxy_accepted_connections_total",
-        "Number of TCP client connections accepted.",
+        "proxy_opened_db_connections_total",
+        "Number of opened connections to a database.",
        &["protocol"],
    )
    .unwrap()
 });

-static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_closed_db_connections_total",
+        "Number of closed connections to a database.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CLIENT_CONNECTION_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_opened_client_connections_total",
+        "Number of opened connections from a client.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_closed_client_connections_total",
+        "Number of closed connections from a client.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_accepted_connections_total",
+        "Number of client connections accepted.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_closed_connections_total",
-        "Number of TCP client connections closed.",
+        "Number of client connections closed.",
        &["protocol"],
    )
    .unwrap()
@@ -75,6 +112,15 @@ static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

+static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_breakdown",
+        "Number of wake-up failures (per kind).",
+        &["retry", "kind"],
+    )
+    .unwrap()
+});
+
 static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_io_bytes_per_client",
@@ -208,12 +254,16 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        "handling interactive connection from client"
    );

-    // The `closed` counter will increase when this future is destroyed.
+    let proto = mode.protocol_label();
+    NUM_CLIENT_CONNECTION_OPENED_COUNTER
+        .with_label_values(&[proto])
+        .inc();
    NUM_CONNECTIONS_ACCEPTED_COUNTER
-        .with_label_values(&[mode.protocol_label()])
+        .with_label_values(&[proto])
        .inc();
    scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
+        NUM_CLIENT_CONNECTION_CLOSED_COUNTER.with_label_values(&[proto]).inc();
+        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
    }

    let tls = config.tls_config.as_ref();
@@ -248,7 +298,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        mode.allow_self_signed_compute(config),
    );
    cancel_map
-        .with_session(|session| client.connect_to_db(session, mode.allow_cleartext()))
+        .with_session(|session| client.connect_to_db(session, mode))
        .await
 }

@@ -397,6 +447,46 @@ impl ConnectMechanism for TcpMechanism<'_> {
    }
 }

+const fn bool_to_str(x: bool) -> &'static str {
+    if x {
+        "true"
+    } else {
+        "false"
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
+
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
@@ -440,10 +530,12 @@ where
        match handle_try_wake(wake_res, num_retries) {
            Err(e) => {
                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
                return Err(e.into());
            }
            // failed to wake up but we can continue to retry
            Ok(ControlFlow::Continue(e)) => {
+                report_error(&e, true);
                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
            }
            // successfully woke up a compute node and can break the wakeup loop
@@ -682,7 +774,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
    async fn connect_to_db(
        self,
        session: cancellation::Session<'_>,
-        allow_cleartext: bool,
+        mode: ClientMode,
    ) -> anyhow::Result<()> {
        let Self {
            mut stream,
@@ -698,7 +790,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        };

        let auth_result = match creds
-            .authenticate(&extra, &mut stream, allow_cleartext)
+            .authenticate(&extra, &mut stream, mode.allow_cleartext())
            .await
        {
            Ok(auth_result) => auth_result,
@@ -724,6 +816,14 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            .or_else(|e| stream.throw_error(e))
            .await?;

+        let proto = mode.protocol_label();
+        NUM_DB_CONNECTIONS_OPENED_COUNTER
+            .with_label_values(&[proto])
+            .inc();
+        scopeguard::defer! {
+            NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
+        }
+
        prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
        // Before proxy passing, forward to compute whatever data is left in the
        // PqStream input buffer. Normally there is none, but our serverless npm
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -374,8 +374,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
    if conf.http_auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            #[allow(clippy::mutable_key_type)]
-            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
-                Lazy::new(|| ["/v1/status"].iter().map(|v| v.parse().unwrap()).collect());
+            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> = Lazy::new(|| {
+                ["/v1/status", "/metrics"]
+                    .iter()
+                    .map(|v| v.parse().unwrap())
+                    .collect()
+            });
            if ALLOWLIST_ROUTES.contains(request.uri()) {
                None
            } else {
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -65,7 +65,7 @@ def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_it

    def start_single_table_workload(table_id: int):
        for _ in range(num_iters):
-            with env.pg.connect().cursor() as cur:
+            with env.pg.connect(options="-cstatement_timeout=300s").cursor() as cur:
                cur.execute(
                    f"INSERT INTO t{table_id} SELECT FROM generate_series(1,{new_rows_each_update})"
                )
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import List, Optional

 import asyncpg
+import pytest
 import toml
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
@@ -597,7 +598,10 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat
    assert res == expected_sum


-# do inserts while restarting postgres and messing with safekeeper addresses
+# Do inserts while restarting postgres and messing with safekeeper addresses.
+# The test takes more than default 5 minutes on Postgres 16,
+# see https://github.com/neondatabase/neon/issues/5305
+@pytest.mark.timeout(600)
 def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path):
    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()
Author	SHA1	Message	Date
John Spray	51f87d34ca	Unify remote index loading	2023-10-13 18:13:21 +01:00
khanova	dbb21d6592	Make http timeout configurable (#5532 ) ## Problem Currently http timeout is hardcoded to 15 seconds. ## Summary of changes Added an option to configure it via cli args. Context: https://neondb.slack.com/archives/C04DGM6SMTM/p1696941726151899	2023-10-12 11:41:07 +02:00
Joonas Koivunen	ddceb9e6cd	fix(branching): read last record lsn only after Tenant::gc_cs (#5535 ) Fixes #5531, at least the latest error of not being able to create a branch from the head under write and gc pressure.	2023-10-11 16:24:36 +01:00
John Spray	0fc3708de2	pageserver: use a backoff::retry in Deleter (#5534 ) ## Problem The `Deleter` currently doesn't use a backoff::retry because it doesn't need to: it is already inside a loop when doing the deletion, so can just let the loop go around. However, this is a problem for logging, because we log on errors, which includes things like 503/429 cases that would usually be swallowed by a backoff::retry in most places we use the RemoteStorage interface. The underlying problem is that RemoteStorage doesn't have a proper error type, and an anyhow::Error can't easily be interrogated for its original S3 SdkError because downcast_ref requires a concrete type, but SdkError is parametrized on response type. ## Summary of changes Wrap remote deletions in Deleter in a backoff::retry to avoid logging warnings on transient 429/503 conditions, and for symmetry with how RemoteStorage is used in other places.	2023-10-11 15:25:08 +01:00
John Spray	e0c8ad48d4	remote_storage: log detail errors in delete_objects (#5530 ) ## Problem When we got an error in the payload of a DeleteObjects response, we only logged how many errors, not what they were. ## Summary of changes Log up to 10 specific errors. We do not log all of them because that would be up to 1000 log lines per request.	2023-10-11 13:22:00 +01:00
John Spray	39e144696f	pageserver: clean up `mgr.rs` types that needn't be public (#5529 ) ## Problem These types/functions are public and it prevents clippy from catching unused things. ## Summary of changes Move to `pub(crate)` and remove the error enum that becomes clearly unused as a result.	2023-10-11 11:50:16 +00:00
Alexander Bayandin	653044f754	test_runners: increase some timeouts to make tests less flaky (#5521 ) ## Problem - `test_heavy_write_workload` is flaky, and fails because of to statement timeout - `test_wal_lagging` is flaky and fails because of the default pytest timeout (see https://github.com/neondatabase/neon/issues/5305) ## Summary of changes - `test_heavy_write_workload`: increase statement timeout to 5 minutes (from default 2 minutes) - `test_wal_lagging`: increase pytest timeout to 600s (from default 300s)	2023-10-11 10:49:15 +01:00
Vadim Kharitonov	80dcdfa8bf	Update pgvector to 0.5.1 (#5525 )	2023-10-11 09:47:19 +01:00
Arseny Sher	685add2009	Enable /metrics without auth. To enable auth faster.	2023-10-10 20:06:25 +03:00
Conrad Ludgate	d4dc86f8e3	proxy: more connection metrics (#5464 ) ## Problem Hard to tell 1. How many clients are connected to proxy 2. How many requests clients are making 3. How many connections are made to a database 1 and 2 are different because of the properties of HTTP. We have 2 already tracked through `proxy_accepted_connections_total` and `proxy_closed_connections_total`, but nothing for 1 and 3 ## Summary of changes Adds 2 new counter gauges. * `proxy_opened_client_connections_total`,`proxy_closed_client_connections_total` - how many client connections are open to proxy * `proxy_opened_db_connections_total`,`proxy_closed_db_connections_total` - how many active connections are made through to a database. For TCP and Websockets, we expect all 3 of these quantities to be roughly the same, barring users connecting but with invalid details. For HTTP: * client_connections/connections can differ because the client connections can be reused. * connections/db_connections can differ because of connection pooling.	2023-10-10 16:33:20 +01:00
Alex Chi Z	5158de70f3	proxy: breakdown wake up failure metrics (#4933 ) ## Problem close https://github.com/neondatabase/neon/issues/4702 ## Summary of changes This PR adds a new metrics for wake up errors and breaks it down by most common reasons (mostly follows the `could_retry` implementation).	2023-10-10 13:17:37 +01:00
khanova	aec9188d36	Added timeout for http requests (#5514 ) # Problem Proxy timeout for HTTP-requests ## Summary of changes If the HTTP-request exceeds 15s, it would be killed. Resolves: https://github.com/neondatabase/neon/issues/4847	2023-10-10 13:39:38 +02:00
John Spray	acefee9a32	pageserver: flush deletion queue on detach (#5452 ) ## Problem If a caller detaches a tenant and then attaches it again, pending deletions from the old attachment might not have happened yet. This is not a correctness problem, but it causes: - Risk of leaking some objects in S3 - Some warnings from the deletion queue when pending LSN updates and pending deletions don't pass validation. ## Summary of changes - Deletion queue now uses UnboundedChannel so that the push interfaces don't have to be async. - This was pulled out of https://github.com/neondatabase/neon/pull/5397, where it is also useful to be able to drive the queue from non-async contexts. - Why is it okay for this to be unbounded? The only way the unbounded-ness of the channel can become a problem is if writing out deletion lists can't keep up, but if the system were that overloaded then the code generating deletions (GC, compaction) would also be impacted. - DeletionQueueClient gets a new `flush_advisory` function, which is like flush_execute, but doesn't wait for completion: this is appropriate for use in contexts where we would like to encourage the deletion queue to flush, but don't need to block on it. - This function is also expected to be useful in next steps for seamless migration, where the option to flush to S3 while transitioning into AttachedStale will also include flushing deletion queue, but we wouldn't want to block on that flush. - The tenant_detach code in mgr.rs invokes flush_advisory after stopping the `Tenant` object. --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>	2023-10-10 10:46:24 +01:00