pageserver: enable setting a target disk range

pageserver: publish disk eviction status
tests: de-flake test_eviction_across_generations (#5650 )
2026-01-21 20:32:56 +00:00 · 2023-10-25 14:39:12 +01:00 · 2023-10-25 14:35:32 +01:00 · 2023-10-25 10:55:45 +01:00 · 2023-10-25 08:30:47 +01:00 · 2023-10-24 16:04:28 -07:00
14 changed files with 321 additions and 160 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -338,6 +338,16 @@ jobs:
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export REMOTE_STORAGE_AZURE_CONTAINER=neon-github-sandbox
+          export REMOTE_STORAGE_AZURE_REGION=eastus2
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -837,7 +847,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.1
+      VM_BUILDER_VERSION: v0.18.2

    steps:
      - name: Checkout
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
-// Then we could replace the custo Ord and PartialOrd implementations below with
-// deriving them.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
+// Then we could replace the custom Ord and PartialOrd implementations below with
+// deriving them. This will require changes in walredoproc.c.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {

 impl Ord for RelTag {
    fn cmp(&self, other: &Self) -> Ordering {
-        let mut cmp = self.spcnode.cmp(&other.spcnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.dbnode.cmp(&other.dbnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.relnode.cmp(&other.relnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.forknum.cmp(&other.forknum);
-
-        cmp
+        // Custom ordering where we put forknum to the end of the list
+        let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
+        (self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
    }
 }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1479,6 +1479,8 @@ threshold = "20m"
            Some(DiskUsageEvictionTaskConfig {
                max_usage_pct: Percent::new(80).unwrap(),
                min_avail_bytes: 0,
+                target_avail_bytes: None,
+                target_usage_pct: None,
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -67,16 +67,40 @@ use crate::{
 pub struct DiskUsageEvictionTaskConfig {
    pub max_usage_pct: Percent,
    pub min_avail_bytes: u64,
+
+    // Control how far we will go when evicting: when usage exceeds max_usage_pct or min_avail_bytes,
+    // we will keep evicting layers until we reach the target.  The resulting disk usage should look
+    // like a sawtooth bouncing between the upper max/min line and the lower target line.
+    #[serde(default)]
+    pub target_usage_pct: Option<Percent>,
+    #[serde(default)]
+    pub target_avail_bytes: Option<u64>,
+
    #[serde(with = "humantime_serde")]
    pub period: Duration,
    #[cfg(feature = "testing")]
    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
 }

+#[derive(Default)]
+enum Status {
+    /// We are within disk limits, and not currently doing any eviction
+    #[default]
+    Idle,
+    /// Disk limits have been exceeded: we will evict soon
+    UnderPressure,
+    /// We are currently doing an eviction pass.
+    Evicting,
+}
+
 #[derive(Default)]
 pub struct State {
    /// Exclude http requests and background task from running at the same time.
    mutex: tokio::sync::Mutex<()>,
+
+    /// Publish the current status of eviction work, for visibility to other subsystems
+    /// that modify their behavior if disk pressure is high or if eviction is going on.
+    status: std::sync::RwLock<Status>,
 }

 pub fn launch_disk_usage_global_eviction_task(
@@ -176,7 +200,9 @@ async fn disk_usage_eviction_task(
 }

 pub trait Usage: Clone + Copy + std::fmt::Debug {
-    fn has_pressure(&self) -> bool;
+    fn pressure(&self) -> f64;
+    fn over_pressure(&self) -> bool;
+    fn no_pressure(&self) -> bool;
    fn add_available_bytes(&mut self, bytes: u64);
 }

@@ -189,13 +215,19 @@ async fn disk_usage_eviction_task_iteration(
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
+
+    if usage_pre.over_pressure() {
+        *state.status.write().unwrap() = Status::Evicting;
+    }
+
    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
-            match outcome {
+            let new_status = match outcome {
                IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
                    // nothing to do, select statement below will handle things
+                    Status::Idle
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
@@ -205,21 +237,30 @@ async fn disk_usage_eviction_task_iteration(

                    debug!(?after, "disk usage");

-                    if after.has_pressure() {
+                    if after.over_pressure() {
                        // Don't bother doing an out-of-order iteration here now.
                        // In practice, the task period is set to a value in the tens-of-seconds range,
                        // which will cause another iteration to happen soon enough.
                        // TODO: deltas between the three different usages would be helpful,
                        // consider MiB, GiB, TiB
                        warn!(?outcome, ?after, "disk usage still high");
+                        Status::UnderPressure
                    } else {
                        info!(?outcome, ?after, "disk usage pressure relieved");
+                        Status::Idle
                    }
                }
-            }
+            };
+
+            *state.status.write().unwrap() = new_status;
        }
        Err(e) => {
            error!("disk_usage_eviction_iteration failed: {:#}", e);
+            *state.status.write().unwrap() = if usage_pre.over_pressure() {
+                Status::UnderPressure
+            } else {
+                Status::Idle
+            };
        }
    }

@@ -285,8 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    debug!(?usage_pre, "disk usage");

-    if !usage_pre.has_pressure() {
+    if !usage_pre.over_pressure() {
        return Ok(IterationOutcome::NoPressure);
+    } else {
+        *state.status.write().unwrap() = Status::Evicting;
    }

    warn!(
@@ -334,7 +377,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    let mut warned = None;
    let mut usage_planned = usage_pre;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
-        if !usage_planned.has_pressure() {
+        if usage_planned.no_pressure() {
            debug!(
                no_candidates_evicted = i,
                "took enough candidates for pressure to be relieved"
@@ -644,22 +687,57 @@ mod filesystem_level_usage {
    }

    impl super::Usage for Usage<'_> {
-        fn has_pressure(&self) -> bool {
-            let usage_pct =
-                (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
+        /// Does the pressure exceed 1.0, i.e. has the disk usage exceeded upper bounds?
+        ///
+        /// This is the condition for starting eviction.
+        fn over_pressure(&self) -> bool {
+            self.pressure() >= 1.0
+        }

-            let pressures = [
-                (
-                    "min_avail_bytes",
-                    self.avail_bytes < self.config.min_avail_bytes,
-                ),
-                (
-                    "max_usage_pct",
-                    usage_pct >= self.config.max_usage_pct.get() as u64,
-                ),
-            ];
+        /// Is the pressure <0, ie.. has disk usage gone below the target bound?
+        ///
+        /// This is the condition for dropping out of eviction.
+        fn no_pressure(&self) -> bool {
+            self.pressure() <= 0.0
+        }

-            pressures.into_iter().any(|(_, has_pressure)| has_pressure)
+        fn pressure(&self) -> f64 {
+            let max_usage = std::cmp::min(
+                self.total_bytes - self.config.min_avail_bytes,
+                (self.total_bytes as f64 * (self.config.max_usage_pct.get() as f64 / 100.0)) as u64,
+            );
+
+            let mut target_usage = max_usage;
+            if let Some(target_avail_bytes) = self.config.target_avail_bytes {
+                target_usage = std::cmp::min(target_usage, self.total_bytes - target_avail_bytes);
+            }
+            if let Some(target_usage_pct) = self.config.target_usage_pct {
+                target_usage = std::cmp::min(
+                    target_usage,
+                    (self.total_bytes as f64 * (target_usage_pct.get() as f64 / 100.0)) as u64,
+                );
+            };
+
+            let usage = self.total_bytes - self.avail_bytes;
+            eprintln!(
+                "pressure: {} {}, current {}",
+                target_usage, max_usage, usage
+            );
+            if target_usage == max_usage {
+                // We are configured with a zero sized range: treat anything at+beyond limit as pressure 1.0, else 0.0
+                if usage >= max_usage {
+                    1.0
+                } else {
+                    0.0
+                }
+            } else if usage <= target_usage {
+                // No pressure.
+                0.0
+            } else {
+                // We are above target: pressure is the ratio of how much we exceed target to the size of the gap
+                let range_size = (max_usage - target_usage) as f64;
+                (usage - target_usage) as f64 / range_size
+            }
        }

        fn add_available_bytes(&mut self, bytes: u64) {
@@ -713,6 +791,8 @@ mod filesystem_level_usage {
            config: &DiskUsageEvictionTaskConfig {
                max_usage_pct: Percent::new(85).unwrap(),
                min_avail_bytes: 0,
+                target_avail_bytes: None,
+                target_usage_pct: None,
                period: Duration::MAX,
                #[cfg(feature = "testing")]
                mock_statvfs: None,
@@ -721,24 +801,24 @@ mod filesystem_level_usage {
            avail_bytes: 0,
        };

-        assert!(usage.has_pressure(), "expected pressure at 100%");
+        assert!(usage.over_pressure(), "expected pressure at 100%");

        usage.add_available_bytes(14_000);
-        assert!(usage.has_pressure(), "expected pressure at 86%");
+        assert!(usage.over_pressure(), "expected pressure at 86%");

        usage.add_available_bytes(999);
-        assert!(usage.has_pressure(), "expected pressure at 85.001%");
+        assert!(usage.over_pressure(), "expected pressure at 85.001%");

        usage.add_available_bytes(1);
-        assert!(usage.has_pressure(), "expected pressure at precisely 85%");
+        assert!(usage.over_pressure(), "expected pressure at precisely 85%");

        usage.add_available_bytes(1);
-        assert!(!usage.has_pressure(), "no pressure at 84.999%");
+        assert!(!usage.over_pressure(), "no pressure at 84.999%");

        usage.add_available_bytes(999);
-        assert!(!usage.has_pressure(), "no pressure at 84%");
+        assert!(!usage.over_pressure(), "no pressure at 84%");

        usage.add_available_bytes(16_000);
-        assert!(!usage.has_pressure());
+        assert!(!usage.over_pressure());
    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1452,10 +1452,22 @@ async fn disk_usage_eviction_run(
    }

    impl crate::disk_usage_eviction_task::Usage for Usage {
-        fn has_pressure(&self) -> bool {
+        fn over_pressure(&self) -> bool {
            self.config.evict_bytes > self.freed_bytes
        }

+        fn no_pressure(&self) -> bool {
+            !self.over_pressure()
+        }
+
+        fn pressure(&self) -> f64 {
+            if self.over_pressure() {
+                1.0
+            } else {
+                0.0
+            }
+        }
+
        fn add_available_bytes(&mut self, bytes: u64) {
            self.freed_bytes += bytes;
        }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -675,9 +675,8 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
-            result.add_key(AUX_FILES_KEY);
-        }
+        result.add_key(AUX_FILES_KEY);
+
        Ok(result.to_keyspace())
    }

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -60,6 +60,8 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("failpoint before-upload-layer")
    });

+    pausable_failpoint!("before-upload-layer-pausable");
+
    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2793,10 +2793,13 @@ impl Timeline {
                )
            };

+        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
+        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
+
        // The new on-disk layers are now in the layer map. We can remove the
        // in-memory layer from the map now. The flushed layer is stored in
        // the mapping in `create_delta_layer`.
-        {
+        let metadata = {
            let mut guard = self.layers.write().await;

            if let Some(ref l) = delta_layer_to_add {
@@ -2812,8 +2815,17 @@ impl Timeline {
            }

            guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
+            if disk_consistent_lsn != old_disk_consistent_lsn {
+                assert!(disk_consistent_lsn > old_disk_consistent_lsn);
+                self.disk_consistent_lsn.store(disk_consistent_lsn);
+
+                // Schedule remote uploads that will reflect our new disk_consistent_lsn
+                Some(self.schedule_uploads(disk_consistent_lsn, layer_paths_to_upload)?)
+            } else {
+                None
+            }
            // release lock on 'layers'
-        }
+        };

        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
@@ -2829,28 +2841,22 @@ impl Timeline {
        //
        // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
        // *all* the layers, to avoid fsyncing the file multiple times.
-        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
-        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();

-        // If we were able to advance 'disk_consistent_lsn', save it the metadata file.
-        // After crash, we will restart WAL streaming and processing from that point.
-        if disk_consistent_lsn != old_disk_consistent_lsn {
-            assert!(disk_consistent_lsn > old_disk_consistent_lsn);
-            self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)
+        // If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
+        if let Some(metadata) = metadata {
+            save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
                .await
-                .context("update_metadata_file")?;
-            // Also update the in-memory copy
-            self.disk_consistent_lsn.store(disk_consistent_lsn);
+                .context("save_metadata")?;
        }
        Ok(())
    }

    /// Update metadata file
-    async fn update_metadata_file(
+    fn schedule_uploads(
        &self,
        disk_consistent_lsn: Lsn,
        layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<TimelineMetadata> {
        // We can only save a valid 'prev_record_lsn' value on disk if we
        // flushed *all* in-memory changes to disk. We only track
        // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -2887,10 +2893,6 @@ impl Timeline {
            x.unwrap()
        ));

-        save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
-            .await
-            .context("save_metadata")?;
-
        if let Some(remote_client) = &self.remote_client {
            for (path, layer_metadata) in layer_paths_to_upload {
                remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -2898,6 +2900,20 @@ impl Timeline {
            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
        }

+        Ok(metadata)
+    }
+
+    async fn update_metadata_file(
+        &self,
+        disk_consistent_lsn: Lsn,
+        layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
+    ) -> anyhow::Result<()> {
+        let metadata = self.schedule_uploads(disk_consistent_lsn, layer_paths_to_upload)?;
+
+        save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
+            .await
+            .context("save_metadata")?;
+
        Ok(())
    }

--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,7 +3,9 @@ mod hacks;
 mod link;

 pub use link::LinkAuthError;
+use tokio_postgres::config::AuthKeys;

+use crate::proxy::{handle_try_wake, retry_after};
 use crate::{
    auth::{self, ClientCredentials},
    config::AuthenticationConfig,
@@ -16,8 +18,9 @@ use crate::{
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
+use std::ops::ControlFlow;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{error, info, warn};

 /// A product of successful authentication.
 pub struct AuthSuccess<T> {
@@ -117,22 +120,27 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
    }
 }

+pub enum ComputeCredentials {
+    Password(Vec<u8>),
+    AuthKeys(AuthKeys),
+}
+
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
-async fn auth_quirks(
+async fn auth_quirks_creds(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
    if creds.project.is_none() {
        // Password will be checked by the compute node later.
-        return hacks::password_hack(api, extra, creds, client).await;
+        return hacks::password_hack(creds, client).await;
    }

    // Password hack should set the project name.
@@ -143,13 +151,55 @@ async fn auth_quirks(
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
        // Password will be checked by the compute node later.
-        return hacks::cleartext_hack(api, extra, creds, client).await;
+        return hacks::cleartext_hack(client).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
    classic::authenticate(api, extra, creds, client, config).await
 }

+/// True to its name, this function encapsulates our current auth trade-offs.
+/// Here, we choose the appropriate auth flow based on circumstances.
+async fn auth_quirks(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    allow_cleartext: bool,
+    config: &'static AuthenticationConfig,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    let auth_stuff = auth_quirks_creds(api, extra, creds, client, allow_cleartext, config).await?;
+
+    let mut num_retries = 0;
+    let mut node = loop {
+        let wake_res = api.wake_compute(extra, creds).await;
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                return Err(e.into());
+            }
+            Ok(ControlFlow::Continue(e)) => {
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
+            Ok(ControlFlow::Break(n)) => break n,
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
+    };
+
+    match auth_stuff.value {
+        ComputeCredentials::Password(password) => node.config.password(password),
+        ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
+    };
+
+    Ok(AuthSuccess {
+        reported_auth_ok: auth_stuff.reported_auth_ok,
+        value: node,
+    })
+}
+
 impl BackendType<'_, ClientCredentials<'_>> {
    /// Get compute endpoint name from the credentials.
    pub fn get_endpoint(&self) -> Option<String> {
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,17 +1,14 @@
-use std::ops::ControlFlow;
-
-use super::AuthSuccess;
+use super::{AuthSuccess, ComputeCredentials};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    config::AuthenticationConfig,
-    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::{handle_try_wake, retry_after},
+    console::{self, AuthInfo, ConsoleReqExtra},
    sasl, scram,
    stream::PqStream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, warn};
+use tracing::{info, warn};

 pub(super) async fn authenticate(
    api: &impl console::Api,
@@ -19,7 +16,7 @@ pub(super) async fn authenticate(
    creds: &ClientCredentials<'_>,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    config: &'static AuthenticationConfig,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    info!("fetching user's authentication info");
    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
        // If we don't have an authentication secret, we mock one to
@@ -66,38 +63,17 @@ pub(super) async fn authenticate(
                }
            };

-            Some(compute::ScramKeys {
+            compute::ScramKeys {
                client_key: client_key.as_bytes(),
                server_key: secret.server_key.as_bytes(),
-            })
+            }
        }
    };

-    let mut num_retries = 0;
-    let mut node = loop {
-        let wake_res = api.wake_compute(extra, creds).await;
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            Ok(ControlFlow::Break(n)) => break n,
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-        tokio::time::sleep(wait_duration).await;
-    };
-    if let Some(keys) = scram_keys {
-        use tokio_postgres::config::AuthKeys;
-        node.config.auth_keys(AuthKeys::ScramSha256(keys));
-    }
-
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: node,
+        value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
+            scram_keys,
+        )),
    })
 }
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,10 +1,6 @@
-use super::AuthSuccess;
+use super::{AuthSuccess, ComputeCredentials};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
-    console::{
-        self,
-        provider::{CachedNodeInfo, ConsoleReqExtra},
-    },
    stream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -15,11 +11,8 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn cleartext_hack(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("cleartext auth flow override is enabled, proceeding");
    let password = AuthFlow::new(client)
        .begin(auth::CleartextPassword)
@@ -27,24 +20,19 @@ pub async fn cleartext_hack(
        .authenticate()
        .await?;

-    let mut node = api.wake_compute(extra, creds).await?;
-    node.config.password(password);
-
    // Report tentative success; compute node will check the password anyway.
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: node,
+        value: ComputeCredentials::Password(password),
    })
 }

 /// Workaround for clients which don't provide an endpoint (project) name.
 /// Very similar to [`cleartext_hack`], but there's a specific password format.
 pub async fn password_hack(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("project not specified, resorting to the password hack auth flow");
    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
@@ -55,12 +43,9 @@ pub async fn password_hack(
    info!(project = &payload.endpoint, "received missing parameter");
    creds.project = Some(payload.endpoint);

-    let mut node = api.wake_compute(extra, creds).await?;
-    node.config.password(payload.password);
-
    // Report tentative success; compute node will check the password anyway.
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: node,
+        value: ComputeCredentials::Password(payload.password),
    })
 }
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -90,7 +90,11 @@ pub mod errors {
                    status: http::StatusCode::LOCKED,
                    ref text,
                } => {
-                    !text.contains("written data quota exceeded")
+                    // written data quota exceeded
+                    // data transfer quota exceeded
+                    // compute time quota exceeded
+                    // logical size quota exceeded
+                    !text.contains("quota exceeded")
                        && !text.contains("the limit for current plan reached")
                }
                // retry server errors
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -486,16 +486,20 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):


 def evict_all_layers(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
-    timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    initial_local_layers = sorted(
-        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
-    )
    client = env.pageserver.http_client()
-    for layer in initial_local_layers:
-        if "ephemeral" in layer.name or "temp" in layer.name:
+
+    layer_map = client.layer_map_info(tenant_id, timeline_id)
+
+    for layer in layer_map.historic_layers:
+        if layer.remote:
+            log.info(
+                f"Skipping trying to evict remote layer {tenant_id}/{timeline_id} {layer.layer_file_name}"
+            )
            continue
-        log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
-        client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
+        log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.layer_file_name}")
+        client.evict_layer(
+            tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.layer_file_name
+        )


 def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -757,12 +757,14 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
        create_thread.join()


-# Regression test for a race condition where L0 layers are compacted before the upload,
-# resulting in the uploading complaining about the file not being found
-# https://github.com/neondatabase/neon/issues/4526
-def test_compaction_delete_before_upload(
+def test_compaction_waits_for_upload(
    neon_env_builder: NeonEnvBuilder,
 ):
+    """
+    Compaction waits for outstanding uploads to complete, so that it avoids deleting layers
+    files that have not yet been uploaded.  This test forces a race between upload and
+    compaction.
+    """
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)

    env = neon_env_builder.init_start(
@@ -792,50 +794,81 @@ def test_compaction_delete_before_upload(
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

        # Now make the flushing hang and update one small piece of data
-        client.configure_failpoints(("flush-frozen-pausable", "pause"))
+        client.configure_failpoints(("before-upload-layer-pausable", "pause"))

        endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")

        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

-    q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
-    barrier = threading.Barrier(2)
+    checkpoint_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    compact_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    compact_barrier = threading.Barrier(2)

    def checkpoint_in_background():
-        barrier.wait()
        try:
+            log.info("Checkpoint starting")
            client.timeline_checkpoint(tenant_id, timeline_id)
-            q.put(None)
+            log.info("Checkpoint complete")
+            checkpoint_result.put(None)
        except PageserverApiException as e:
-            q.put(e)
+            log.info("Checkpoint errored: {e}")
+            checkpoint_result.put(e)

-    create_thread = threading.Thread(target=checkpoint_in_background)
-    create_thread.start()
+    def compact_in_background():
+        compact_barrier.wait()
+        try:
+            log.info("Compaction starting")
+            client.timeline_compact(tenant_id, timeline_id)
+            log.info("Compaction complete")
+            compact_result.put(None)
+        except PageserverApiException as e:
+            log.info("Compaction errored: {e}")
+            compact_result.put(e)
+
+    checkpoint_thread = threading.Thread(target=checkpoint_in_background)
+    checkpoint_thread.start()
+
+    compact_thread = threading.Thread(target=compact_in_background)
+    compact_thread.start()

    try:
-        barrier.wait()
+        # Start the checkpoint, see that it blocks
+        log.info("Waiting to see checkpoint hang...")
+        time.sleep(5)
+        assert checkpoint_result.empty()

-        time.sleep(4)
-        client.timeline_compact(tenant_id, timeline_id)
+        # Start the compaction, see that it finds work to do but blocks
+        compact_barrier.wait()
+        log.info("Waiting to see compaction hang...")
+        time.sleep(5)
+        assert compact_result.empty()

-        client.configure_failpoints(("flush-frozen-pausable", "off"))
+        # This is logged once compaction is started, but before we wait for operations to complete
+        assert env.pageserver.log_contains("compact_level0_phase1 stats available.")

-        conflict = q.get()
+        # Once we unblock uploads the compaction should complete successfully
+        log.info("Disabling failpoint")
+        client.configure_failpoints(("before-upload-layer-pausable", "off"))
+        log.info("Awaiting compaction result")
+        assert compact_result.get(timeout=10) is None
+        log.info("Awaiting checkpoint result")
+        assert checkpoint_result.get(timeout=10) is None

-        assert conflict is None
+    except Exception:
+        # Log the actual failure's backtrace here, before we proceed to join threads
+        log.exception("Failure, cleaning up...")
+        raise
    finally:
-        create_thread.join()
+        compact_barrier.abort()

-    # Add a delay for the uploads to run into either the file not found or the
-    time.sleep(4)
+        checkpoint_thread.join()
+        compact_thread.join()

    # Ensure that this actually terminates
    wait_upload_queue_empty(client, tenant_id, timeline_id)

-    # For now we are hitting this message.
-    # Maybe in the future the underlying race condition will be fixed,
-    # but until then, ensure that this message is hit instead.
-    assert env.pageserver.log_contains(
+    # We should not have hit the error handling path in uploads where the remote file is gone
+    assert not env.pageserver.log_contains(
        "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
    )
Author	SHA1	Message	Date
John Spray	ed3e3b6f61	pageserver: enable setting a target disk range	2023-10-25 14:39:12 +01:00
John Spray	098ef0956b	pageserver: publish disk eviction status	2023-10-25 14:35:32 +01:00
John Spray	127837abb0	tests: de-flake test_eviction_across_generations (#5650 ) ## Problem There was an edge case where initial logical size calculation can be downloading a layer that wasn't hit by the test's `SELECT`, and it's on-disk but still marked as remote in the pageserver's internal state, so evicting it fails. https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5648/6630099807/index.html#categories/dee044ec96f666edb90a77c01099a941/e38e97a2735ffa8c/ ## Summary of changes Use pageserver API to learn about layers, instead of inspecting local disk, so that we will always agree with the pageserver about which layer are local.	2023-10-25 10:55:45 +01:00
Conrad Ludgate	b2c96047d0	move wake compute after the auth quirks logic (#5642 ) ## Problem https://github.com/neondatabase/neon/issues/5568#issuecomment-1777015606 ## Summary of changes Make the auth_quirks_creds return the authentication information, and push the wake_compute loop to after, inside `auth_quirks`	2023-10-25 08:30:47 +01:00
Em Sharnoff	44202eeb3b	Bump vm-builder v0.18.1 -> v0.18.2 (#5646 ) Only applicable change was neondatabase/autoscaling#571, removing the postgres_exporter flags `--auto-discover-databases` and `--exclude-databases=...`	2023-10-24 16:04:28 -07:00
Arpad Müller	4bef977c56	Use tuples instead of manual comparison chain (#5637 ) Makes code a little bit simpler	2023-10-24 17:16:23 +00:00
John Spray	a0b862a8bd	pageserver: schedule frozen layer uploads inside the layers lock (#5639 ) ## Problem Compaction's source of truth for what layers exist is the LayerManager. `flush_frozen_layer` updates LayerManager before it has scheduled upload of the frozen layer. Compaction can then "see" the new layer, decide to delete it, schedule uploads of replacement layers, all before `flush_frozen_layer` wakes up again and schedules the upload. When the upload is scheduled, the local layer file may be gone, in which case we end up with no such layer in remote storage, but an entry still added to IndexPart pointing to the missing layer. ## Summary of changes Schedule layer uploads inside the `self.layers` lock, so that whenever a frozen layer is present in LayerManager, it is also present in RemoteTimelineClient's metadata. Closes: #5635	2023-10-24 13:57:01 +01:00
Conrad Ludgate	767ef29390	proxy: filter out more quota exceeded errors (#5640 ) ## Problem Looking at logs, I saw more retries being performed for other quota exceeded errors ## Summary of changes Filter out all quota exceeded family of errors	2023-10-24 13:13:23 +01:00
Alexander Bayandin	a8a800af51	Run real Azure tests on CI (#5627 ) ## Problem We do not run real Azure-related tests on CI ## Summary of changes - Set required env variables to run real Azure blob storage tests on CI	2023-10-24 12:12:11 +01:00