Improve the scaling, add Play/Stop buttons

WIP: Collect and draw layer trace
Add test from PR #3673
2026-03-19 08:10:37 +00:00 · 2023-03-22 19:40:54 +02:00 · 2023-03-22 19:40:54 +02:00 · 2023-03-22 19:40:54 +02:00 · 2023-03-22 17:42:31 +02:00 · 2023-03-22 16:26:27 +02:00
51 changed files with 2116 additions and 985 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -552,7 +552,7 @@ jobs:
  neon-image-depot:
    # For testing this will run side-by-side for a few merges.
    # This action is not really optimized yet, but gets the job done
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    permissions:
--- a/README.md
+++ b/README.md
@@ -46,11 +46,14 @@ postgresql-libs cmake postgresql protobuf
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 ```

-#### Installing dependencies on OSX (12.3.1)
+#### Installing dependencies on macOS (12.3.1)
 1. Install XCode and dependencies
 ```
 xcode-select --install
 brew install protobuf openssl flex bison
+
+# add openssl to PATH, required for ed25519 keys generation in neon_local
+echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -133,6 +133,7 @@ fn main() -> Result<()> {
        .settings
        .find("neon.pageserver_connstring")
        .expect("pageserver connstr should be provided");
+    let storage_auth_token = spec.storage_auth_token.clone();
    let tenant = spec
        .cluster
        .settings
@@ -153,6 +154,7 @@ fn main() -> Result<()> {
        tenant,
        timeline,
        pageserver_connstr,
+        storage_auth_token,
        metrics: ComputeMetrics::default(),
        state: RwLock::new(ComputeState::new()),
    };
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -18,6 +18,7 @@ use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
+use std::str::FromStr;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::RwLock;

@@ -44,6 +45,7 @@ pub struct ComputeNode {
    pub tenant: String,
    pub timeline: String,
    pub pageserver_connstr: String,
+    pub storage_auth_token: Option<String>,
    pub metrics: ComputeMetrics,
    /// Volatile part of the `ComputeNode` so should be used under `RwLock`
    /// to allow HTTP API server to serve status requests, while configuration
@@ -126,7 +128,18 @@ impl ComputeNode {
    fn get_basebackup(&self, lsn: &str) -> Result<()> {
        let start_time = Utc::now();

-        let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
+        let mut config = postgres::Config::from_str(&self.pageserver_connstr)?;
+
+        // Use the storage auth token from the config file, if given.
+        // Note: this overrides any password set in the connection string.
+        if let Some(storage_auth_token) = &self.storage_auth_token {
+            info!("Got storage auth token from spec file");
+            config.password(storage_auth_token);
+        } else {
+            info!("Storage auth token not set");
+        }
+
+        let mut client = config.connect(NoTls)?;
        let basebackup_cmd = match lsn {
            "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
            _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
@@ -163,6 +176,11 @@ impl ComputeNode {
        let sync_handle = Command::new(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
+            .envs(if let Some(storage_auth_token) = &self.storage_auth_token {
+                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
+            } else {
+                vec![]
+            })
            .stdout(Stdio::piped())
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");
@@ -240,6 +258,11 @@ impl ComputeNode {
        // Run postgres as a child process.
        let mut pg = Command::new(&self.pgbin)
            .args(["-D", &self.pgdata])
+            .envs(if let Some(storage_auth_token) = &self.storage_auth_token {
+                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
+            } else {
+                vec![]
+            })
            .spawn()
            .expect("cannot start postgres process");

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -24,6 +24,8 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

+    pub storage_auth_token: Option<String>,
+
    pub startup_tracing_context: Option<HashMap<String, String>>,
 }

--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -11,7 +11,6 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result};
-use postgres_backend::AuthType;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -97,7 +96,7 @@ impl ComputeControlPlane {
        });

        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.pageserver.pg_auth_type)?;
+        node.setup_pg_conf()?;

        self.nodes
            .insert((tenant_id, node.name.clone()), Arc::clone(&node));
@@ -278,7 +277,7 @@ impl PostgresNode {

    // Write postgresql.conf with default configuration
    // and PG_VERSION file to the data directory of a new node.
-    fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
+    fn setup_pg_conf(&self) -> Result<()> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
        conf.append("wal_log_hints", "off");
@@ -302,29 +301,12 @@ impl PostgresNode {
            let config = &self.pageserver.pg_connection_config;
            let (host, port) = (config.host(), config.port());

-            // Set up authentication
-            //
-            // $NEON_AUTH_TOKEN will be replaced with value from environment
-            // variable during compute pg startup. It is done this way because
-            // otherwise user will be able to retrieve the value using SHOW
-            // command or pg_settings
-            let password = if let AuthType::NeonJWT = auth_type {
-                "$NEON_AUTH_TOKEN"
-            } else {
-                ""
-            };
-            // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
-            // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
-            // We parse this string and build it back with token from env var, and for simplicity rebuild
-            // uses only needed variables namely host, port, user, password.
-            format!("postgresql://no_user:{password}@{host}:{port}")
+            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
+            format!("postgresql://no_user@{host}:{port}")
        };
        conf.append("shared_preload_libraries", "neon");
        conf.append_line("");
        conf.append("neon.pageserver_connstring", &pageserver_connstr);
-        if let AuthType::NeonJWT = auth_type {
-            conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
-        }
        conf.append("neon.tenant_id", &self.tenant_id.to_string());
        conf.append("neon.timeline_id", &self.timeline_id.to_string());
        if let Some(lsn) = self.lsn {
@@ -447,6 +429,8 @@ impl PostgresNode {
            "DYLD_LIBRARY_PATH",
            self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
        );
+
+        // Pass authentication token used for the connections to pageserver and safekeepers
        if let Some(token) = auth_token {
            cmd.env("NEON_AUTH_TOKEN", token);
        }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -18,7 +18,7 @@ use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use utils::{
-    auth::{encode_from_key_file, Claims, Scope},
+    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
 };

@@ -118,9 +118,6 @@ pub struct PageServerConf {
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
-
-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
 }

 impl Default for PageServerConf {
@@ -131,7 +128,6 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            auth_token: String::new(),
        }
    }
 }
@@ -404,48 +400,33 @@ impl LocalEnv {

        fs::create_dir(base_path)?;

-        // generate keys for jwt
-        // openssl genrsa -out private_key.pem 2048
-        let private_key_path;
+        // Generate keypair for JWT.
+        //
+        // The keypair is only needed if authentication is enabled in any of the
+        // components. For convenience, we generate the keypair even if authentication
+        // is not enabled, so that you can easily enable it after the initialization
+        // step. However, if the key generation fails, we treat it as non-fatal if
+        // authentication was not enabled.
        if self.private_key_path == PathBuf::new() {
-            private_key_path = base_path.join("auth_private_key.pem");
-            let keygen_output = Command::new("openssl")
-                .arg("genrsa")
-                .args(["-out", private_key_path.to_str().unwrap()])
-                .arg("2048")
-                .stdout(Stdio::null())
-                .output()
-                .context("failed to generate auth private key")?;
-            if !keygen_output.status.success() {
-                bail!(
-                    "openssl failed: '{}'",
-                    String::from_utf8_lossy(&keygen_output.stderr)
-                );
-            }
-            self.private_key_path = PathBuf::from("auth_private_key.pem");
-
-            let public_key_path = base_path.join("auth_public_key.pem");
-            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-            let keygen_output = Command::new("openssl")
-                .arg("rsa")
-                .args(["-in", private_key_path.to_str().unwrap()])
-                .arg("-pubout")
-                .args(["-outform", "PEM"])
-                .args(["-out", public_key_path.to_str().unwrap()])
-                .stdout(Stdio::null())
-                .output()
-                .context("failed to generate auth private key")?;
-            if !keygen_output.status.success() {
-                bail!(
-                    "openssl failed: '{}'",
-                    String::from_utf8_lossy(&keygen_output.stderr)
-                );
+            match generate_auth_keys(
+                base_path.join("auth_private_key.pem").as_path(),
+                base_path.join("auth_public_key.pem").as_path(),
+            ) {
+                Ok(()) => {
+                    self.private_key_path = PathBuf::from("auth_private_key.pem");
+                }
+                Err(e) => {
+                    if !self.auth_keys_needed() {
+                        eprintln!("Could not generate keypair for JWT authentication: {e}");
+                        eprintln!("Continuing anyway because authentication was not enabled");
+                        self.private_key_path = PathBuf::from("auth_private_key.pem");
+                    } else {
+                        return Err(e);
+                    }
+                }
            }
        }

-        self.pageserver.auth_token =
-            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-
        fs::create_dir_all(self.pg_data_dirs_path())?;

        for safekeeper in &self.safekeepers {
@@ -454,6 +435,12 @@ impl LocalEnv {

        self.persist_config(base_path)
    }
+
+    fn auth_keys_needed(&self) -> bool {
+        self.pageserver.pg_auth_type == AuthType::NeonJWT
+            || self.pageserver.http_auth_type == AuthType::NeonJWT
+            || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+    }
 }

 fn base_path() -> PathBuf {
@@ -463,6 +450,43 @@ fn base_path() -> PathBuf {
    }
 }

+/// Generate a public/private key pair for JWT authentication
+fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow::Result<()> {
+    // Generate the key pair
+    //
+    // openssl genpkey -algorithm ed25519 -out auth_private_key.pem
+    let keygen_output = Command::new("openssl")
+        .arg("genpkey")
+        .args(["-algorithm", "ed25519"])
+        .args(["-out", private_key_path.to_str().unwrap()])
+        .stdout(Stdio::null())
+        .output()
+        .context("failed to generate auth private key")?;
+    if !keygen_output.status.success() {
+        bail!(
+            "openssl failed: '{}'",
+            String::from_utf8_lossy(&keygen_output.stderr)
+        );
+    }
+    // Extract the public key from the private key file
+    //
+    // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
+    let keygen_output = Command::new("openssl")
+        .arg("pkey")
+        .args(["-in", private_key_path.to_str().unwrap()])
+        .arg("-pubout")
+        .args(["-out", public_key_path.to_str().unwrap()])
+        .output()
+        .context("failed to extract public key from private key")?;
+    if !keygen_output.status.success() {
+        bail!(
+            "openssl failed: '{}'",
+            String::from_utf8_lossy(&keygen_output.stderr)
+        );
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -82,15 +82,8 @@ impl PageServerNode {
        let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
            .expect("Unable to parse listen_pg_addr");
        let port = port.unwrap_or(5432);
-        let password = if env.pageserver.pg_auth_type == AuthType::NeonJWT {
-            Some(env.pageserver.auth_token.clone())
-        } else {
-            None
-        };
-
        Self {
-            pg_connection_config: PgConnectionConfig::new_host_port(host, port)
-                .set_password(password),
+            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            env: env.clone(),
            http_client: Client::new(),
            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
@@ -280,27 +273,30 @@ impl PageServerNode {
        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

-    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.pg_connection_config.connect_no_tls().unwrap();
-
-        println!("Pageserver query: '{sql}'");
-        client.simple_query(sql).unwrap()
+    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
+        let mut config = self.pg_connection_config.clone();
+        if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+            config = config.set_password(Some(token));
+        }
+        Ok(config.connect_no_tls()?)
    }

-    pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
-        self.pg_connection_config.connect_no_tls()
-    }
-
-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
        let mut builder = self.http_client.request(method, url);
        if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
-            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+            builder = builder.bearer_auth(token)
        }
-        builder
+        Ok(builder)
    }

    pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/status", self.http_base_url))
+        self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
            .send()?
            .error_from_body()?;
        Ok(())
@@ -308,7 +304,7 @@ impl PageServerNode {

    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
        Ok(self
-            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))
+            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
            .send()?
            .error_from_body()?
            .json()?)
@@ -362,11 +358,16 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'trace_read_requests' as bool")?,
+            eviction_policy: settings
+                .get("eviction_policy")
+                .map(|x| serde_json::from_str(x))
+                .transpose()
+                .context("Failed to parse 'eviction_policy' json")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
        }
-        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
+        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
            .json(&request)
            .send()?
            .error_from_body()?
@@ -383,7 +384,7 @@ impl PageServerNode {
    }

    pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
-        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
+        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
            .json(&TenantConfigRequest {
                tenant_id,
                checkpoint_distance: settings
@@ -446,7 +447,7 @@ impl PageServerNode {
            .http_request(
                Method::GET,
                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-            )
+            )?
            .send()?
            .error_from_body()?
            .json()?;
@@ -465,7 +466,7 @@ impl PageServerNode {
        self.http_request(
            Method::POST,
            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-        )
+        )?
        .json(&TimelineCreateRequest {
            new_timeline_id,
            ancestor_start_lsn,
@@ -502,7 +503,7 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let mut client = self.pg_connection_config.connect_no_tls().unwrap();
+        let mut client = self.page_server_psql_client()?;

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,7 +1,6 @@
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
-use std::sync::Arc;
 use std::{io, result};

 use anyhow::Context;
@@ -11,7 +10,6 @@ use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{http::error::HttpErrorBody, id::NodeId};

-use crate::pageserver::PageServerNode;
 use crate::{
    background_process,
    local_env::{LocalEnv, SafekeeperConf},
@@ -65,14 +63,10 @@ pub struct SafekeeperNode {
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
-
-    pub pageserver: Arc<PageServerNode>,
 }

 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
-        let pageserver = Arc::new(PageServerNode::from_env(env));
-
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
@@ -80,7 +74,6 @@ impl SafekeeperNode {
            env: env.clone(),
            http_client: Client::new(),
            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
-            pageserver,
        }
    }

--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -160,6 +160,7 @@ services:
    build:
      context: ./compute_wrapper/
      args:
+        - REPOSITORY=${REPOSITORY:-neondatabase}
        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -29,15 +29,22 @@ These components should not have access to the private key and may only get toke
 The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`.
 There is currently no way to rotate the key without bringing down all components.

+### Best practices
+
+See [RFC 8725: JSON Web Token Best Current Practices](https://www.rfc-editor.org/rfc/rfc8725)
+
+
 ### Token format

-The JWT tokens in Neon use RSA as the algorithm. Example:
+The JWT tokens in Neon use "EdDSA" as the algorithm (defined in [RFC8037](https://www.rfc-editor.org/rfc/rfc8037)).
+
+Example:

 Header:

 ```
 {
-  "alg": "RS512",     # RS256, RS384, or RS512
+  "alg": "EdDSA",
  "typ": "JWT"
 }
 ```
@@ -68,8 +75,8 @@ Currently also used for connection from any pageserver to any safekeeper.
 CLI generates a key pair during call to `neon_local init` with the following commands:

 ```bash
-openssl genrsa -out auth_private_key.pem 2048
-openssl rsa -in auth_private_key.pem -pubout -outform PEM -out auth_public_key.pem
+openssl genpkey -algorithm ed25519 -out auth_private_key.pem
+openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
 ```

 Configuration files for all components point to `public_key.pem` for JWT validation.
@@ -99,20 +106,22 @@ Their authentication is just plain PostgreSQL authentication and out of scope fo
 There is no administrative API except those provided by PostgreSQL.

 #### Outgoing connections
-Compute connects to Pageserver for getting pages.
-The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
-The environment variable inside the connection string is substituted with
-the JWT token.
+Compute connects to Pageserver for getting pages. The connection string is
+configured by the `neon.pageserver_connstring` PostgreSQL GUC,
+e.g. `postgresql://no_user@localhost:15028`. If the `$NEON_AUTH_TOKEN`
+environment variable is set, it is used as the password for the connection. (The
+pageserver uses JWT tokens for authentication, so the password is really a
+token.)

-Compute connects to Safekeepers to write and commit data.
-The token is the same for all safekeepers.
-It's stored in an environment variable, whose name is configured
-by the `neon.safekeeper_token_env` PostgreSQL GUC.
-If the GUC is unset, no token is passed.
+Compute connects to Safekeepers to write and commit data. The list of safekeeper
+addresses is given in the `neon.safekeepers` GUC. The connections to the
+safekeepers take the password from the `$NEON_AUTH_TOKEN` environment
+variable, if set.

-Note that both tokens can be (and typically are) the same;
-the scope is the tenant and the token is usually passed through the
-`$NEON_AUTH_TOKEN` environment variable.
+The `compute_ctl` binary that runs before the PostgreSQL server, and launches
+PostgreSQL, also makes a connection to the pageserver. It uses it to fetch the
+initial "base backup" dump, to initialize the PostgreSQL data directory. It also
+uses `$NEON_AUTH_TOKEN` as the password for the connection.

 ### Pageserver
 #### Overview
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -37,9 +37,9 @@ You can specify version of neon cluster using following environment values.
 - PG_VERSION: postgres version for compute (default is 14)
 - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
-$ cd docker-compose/docker-compose.yml
+$ cd docker-compose/
 $ docker-compose down   # remove the conainers if exists
-$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)
--- a/docs/rfcs/022-pageserver-delete-from-s3.md
+++ b/docs/rfcs/022-pageserver-delete-from-s3.md
@@ -0,0 +1,269 @@
+# Deleting pageserver part of tenants data from s3
+
+Created on 08.03.23
+
+## Motivation
+
+Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC).
+
+This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident)
+
+## Summary
+
+TLDR; There are two options, one based on control plane issuing actual delete requests to s3 and the other one that keeps s3 stuff bound to pageserver. Each one has its pros and cons.
+
+The decision is to stick with pageserver centric approach. For motivation see [Decision](#decision).
+
+## Components
+
+pageserver, control-plane
+
+## Requirements
+
+Deletion should successfully finish (eventually) without leaving dangling files in presense of:
+
+- component restarts
+- component outage
+- pageserver loss
+
+## Proposed implementation
+
+Before the options are discussed, note that deletion can be quite long process. For deletion from s3 the obvious choice is [DeleteObjects](https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html) API call. It allows to batch deletion of up to 1k objects in one API call. So deletion operation linearly depends on number of layer files.
+
+Another design limitation is that there is no cheap `mv` operation available for s3. `mv` from `aws s3 mv` uses `copy(src, dst) + delete(src)`. So `mv`-like operation is not feasible as a building block because it actually amplifies the problem with both duration and resulting cost of the operation.
+
+The case when there are multiple pageservers handling the same tenants is largely out of scope of the RFC. We still consider case with migration from one PS to another, but do not consider case when tenant exists on multiple pageservers for extended period of time. The case with multiple pageservers can be reduced to case with one pageservers by calling detach on all pageservers except the last one, for it actual delete needs to be called.
+
+For simplicity lets look into deleting tenants. Differences in deletion process between tenants and timelines are mentioned in paragraph ["Differences between tenants and timelines"](#differences-between-tenants-and-timelines)
+
+### 1. Pageserver owns deletion machinery
+
+#### The sequence
+
+TLDR; With this approach control plane needs to call delete on a tenant and poll for progress. As much as possible is handled on pageserver. Lets see the sequence.
+
+Happy path:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS as Pageserver
+    participant S3
+
+    CP->>PS: Delete tenant
+    PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
+    PS->>PS: Create deleted mark file locally
+    PS->>CP: Accepted
+    PS->>PS: delete local files other than deleted mark
+    loop Delete layers for each timeline
+        PS->>S3: delete(..)
+        CP->>PS: Finished?
+        PS->>CP: False
+    end
+    PS->>S3: Delete mark file
+    PS->>PS: Delete local mark file
+
+    loop Poll for status
+        CP->>PS: Finished?
+        PS->>CP: True or False
+    end
+```
+
+Why two mark files?
+Remote one is needed for cases when pageserver is lost during deletion so other pageserver can learn the deletion from s3 during attach.
+
+Why local mark file is needed?
+
+If we dont have one, we have two choices, delete local data before deleting the remote part or do that after.
+
+If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants).
+
+If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote.
+
+Thus we need local record of tenant being deleted as well.
+
+##### Handle pageserver crashes
+
+Lets explore sequences with various crash points.
+
+Pageserver crashes before `deleted` mark file is persisted in s3:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS as Pageserver
+    participant S3
+
+    CP->>PS: Delete tenant
+    note over PS: Crash point 1.
+    CP->>PS: Retry delete request
+
+    PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
+    PS->>PS: Create deleted mark file locally
+
+    PS->>CP: Accepted
+
+    PS->>PS: delete local files other than deleted mark
+
+    loop Delete layers for each timeline
+        PS->>S3: delete(..)
+        CP->>PS: Finished?
+        PS->>CP: False
+    end
+    PS->>S3: Delete mark file
+    PS->>PS: Delete local mark file
+
+    CP->>PS: Finished?
+    PS->>CP: True
+```
+
+Pageserver crashed when deleted mark was about to be persisted in s3, before Control Plane gets a response:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS as Pageserver
+    participant S3
+
+    CP->>PS: Delete tenant
+    PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
+
+    note over PS: Crash point 2.
+    note over PS: During startup we reconcile <br> with remote and see <br> whether the remote mark exists
+    alt Remote mark exists
+        PS->>PS: create local mark if its missing
+        PS->>PS: delete local files other than deleted mark
+        loop Delete layers for each timeline
+            PS->>S3: delete(..)
+        end
+
+        note over CP: Eventually console should <br> retry delete request
+
+        CP->>PS: Retry delete tenant
+        PS->>CP: Not modified
+    else Mark is missing
+        note over PS: Continue to operate the tenant as if deletion didnt happen
+
+        note over CP: Eventually console should <br> retry delete request
+
+        CP->>PS: Retry delete tenant
+        PS->>S3: Create deleted mark file at <br> /tenant/meta/deleted
+        PS->>CP: Delete tenant
+    end
+
+    PS->>PS: Continue with layer file deletions
+    loop Delete layers for each timeline
+        PS->>S3: delete(..)
+        CP->>PS: Finished?
+        PS->>CP: False
+    end
+
+    PS->>S3: Delete mark file
+    PS->>PS: Delete local mark file
+
+    CP->>PS: Finished?
+    PS->>CP: True
+```
+
+Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response.
+
+If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success.
+
+The same applies if pageserver crashes in the end, when remote mark is deleted but before local one gets deleted. In this case on restart pageserver moves forward with deletion of local mark and Control Plane will receive 404.
+
+##### Differences between tenants and timelines
+
+For timeline the sequence is the same with the following differences:
+
+- remote delete mark file can be replaced with a boolean "deleted" flag in index_part.json
+- local deletion mark is not needed, because whole tenant is kept locally so situation described in motivation for local mark is impossible
+
+##### Handle pageserver loss
+
+If pageseserver is lost then the deleted tenant should be attached to different pageserver and delete request needs to be retried against new pageserver. Then attach logic is shared with one described for pageserver restarts (local deletion mark wont be available so needs to be created).
+
+##### Restrictions for tenant that is in progress of being deleted
+
+I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status.
+
+#### Summary
+
+Pros:
+
+- Storage is not dependent on control plane. Storage can be restarted even if control plane is not working.
+- Allows for easier dogfooding, console can use Neon backed database as primary operational data store. If storage depends on control plane and control plane depends on storage we're stuck.
+- No need to share inner s3 workings with control plane. Pageserver presents api contract and S3 paths are not part of this contract.
+- No need to pass list of alive timelines to attach call. This will be solved by pageserver observing deleted flag. See
+
+Cons:
+
+- Logic is a tricky, needs good testing
+- Anything else?
+
+### 2. Control plane owns deletion machinery
+
+In this case the only action performed on pageserver is removal of local files.
+
+Everything else is done by control plane. The steps are as follows:
+
+1. Control plane marks tenant as "delete pending" in its database
+2. It lists the s3 for all the files and repeatedly calls delete until nothing is left behind
+3. When no files are left marks deletion as completed
+
+In case of restart it selects all tenants marked as "delete pending" and continues the deletion.
+
+For tenants it is simple. For timelines there are caveats.
+
+Assume that the same workflow is used for timelines.
+
+If a tenant gets relocated during timeline deletion the attach call with its current logic will pick up deleted timeline in its half deleted state.
+
+Available options:
+
+- require list of alive timelines to be passed to attach call
+- use the same schema with flag in index_part.json (again part of the caveats around pageserver restart applies). In this case nothing stops pageserver from implementing deletion inside if we already have these deletion marks.
+
+With first option the following problem becomes apparent:
+
+Who is the source of truth regarding timeline liveness?
+
+Imagine:
+PS1 fails.
+PS2 gets assigned the tenant.
+New branch gets created
+PS1 starts up (is it possible or we just recycle it?)
+PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane.
+
+So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane.
+
+### Summary
+
+Cons:
+
+- Potential thundering herd-like problem during storage restart (requests to control plane)
+- Potential increase in storage startup time (additional request to control plane)
+- Storage startup starts to depend on console
+- Erroneous attach call can attach tenant in half deleted state
+
+Pros:
+
+- Easier to reason about if you dont have to account for pageserver restarts
+
+### Extra notes
+
+There was a concern that having deletion code in pageserver is a littlebit scary, but we need to have this code somewhere. So to me it is equally scary to have that in whatever place it ends up at.
+
+Delayed deletion can be done with both approaches. As discussed with Anna (@stepashka) this is only relevant for tenants (projects) not for timelines. For first approach detach can be called immediately and deletion can be done later with attach + delete. With second approach control plane needs to start the deletion whenever necessary.
+
+## Decision
+
+After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete.
+
+To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes.
+
+With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo.
+
+So the decision is to proceed with pageserver centric approach.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -115,6 +115,11 @@ pub struct TenantCreateRequest {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
+    // We defer the parsing of the eviction_policy field to the request handler.
+    // Otherwise we'd have to move the types for eviction policy into this package.
+    // We might do that once the eviction feature has stabilizied.
+    // For now, this field is not even documented in the openapi_spec.yml.
+    pub eviction_policy: Option<serde_json::Value>,
 }

 #[serde_as]
@@ -341,7 +346,7 @@ pub enum InMemoryLayerInfo {
 pub enum HistoricLayerInfo {
    Delta {
        layer_file_name: String,
-        layer_file_size: Option<u64>,
+        layer_file_size: u64,

        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
@@ -352,7 +357,7 @@ pub enum HistoricLayerInfo {
    },
    Image {
        layer_file_name: String,
-        layer_file_size: Option<u64>,
+        layer_file_size: u64,

        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -767,7 +767,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

        let err_to_send_and_errcode = match &end {
            ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
-            Other(_) => Some((end.to_string(), SQLSTATE_INTERNAL_ERROR)),
+            Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
            // Note: CopyFail in duplex copy is somewhat unexpected (at least to
            // PG walsender; evidently and per my docs reading client should
            // finish it with CopyDone). It is not a problem to recover from it
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,4 @@
 // For details about authentication see docs/authentication.md
-//
-// TODO: use ed25519 keys
-// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162

 use serde;
 use std::fs;
@@ -9,26 +6,15 @@ use std::path::Path;

 use anyhow::Result;
 use jsonwebtoken::{
-    decode, encode, Algorithm, Algorithm::*, DecodingKey, EncodingKey, Header, TokenData,
-    Validation,
+    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};

 use crate::id::TenantId;

-/// Algorithms accepted during validation.
-///
-/// Accept all RSA-based algorithms. We pass this list to jsonwebtoken::decode,
-/// which checks that the algorithm in the token is one of these.
-///
-/// XXX: It also fails the validation if there are any algorithms in this list that belong
-/// to different family than the token's algorithm. In other words, we can *not* list any
-/// non-RSA algorithms here, or the validation always fails with InvalidAlgorithm error.
-const ACCEPTED_ALGORITHMS: &[Algorithm] = &[RS256, RS384, RS512];
-
-/// Algorithm to use when generating a new token in [`encode_from_key_file`]
-const ENCODE_ALGORITHM: Algorithm = Algorithm::RS256;
+/// Algorithm to use. We require EdDSA.
+const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;

 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 #[serde(rename_all = "lowercase")]
@@ -69,7 +55,7 @@ pub struct JwtAuth {
 impl JwtAuth {
    pub fn new(decoding_key: DecodingKey) -> Self {
        let mut validation = Validation::default();
-        validation.algorithms = ACCEPTED_ALGORITHMS.into();
+        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
@@ -81,7 +67,7 @@ impl JwtAuth {

    pub fn from_key_path(key_path: &Path) -> Result<Self> {
        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?))
+        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
    }

    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
@@ -99,8 +85,8 @@ impl std::fmt::Debug for JwtAuth {

 // this function is used only for testing purposes in CLI e g generate tokens during init
 pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
-    let key = EncodingKey::from_rsa_pem(key_data)?;
-    Ok(encode(&Header::new(ENCODE_ALGORITHM), claims, &key)?)
+    let key = EncodingKey::from_ed_pem(key_data)?;
+    Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
 }

 #[cfg(test)]
@@ -108,49 +94,19 @@ mod tests {
    use super::*;
    use std::str::FromStr;

-    // generated with:
+    // Generated with:
    //
-    // openssl genpkey -algorithm rsa -out storage-auth-priv.pem
-    // openssl pkey -in storage-auth-priv.pem -pubout -out storage-auth-pub.pem
-    const TEST_PUB_KEY_RSA: &[u8] = br#"
+    // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
+    // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
+    const TEST_PUB_KEY_ED25519: &[u8] = br#"
 -----BEGIN PUBLIC KEY-----
-MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy6OZ+/kQXcueVJA/KTzO
-v4ljxylc/Kcb0sXWuXg1GB8k3nDA1gK66LFYToH0aTnqrnqG32Vu6wrhwuvqsZA7
-jQvP0ZePAbWhpEqho7EpNunDPcxZ/XDy5TQlB1P58F9I3lkJXDC+DsHYLuuzwhAv
-vo2MtWRdYlVHblCVLyZtANHhUMp2HUhgjHnJh5UrLIKOl4doCBxkM3rK0wjKsNCt
-M92PCR6S9rvYzldfeAYFNppBkEQrXt2CgUqZ4KaS4LXtjTRUJxljijA4HWffhxsr
-euRu3ufq8kVqie7fum0rdZZSkONmce0V0LesQ4aE2jB+2Sn48h6jb4dLXGWdq8TV
-wQIDAQAB
+MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
 -----END PUBLIC KEY-----
 "#;
-    const TEST_PRIV_KEY_RSA: &[u8] = br#"
+
+    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
 -----BEGIN PRIVATE KEY-----
-MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDLo5n7+RBdy55U
-kD8pPM6/iWPHKVz8pxvSxda5eDUYHyTecMDWArrosVhOgfRpOequeobfZW7rCuHC
-6+qxkDuNC8/Rl48BtaGkSqGjsSk26cM9zFn9cPLlNCUHU/nwX0jeWQlcML4Owdgu
-67PCEC++jYy1ZF1iVUduUJUvJm0A0eFQynYdSGCMecmHlSssgo6Xh2gIHGQzesrT
-CMqw0K0z3Y8JHpL2u9jOV194BgU2mkGQRCte3YKBSpngppLgte2NNFQnGWOKMDgd
-Z9+HGyt65G7e5+ryRWqJ7t+6bSt1llKQ42Zx7RXQt6xDhoTaMH7ZKfjyHqNvh0tc
-ZZ2rxNXBAgMBAAECggEAVz3u4Wlx3o02dsoZlSQs+xf0PEX3RXKeU+1YMbtTG9Nz
-6yxpIQaoZrpbt76rJE2gwkFR+PEu1NmjoOuLb6j4KlQuI4AHz1auOoGSwFtM6e66
-K4aZ4x95oEJ3vqz2fkmEIWYJwYpMUmwvnuJx76kZm0xvROMLsu4QHS2+zCVtO5Tr
-hvS05IMVuZ2TdQBZw0+JaFdwXbgDjQnQGY5n9MoTWSx1a4s/FF4Eby65BbDutcpn
-Vt3jQAOmO1X2kbPeWSGuPJRzyUs7Kg8qfeglBIR3ppGP3vPYAdWX+ho00bmsVkSp
-Q8vjul6C3WiM+kjwDxotHSDgbl/xldAl7OqPh0bfAQKBgQDnycXuq14Vg8nZvyn9
-rTnvucO8RBz5P6G+FZ+44cAS2x79+85onARmMnm+9MKYLSMo8fOvsK034NDI68XM
-04QQ/vlfouvFklMTGJIurgEImTZbGCmlMYCvFyIxaEWixon8OpeI4rFe4Hmbiijh
-PxhxWg221AwvBS2sco8J/ylEkQKBgQDg6Rh2QYb/j0Wou1rJPbuy3NhHofd5Rq35
-4YV3f2lfVYcPrgRhwe3T9SVII7Dx8LfwzsX5TAlf48ESlI3Dzv40uOCDM+xdtBRI
-r96SfSm+jup6gsXU3AsdNkrRK3HoOG9Z/TkrUp213QAIlVnvIx65l4ckFMlpnPJ0
-lo1LDXZWMQKBgFArzjZ7N5OhfdO+9zszC3MLgdRAivT7OWqR+CjujIz5FYMr8Xzl
-WfAvTUTrS9Nu6VZkObFvHrrRG+YjBsuN7YQjbQXTSFGSBwH34bgbn2fl9pMTjHQC
-50uoaL9GHa/rlBaV/YvvPQJgCi/uXa1rMX0jdNLkDULGO8IF7cu7Yf7BAoGBAIUU
-J29BkpmAst0GDs/ogTlyR18LTR0rXyHt+UUd1MGeH859TwZw80JpWWf4BmkB4DTS
-hH3gKePdJY7S65ci0XNsuRupC4DeXuorde0DtkGU2tUmr9wlX0Ynq9lcdYfMbMa4
-eK1TsxG69JwfkxlWlIWITWRiEFM3lJa7xlrUWmLhAoGAFpKWF/hn4zYg3seU9gai
-EYHKSbhxA4mRb+F0/9IlCBPMCqFrL5yftUsYIh2XFKn8+QhO97Nmk8wJSK6TzQ5t
-ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp
-8ismApXVGHpOCstzikV9W7k=
+MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 -----END PRIVATE KEY-----
 "#;

@@ -161,8 +117,7 @@ ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp
            scope: Scope::Tenant,
        };

-        // Here are tokens containing the following payload, signed using TEST_PRIV_KEY_RSA
-        // using RS512, RS384 and RS256 algorithms:
+        // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519:
        //
        // ```
        // {
@@ -174,21 +129,13 @@ ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp
        // }
        // ```
        //
-        // These were encoded with the online debugger at https://jwt.io
-        //
-        let encoded_rs512 = "eyJhbGciOiJSUzUxMiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.QmqfteDQmDGoxQ5EFkasbt35Lx0W0Nh63muQnYZvFq93DSh4ZbOG9Mc4yaiXZoiS5HgeKtFKv3mbWkDqjz3En06aY17hWwguBtAsGASX48lYeCPADYGlGAuaWnOnVRwe3iiOC7tvPFvwX_45S84X73sNUXyUiXv6nLdcDqVXudtNrGST_DnZDnjuUJX11w7sebtKqQQ8l9-iGHiXOl5yevpMCoB1OcTWcT6DfDtffoNuMHDC3fyhmEGG5oKAt1qBybqAIiyC9-UBAowRZXhdfxrzUl-I9jzKWvk85c5ulhVRwbPeP6TTTlPKwFzBNHg1i2U-1GONew5osQ3aoptwsA";
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

-        let encoded_rs384 = "eyJhbGciOiJSUzM4NCIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.qqk4nkxKzOJP38c_g57_w_SfdQVmCsDT_bsLmdFj_N6LIB22gr6U6_P_5mvk3pIAsp0VCTDwPrCU908TxqjibEkwvQoJwbogHamSGHpD7eJBxGblSnA-Nr3MlEMxpFtec8QokSm6C5mH7DoBYjB2xzeOlxAmpR2GAzInKiMkU4kZ_OcqqrmVcMXY_6VnbxZWMekuw56zE1-PP_qNF1HvYOH-P08ONP8qdo5UPtBG7QBEFlCqZXJZCFihQaI4Vzil9rDuZGCm3I7xQJ8-yh1PX3BTbGo8EzqLdRyBeTpr08UTuRbp_MJDWevHpP3afvJetAItqZXIoZQrbJjcByHqKw";
+        // Check it can be validated with the public key
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        assert_eq!(claims_from_token, expected_claims);

-        let encoded_rs256 = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.dF2N9KXG8ftFKHYbd5jQtXMQqv0Ej8FISGp1b_dmqOCotXj5S1y2AWjwyB_EXHM77JXfbEoJPAPrFFBNfd8cWtkCSTvpxWoHaecGzegDFGv5ZSc5AECFV1Daahc3PI3jii9wEiGkFOiwiBNfZ5INomOAsV--XXxlqIwKbTcgSYI7lrOTfecXAbAHiMKQlQYiIBSGnytRCgafhRkyGzPAL8ismthFJ9RHfeejyskht-9GbVHURw02bUyijuHEulpf9eEY3ZiB28de6jnCdU7ftIYaUMaYWt0nZQGkzxKPSfSLZNy14DTOYLDS04DVstWQPqnCUW_ojg0wJETOOfo9Zw";
-
-        // Check that RS512, RS384 and RS256 tokens can all be validated
-        let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?);
-
-        for encoded in [encoded_rs512, encoded_rs384, encoded_rs256] {
-            let claims_from_token = auth.decode(encoded)?.claims;
-            assert_eq!(claims_from_token, expected_claims);
-        }
        Ok(())
    }

@@ -199,10 +146,10 @@ ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_RSA)?;
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;

        // decode it back
-        let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?);
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
        let decoded = auth.decode(&encoded)?;

        assert_eq!(decoded.claims, claims);
--- a/pageserver/src/bin/draw_layer-trace.rs
+++ b/pageserver/src/bin/draw_layer-trace.rs
@@ -0,0 +1,541 @@
+use anyhow::Result;
+use pageserver::repository::Key;
+use serde::{Deserialize, Serialize};
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+use std::{
+    collections::{BTreeMap, BTreeSet, HashMap},
+    fmt::Write,
+    ops::Range,
+};
+use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+struct CoordinateMap<T: Ord + Copy> {
+    map: BTreeMap<T, usize>,
+    stretch: f32
+}
+
+impl<T: Ord + Copy> CoordinateMap<T> {
+    fn new(coords: Vec<T>, stretch: f32) -> Self {
+        let set: BTreeSet<T> = coords.into_iter().collect();
+
+        let mut map: BTreeMap<T, usize> = BTreeMap::new();
+        for (i, e) in set.iter().enumerate() {
+            map.insert(*e, i);
+        }
+
+        Self { map, stretch }
+    }
+
+    fn map(&self, val: T) -> f32 {
+        *self.map.get(&val).unwrap() as f32 * self.stretch
+    }
+
+    fn max(&self) -> f32 {
+        self.map.len() as f32 * self.stretch
+    }
+}
+
+fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
+    let split: Vec<&str> = name.split("__").collect();
+    let keys: Vec<&str> = split[0].split('-').collect();
+    let mut lsns: Vec<&str> = split[1].split('-').collect();
+    if lsns.len() == 1 {
+        lsns.push(lsns[0]);
+    }
+
+    let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
+    let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    (keys, lsns)
+}
+
+#[derive(Serialize, Deserialize, PartialEq)]
+enum  LayerTraceOp {
+    #[serde(rename = "evict")]
+    Evict,
+    #[serde(rename = "flush")]
+    Flush,
+    #[serde(rename = "compact_create")]
+    CompactCreate,
+    #[serde(rename = "compact_delete")]
+    CompactDelete,
+    #[serde(rename = "image_create")]
+    ImageCreate,
+    #[serde(rename = "gc_delete")]
+    GcDelete,
+    #[serde(rename = "gc_start")]
+    GcStart,
+}
+
+impl std::fmt::Display for LayerTraceOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        let op_str = match self {
+            LayerTraceOp::Evict => "evict",
+            LayerTraceOp::Flush => "flush",
+            LayerTraceOp::CompactCreate => "compact_create",
+            LayerTraceOp::CompactDelete => "compact_delete",
+            LayerTraceOp::ImageCreate => "image_create",
+            LayerTraceOp::GcDelete => "gc_delete",
+            LayerTraceOp::GcStart => "gc_start",
+        };
+        f.write_str(op_str)
+    }
+}
+
+#[serde_with::serde_as]
+#[derive(Serialize, Deserialize)]
+struct LayerTraceLine {
+    time: u64,
+    op: LayerTraceOp,
+    #[serde(default)]
+    filename: String,
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    cutoff: Option<Lsn>,
+}
+
+struct LayerTraceFile {
+    filename: String,
+    key_range: Range<Key>,
+    lsn_range: Range<Lsn>,
+}
+
+impl LayerTraceFile {
+    fn is_image(&self) -> bool {
+        self.lsn_range.start == self.lsn_range.end
+    }
+}
+
+struct LayerTraceEvent {
+    time_rel: u64,
+    op: LayerTraceOp,
+    filename: String,
+}
+
+struct GcEvent {
+    time_rel: u64,
+    cutoff: Lsn,
+}
+
+fn main() -> Result<()> {
+    // Parse trace lines from stdin
+    let stdin = io::stdin();
+
+    let mut files: HashMap<String, LayerTraceFile> = HashMap::new();
+    let mut layer_events: Vec<LayerTraceEvent> = Vec::new();
+    let mut gc_events: Vec<GcEvent> = Vec::new();
+    let mut first_time: Option<u64> = None;
+    for line in stdin.lock().lines() {
+        let line = line.unwrap();
+        let parsed_line: LayerTraceLine = serde_json::from_str(&line)?;
+
+        let time_rel = if let Some(first_time) = first_time {
+            parsed_line.time - first_time
+        } else {
+            first_time = Some(parsed_line.time);
+            0
+        };
+
+        if parsed_line.op == LayerTraceOp::GcStart {
+            gc_events.push(GcEvent {
+                time_rel,
+                cutoff: parsed_line.cutoff.unwrap(),
+            });
+        } else {
+            layer_events.push(LayerTraceEvent {
+                time_rel,
+                filename: parsed_line.filename.clone(),
+                op: parsed_line.op,
+            });
+
+            if !files.contains_key(&parsed_line.filename) {
+                let (key_range, lsn_range) = parse_filename(&parsed_line.filename);
+                files.insert(parsed_line.filename.clone(), LayerTraceFile {
+                    filename: parsed_line.filename.clone(),
+                    key_range,
+                    lsn_range,
+                });
+            };
+        }
+    }
+    let mut last_time_rel = layer_events.last().unwrap().time_rel;
+    if let Some(last_gc) = gc_events.last() {
+        last_time_rel = std::cmp::min(last_gc.time_rel, last_time_rel);
+    }
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for f in files.values() {
+        keys.push(f.key_range.start);
+        keys.push(f.key_range.end);
+        lsns.push(f.lsn_range.start);
+        lsns.push(f.lsn_range.end);
+    }
+    for gc_event in &gc_events {
+        lsns.push(gc_event.cutoff);
+    }
+
+    // Analyze
+    let key_map = CoordinateMap::new(keys, 2.0);
+    // Stretch out vertically for better visibility
+    let lsn_map = CoordinateMap::new(lsns, 3.0);
+
+    // Initialize stats
+    let mut num_deltas = 0;
+    let mut num_images = 0;
+
+    let mut svg = String::new();
+
+    // Draw
+    writeln!(svg,
+        "{}",
+        BeginSvg {
+            w: key_map.max(),
+            h: lsn_map.max(),
+        }
+    )?;
+    let lsn_max = lsn_map.max();
+
+    // Sort the files by LSN, but so that image layers go after all delta layers
+    // The SVG is painted in the order the elements appear, and we want to draw
+    // image layers on top of the delta layers if they overlap
+    let mut files_sorted: Vec<LayerTraceFile> = files.into_values().collect();
+    files_sorted.sort_by(|a, b| {
+        if a.is_image() && !b.is_image() {
+            Ordering::Greater
+        } else if !a.is_image() && b.is_image() {
+            Ordering::Less
+        } else {
+            a.lsn_range.end.cmp(&b.lsn_range.end)
+        }
+    });
+
+    for f in files_sorted {
+        let key_start = key_map.map(f.key_range.start);
+        let key_end = key_map.map(f.key_range.end);
+        let key_diff = key_end - key_start;
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = lsn_map.map(f.lsn_range.start);
+        let lsn_end = lsn_map.map(f.lsn_range.end);
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        let mut style = Style::default();
+        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
+        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
+
+        let y_start = (lsn_max - lsn_start) as f32;
+        let y_end = (lsn_max - lsn_end) as f32;
+
+        let x_margin = 0.25;
+        let y_margin = 0.5;
+
+        match f.lsn_range.start.cmp(&f.lsn_range.end) {
+            Ordering::Less => {
+                num_deltas += 1;
+                write!(svg,
+                       r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
+                       f.filename,
+                       key_start as f32 + x_margin,
+                       y_end + y_margin,
+                       key_diff as f32 - x_margin * 2.0,
+                       y_start - y_end - y_margin * 2.0,
+                       1.0, // border_radius,
+                       style.to_string(),
+                )?;
+                write!(svg, "<title>{}<br>{} - {}</title>", f.filename, lsn_end, y_end)?;
+                writeln!(svg, "</rect>")?;
+            }
+            Ordering::Equal => {
+                num_images += 1;
+                //lsn_diff = 0.3;
+                //lsn_offset = -lsn_diff / 2.0;
+                //margin = 0.05;
+                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
+                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
+                write!(svg,
+                       r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
+                       f.filename,
+                       key_start as f32 + x_margin,
+                       y_end,
+                       key_end as f32 - x_margin,
+                       y_end,
+                       style.to_string(),
+                )?;
+                write!(svg, "<title>{}<br>{} - {}</title>", f.filename, lsn_end, y_end)?;
+                writeln!(svg, "</line>")?;
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+    }
+
+    for (idx, gc) in gc_events.iter().enumerate() {
+        let cutoff_lsn = lsn_map.map(gc.cutoff);
+
+        let mut style = Style::default();
+        style.fill = Fill::None;
+        style.stroke = Stroke::Color(rgb(0xff, 0, 0), 0.5);
+
+        let y = lsn_max - cutoff_lsn;
+        writeln!(svg,
+                 r#"    <line id="gc_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}" />"#,
+                 idx,
+                 0,
+                 y,
+                 key_map.max(),
+                 y,
+                 style.to_string(),
+        )?;
+    }
+
+    writeln!(svg, "{}", EndSvg)?;
+
+    let mut layer_events_str = String::new();
+    let mut first = true;
+    for e in layer_events {
+        if !first {
+            writeln!(layer_events_str, ",")?;
+        }
+        write!(layer_events_str,
+                 r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
+                 e.time_rel, e.filename, e.op)?;
+        first = false;
+    }
+    writeln!(layer_events_str)?;
+
+    let mut gc_events_str = String::new();
+    let mut first = true;
+    for e in gc_events {
+        if !first {
+            writeln!(gc_events_str, ",")?;
+        }
+        write!(gc_events_str,
+                 r#"  {{"time_rel": {}, "cutoff_lsn": "{}"}}"#,
+                 e.time_rel, e.cutoff)?;
+        first = false;
+    }
+    writeln!(gc_events_str)?;
+    
+    println!(r#"<!DOCTYPE html>
+<html>
+<head>
+<style>
+/* Keep the slider pinned at top */
+.topbar {{
+  display: block;
+  overflow: hidden;
+  background-color: lightgrey;
+  position: fixed;
+  top: 0;
+  width: 100%;
+/*  width: 500px; */
+}}
+.slidercontainer {{
+  float: left;
+  width: 50%;
+  margin-right: 200px;
+}}
+.slider {{
+  float: left;
+  width: 100%;
+}}
+.legend {{
+  width: 200px;
+  float: right;
+}}
+
+/* Main content */
+.main {{
+  margin-top: 50px; /* Add a top margin to avoid content overlay */
+}}
+</style>
+</head>
+
+  <body onload="init()">
+    <script type="text/javascript">
+
+      var layer_events = [{layer_events_str}]
+      var gc_events = [{gc_events_str}]
+
+      let ticker;
+
+      function init() {{
+          moveSlider({last_time_rel})
+          moveSlider(0)
+          moveSlider(last_slider_pos)
+      }}
+
+      function startAnimation() {{
+          ticker = setInterval(animateStep, 100);
+      }}
+      function stopAnimation() {{
+          clearInterval(ticker);
+      }}
+
+      function animateStep() {{
+          if (last_layer_event < layer_events.length - 1) {{
+              var slider = document.getElementById("time-slider");
+              let prevPos = slider.value
+              let nextEvent = last_layer_event
+              while (nextEvent < layer_events.length - 1) {{
+                  if (layer_events[nextEvent].time_rel > prevPos) {{
+                      break;
+                  }}
+                  nextEvent += 1;
+              }}
+              let nextPos = layer_events[nextEvent].time_rel
+              slider.value = nextPos
+              moveSlider(nextPos)
+          }}
+      }}
+
+      function redoLayerEvent(n, dir) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "evict":
+                  break;
+              case "flush":
+                  layer.style.visibility = "visible";
+                  break;
+              case "compact_create":
+                  layer.style.visibility = "visible";
+                  break;
+              case "image_create":
+                  layer.style.visibility = "visible";
+                  break;
+              case "compact_delete":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "gc_delete":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "gc_start":
+                  layer.style.visibility = "hidden";
+                  break;
+          }}
+      }}
+      function undoLayerEvent(n) {{
+          var layer = document.getElementById("layer_" + layer_events[n].filename);
+          switch (layer_events[n].op) {{
+              case "evict":
+                  break;
+              case "flush":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "compact_create":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "image_create":
+                  layer.style.visibility = "hidden";
+                  break;
+              case "compact_delete":
+                  layer.style.visibility = "visible";
+                  break;
+              case "gc_delete":
+                  layer.style.visibility = "visible";
+                  break;
+          }}
+      }}
+
+      function redoGcEvent(n) {{
+          var prev_gc_bar = document.getElementById("gc_" + (n - 1));
+          var new_gc_bar = document.getElementById("gc_" + n);
+
+          prev_gc_bar.style.visibility = "hidden"
+          new_gc_bar.style.visibility = "visible"
+      }}
+      function undoGcEvent(n) {{
+          var prev_gc_bar = document.getElementById("gc_" + n);
+          var new_gc_bar = document.getElementById("gc_" + (n - 1));
+
+          prev_gc_bar.style.visibility = "hidden"
+          new_gc_bar.style.visibility = "visible"
+      }}
+
+      var last_slider_pos = 0
+      var last_layer_event = 0
+      var last_gc_event = 0
+
+      var moveSlider = function(new_pos) {{
+          if (new_pos > last_slider_pos) {{
+              while (last_layer_event < layer_events.length - 1) {{
+                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
+                      break;
+                  }}
+                  last_layer_event += 1;
+                  redoLayerEvent(last_layer_event)
+              }}
+
+              while (last_gc_event < gc_events.length - 1) {{
+                  if (gc_events[last_gc_event + 1].time_rel > new_pos) {{
+                      break;
+                  }}
+                  last_gc_event += 1;
+                  redoGcEvent(last_gc_event)
+              }}
+
+          }}
+          if (new_pos < last_slider_pos) {{
+              while (last_layer_event > 0) {{
+                  if (layer_events[last_layer_event - 1].time_rel < new_pos) {{
+                      break;
+                  }}
+                  undoLayerEvent(last_layer_event)
+                  last_layer_event -= 1;
+              }}
+              while (last_gc_event > 0) {{
+                  if (gc_events[last_gc_event - 1].time_rel < new_pos) {{
+                      break;
+                  }}
+                  undoGcEvent(last_gc_event)
+                  last_gc_event -= 1;
+              }}
+          }}
+          last_slider_pos = new_pos;
+          document.getElementById("debug_pos").textContent=new_pos;
+          document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
+          document.getElementById("debug_gc_event").textContent=last_gc_event + " " + gc_events[last_gc_event].time_rel;
+      }}
+    </script>
+
+    <div class="topbar">
+      <div class="slidercontainer">
+        <label for="time-slider">TIME</label>:
+        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
+
+        pos: <span id="debug_pos"></span><br>
+        event: <span id="debug_layer_event"></span><br>
+        gc: <span id="debug_gc_event"></span><br>
+      </div>
+
+      <button onclick="startAnimation()">Play</button> 
+      <button onclick="stopAnimation()">Stop</button> 
+
+      <svg class="legend">
+        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
+        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
+      </svg>
+    </div>
+
+    <div class="main">
+{svg}
+    </div>
+  </body>
+</html>
+"#);
+
+    eprintln!("num_images: {}", num_images);
+    eprintln!("num_deltas: {}", num_deltas);
+
+    Ok(())
+}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -61,6 +61,7 @@ pub mod defaults {
    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";

    ///
    /// Default built-in configuration file.
@@ -89,6 +90,8 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

+#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -170,6 +173,9 @@ pub struct PageServerConf {
    pub metric_collection_endpoint: Option<Url>,
    pub synthetic_size_calculation_interval: Duration,

+    // See the corresponding metric's help string.
+    pub evictions_low_residence_duration_metric_threshold: Duration,
+
    pub test_remote_failures: u64,

    pub ondemand_download_behavior_treat_error_as_warn: bool,
@@ -240,6 +246,8 @@ struct PageServerConfigBuilder {
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,

+    evictions_low_residence_duration_metric_threshold: BuilderValue<Duration>,
+
    test_remote_failures: BuilderValue<u64>,

    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
@@ -293,6 +301,11 @@ impl Default for PageServerConfigBuilder {
            .expect("cannot parse default synthetic size calculation interval")),
            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),

+            evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration(
+                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")),
+
            test_remote_failures: Set(0),

            ondemand_download_behavior_treat_error_as_warn: Set(false),
@@ -408,6 +421,10 @@ impl PageServerConfigBuilder {
        self.test_remote_failures = BuilderValue::Set(fail_first);
    }

+    pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) {
+        self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value);
+    }
+
    pub fn ondemand_download_behavior_treat_error_as_warn(
        &mut self,
        ondemand_download_behavior_treat_error_as_warn: bool,
@@ -481,6 +498,11 @@ impl PageServerConfigBuilder {
            synthetic_size_calculation_interval: self
                .synthetic_size_calculation_interval
                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
+            evictions_low_residence_duration_metric_threshold: self
+                .evictions_low_residence_duration_metric_threshold
+                .ok_or(anyhow!(
+                    "missing evictions_low_residence_duration_metric_threshold"
+                ))?,
            test_remote_failures: self
                .test_remote_failures
                .ok_or(anyhow!("missing test_remote_failuers"))?,
@@ -670,6 +692,7 @@ impl PageServerConf {
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
+                "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?),
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
@@ -810,6 +833,10 @@ impl PageServerConf {
            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
            synthetic_size_calculation_interval: Duration::from_secs(60),
+            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .unwrap(),
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
        }
@@ -951,6 +978,9 @@ metric_collection_interval = '222 s'
 cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'
+
+evictions_low_residence_duration_metric_threshold = '444 s'
+
 log_format = 'json'

 "#;
@@ -1005,6 +1035,9 @@ log_format = 'json'
                synthetic_size_calculation_interval: humantime::parse_duration(
                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                )?,
+                evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                    defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD
+                )?,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
            },
@@ -1056,6 +1089,7 @@ log_format = 'json'
                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                synthetic_size_calculation_interval: Duration::from_secs(333),
+                evictions_low_residence_duration_metric_threshold: Duration::from_secs(444),
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
            },
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -351,6 +351,13 @@ paths:
        schema:
          type: string
          format: hex
+      - name: detach_ignored
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: |
+          When true, allow to detach a tenant which state is ignored.
    post:
      description: |
        Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -185,7 +185,7 @@ fn build_timeline_info_common(
            None
        }
    };
-    let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
+    let current_physical_size = Some(timeline.layer_size_sum());
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

@@ -384,10 +384,11 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
 async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
+    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;

    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(conf, tenant_id)
+    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
        .instrument(info_span!("tenant_detach", tenant = %tenant_id))
        .await?;

@@ -451,7 +452,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.layer_size_sum().approximate_is_ok();
+            current_physical_size += timeline.layer_size_sum();
        }

        let state = tenant.current_state();
@@ -738,6 +739,14 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        );
    }

+    if let Some(eviction_policy) = request_data.eviction_policy {
+        tenant_conf.eviction_policy = Some(
+            serde_json::from_value(eviction_policy)
+                .context("parse field `eviction_policy`")
+                .map_err(ApiError::BadRequest)?,
+        );
+    }
+
    let target_tenant_id = request_data
        .new_tenant_id
        .map(TenantId::from)
--- a/pageserver/src/keyspace.rs
+++ b/pageserver/src/keyspace.rs
@@ -1,11 +1,12 @@
 use crate::repository::{key_range_size, singleton_range, Key};
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;
+use tracing::debug;

 ///
 /// Represents a set of Keys, in a compact form.
 ///
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct KeySpace {
    /// Contiguous ranges of keys that belong to the key space. In key order,
    /// and with no overlap.
@@ -61,6 +62,60 @@ impl KeySpace {

        KeyPartitioning { parts }
    }
+
+    /// Add range to keyspace.
+    ///
+    /// Unlike KeySpaceAccum, it accepts key ranges in any order and overlapping ranges.
+    pub fn add_range(&mut self, range: Range<Key>) {
+        let start = range.start;
+        let mut end = range.end;
+        let mut prev_index = match self.ranges.binary_search_by_key(&end, |r| r.start) {
+            Ok(index) => index,
+            Err(0) => {
+                self.ranges.insert(0, range);
+                return;
+            }
+            Err(index) => index - 1,
+        };
+        loop {
+            let mut prev = &mut self.ranges[prev_index];
+            if prev.end >= start {
+                // two ranges overlap
+                if prev.start <= start {
+                    // combine with prev range
+                    if prev.end < end {
+                        prev.end = end;
+                        debug!("Extend wanted image {}..{}", prev.start, end);
+                    }
+                    return;
+                } else {
+                    if prev.end > end {
+                        end = prev.end;
+                    }
+                    self.ranges.remove(prev_index);
+                }
+            } else {
+                break;
+            }
+            if prev_index == 0 {
+                break;
+            }
+            prev_index -= 1;
+        }
+        debug!("Wanted image {}..{}", start, end);
+        self.ranges.insert(prev_index, start..end);
+    }
+
+    ///
+    /// Check if key space contains overlapping range
+    ///
+    pub fn overlaps(&self, range: &Range<Key>) -> bool {
+        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
+            Ok(_) => false,
+            Err(0) => false,
+            Err(index) => self.ranges[index - 1].end > range.start,
+        }
+    }
 }

 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,22 +9,18 @@ use once_cell::sync::Lazy;
 use pageserver_api::models::state;
 use utils::id::{TenantId, TimelineId};

-/// Prometheus histogram buckets (in seconds) that capture the majority of
-/// latencies in the microsecond range but also extend far enough up to distinguish
-/// "bad" from "really bad".
-fn get_buckets_for_critical_operations() -> Vec<f64> {
-    let buckets_per_digit = 5;
-    let min_exponent = -6;
-    let max_exponent = 2;
-
-    let mut buckets = vec![];
-    // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
-    // because it's more numerically stable and doesn't result in numbers like 9.999999
-    for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
-        buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
-    }
-    buckets
-}
+/// Prometheus histogram buckets (in seconds) for operations in the critical
+/// path. In other words, operations that directly affect that latency of user
+/// queries.
+///
+/// The buckets capture the majority of latencies in the microsecond and
+/// millisecond range but also extend far enough up to distinguish "bad" from
+/// "really bad".
+const CRITICAL_OP_BUCKETS: &[f64] = &[
+    0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
+    0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
+    1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
+];

 // Metrics collected on operations on the storage repository.
 const STORAGE_TIME_OPERATIONS: &[&str] = &[
@@ -55,12 +51,15 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+// Buckets for background operations like compaction, GC, size calculation
+const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
+
 pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
        &["operation"],
-        get_buckets_for_critical_operations(),
+        STORAGE_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -71,7 +70,7 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value",
        &["tenant_id", "timeline_id"],
-        get_buckets_for_critical_operations(),
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -90,7 +89,7 @@ static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
        &["tenant_id", "timeline_id"],
-        get_buckets_for_critical_operations(),
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -195,15 +194,101 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_evictions",
+        "Number of layers evicted from the pageserver",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_evictions_with_low_residence_duration",
+        "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
+         Residence duration is determined using the `residence_duration_data_source`.",
+        &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
+    )
+    .expect("failed to define a metric")
+});
+
+/// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
+#[derive(Debug)]
+pub struct EvictionsWithLowResidenceDuration {
+    data_source: &'static str,
+    threshold: Duration,
+    counter: Option<IntCounter>,
+}
+
+pub struct EvictionsWithLowResidenceDurationBuilder {
+    data_source: &'static str,
+    threshold: Duration,
+}
+
+impl EvictionsWithLowResidenceDurationBuilder {
+    pub fn new(data_source: &'static str, threshold: Duration) -> Self {
+        Self {
+            data_source,
+            threshold,
+        }
+    }
+
+    fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
+        let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
+            .get_metric_with_label_values(&[
+                tenant_id,
+                timeline_id,
+                self.data_source,
+                &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
+            ])
+            .unwrap();
+        EvictionsWithLowResidenceDuration {
+            data_source: self.data_source,
+            threshold: self.threshold,
+            counter: Some(counter),
+        }
+    }
+}
+
+impl EvictionsWithLowResidenceDuration {
+    fn threshold_label_value(threshold: Duration) -> String {
+        format!("{}", threshold.as_secs())
+    }
+
+    pub fn observe(&self, observed_value: Duration) {
+        if self.threshold < observed_value {
+            self.counter
+                .as_ref()
+                .expect("nobody calls this function after `remove_from_vec`")
+                .inc();
+        }
+    }
+
+    // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
+    fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
+        let Some(_counter) = self.counter.take() else {
+            return;
+        };
+        EVICTIONS_WITH_LOW_RESIDENCE_DURATION
+            .remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                self.data_source,
+                &Self::threshold_label_value(self.threshold),
+            ])
+            .expect("we own the metric, no-one else should remove it");
+    }
+}
+
 // Metrics collected on disk IO operations
+//
+// Roughly logarithmic scale.
 const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
-    0.000001, // 1 usec
-    0.00001,  // 10 usec
-    0.0001,   // 100 usec
-    0.001,    // 1 msec
-    0.01,     // 10 msec
-    0.1,      // 100 msec
-    1.0,      // 1 sec
+    0.000030, // 30 usec
+    0.001000, // 1000 usec
+    0.030,    // 30 ms
+    1.000,    // 1000 ms
 ];

 const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -238,20 +323,12 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
    "get_db_size",
 ];

-const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[
-    0.00001, // 1/100000 s
-    0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s
-    0.001, 0.0025, 0.005, 0.0075, // 1/1000 s
-    0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s
-    0.1,  // 1/10 s
-];
-
 pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling",
        &["smgr_query_type", "tenant_id", "timeline_id"],
-        SMGR_QUERY_TIME_BUCKETS.into()
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -520,10 +597,16 @@ pub struct TimelineMetrics {
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
    pub persistent_bytes_written: IntCounter,
+    pub evictions: IntCounter,
+    pub evictions_with_low_residence_duration: EvictionsWithLowResidenceDuration,
 }

 impl TimelineMetrics {
-    pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+    pub fn new(
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
+    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
        let reconstruct_time_histo = RECONSTRUCT_TIME
@@ -560,6 +643,11 @@ impl TimelineMetrics {
        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let evictions = EVICTIONS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let evictions_with_low_residence_duration =
+            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
@@ -579,6 +667,8 @@ impl TimelineMetrics {
            current_logical_size_gauge,
            num_persistent_files_created,
            persistent_bytes_written,
+            evictions,
+            evictions_with_low_residence_duration,
        }
    }
 }
@@ -595,7 +685,9 @@ impl Drop for TimelineMetrics {
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
-
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        self.evictions_with_low_residence_duration
+            .remove(tenant_id, timeline_id);
        for op in STORAGE_TIME_OPERATIONS {
            let _ =
                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -630,7 +722,7 @@ use std::collections::HashMap;
 use std::pin::Pin;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
-use std::time::Instant;
+use std::time::{Duration, Instant};

 pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -478,7 +478,7 @@ impl Tenant {

            let dummy_timeline = self.create_timeline_data(
                timeline_id,
-                up_to_date_metadata.clone(),
+                up_to_date_metadata,
                ancestor.clone(),
                remote_client,
            )?;
@@ -503,7 +503,7 @@ impl Tenant {
                    let broken_timeline = self
                        .create_timeline_data(
                            timeline_id,
-                            up_to_date_metadata.clone(),
+                            up_to_date_metadata,
                            ancestor.clone(),
                            None,
                        )
@@ -1142,7 +1142,7 @@ impl Tenant {
        );
        self.prepare_timeline(
            new_timeline_id,
-            new_metadata,
+            &new_metadata,
            timeline_uninit_mark,
            true,
            None,
@@ -1700,7 +1700,7 @@ impl Tenant {
    fn create_timeline_data(
        &self,
        new_timeline_id: TimelineId,
-        new_metadata: TimelineMetadata,
+        new_metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        remote_client: Option<RemoteTimelineClient>,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -2160,13 +2160,25 @@ impl Tenant {
        let new_timeline = self
            .prepare_timeline(
                dst_id,
-                metadata,
+                &metadata,
                timeline_uninit_mark,
                false,
                Some(Arc::clone(src_timeline)),
            )?
            .initialize_with_lock(&mut timelines, true, true)?;
        drop(timelines);
+
+        // Root timeline gets its layers during creation and uploads them along with the metadata.
+        // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
+        // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
+        // could get incorrect information and remove more layers, than needed.
+        // See also https://github.com/neondatabase/neon/issues/3865
+        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
+            remote_client
+                .schedule_index_upload_for_metadata_update(&metadata)
+                .context("branch initial metadata upload")?;
+        }
+
        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");

        Ok(new_timeline)
@@ -2229,7 +2241,7 @@ impl Tenant {
            pg_version,
        );
        let raw_timeline =
-            self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?;
+            self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?;

        let tenant_id = raw_timeline.owning_tenant.tenant_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;
@@ -2283,7 +2295,7 @@ impl Tenant {
    fn prepare_timeline(
        &self,
        new_timeline_id: TimelineId,
-        new_metadata: TimelineMetadata,
+        new_metadata: &TimelineMetadata,
        uninit_mark: TimelineUninitMark,
        init_layers: bool,
        ancestor: Option<Arc<Timeline>>,
@@ -2297,7 +2309,7 @@ impl Tenant {
                tenant_id,
                new_timeline_id,
            );
-            remote_client.init_upload_queue_for_empty_remote(&new_metadata)?;
+            remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
            Some(remote_client)
        } else {
            None
@@ -2336,17 +2348,12 @@ impl Tenant {
        &self,
        timeline_path: &Path,
        new_timeline_id: TimelineId,
-        new_metadata: TimelineMetadata,
+        new_metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        remote_client: Option<RemoteTimelineClient>,
    ) -> anyhow::Result<Arc<Timeline>> {
        let timeline_data = self
-            .create_timeline_data(
-                new_timeline_id,
-                new_metadata.clone(),
-                ancestor,
-                remote_client,
-            )
+            .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client)
            .context("Failed to create timeline data structure")?;
        crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;

@@ -2358,7 +2365,7 @@ impl Tenant {
            self.conf,
            new_timeline_id,
            self.tenant_id,
-            &new_metadata,
+            new_metadata,
            true,
        )
        .context("Failed to create timeline metadata")?;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -315,10 +315,6 @@ pub async fn get_tenant(
        .get(&tenant_id)
        .ok_or(TenantStateError::NotFound(tenant_id))?;
    if active_only && !tenant.is_active() {
-        tracing::warn!(
-            "Tenant {tenant_id} is not active. Current state: {:?}",
-            tenant.current_state()
-        );
        Err(TenantStateError::NotActive(tenant_id))
    } else {
        Ok(Arc::clone(tenant))
@@ -350,17 +346,35 @@ pub enum TenantStateError {
 pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
+    detach_ignored: bool,
 ) -> Result<(), TenantStateError> {
-    remove_tenant_from_memory(tenant_id, async {
-        let local_tenant_directory = conf.tenant_path(&tenant_id);
+    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
+        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
        fs::remove_dir_all(&local_tenant_directory)
            .await
            .with_context(|| {
-                format!("Failed to remove local tenant directory {local_tenant_directory:?}")
+                format!("local tenant directory {local_tenant_directory:?} removal")
            })?;
        Ok(())
-    })
-    .await
+    };
+
+    let removal_result =
+        remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
+
+    // Ignored tenants are not present in memory and will bail the removal from memory operation.
+    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+    if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
+        if tenant_ignore_mark.exists() {
+            info!("Detaching an ignored tenant");
+            local_files_cleanup_operation(tenant_id)
+                .await
+                .with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?;
+            return Ok(());
+        }
+    }
+
+    removal_result
 }

 pub async fn load_tenant(
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,7 +210,6 @@ pub use download::{is_temp_download_file, list_remote_timelines};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

-use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
@@ -347,7 +346,7 @@ impl RemoteTimelineClient {
                .layer_metadata
                .values()
                // If we don't have the file size for the layer, don't account for it in the metric.
-                .map(|ilmd| ilmd.file_size.unwrap_or(0))
+                .map(|ilmd| ilmd.file_size)
                .sum()
        } else {
            0
@@ -420,34 +419,6 @@ impl RemoteTimelineClient {
            .await?
        };

-        // Update the metadata for given layer file. The remote index file
-        // might be missing some information for the file; this allows us
-        // to fill in the missing details.
-        if layer_metadata.file_size().is_none() {
-            let new_metadata = LayerFileMetadata::new(downloaded_size);
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-            if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
-                if upgraded.merge(&new_metadata) {
-                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                }
-                // If we don't do an index file upload inbetween here and restart,
-                // the value will go back down after pageserver restart, since we will
-                // have lost this data point.
-                // But, we upload index part fairly frequently, and restart pageserver rarely.
-                // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
-                self.metrics
-                    .remote_physical_size_gauge()
-                    .add(downloaded_size);
-            } else {
-                // The file should exist, since we just downloaded it.
-                warn!(
-                    "downloaded file {:?} not found in local copy of the index file",
-                    layer_file_name
-                );
-            }
-        }
-
        REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
        REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);

@@ -550,13 +521,6 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        // The file size can be missing for files that were created before we tracked that
-        // in the metadata, but it should be present for any new files we create.
-        ensure!(
-            layer_metadata.file_size().is_some(),
-            "file size not initialized in metadata"
-        );
-
        upload_queue
            .latest_files
            .insert(layer_file_name.clone(), layer_metadata.clone());
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -21,7 +21,7 @@ use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

-use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
+use super::index::{IndexPart, LayerFileMetadata};
 use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};

 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
@@ -113,16 +113,11 @@ pub async fn download_layer_file<'a>(
        })
        .map_err(DownloadError::Other)?;

-    match layer_metadata.file_size() {
-        Some(expected) if expected != bytes_amount => {
-            return Err(DownloadError::Other(anyhow!(
-                "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
-                temp_file_path.display()
-            )));
-        }
-        Some(_) | None => {
-            // matches, or upgrading from an earlier IndexPart version
-        }
+    let expected = layer_metadata.file_size();
+    if expected != bytes_amount {
+        return Err(DownloadError::Other(anyhow!(
+            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
+        )));
    }

    // not using sync_data because it can lose file size update
@@ -261,14 +256,12 @@ pub(super) async fn download_index_part(
    )
    .await?;

-    let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
+    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
        .with_context(|| {
            format!("Failed to deserialize index part file into file {index_part_path:?}")
        })
        .map_err(DownloadError::Other)?;

-    let index_part = index_part.remove_unclean_layer_file_names();
-
    Ok(index_part)
 }

--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -6,7 +6,6 @@ use std::collections::{HashMap, HashSet};

 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use tracing::warn;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
@@ -20,7 +19,7 @@ use utils::lsn::Lsn;
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
 #[cfg_attr(test, derive(Default))]
 pub struct LayerFileMetadata {
-    file_size: Option<u64>,
+    file_size: u64,
 }

 impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
@@ -33,36 +32,16 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {

 impl LayerFileMetadata {
    pub fn new(file_size: u64) -> Self {
-        LayerFileMetadata {
-            file_size: Some(file_size),
-        }
+        LayerFileMetadata { file_size }
    }

-    /// This is used to initialize the metadata for remote layers, for which
-    /// the metadata was missing from the index part file.
-    pub const MISSING: Self = LayerFileMetadata { file_size: None };
-
-    pub fn file_size(&self) -> Option<u64> {
+    pub fn file_size(&self) -> u64 {
        self.file_size
    }
-
-    /// Metadata has holes due to version upgrades. This method is called to upgrade self with the
-    /// other value.
-    ///
-    /// This is called on the possibly outdated version. Returns true if any changes
-    /// were made.
-    pub fn merge(&mut self, other: &Self) -> bool {
-        let mut changed = false;
-
-        if self.file_size != other.file_size {
-            self.file_size = other.file_size.or(self.file_size);
-            changed = true;
-        }
-
-        changed
-    }
 }

+// TODO seems like another part of the remote storage file format
+// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -71,10 +50,7 @@ impl LayerFileMetadata {
 /// remember to add a test case for the changed version.
 #[serde_as]
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct IndexPartImpl<L>
-where
-    L: std::hash::Hash + PartialEq + Eq,
-{
+pub struct IndexPart {
    /// Debugging aid describing the version of this type.
    #[serde(default)]
    version: usize,
@@ -82,14 +58,13 @@ where
    /// Layer names, which are stored on the remote storage.
    ///
    /// Additional metadata can might exist in `layer_metadata`.
-    pub timeline_layers: HashSet<L>,
+    pub timeline_layers: HashSet<LayerFileName>,

    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
    /// that latest version stores.
-    #[serde(default = "HashMap::default")]
-    pub layer_metadata: HashMap<L, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,

    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated here for convenience.
@@ -98,101 +73,6 @@ where
    metadata_bytes: Vec<u8>,
 }

-// TODO seems like another part of the remote storage file format
-// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
-pub type IndexPart = IndexPartImpl<LayerFileName>;
-
-pub type IndexPartUnclean = IndexPartImpl<UncleanLayerFileName>;
-
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
-pub enum UncleanLayerFileName {
-    Clean(LayerFileName),
-    BackupFile(String),
-}
-
-impl<'de> serde::Deserialize<'de> for UncleanLayerFileName {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        deserializer.deserialize_string(UncleanLayerFileNameVisitor)
-    }
-}
-
-struct UncleanLayerFileNameVisitor;
-
-impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
-    type Value = UncleanLayerFileName;
-
-    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(
-            formatter,
-            "a string that is a valid LayerFileName or '.old' backup file name"
-        )
-    }
-
-    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-    where
-        E: serde::de::Error,
-    {
-        let maybe_clean: Result<LayerFileName, _> = v.parse();
-        match maybe_clean {
-            Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
-            Err(e) => {
-                if v.ends_with(".old") || v == "metadata_backup" {
-                    Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
-                } else {
-                    Err(E::custom(e))
-                }
-            }
-        }
-    }
-}
-
-impl UncleanLayerFileName {
-    fn into_clean(self) -> Option<LayerFileName> {
-        match self {
-            UncleanLayerFileName::Clean(clean) => Some(clean),
-            UncleanLayerFileName::BackupFile(_) => None,
-        }
-    }
-}
-
-impl IndexPartUnclean {
-    pub fn remove_unclean_layer_file_names(self) -> IndexPart {
-        let IndexPartUnclean {
-            version,
-            timeline_layers,
-            layer_metadata,
-            disk_consistent_lsn,
-            metadata_bytes,
-        } = self;
-
-        IndexPart {
-            version,
-            timeline_layers: timeline_layers
-                .into_iter()
-                .filter_map(|unclean_file_name| match unclean_file_name {
-                    UncleanLayerFileName::Clean(clean_name) => Some(clean_name),
-                    UncleanLayerFileName::BackupFile(backup_file_name) => {
-                        // For details see https://github.com/neondatabase/neon/issues/3024
-                        warn!(
-                            "got backup file on the remote storage, ignoring it {backup_file_name}"
-                        );
-                        None
-                    }
-                })
-                .collect(),
-            layer_metadata: layer_metadata
-                .into_iter()
-                .filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
-                .collect(),
-            disk_consistent_lsn,
-            metadata_bytes,
-        }
-    }
-}
-
 impl IndexPart {
    /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
    /// used to understand later versions.
@@ -232,7 +112,7 @@ impl IndexPart {
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
-    pub(super) file_size: Option<u64>,
+    pub(super) file_size: u64,
 }

 impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
@@ -247,27 +127,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
 mod tests {
    use super::*;

-    #[test]
-    fn v0_indexpart_is_parsed() {
-        let example = r#"{
-            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-        }"#;
-
-        let expected = IndexPart {
-            version: 0,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            layer_metadata: HashMap::default(),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-        };
-
-        let part: IndexPartUnclean = serde_json::from_str(example).unwrap();
-        let part = part.remove_unclean_layer_file_names();
-        assert_eq!(part, expected);
-    }
-
    #[test]
    fn v1_indexpart_is_parsed() {
        let example = r#"{
@@ -287,21 +146,19 @@ mod tests {
            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: Some(25600000),
+                    file_size: 25600000,
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
-                    file_size: Some(9007199254741001),
+                    file_size: 9007199254741001,
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
        };

-        let part = serde_json::from_str::<IndexPartUnclean>(example)
-            .unwrap()
-            .remove_unclean_layer_file_names();
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
        assert_eq!(part, expected);
    }

@@ -325,20 +182,64 @@ mod tests {
            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: Some(25600000),
+                    file_size: 25600000,
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
-                    file_size: Some(9007199254741001),
+                    file_size: 9007199254741001,
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
        };

-        let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
-        let part = part.remove_unclean_layer_file_names();
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
        assert_eq!(part, expected);
    }
+
+    #[test]
+    fn empty_layers_are_parsed() {
+        let empty_layers_json = r#"{
+            "version":1,
+            "timeline_layers":[],
+            "layer_metadata":{},
+            "disk_consistent_lsn":"0/2532648",
+            "metadata_bytes":[136,151,49,208,0,70,0,4,0,0,0,0,2,83,38,72,1,0,0,0,0,2,83,38,32,1,87,198,240,135,97,119,45,125,38,29,155,161,140,141,255,210,0,0,0,0,2,83,38,72,0,0,0,0,1,73,240,192,0,0,0,0,1,73,240,192,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            version: 1,
+            timeline_layers: HashSet::new(),
+            layer_metadata: HashMap::new(),
+            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
+            metadata_bytes: [
+                136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
+                38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
+                210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
+                240, 192, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0,
+            ]
+            .to_vec(),
+        };
+
+        let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
+
+        assert_eq!(empty_layers_parsed, expected);
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -64,13 +64,9 @@ pub(super) async fn upload_timeline_layer<'a>(
        })?
        .len();

-    // FIXME: this looks bad
-    if let Some(metadata_size) = known_metadata.file_size() {
-        if metadata_size != fs_size {
-            bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
-        }
-    } else {
-        // this is a silly state we would like to avoid
+    let metadata_size = known_metadata.file_size();
+    if metadata_size != fs_size {
+        bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
    }

    let fs_size = usize::try_from(fs_size).with_context(|| {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -378,7 +378,7 @@ pub trait PersistentLayer: Layer {
    ///
    /// Should not change over the lifetime of the layer object because
    /// current_physical_size is computed as the som of this value.
-    fn file_size(&self) -> Option<u64>;
+    fn file_size(&self) -> u64;

    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -444,8 +444,8 @@ impl PersistentLayer for DeltaLayer {
        Ok(())
    }

-    fn file_size(&self) -> Option<u64> {
-        Some(self.file_size)
+    fn file_size(&self) -> u64 {
+        self.file_size
    }

    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
@@ -456,7 +456,7 @@ impl PersistentLayer for DeltaLayer {

        HistoricLayerInfo::Delta {
            layer_file_name,
-            layer_file_size: Some(self.file_size),
+            layer_file_size: self.file_size,
            lsn_start: lsn_range.start,
            lsn_end: lsn_range.end,
            remote: false,
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -258,6 +258,15 @@ impl serde::Serialize for LayerFileName {
    }
 }

+impl<'de> serde::Deserialize<'de> for LayerFileName {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        deserializer.deserialize_string(LayerFileNameVisitor)
+    }
+}
+
 struct LayerFileNameVisitor;

 impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -258,8 +258,8 @@ impl PersistentLayer for ImageLayer {
        Ok(())
    }

-    fn file_size(&self) -> Option<u64> {
-        Some(self.file_size)
+    fn file_size(&self) -> u64 {
+        self.file_size
    }

    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
@@ -268,7 +268,7 @@ impl PersistentLayer for ImageLayer {

        HistoricLayerInfo::Image {
            layer_file_name,
-            layer_file_size: Some(self.file_size),
+            layer_file_size: self.file_size,
            lsn_start: lsn_range.start,
            remote: false,
            access_stats: self.access_stats.as_api_model(reset),
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -167,7 +167,7 @@ impl PersistentLayer for RemoteLayer {
        true
    }

-    fn file_size(&self) -> Option<u64> {
+    fn file_size(&self) -> u64 {
        self.layer_metadata.file_size()
    }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2,6 +2,7 @@

 mod eviction_task;
 mod walreceiver;
+mod layer_trace;

 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
@@ -19,8 +20,7 @@ use tracing::*;
 use utils::id::TenantTimelineId;

 use std::cmp::{max, min, Ordering};
-use std::collections::BinaryHeap;
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap};
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
@@ -115,6 +115,17 @@ pub struct Timeline {

    pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,

+    /// Set of key ranges which should be covered by image layers to
+    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
+    /// It is used by compaction task when it checks if new image layer should be created.
+    /// Newly created image layer doesn't help to remove the delta layer, until the
+    /// newly created image layer falls off the PITR horizon. So on next GC cycle,
+    /// gc_timeline may still want the new image layer to be created. To avoid redundant
+    /// image layers creation we should check if image layer exists but beyond PITR horizon.
+    /// This is why we need remember GC cutoff LSN.
+    ///
+    wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
+
    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
    last_freeze_ts: RwLock<Instant>,
@@ -216,6 +227,8 @@ pub struct Timeline {
    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,

    state: watch::Sender<TimelineState>,
+
+    layer_trace_file: Mutex<Option<std::fs::File>>,
 }

 /// Internal structure to hold all data needed for logical size calculation.
@@ -312,7 +325,7 @@ impl LogicalSize {
        //                  we change the type.
        match self.initial_logical_size.get() {
            Some(initial_size) => {
-                initial_size.checked_add_signed(size_increment)
+                initial_size.checked_add(size_increment.try_into().unwrap())
                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
                    .map(CurrentLogicalSize::Exact)
            }
@@ -334,25 +347,6 @@ impl LogicalSize {
    }
 }

-/// Returned by [`Timeline::layer_size_sum`]
-pub enum LayerSizeSum {
-    /// The result is accurate.
-    Accurate(u64),
-    // We don't know the layer file size of one or more layers.
-    // They contribute to the sum with a value of 0.
-    // Hence, the sum is a lower bound for the actualy layer file size sum.
-    ApproximateLowerBound(u64),
-}
-
-impl LayerSizeSum {
-    pub fn approximate_is_ok(self) -> u64 {
-        match self {
-            LayerSizeSum::Accurate(v) => v,
-            LayerSizeSum::ApproximateLowerBound(v) => v,
-        }
-    }
-}
-
 pub struct WalReceiverInfo {
    pub wal_source_connconf: PgConnectionConfig,
    pub last_received_msg_lsn: Lsn,
@@ -550,20 +544,13 @@ impl Timeline {
    /// The sum of the file size of all historic layers in the layer map.
    /// This method makes no distinction between local and remote layers.
    /// Hence, the result **does not represent local filesystem usage**.
-    pub fn layer_size_sum(&self) -> LayerSizeSum {
+    pub fn layer_size_sum(&self) -> u64 {
        let layer_map = self.layers.read().unwrap();
        let mut size = 0;
-        let mut no_size_cnt = 0;
        for l in layer_map.iter_historic_layers() {
-            let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1));
-            size += l_size;
-            no_size_cnt += l_no_size;
-        }
-        if no_size_cnt == 0 {
-            LayerSizeSum::Accurate(size)
-        } else {
-            LayerSizeSum::ApproximateLowerBound(size)
+            size += l.file_size();
        }
+        size
    }

    pub fn get_resident_physical_size(&self) -> u64 {
@@ -883,6 +870,7 @@ impl Timeline {
    }

    pub fn activate(self: &Arc<Self>) {
+        self.start_layer_tracing();
        self.set_state(TimelineState::Active);
        self.launch_wal_receiver();
        self.launch_eviction_task();
@@ -1047,9 +1035,23 @@ impl Timeline {
            return Ok(false);
        }

-        let layer_file_size = local_layer
-            .file_size()
-            .expect("Local layer should have a file size");
+        let layer_file_size = local_layer.file_size();
+
+        let local_layer_mtime = local_layer
+            .local_path()
+            .expect("local layer should have a local path")
+            .metadata()
+            .context("get local layer file stat")?
+            .modified()
+            .context("get mtime of layer file")?;
+        let local_layer_residence_duration =
+            match SystemTime::now().duration_since(local_layer_mtime) {
+                Err(e) => {
+                    warn!("layer mtime is in the future: {}", e);
+                    None
+                }
+                Ok(delta) => Some(delta),
+            };

        let layer_metadata = LayerFileMetadata::new(layer_file_size);

@@ -1092,6 +1094,15 @@ impl Timeline {
                self.metrics
                    .resident_physical_size_gauge
                    .sub(layer_file_size);
+                self.trace_layer_evict(&local_layer.filename());
+
+                self.metrics.evictions.inc();
+
+                if let Some(delta) = local_layer_residence_duration {
+                    self.metrics
+                        .evictions_with_low_residence_duration
+                        .observe(delta);
+                }

                true
            }
@@ -1167,7 +1178,7 @@ impl Timeline {
    pub(super) fn new(
        conf: &'static PageServerConf,
        tenant_conf: Arc<RwLock<TenantConfOpt>>,
-        metadata: TimelineMetadata,
+        metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -1190,6 +1201,7 @@ impl Timeline {
                tenant_id,
                pg_version,
                layers: RwLock::new(LayerMap::default()),
+                wanted_image_layers: Mutex::new(None),

                walredo_mgr,

@@ -1208,7 +1220,14 @@ impl Timeline {
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

-                metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
+                metrics: TimelineMetrics::new(
+                    &tenant_id,
+                    &timeline_id,
+                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                        "mtime",
+                        conf.evictions_low_residence_duration_metric_threshold,
+                    ),
+                ),

                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

@@ -1245,6 +1264,8 @@ impl Timeline {
                download_all_remote_layers_task_info: RwLock::new(None),

                state,
+
+                layer_trace_file: Mutex::new(None),
            };
            result.repartition_threshold = result.get_checkpoint_distance() / 10;
            result
@@ -1483,7 +1504,12 @@ impl Timeline {
                .layer_metadata
                .get(remote_layer_name)
                .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
+                .with_context(|| {
+                    format!(
+                        "No remote layer metadata found for layer {}",
+                        remote_layer_name.file_name()
+                    )
+                })?;

            // Is the local layer's size different from the size stored in the
            // remote index file?
@@ -1499,34 +1525,27 @@ impl Timeline {
                    local_layer_path.display()
                );

-                if let Some(remote_size) = remote_layer_metadata.file_size() {
-                    let metadata = local_layer_path.metadata().with_context(|| {
-                        format!(
-                            "get file size of local layer {}",
-                            local_layer_path.display()
-                        )
-                    })?;
-                    let local_size = metadata.len();
-                    if local_size != remote_size {
-                        warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-                        if let Err(err) = rename_to_backup(&local_layer_path) {
-                            assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
-                            anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
-                        } else {
-                            self.metrics.resident_physical_size_gauge.sub(local_size);
-                            updates.remove_historic(local_layer);
-                            // fall-through to adding the remote layer
-                        }
+                let remote_size = remote_layer_metadata.file_size();
+                let metadata = local_layer_path.metadata().with_context(|| {
+                    format!(
+                        "get file size of local layer {}",
+                        local_layer_path.display()
+                    )
+                })?;
+                let local_size = metadata.len();
+                if local_size != remote_size {
+                    warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+                    if let Err(err) = rename_to_backup(&local_layer_path) {
+                        assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
+                        anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                    } else {
-                        debug!(
-                            "layer is present locally and file size matches remote, using it: {}",
-                            local_layer_path.display()
-                        );
-                        continue;
+                        self.metrics.resident_physical_size_gauge.sub(local_size);
+                        updates.remove_historic(local_layer);
+                        // fall-through to adding the remote layer
                    }
                } else {
                    debug!(
-                        "layer is present locally and remote does not have file size, using it: {}",
+                        "layer is present locally and file size matches remote, using it: {}",
                        local_layer_path.display()
                    );
                    continue;
@@ -1628,6 +1647,8 @@ impl Timeline {
            .map(|l| (l.filename(), l))
            .collect::<HashMap<_, _>>();

+        // If no writes happen, new branches do not have any layers, only the metadata file.
+        let has_local_layers = !local_layers.is_empty();
        let local_only_layers = match index_part {
            Some(index_part) => {
                info!(
@@ -1645,21 +1666,40 @@ impl Timeline {
            }
        };

-        // Are there local files that don't exist remotely? Schedule uploads for them
-        for (layer_name, layer) in &local_only_layers {
-            // XXX solve this in the type system
-            let layer_path = layer
-                .local_path()
-                .expect("local_only_layers only contains local layers");
-            let layer_size = layer_path
-                .metadata()
-                .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
-                .len();
-            info!("scheduling {layer_path:?} for upload");
-            remote_client
-                .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
+        if has_local_layers {
+            // Are there local files that don't exist remotely? Schedule uploads for them.
+            // Local timeline metadata will get uploaded to remove along witht he layers.
+            for (layer_name, layer) in &local_only_layers {
+                // XXX solve this in the type system
+                let layer_path = layer
+                    .local_path()
+                    .expect("local_only_layers only contains local layers");
+                let layer_size = layer_path
+                    .metadata()
+                    .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
+                    .len();
+                info!("scheduling {layer_path:?} for upload");
+                remote_client
+                    .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
+            }
+            remote_client.schedule_index_upload_for_file_changes()?;
+        } else if index_part.is_none() {
+            // No data on the remote storage, no local layers, local metadata file.
+            //
+            // TODO https://github.com/neondatabase/neon/issues/3865
+            // Currently, console does not wait for the timeline data upload to the remote storage
+            // and considers the timeline created, expecting other pageserver nodes to work with it.
+            // Branch metadata upload could get interrupted (e.g pageserver got killed),
+            // hence any locally existing branch metadata with no remote counterpart should be uploaded,
+            // otherwise any other pageserver won't see the branch on `attach`.
+            //
+            // After the issue gets implemented, pageserver should rather remove the branch,
+            // since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
+            // no need to keep the old files.
+            remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
+        } else {
+            // Local timeline has a metadata file, remote one too, both have no layers to sync.
        }
-        remote_client.schedule_index_upload_for_file_changes()?;

        info!("Done");

@@ -1725,7 +1765,7 @@ impl Timeline {
                    .size_added_after_initial
                    .load(AtomicOrdering::Relaxed);

-                let sum = calculated_size.saturating_add_signed(added);
+                let sum = calculated_size.saturating_add(added.try_into().unwrap());

                // set the gauge value before it can be set in `update_current_logical_size`.
                self_clone.metrics.current_logical_size_gauge.set(sum);
@@ -1953,9 +1993,7 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        if !layer.is_remote_layer() {
            layer.delete_resident_layer_file()?;
-            let layer_file_size = layer
-                .file_size()
-                .expect("Local layer should have a file size");
+            let layer_file_size = layer.file_size();
            self.metrics
                .resident_physical_size_gauge
                .sub(layer_file_size);
@@ -2608,6 +2646,8 @@ impl Timeline {
            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
        ])?;

+        self.trace_layer_flush(&new_delta.filename());
+
        // Add it to the layer map
        self.layers
            .write()
@@ -2663,6 +2703,30 @@ impl Timeline {
        let layers = self.layers.read().unwrap();

        let mut max_deltas = 0;
+        let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
+        if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
+            let img_range =
+                partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
+            if wanted.overlaps(&img_range) {
+                //
+                // gc_timeline only pays attention to image layers that are older than the GC cutoff,
+                // but create_image_layers creates image layers at last-record-lsn.
+                // So it's possible that gc_timeline decides that it wants new image layer to be created for a key range,
+                // and on next compcation create_image_layers creates the image layer.
+                // But on next GC cycle, gc_timeline still wantes the new image layer to be created,
+                // because the newly created image layer doesn't help to remove the delta layer,
+                // until the newly created image layer falls off the PITR horizon.
+                //
+                // So we should check if image layer beyond cutoff LSN already exists.
+                if !layers.image_layer_exists(&img_range, &(*cutoff_lsn..lsn))? {
+                    debug!(
+                        "Force generation of layer {}-{} wanted by GC)",
+                        img_range.start, img_range.end
+                    );
+                    return Ok(true);
+                }
+            }
+        }

        for part_range in &partition.ranges {
            let image_coverage = layers.image_coverage(part_range, lsn)?;
@@ -2782,6 +2846,11 @@ impl Timeline {
                image_layers.push(image_layer);
            }
        }
+        // All wanted layers are taken in account by time_for_new_image_layer.
+        // The wanted_image_layers could get updated out of turn and we could
+        // clear something which hasn't been looked at all. This is fine, because
+        // next gc round any wanted would get added back in.
+        *self.wanted_image_layers.lock().unwrap() = None;

        // Sync the new layer to disk before adding it to the layer map, to make sure
        // we don't garbage collect something based on the new layer, before it has
@@ -2818,6 +2887,7 @@ impl Timeline {
            self.metrics
                .resident_physical_size_gauge
                .add(metadata.len());
+            self.trace_layer_image_create(&l.filename());
            updates.insert_historic(Arc::new(l));
        }
        updates.flush();
@@ -3248,6 +3318,7 @@ impl Timeline {
            self.metrics
                .resident_physical_size_gauge
                .add(metadata.len());
+            self.trace_layer_compact_create(&l.filename());

            new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
            let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
@@ -3258,6 +3329,7 @@ impl Timeline {
        // delete the old ones
        let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
        for l in deltas_to_compact {
+            self.trace_layer_compact_delete(&l.filename());
            layer_names_to_delete.push(l.filename());
            self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
        }
@@ -3454,6 +3526,8 @@ impl Timeline {

        info!("GC starting");

+        self.trace_gc_start(new_gc_cutoff);
+
        debug!("retain_lsns: {:?}", retain_lsns);

        // Before deleting any layers, we need to wait for their upload ops to finish.
@@ -3468,6 +3542,7 @@ impl Timeline {
        }

        let mut layers_to_remove = Vec::new();
+        let mut wanted_image_layers = KeySpace::default();

        // Scan all layers in the timeline (remote or on-disk).
        //
@@ -3551,6 +3626,15 @@ impl Timeline {
                    "keeping {} because it is the latest layer",
                    l.filename().file_name()
                );
+                // Collect delta key ranges that need image layers to allow garbage
+                // collecting the layers.
+                // It is not so obvious whether we need to propagate information only about
+                // delta layers. Image layers can form "stairs" preventing old image from been deleted.
+                // But image layers are in any case less sparse than delta layers. Also we need some
+                // protection from replacing recent image layers with new one after each GC iteration.
+                if l.is_incremental() && !LayerMap::is_l0(&*l) {
+                    wanted_image_layers.add_range(l.get_key_range());
+                }
                result.layers_not_updated += 1;
                continue 'outer;
            }
@@ -3563,6 +3647,10 @@ impl Timeline {
            );
            layers_to_remove.push(Arc::clone(&l));
        }
+        self.wanted_image_layers
+            .lock()
+            .unwrap()
+            .replace((new_gc_cutoff, wanted_image_layers));

        let mut updates = layers.batch_update();
        if !layers_to_remove.is_empty() {
@@ -3577,6 +3665,7 @@ impl Timeline {
            {
                for doomed_layer in layers_to_remove {
                    layer_names_to_delete.push(doomed_layer.filename());
+                    self.trace_layer_gc_delete(&doomed_layer.filename());
                    self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
                    result.layers_removed += 1;
                }
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -1,5 +1,18 @@
-//! The per-timeline layer eviction task.
-
+//! The per-timeline layer eviction task, which evicts data which has not been accessed for more
+//! than a given threshold.
+//!
+//! Data includes all kinds of caches, namely:
+//! - (in-memory layers)
+//! - on-demand downloaded layer files on disk
+//! - (cached layer file pages)
+//! - derived data from layer file contents, namely:
+//!     - initial logical size
+//!     - partitioning
+//!     - (other currently missing unknowns)
+//!
+//! Items with parentheses are not (yet) touched by this task.
+//!
+//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
 use std::{
    ops::ControlFlow,
    sync::Arc,
@@ -12,6 +25,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn};

 use crate::{
+    context::{DownloadBehavior, RequestContext},
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -54,9 +68,10 @@ impl Timeline {
            }
        }

+        let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
        loop {
            let policy = self.get_eviction_policy();
-            let cf = self.eviction_iteration(&policy, cancel.clone()).await;
+            let cf = self.eviction_iteration(&policy, &cancel, &ctx).await;

            match cf {
                ControlFlow::Break(()) => break,
@@ -77,7 +92,8 @@ impl Timeline {
    async fn eviction_iteration(
        self: &Arc<Self>,
        policy: &EvictionPolicy,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
    ) -> ControlFlow<(), Instant> {
        debug!("eviction iteration: {policy:?}");
        match policy {
@@ -87,7 +103,7 @@ impl Timeline {
            }
            EvictionPolicy::LayerAccessThreshold(p) => {
                let start = Instant::now();
-                match self.eviction_iteration_threshold(p, cancel).await {
+                match self.eviction_iteration_threshold(p, cancel, ctx).await {
                    ControlFlow::Break(()) => return ControlFlow::Break(()),
                    ControlFlow::Continue(()) => (),
                }
@@ -101,7 +117,8 @@ impl Timeline {
    async fn eviction_iteration_threshold(
        self: &Arc<Self>,
        p: &EvictionPolicyLayerAccessThreshold,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

@@ -114,6 +131,20 @@ impl Timeline {
            not_evictable: usize,
            skipped_for_shutdown: usize,
        }
+
+        // what we want is to invalidate any caches which haven't been accessed for `p.threshold`,
+        // but we cannot actually do it for current limitations except by restarting pageserver. we
+        // just recompute the values which would be recomputed on startup.
+        //
+        // for active tenants this will likely materialized page cache or in-memory layers. for
+        // inactive tenants it will refresh the last_access timestamps so that we will not evict
+        // and re-download on restart these layers.
+        self.refresh_layers_required_in_restart(cancel, ctx).await;
+
+        if cancel.is_cancelled() {
+            return ControlFlow::Break(());
+        }
+
        let mut stats = EvictionStats::default();
        // Gather layers for eviction.
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
@@ -174,7 +205,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates[..], cancel)
+            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
            .await
        {
            Err(pre_err) => {
@@ -216,4 +247,40 @@ impl Timeline {
        }
        ControlFlow::Continue(())
    }
+
+    /// Recompute the values which would cause on-demand downloads during restart.
+    async fn refresh_layers_required_in_restart(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) {
+        let lsn = self.get_last_record_lsn();
+
+        // imitiate on-restart initial logical size
+        let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await;
+
+        match &size {
+            Ok(_size) => {
+                // good, don't log it to avoid confusion
+            }
+            Err(_) => {
+                // we have known issues for which we already log this on consumption metrics,
+                // gc, and compaction. leave logging out for now.
+                //
+                // https://github.com/neondatabase/neon/issues/2539
+            }
+        }
+
+        // imitiate repartiting on first compactation
+        if let Err(e) = self.collect_keyspace(lsn, ctx).await {
+            // if this failed, we probably failed logical size because these use the same keys
+            if size.is_err() {
+                // ignore, see above comment
+            } else {
+                warn!(
+                    "failed to collect keyspace but succeeded in calculating logical size: {e:#}"
+                );
+            }
+        }
+    }
 }
--- a/pageserver/src/tenant/timeline/layer_trace.rs
+++ b/pageserver/src/tenant/timeline/layer_trace.rs
@@ -0,0 +1,81 @@
+use crate::tenant::timeline::LayerFileName;
+use crate::tenant::Timeline;
+use std::io::Write;
+use std::time::UNIX_EPOCH;
+use tracing::*;
+use std::fs::File;
+use utils::lsn::Lsn;
+
+impl Timeline {
+
+    pub(super) fn start_layer_tracing(&self) {
+        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+
+        let path = timeline_path.join("layer_trace");
+
+        match File::options()
+            .create(true)
+            .append(true)
+            .open(&path)
+        {
+            Ok(file) => {
+                info!("enabled layer tracing");
+                self.layer_trace_file.lock().unwrap().replace(file);
+            },
+            Err(e) => {
+                warn!("could not open layer tracing file \"{}\": {}", path.display(), e);
+            }
+        }
+    }
+
+    fn trace_op(&self, op: &str, filename: &str) {
+        let opt_out = &self.layer_trace_file.lock().unwrap();
+        if let Some(mut out) = opt_out.as_ref() {
+            if let Ok(elapsed) = UNIX_EPOCH.elapsed() {
+                let time = elapsed.as_millis();
+                let _ = writeln!(out, "{{ \"time\": {time}, \"op\": \"{op}\", \"filename\": \"{filename}\"}}");
+            }
+            else {
+                warn!("could not get current timestamp");
+            }
+        }
+    }
+
+    pub(super) fn trace_layer_evict(&self, filename: &LayerFileName) {
+        self.trace_op("evict", &filename.file_name())
+    }
+
+    pub(super) fn trace_layer_flush(&self, filename: &LayerFileName) {
+        self.trace_op("flush", &filename.file_name())
+    }
+
+    pub(super) fn trace_layer_compact_create(&self, filename: &LayerFileName) {
+        self.trace_op("compact_create", &filename.file_name())
+    }
+
+    pub(super) fn trace_layer_compact_delete(&self, filename: &LayerFileName) {
+        self.trace_op("compact_delete", &filename.file_name())
+    }
+
+    pub(super) fn trace_layer_image_create(&self, filename: &LayerFileName) {
+        self.trace_op("image_create", &filename.file_name())
+    }
+    
+    pub(super) fn trace_layer_gc_delete(&self, filename: &LayerFileName) {
+        self.trace_op("gc_delete", &filename.file_name())
+    }
+
+    // TODO: also report 'retain_lsns'
+    pub(super) fn trace_gc_start(&self, cutoff_lsn: Lsn) {
+        let opt_out = &self.layer_trace_file.lock().unwrap();
+        if let Some(mut out) = opt_out.as_ref() {
+            if let Ok(elapsed) = UNIX_EPOCH.elapsed() {
+                let time = elapsed.as_millis();
+                let _ = writeln!(out, "{{ \"time\": {time}, \"op\": \"gc_start\", \"cutoff\": \"{cutoff_lsn}\"}}");
+            }
+            else {
+                warn!("could not get current timestamp");
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -127,12 +127,21 @@ impl UploadQueue {

        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
        for layer_name in &index_part.timeline_layers {
-            let layer_metadata = index_part
+            match index_part
                .layer_metadata
                .get(layer_name)
                .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(layer_name.to_owned(), layer_metadata);
+            {
+                Some(layer_metadata) => {
+                    files.insert(layer_name.to_owned(), layer_metadata);
+                }
+                None => {
+                    anyhow::bail!(
+                        "No remote layer metadata found for layer {}",
+                        layer_name.file_name()
+                    );
+                }
+            }
        }

        let index_part_metadata = index_part.parse_metadata()?;
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -46,8 +46,12 @@ PGconn	   *pageserver_conn = NULL;
 */
 WaitEventSet *pageserver_conn_wes = NULL;

-char	   *page_server_connstring_raw;
-char	   *safekeeper_token_env;
+/* GUCs */
+char	   *neon_timeline;
+char	   *neon_tenant;
+int32		max_cluster_size;
+char	   *page_server_connstring;
+char	   *neon_auth_token;

 int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
@@ -60,10 +64,37 @@ pageserver_connect(int elevel)
 {
 	char	   *query;
 	int			ret;
+	const char *keywords[3];
+	const char *values[3];
+	int			n;

 	Assert(!connected);

-	pageserver_conn = PQconnectdb(page_server_connstring);
+	/*
+	 * Connect using the connection string we got from the
+	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
+	 * variable was set, use that as the password.
+	 *
+	 * The connection options are parsed in the order they're given, so
+	 * when we set the password before the connection string, the
+	 * connection string can override the password from the env variable.
+	 * Seems useful, although we don't currently use that capability
+	 * anywhere.
+	 */
+	n = 0;
+	if (neon_auth_token)
+	{
+		keywords[n] = "password";
+		values[n] = neon_auth_token;
+		n++;
+	}
+	keywords[n] = "dbname";
+	values[n] = page_server_connstring;
+	n++;
+	keywords[n] = NULL;
+	values[n] = NULL;
+	n++;
+	pageserver_conn = PQconnectdbParams(keywords, values, 1);

 	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -125,7 +156,7 @@ pageserver_connect(int elevel)
 		}
 	}

-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
+	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);

 	connected = true;
 	return true;
@@ -354,105 +385,6 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }

-static char *
-substitute_pageserver_password(const char *page_server_connstring_raw)
-{
-	char	   *host = NULL;
-	char	   *port = NULL;
-	char	   *user = NULL;
-	char	   *auth_token = NULL;
-	char	   *err = NULL;
-	char	   *page_server_connstring = NULL;
-	PQconninfoOption *conn_options;
-	PQconninfoOption *conn_option;
-	MemoryContext oldcontext;
-
-	/*
-	 * Here we substitute password in connection string with an environment
-	 * variable. To simplify things we construct a connection string back with
-	 * only known options. In particular: host port user and password. We do
-	 * not currently use other options and constructing full connstring in an
-	 * URI shape is quite messy.
-	 */
-
-	if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
-		return NULL;
-
-	/* extract the auth token from the connection string */
-	conn_options = PQconninfoParse(page_server_connstring_raw, &err);
-	if (conn_options == NULL)
-	{
-		/* The error string is malloc'd, so we must free it explicitly */
-		char	   *errcopy = err ? pstrdup(err) : "out of memory";
-
-		PQfreemem(err);
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid connection string syntax: %s", errcopy)));
-	}
-
-	/*
-	 * Trying to populate pageserver connection string with auth token from
-	 * environment. We are looking for password in with placeholder value like
-	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
-	 * to fetch environment variable value and fail loudly if it is not set.
-	 */
-	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
-	{
-		if (strcmp(conn_option->keyword, "host") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				host = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "port") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				port = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "user") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-				user = conn_option->val;
-		}
-		else if (strcmp(conn_option->keyword, "password") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-			{
-				/* ensure that this is a template */
-				if (strncmp(conn_option->val, "$", 1) != 0)
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
-
-				neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
-				auth_token = getenv(&conn_option->val[1]);
-				if (!auth_token)
-				{
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
-				}
-				else
-				{
-					neon_log(LOG, "using auth token from environment passed via env");
-				}
-			}
-		}
-	}
-
-	/*
-	 * allocate connection string in TopMemoryContext to make sure it is not
-	 * freed
-	 */
-	oldcontext = CurrentMemoryContext;
-	MemoryContextSwitchTo(TopMemoryContext);
-	page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
-	MemoryContextSwitchTo(oldcontext);
-
-	PQconninfoFree(conn_options);
-	return page_server_connstring;
-}
-
 /*
 * Module initialization function
 */
@@ -462,21 +394,12 @@ pg_init_libpagestore(void)
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
-							   &page_server_connstring_raw,
+							   &page_server_connstring,
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);

-    DefineCustomStringVariable("neon.safekeeper_token_env",
-                               "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN",
-                               NULL,
-                               &safekeeper_token_env,
-                               NULL,
-                               PGC_POSTMASTER,
-                               0,	/* no flags required */
-                               NULL, NULL, NULL);
-
 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
 							   NULL,
@@ -533,30 +456,10 @@ pg_init_libpagestore(void)
 	neon_log(PageStoreTrace, "libpagestore already loaded");
 	page_server = &api;

-	/* substitute password in pageserver_connstring */
-	page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
-
-	/* Is there more correct way to pass CustomGUC to postgres code? */
-	neon_timeline_walproposer = neon_timeline;
-	neon_tenant_walproposer = neon_tenant;
-
-	/* retrieve the token for Safekeeper, if present */
-	if (safekeeper_token_env != NULL) {
-		if (safekeeper_token_env[0] != '$') {
-			ereport(ERROR,
-					(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							errmsg("expected safekeeper auth token environment variable's name starting with $ but found: %s",
-								   safekeeper_token_env)));
-		}
-		neon_safekeeper_token_walproposer = getenv(&safekeeper_token_env[1]);
-		if (!neon_safekeeper_token_walproposer) {
-			ereport(ERROR,
-					(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							errmsg("cannot get safekeeper auth token, environment variable %s is not set",
-								   &safekeeper_token_env[1])));
-		}
-		neon_log(LOG, "using safekeeper auth token from environment variable");
-	}
+	/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
+	neon_auth_token = getenv("NEON_AUTH_TOKEN");
+	if (neon_auth_token)
+		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");

 	if (page_server_connstring && page_server_connstring[0])
 	{
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -51,12 +51,39 @@ walprop_status(WalProposerConn *conn)
 }

 WalProposerConn *
-walprop_connect_start(char *conninfo)
+walprop_connect_start(char *conninfo, char *password)
 {
 	WalProposerConn *conn;
 	PGconn	   *pg_conn;
+	const char *keywords[3];
+	const char *values[3];
+	int			n;

-	pg_conn = PQconnectStart(conninfo);
+	/*
+	 * Connect using the given connection string. If the
+	 * NEON_AUTH_TOKEN environment variable was set, use that as
+	 * the password.
+	 *
+	 * The connection options are parsed in the order they're given, so
+	 * when we set the password before the connection string, the
+	 * connection string can override the password from the env variable.
+	 * Seems useful, although we don't currently use that capability
+	 * anywhere.
+	 */
+	n = 0;
+	if (password)
+	{
+		keywords[n] = "password";
+		values[n] = neon_auth_token;
+		n++;
+	}
+	keywords[n] = "dbname";
+	values[n] = conninfo;
+	n++;
+	keywords[n] = NULL;
+	values[n] = NULL;
+	n++;
+	pg_conn = PQconnectStartParams(keywords, values, 1);

 	/*
 	 * Allocation of a PQconn can fail, and will return NULL. We want to fully
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -12,6 +12,11 @@
 #ifndef NEON_H
 #define NEON_H

+/* GUCs */
+extern char *neon_auth_token;
+extern char *neon_timeline;
+extern char *neon_tenant;
+
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -92,14 +92,6 @@ const int	SmgrTrace = DEBUG5;

 page_server_api *page_server;

-/* GUCs */
-char	   *page_server_connstring;
-
-/*with substituted password*/
-char	   *neon_timeline;
-char	   *neon_tenant;
-int32		max_cluster_size;
-
 /* unlogged relation build states */
 typedef enum
 {
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -78,10 +78,6 @@ int			wal_acceptor_reconnect_timeout;
 int			wal_acceptor_connection_timeout;
 bool		am_wal_proposer;

-char	   *neon_timeline_walproposer = NULL;
-char	   *neon_tenant_walproposer = NULL;
-char	   *neon_safekeeper_token_walproposer = NULL;
-
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"

 static int	n_safekeepers = 0;
@@ -514,17 +510,9 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 			Safekeeper *sk = &safekeeper[n_safekeepers];
 			int written = 0;

-			if (neon_safekeeper_token_walproposer != NULL) {
-				written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
-								   "host=%s port=%s password=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
-								   sk->host, sk->port, neon_safekeeper_token_walproposer, neon_timeline_walproposer,
-								   neon_tenant_walproposer);
-			} else {
-				written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
-								   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
-								   sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer);
-			}
-
+			written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
+							   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
+							   sk->host, sk->port, neon_timeline, neon_tenant);
 			if (written > MAXCONNINFO || written < 0)
 				elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 		}
@@ -550,16 +538,16 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	greetRequest.pgVersion = PG_VERSION_NUM;
 	pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
 	greetRequest.systemId = systemId;
-	if (!neon_timeline_walproposer)
+	if (!neon_timeline)
 		elog(FATAL, "neon.timeline_id is not provided");
-	if (*neon_timeline_walproposer != '\0' &&
-		!HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16))
-		elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer);
-	if (!neon_tenant_walproposer)
+	if (*neon_timeline != '\0' &&
+		!HexDecodeString(greetRequest.timeline_id, neon_timeline, 16))
+		elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline);
+	if (!neon_tenant)
 		elog(FATAL, "neon.tenant_id is not provided");
-	if (*neon_tenant_walproposer != '\0' &&
-		!HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16))
-		elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer);
+	if (*neon_tenant != '\0' &&
+		!HexDecodeString(greetRequest.tenant_id, neon_tenant, 16))
+		elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant);

 #if PG_VERSION_NUM >= 150000
 	/* FIXME don't use hardcoded timeline id */
@@ -700,7 +688,7 @@ ResetConnection(Safekeeper *sk)
 	/*
 	 * Try to establish new connection
 	 */
-	sk->conn = walprop_connect_start((char *) &sk->conninfo);
+	sk->conn = walprop_connect_start((char *) &sk->conninfo, neon_auth_token);

 	/*
 	 * "If the result is null, then libpq has been unable to allocate a new
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -39,10 +39,6 @@ typedef struct WalProposerConn WalProposerConn;
 struct WalMessage;
 typedef struct WalMessage WalMessage;

-extern char *neon_timeline_walproposer;
-extern char *neon_tenant_walproposer;
-extern char *neon_safekeeper_token_walproposer;
-
 /* Possible return values from ReadPGAsync */
 typedef enum
 {
@@ -458,7 +454,7 @@ extern char *walprop_error_message(WalProposerConn *conn);
 extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);

 /* Re-exported PQconnectStart */
-extern WalProposerConn * walprop_connect_start(char *conninfo);
+extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);

 /* Re-exported PQconectPoll */
 extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -111,7 +111,7 @@ struct Args {
    /// WAL backup horizon.
    #[arg(long)]
    disable_wal_backup: bool,
-    /// Path to an RSA .pem public key which is used to check JWT tokens.
+    /// Path to a .pem public key which is used to check JWT tokens.
    #[arg(long)]
    auth_validation_public_key_path: Option<PathBuf>,
    /// Format for logging, either 'plain' or 'json'.
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -78,5 +78,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_created_persistent_files_total",
    "pageserver_written_persistent_bytes_total",
    "pageserver_tenant_states_count",
+    "pageserver_evictions_total",
+    "pageserver_evictions_with_low_residence_duration_total",
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -431,7 +431,7 @@ class AuthKeys:
    priv: str

    def generate_token(self, *, scope: str, **token_data: str) -> str:
-        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="RS256")
+        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
        # cast(Any, self.priv)

        # jwt.encode can return 'bytes' or 'str', depending on Python version or type
@@ -643,6 +643,7 @@ class NeonEnvBuilder:
            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
        )
        initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
+        env.initial_timeline = initial_timeline
        log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")

        return env
@@ -904,6 +905,7 @@ class NeonEnv:
        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
        self.initial_tenant = config.initial_tenant
+        self.initial_timeline: Optional[TimelineId] = None

        # Create a config file corresponding to the options
        toml = textwrap.dedent(
@@ -1117,7 +1119,9 @@ def neon_env_builder(


 class PageserverApiException(Exception):
-    pass
+    def __init__(self, message, status_code: int):
+        super().__init__(message)
+        self.status_code = status_code


 class PageserverHttpClient(requests.Session):
@@ -1138,7 +1142,7 @@ class PageserverHttpClient(requests.Session):
                msg = res.json()["msg"]
            except:  # noqa: E722
                msg = ""
-            raise PageserverApiException(msg) from e
+            raise PageserverApiException(msg, res.status_code) from e

    def check_status(self):
        self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
@@ -1188,8 +1192,12 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
        self.verbose_error(res)

-    def tenant_detach(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach")
+    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
+        params = {}
+        if detach_ignored:
+            params["detach_ignored"] = "true"
+
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

    def tenant_load(self, tenant_id: TenantId):
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -365,11 +365,9 @@ def check_neon_works(
    tenant_id = snapshot_config["default_tenant_id"]
    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
    pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1]
-    auth_token = snapshot_config["pageserver"]["auth_token"]
    pageserver_http = PageserverHttpClient(
        port=pageserver_port,
        is_testing_enabled_or_skip=lambda: True,  # TODO: check if testing really enabled
-        auth_token=auth_token,
    )

    shutil.rmtree(repo_dir / "local_fs_remote_storage")
--- a/test_runner/regress/test_gc_old_layers.py
+++ b/test_runner/regress/test_gc_old_layers.py
@@ -0,0 +1,67 @@
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.timeout(10000)
+def test_gc_old_layers(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that GC is able to collect all old layers even if them are forming
+    "stairs" and there are not three delta layers since last image layer.
+
+    Information about image layers needed to collect old layers should
+    be propagated by GC to compaction task which should take in in account
+    when make a decision which new image layers needs to be created.
+    """
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    tenant_id, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable default GC and compaction
+            "gc_period": "1000 m",
+            "compaction_period": "0 s",
+            "gc_horizon": f"{1024 ** 2}",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "compaction_target_size": f"{1024 ** 2}",
+            # set PITR interval to be small, so we can do GC
+            "pitr_interval": "1 s",
+            # "compaction_threshold": "3",
+            # "image_creation_threshold": "2",
+        }
+    )
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+    n_steps = 10
+    n_update_iters = 100
+    step_size = 10000
+    with pg.cursor() as cur:
+        cur.execute("SET statement_timeout='1000s'")
+        cur.execute(
+            "CREATE TABLE t(pk bigint primary key, count bigint default 0, payload text default repeat(' ', 100))  with (fillfactor=50)"
+        )
+        for step in range(n_steps):
+            cur.execute(
+                f"INSERT INTO t (pk) values (generate_series({step*step_size+1},{(step+1)*step_size}))"
+            )
+            for i in range(n_update_iters):
+                cur.execute(
+                    f"UPDATE t set count=count+1 where pk BETWEEN {(step-1)*step_size+1+i*step_size//n_update_iters} AND {step*step_size+i*step_size//n_update_iters}"
+                )
+                cur.execute("vacuum t")
+
+            # cur.execute("select pg_table_size('t')")
+            # logical_size = cur.fetchone()[0]
+            logical_size = client.timeline_detail(tenant_id, timeline_id)["current_logical_size"]
+            log.info(f"Logical storage size  {logical_size}")
+            physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
+            log.info(f"Physical storage size {physical_size}")
+
+            client.timeline_checkpoint(tenant_id, timeline_id)
+
+            # Do compaction and GC
+            client.timeline_gc(tenant_id, timeline_id, 0)
+            client.timeline_compact(tenant_id, timeline_id)
+
+            physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
+            log.info(f"Physical after GC     {physical_size}")
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -11,8 +11,10 @@ from typing import Dict, List, Tuple
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LocalFsStorage,
    NeonEnvBuilder,
    PageserverApiException,
+    PageserverHttpClient,
    RemoteStorageKind,
    available_remote_storages,
    wait_for_last_flush_lsn,
@@ -421,23 +423,6 @@ def test_remote_timeline_client_calls_started_metric(
        )
        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)

-    def get_queued_count(file_kind, op_kind):
-        val = client.get_remote_timeline_client_metric(
-            "pageserver_remote_timeline_client_calls_unfinished",
-            tenant_id,
-            timeline_id,
-            file_kind,
-            op_kind,
-        )
-        if val is None:
-            return val
-        return int(val)
-
-    def wait_upload_queue_empty():
-        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
-        wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
-        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
-
    calls_started: Dict[Tuple[str, str], List[int]] = {
        ("layer", "upload"): [0],
        ("index", "upload"): [0],
@@ -478,7 +463,7 @@ def test_remote_timeline_client_calls_started_metric(
    # create some layers & wait for uploads to finish
    churn("a", "b")

-    wait_upload_queue_empty()
+    wait_upload_queue_empty(client, tenant_id, timeline_id)

    # ensure that we updated the calls_started metric
    fetch_calls_started()
@@ -637,4 +622,147 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    time.sleep(10)


+# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only.
+# Ensures that such branch is still persisted on the remote storage, and can be restored during tenant (re)attach.
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_empty_branch_remote_storage_upload(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_empty_branch_remote_storage_upload",
+    )
+
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    new_branch_name = "new_branch"
+    new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
+
+    with env.postgres.create_start(new_branch_name, tenant_id=env.initial_tenant) as pg:
+        wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_branch_timeline_id)
+    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+
+    timelines_before_detach = set(
+        map(
+            lambda t: TimelineId(t["timeline_id"]),
+            client.timeline_list(env.initial_tenant),
+        )
+    )
+    expected_timelines = set([env.initial_timeline, new_branch_timeline_id])
+    assert (
+        timelines_before_detach == expected_timelines
+    ), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}"
+
+    client.tenant_detach(env.initial_tenant)
+    client.tenant_attach(env.initial_tenant)
+    wait_until_tenant_state(client, env.initial_tenant, "Active", 5)
+
+    timelines_after_detach = set(
+        map(
+            lambda t: TimelineId(t["timeline_id"]),
+            client.timeline_list(env.initial_tenant),
+        )
+    )
+
+    assert (
+        timelines_before_detach == timelines_after_detach
+    ), f"Expected to have same timelines after reattach, but got {timelines_after_detach}"
+
+
+# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only.
+# Ensures the branch is not on the remote storage and restarts the pageserver — the branch should be uploaded after the restart.
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_empty_branch_remote_storage_upload_on_restart(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_empty_branch_remote_storage_upload_on_restart",
+    )
+
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    new_branch_name = "new_branch"
+    new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
+
+    with env.postgres.create_start(new_branch_name, tenant_id=env.initial_tenant) as pg:
+        wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_branch_timeline_id)
+    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+
+    env.pageserver.stop()
+
+    # Remove new branch from the remote storage
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    new_branch_on_remote_storage = (
+        env.remote_storage.root
+        / "tenants"
+        / str(env.initial_tenant)
+        / "timelines"
+        / str(new_branch_timeline_id)
+    )
+    assert (
+        new_branch_on_remote_storage.is_dir()
+    ), f"'{new_branch_on_remote_storage}' path does not exist on the remote storage"
+    shutil.rmtree(new_branch_on_remote_storage)
+
+    env.pageserver.start()
+
+    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    assert (
+        new_branch_on_remote_storage.is_dir()
+    ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'"
+
+
+def wait_upload_queue_empty(
+    client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+):
+    wait_until(
+        2,
+        1,
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
+        )
+        == 0,
+    )
+    wait_until(
+        2,
+        1,
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
+        )
+        == 0,
+    )
+    wait_until(
+        2,
+        1,
+        lambda: get_queued_count(
+            client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
+        )
+        == 0,
+    )
+
+
+def get_queued_count(
+    client: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    file_kind: str,
+    op_kind: str,
+):
+    val = client.get_remote_timeline_client_metric(
+        "pageserver_remote_timeline_client_calls_unfinished",
+        tenant_id,
+        timeline_id,
+        file_kind,
+        op_kind,
+    )
+    if val is None:
+        return val
+    return int(val)
+
+
 # TODO Test that we correctly handle GC of files that are stuck in upload queue.
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -264,9 +264,11 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    with pytest.raises(
        expected_exception=PageserverApiException,
        match=f"NotFound: tenant {tenant_id}",
-    ):
+    ) as excinfo:
        pageserver_http.tenant_detach(tenant_id)

+    assert excinfo.value.status_code == 404
+
    # the error will be printed to the log too
    env.pageserver.allowed_errors.append(".*NotFound: tenant *")

@@ -325,7 +327,91 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)


-#
+# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
+# then with parameters to force ignored tenant detach (should not fail).
+def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    client = env.pageserver.http_client()
+
+    # create a new tenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    # assert tenant exists on disk
+    assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    # we rely upon autocommit after each statement
+    pg.safe_psql_many(
+        queries=[
+            "CREATE TABLE t(key int primary key, value text)",
+            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
+        ]
+    )
+
+    # ignore tenant
+    client.tenant_ignore(tenant_id)
+    env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
+    # ensure tenant couldn't be detached without the special flag for ignored tenant
+    log.info("detaching ignored tenant WITHOUT required flag")
+    with pytest.raises(
+        expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
+    ):
+        client.tenant_detach(tenant_id)
+
+    log.info("tenant detached failed as expected")
+
+    # ensure tenant is detached with ignore state
+    log.info("detaching ignored tenant with required flag")
+    client.tenant_detach(tenant_id, True)
+    log.info("ignored tenant detached without error")
+
+    # check that nothing is left on disk for deleted tenant
+    assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
+
+    # assert the tenant does not exists in the Pageserver
+    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
+    assert (
+        tenant_id not in tenants_after_detach
+    ), f"Ignored and then detached tenant {tenant_id} \
+        should not be present in pageserver's memory"
+
+
+# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
+# Tenant should be detached without issues.
+def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    client = env.pageserver.http_client()
+
+    # create a new tenant
+    tenant_id, _ = env.neon_cli.create_tenant()
+
+    # assert tenant exists on disk
+    assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    # we rely upon autocommit after each statement
+    pg.safe_psql_many(
+        queries=[
+            "CREATE TABLE t(key int primary key, value text)",
+            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
+        ]
+    )
+
+    log.info("detaching regular tenant with detach ignored flag")
+    client.tenant_detach(tenant_id, True)
+    log.info("regular tenant detached without error")
+
+    # check that nothing is left on disk for deleted tenant
+    assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
+
+    # assert the tenant does not exists in the Pageserver
+    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
+    assert (
+        tenant_id not in tenants_after_detach
+    ), f"Ignored and then detached tenant {tenant_id} \
+        should not be present in pageserver's memory"
+
+
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_detach_while_attaching(
    neon_env_builder: NeonEnvBuilder,
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -9,7 +9,6 @@
 import asyncio
 import json
 import os
-import shutil
 from pathlib import Path
 from typing import List, Tuple

@@ -217,208 +216,9 @@ def test_tenants_attached_after_download(
    assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*")


-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
-def test_tenant_upgrades_index_json_from_v0(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
-    # the "image" for the v0 index_part.json. the fields themselves are
-    # replaced with values read from the later version because of #2592 (initdb
-    # lsn not reproducible).
-    v0_skeleton = json.loads(
-        """{
-        "timeline_layers":[
-            "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"
-        ],
-        "missing_layers":["This should not fail as its not used anymore"],
-        "disk_consistent_lsn":"0/16960E8",
-        "metadata_bytes":[]
-    }"""
-    )
-
-    # getting a too eager compaction happening for this test would not play
-    # well with the strict assertions.
-    neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'"
-
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind, "test_tenant_upgrades_index_json_from_v0"
-    )
-
-    # launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
-    # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
-    env = neon_env_builder.init_start()
-
-    pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
-
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
-
-    with pg.cursor() as cur:
-        cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-
-    # flush, wait until in remote storage
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-    env.postgres.stop_all()
-    env.pageserver.stop()
-
-    # remove all local data for the tenant to force redownloading and subsequent upgrade
-    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
-
-    # downgrade the remote file
-    timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
-    with open(timeline_path, "r+") as timeline_file:
-        # keep the deserialized for later inspection
-        orig_index_part = json.load(timeline_file)
-
-        v0_index_part = {
-            key: orig_index_part[key]
-            for key in v0_skeleton.keys() - ["missing_layers"]  # pgserver doesn't have it anymore
-        }
-
-        timeline_file.seek(0)
-        json.dump(v0_index_part, timeline_file)
-        timeline_file.truncate(timeline_file.tell())
-
-    env.pageserver.start()
-    pageserver_http = env.pageserver.http_client()
-    pageserver_http.tenant_attach(tenant_id)
-
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
-    )
-
-    pg = env.postgres.create_start("main")
-
-    with pg.cursor() as cur:
-        cur.execute("INSERT INTO t0 VALUES (234, 'test data');")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
-    # not needed anymore
-    env.postgres.stop_all()
-    env.pageserver.stop()
-
-    # make sure the file has been upgraded back to how it started
-    index_part = local_fs_index_part(env, tenant_id, timeline_id)
-    assert index_part["version"] == orig_index_part["version"]
-    assert "missing_layers" not in index_part.keys()
-
-    # expect one more layer because of the forced checkpoint
-    assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1
-
-    # all of the same layer files are there, but they might be shuffled around
-    orig_layers = set(orig_index_part["timeline_layers"])
-    later_layers = set(index_part["timeline_layers"])
-    assert later_layers.issuperset(orig_layers)
-
-    added_layers = later_layers - orig_layers
-    assert len(added_layers) == 1
-
-    # all of metadata has been regenerated (currently just layer file size)
-    all_metadata_keys = set()
-    for layer in orig_layers:
-        orig_metadata = orig_index_part["layer_metadata"][layer]
-        new_metadata = index_part["layer_metadata"][layer]
-        assert (
-            orig_metadata == new_metadata
-        ), f"metadata for layer {layer} should not have changed {orig_metadata} vs. {new_metadata}"
-        all_metadata_keys |= set(orig_metadata.keys())
-
-    one_new_layer = next(iter(added_layers))
-    assert one_new_layer in index_part["layer_metadata"], "new layer should have metadata"
-
-    only_new_metadata = index_part["layer_metadata"][one_new_layer]
-
-    assert (
-        set(only_new_metadata.keys()).symmetric_difference(all_metadata_keys) == set()
-    ), "new layer metadata has same metadata as others"
-
-
 # FIXME: test index_part.json getting downgraded from imaginary new version


-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
-def test_tenant_ignores_backup_file(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
-    # getting a too eager compaction happening for this test would not play
-    # well with the strict assertions.
-    neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'"
-
-    neon_env_builder.enable_remote_storage(remote_storage_kind, "test_tenant_ignores_backup_file")
-
-    # launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
-    # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
-    env = neon_env_builder.init_start()
-
-    env.pageserver.allowed_errors.append(".*got backup file on the remote storage, ignoring it.*")
-
-    pageserver_http = env.pageserver.http_client()
-    pg = env.postgres.create_start("main")
-
-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
-
-    with pg.cursor() as cur:
-        cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-
-    # flush, wait until in remote storage
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
-    env.postgres.stop_all()
-    env.pageserver.stop()
-
-    # change the remote file to have entry with .0.old suffix
-    timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
-    with open(timeline_path, "r+") as timeline_file:
-        # keep the deserialized for later inspection
-        orig_index_part = json.load(timeline_file)
-        backup_layer_name = orig_index_part["timeline_layers"][0] + ".0.old"
-        orig_index_part["timeline_layers"].append(backup_layer_name)
-
-        timeline_file.seek(0)
-        json.dump(orig_index_part, timeline_file)
-
-    env.pageserver.start()
-    pageserver_http = env.pageserver.http_client()
-
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
-    )
-
-    pg = env.postgres.create_start("main")
-
-    with pg.cursor() as cur:
-        cur.execute("INSERT INTO t0 VALUES (234, 'test data');")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
-    # not needed anymore
-    env.postgres.stop_all()
-    env.pageserver.stop()
-
-    # the .old file is gone from newly serialized index_part
-    new_index_part = local_fs_index_part(env, tenant_id, timeline_id)
-    backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"])
-    assert len(list(backup_layers)) == 0
-
-
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_tenant_redownloads_truncated_file_on_startup(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
Author	SHA1	Message	Date
Heikki Linnakangas	7eefad691c	Improve the scaling, add Play/Stop buttons	2023-03-22 19:40:54 +02:00
Heikki Linnakangas	fe59a063ea	WIP: Collect and draw layer trace	2023-03-22 19:40:54 +02:00
Heikki Linnakangas	ae8e5b3a8e	Add test from PR #3673	2023-03-22 19:40:54 +02:00
Kirill Bulatov	8bd565e09e	Ensure branches with no layers have their remote storage counterpart created eventually (#3857 ) Discovered during writing a test for https://github.com/neondatabase/neon/pull/3843	2023-03-22 17:42:31 +02:00
Joonas Koivunen	6033dfdf4a	Re-access layers before threshold eviction (#3867 ) To avoid re-downloading evicted files on restart, re-compute logical size and partitioning before each threshold based eviction run. Cc: #3802 Co-authored-by: Christian Schwarz <christian@neon.tech>	2023-03-22 16:26:27 +02:00
mikecaat	14a40c9ca6	Fix minor things for the docker-compose file (#3862 ) * Add the REPOSITORY env to build args to avoid the following error when executing without the credentials for the repository. ``` ERROR: Service 'compute' failed to build: Head "https://369495373322.dkr.ecr.eu-central-1.amazonaws.com/v2/compute-node-v15/manifests/2221": no basic auth credentials ``` * update the tag version in the documentation to support storage broker	2023-03-22 08:10:53 +00:00
Shany Pozin	0f7de84785	Allow calling detach on ignored tenant (#3834 ) ## Describe your changes Added a query param to detach API Allow to remove local state of a tenant even if its not in the memory (following ignore API) ## Issue ticket number and link #3828 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --------- Co-authored-by: Kirill Bulatov <kirill@neon.tech>	2023-03-22 07:17:00 +00:00
Kirill Bulatov	dd22c87100	Remove older layer metadata format support code (#3854 ) The PR enforces current newest `index_part.json` format in the type system (version `1`), not allowing any previous forms of it, that were used in the past. Similarly, the code to mitigate the https://github.com/neondatabase/neon/issues/3024 issue is now also removed. Current code does not produce old formats and extra files in the index_part.json, in the future we will be able to use https://github.com/neondatabase/aversion or other approach to make version transitions more explicit. See https://neondb.slack.com/archives/C033RQ5SPDH/p1679134185248119 for the justification on the breaking changes.	2023-03-21 23:33:28 +02:00
Heikki Linnakangas	6fdd9c10d1	Read storage auth token from spec file. We read the pageserver connection string from the spec file, so let's read the auth token from the same place. We've been talking about pre-launching compute nodes that are not associated with any particular tenant at startup, so that the spec file is delivered to the compute node later. We cannot change the env variables after the process has been launched. We still pass the token to 'postgres' binary in the NEON_AUTH_TOKEN env variable, but compute_ctl is now responsible for setting it.	2023-03-21 20:12:09 +02:00
Dmitry Rodionov	4158e24e60	rfc: delete pageserver data from s3 (#3792 ) [Rendered](https://github.com/neondatabase/neon/blob/main/docs/rfcs/022-pageserver-delete-from-s3.md) --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-03-21 20:03:27 +02:00
Shany Pozin	809acb5fa9	Move neon-image-depot to a larger runner (#3860 ) ## Describe your changes https://neondb.slack.com/archives/C039YKBRZB4/p1679413279637059 ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.	2023-03-21 19:32:36 +02:00
Heikki Linnakangas	299db9d028	Simplify and clean up the $NEON_AUTH_TOKEN stuff in compute - Remove the neon.safekeeper_token_env GUC. It was used to set the name of an environment variable, which was then used in pageserver and safekeeper connection strings to in place of the password. Instead, always look up the environment variable called NEON_AUTH_TOKEN. That's what neon.safekeeper_token_env was always set to in practice, and I don't see the need for the extra level of indirection or configurability. - Instead of substituting $NEON_AUTH_TOKEN in the connection strings, pass $NEON_AUTH_TOKEN "out-of-band" as the password, when we connect to the pageserver or safekeepers. That's simpler. - Also use the password from $NEON_AUTH_TOKEN in compute_ctl, when it connects to the pageserver to get the "base backup".	2023-03-21 00:15:04 +02:00
Heikki Linnakangas	5a786fab4f	Remove duplicated global variables in neon extension. Walproposer used to live in the backend, while pagestore_smgr was an extension. But now that both are part of the neon extension, walproposer can access the same 'neon_tenant' and 'neon_timeline' variables as the pageserver_smgr code.	2023-03-21 00:15:04 +02:00
Arseny Sher	699f200811	Send error context chain to the client when Copy stream errors.	2023-03-21 01:22:02 +04:00
Christian Schwarz	881356c417	add metrics to detect eviction-induced thrashing (#3837 ) This patch adds two metrics that will enable us to detect thrashing of layers, i.e., repetitions of `eviction, on-demand-download, eviction, ... ` for a given layer. The first metric counts all layer evictions per timeline. It requires no further explanation. The second metric counts the layer evictions where the layer was resident for less than a given threshold. We can alert on increments to the second metric. The first metric will serve as a baseline, and further, it's generally interesting, outside of thrashing. The second metric's threshold is configurable in PageServerConf and defaults to 24h. The threshold value is reproduced as a label in the metric because the counter's value is semantically tied to that threshold. Since changes to the config and hence the label value are infrequent, this will have low storage overhead in the metrics storage. The data source to determine the time that the layer was resident is the file's `mtime`. Using `mtime` is more of a crutch. It would be better if Pageserver did its own persistent bookkeeping of residence change events instead of relying on the filesystem. We had some discussion about this: https://github.com/neondatabase/neon/pull/3809#issuecomment-1470448900 My position is that `mtime` is good enough for now. It can theoretically jump forward if someone copies files without resetting `mtime`. But that shouldn't happen in practice. Note that moving files back and forth doesn't change `mtime`, nor does `chown` or `chmod`. Lastly, `rsync -a`, which is typically used for filesystem-level backup / restore, correctly syncs `mtime`. I've added a label that identifies the data source to keep options open for a future, better data source than `mtime`. Since this value will stay the same for the time being, it's not a problem for metrics storage. refs https://github.com/neondatabase/neon/issues/3728	2023-03-20 16:11:36 +01:00
Heikki Linnakangas	fea4b5f551	Switch to EdDSA algorithm for the storage JWT authentication tokens. The control plane currently only supports EdDSA. We need to either teach the storage to use EdDSA, or the control plane to use RSA. EdDSA is more modern, so let's use that. We could support both, but it would require a little more code and tests, and we don't really need the flexibility since we control both sides.	2023-03-20 16:28:01 +02:00
Heikki Linnakangas	77107607f3	Allow JWT key generation to fail if authentication is not enabled. This allows you to run without the 'openssl' binary as long as you don't enable authentication. This becomes more important with the next commit, which switches the JWT algorithm to EdDSA. LibreSSL does not support EdDSA, and LibreSSL comes with macOS, so the next commit makes it much more likely for the key generation to fail for macOS users. To allow running without a keypair, don't generate the authentication token in the 'neon_local init' step. Instead, generate a new token on every request that needs one, using the private key.	2023-03-20 16:28:01 +02:00
Heikki Linnakangas	1da963b2f9	Remove some unused code in control plane.	2023-03-20 16:28:01 +02:00
Heikki Linnakangas	1ddb9249aa	Reduce the # of histogram buckets in metrics. (#3850 ) Shrinks the total number of metrics collected for each timeline by about 50%. See https://github.com/neondatabase/neon/issues/2848. This doesn't fully solve the problem, we still collect a lot of metrics even with this, but this gives us a lot of headroom.	2023-03-20 15:49:16 +02:00
Joonas Koivunen	0c1228c37a	feat: store initial timeline in env fixture (#3839 ) minor change, but will allow more use in future for the default tenants. Co-authored-by: Alexander Bayandin <alexander@neon.tech>	2023-03-20 11:57:27 +02:00
Christian Schwarz	3c15874c48	allow specifying eviction_policy in TenantCreateRequest This was on oversight from `175a577ad4`. Nothing uses this AFAIK, but, let's fix it anyways. Noticed while working on https://github.com/neondatabase/neon/issues/3728	2023-03-20 10:43:53 +01:00