Fix race condition in XLogWaitForReplayOf()

ConditionVariablePrepareToSleep has this comment on it: > * Caution: "before entering the loop" means you *must* test the exit > * condition between calling ConditionVariablePrepareToSleep and calling > * ConditionVariableSleep. If that is inconvenient, omit calling > * ConditionVariablePrepareToSleep. We were not obeying that: we did not test the exit condition correctly between the ConditionVariablePrepareToSleep and ConditionVariableSleep calls, because the test that we had in between them only checked the local 'replayRecPtr' variable, without updating it from shared memory. That wasn't too serious, because the loop includes a 10 second timeout, and would retry and succeed if the original update was missed. Still, better fix it. To fix, just remove the ConditionVariablePrepareToSleep() call. As the comment says, that's also correct, and even more efficient if we assume that sleeping is rare.
feat(pagebench): add aux file bench (#7746 )
2026-02-10 14:10:37 +00:00 · 2024-05-19 21:07:29 +03:00 · 2024-05-17 20:04:02 +00:00 · 2024-05-17 19:22:49 +00:00 · 2024-05-17 16:01:24 +00:00 · 2024-05-17 16:44:33 +01:00
60 changed files with 1731 additions and 463 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -708,7 +708,7 @@ dependencies = [
 "sha1",
 "sync_wrapper",
 "tokio",
- "tokio-tungstenite 0.20.0",
+ "tokio-tungstenite",
 "tower",
 "tower-layer",
 "tower-service",
@@ -979,6 +979,12 @@ version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"

+[[package]]
+name = "bytemuck"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -1233,8 +1239,10 @@ dependencies = [
 "serde_json",
 "signal-hook",
 "tar",
+ "thiserror",
 "tokio",
 "tokio-postgres",
+ "tokio-stream",
 "tokio-util",
 "toml_edit",
 "tracing",
@@ -1596,7 +1604,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
 "cfg-if",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 "lock_api",
 "once_cell",
 "parking_lot_core 0.9.8",
@@ -1997,6 +2005,27 @@ dependencies = [
 "percent-encoding",
 ]

+[[package]]
+name = "framed-websockets"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
+dependencies = [
+ "base64 0.21.1",
+ "bytemuck",
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-util",
+ "pin-project",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "fs2"
 version = "0.4.3"
@@ -2275,9 +2304,9 @@ dependencies = [

 [[package]]
 name = "hashbrown"
-version = "0.14.0"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
 "ahash",
 "allocator-api2",
@@ -2285,11 +2314,11 @@ dependencies = [

 [[package]]
 name = "hashlink"
-version = "0.8.4"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
+checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
 dependencies = [
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]

 [[package]]
@@ -2598,21 +2627,6 @@ dependencies = [
 "tokio-native-tls",
 ]

-[[package]]
-name = "hyper-tungstenite"
-version = "0.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
-dependencies = [
- "http-body-util",
- "hyper 1.2.0",
- "hyper-util",
- "pin-project-lite",
- "tokio",
- "tokio-tungstenite 0.21.0",
- "tungstenite 0.21.0",
-]
-
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -2690,7 +2704,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
 dependencies = [
 "equivalent",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]

 [[package]]
@@ -2952,7 +2966,7 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
 dependencies = [
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]

 [[package]]
@@ -3005,7 +3019,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
 "bytes",
 "crossbeam-utils",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 "itoa",
 "lasso",
 "measured-derive",
@@ -3567,7 +3581,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
 dependencies = [
 "dlv-list",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]

 [[package]]
@@ -3894,7 +3908,7 @@ dependencies = [
 "ahash",
 "bytes",
 "chrono",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 "num",
 "num-bigint",
 "paste",
@@ -4378,9 +4392,10 @@ dependencies = [
 "dashmap",
 "env_logger",
 "fallible-iterator",
+ "framed-websockets",
 "futures",
 "git-version",
- "hashbrown 0.13.2",
+ "hashbrown 0.14.5",
 "hashlink",
 "hex",
 "hmac",
@@ -4390,7 +4405,6 @@ dependencies = [
 "humantime",
 "hyper 0.14.26",
 "hyper 1.2.0",
- "hyper-tungstenite",
 "hyper-util",
 "indexmap 2.0.1",
 "ipnet",
@@ -4435,7 +4449,6 @@ dependencies = [
 "smol_str",
 "socket2 0.5.5",
 "subtle",
- "sync_wrapper",
 "task-local-extensions",
 "thiserror",
 "tikv-jemalloc-ctl",
@@ -4444,6 +4457,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
+ "tokio-tungstenite",
 "tokio-util",
 "tower-service",
 "tracing",
@@ -6380,19 +6394,7 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite 0.20.1",
-]
-
-[[package]]
-name = "tokio-tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.21.0",
+ "tungstenite",
 ]

 [[package]]
@@ -6406,7 +6408,7 @@ dependencies = [
 "futures-io",
 "futures-sink",
 "futures-util",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -6688,25 +6690,6 @@ dependencies = [
 "utf-8",
 ]

-[[package]]
-name = "tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
-dependencies = [
- "byteorder",
- "bytes",
- "data-encoding",
- "http 1.1.0",
- "httparse",
- "log",
- "rand 0.8.5",
- "sha1",
- "thiserror",
- "url",
- "utf-8",
-]
-
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -7502,7 +7485,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 "hex",
 "hmac",
 "hyper 0.14.26",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,13 +81,14 @@ enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
+framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
-hashbrown = "0.13"
-hashlink = "0.8.4"
+hashbrown = "0.14"
+hashlink = "0.9.1"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
@@ -98,7 +99,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+tokio-tungstenite = "0.20.0"
 indexmap = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
+tokio-stream.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
+thiserror.workspace = true
 url.workspace = true

 compute_api.workspace = true
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -0,0 +1,116 @@
+use compute_api::{
+    responses::CatalogObjects,
+    spec::{Database, Role},
+};
+use futures::Stream;
+use postgres::{Client, NoTls};
+use std::{path::Path, process::Stdio, result::Result, sync::Arc};
+use tokio::{
+    io::{AsyncBufReadExt, BufReader},
+    process::Command,
+    task,
+};
+use tokio_stream::{self as stream, StreamExt};
+use tokio_util::codec::{BytesCodec, FramedRead};
+use tracing::warn;
+
+use crate::{
+    compute::ComputeNode,
+    pg_helpers::{get_existing_dbs, get_existing_roles},
+};
+
+pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
+    let connstr = compute.connstr.clone();
+    task::spawn_blocking(move || {
+        let mut client = Client::connect(connstr.as_str(), NoTls)?;
+        let roles: Vec<Role>;
+        {
+            let mut xact = client.transaction()?;
+            roles = get_existing_roles(&mut xact)?;
+        }
+        let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
+
+        Ok(CatalogObjects { roles, databases })
+    })
+    .await?
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SchemaDumpError {
+    #[error("Database does not exist.")]
+    DatabaseDoesNotExist,
+    #[error("Failed to execute pg_dump.")]
+    IO(#[from] std::io::Error),
+}
+
+// It uses the pg_dump utility to dump the schema of the specified database.
+// The output is streamed back to the caller and supposed to be streamed via HTTP.
+//
+// Before return the result with the output, it checks that pg_dump produced any output.
+// If not, it tries to parse the stderr output to determine if the database does not exist
+// and special error is returned.
+//
+// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
+pub async fn get_database_schema(
+    compute: &Arc<ComputeNode>,
+    dbname: &str,
+) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
+    let pgbin = &compute.pgbin;
+    let basepath = Path::new(pgbin).parent().unwrap();
+    let pgdump = basepath.join("pg_dump");
+    let mut connstr = compute.connstr.clone();
+    connstr.set_path(dbname);
+    let mut cmd = Command::new(pgdump)
+        .arg("--schema-only")
+        .arg(connstr.as_str())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .kill_on_drop(true)
+        .spawn()?;
+
+    let stdout = cmd.stdout.take().ok_or_else(|| {
+        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
+    })?;
+
+    let stderr = cmd.stderr.take().ok_or_else(|| {
+        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
+    })?;
+
+    let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
+    let stderr_reader = BufReader::new(stderr);
+
+    let first_chunk = match stdout_reader.next().await {
+        Some(Ok(bytes)) if !bytes.is_empty() => bytes,
+        Some(Err(e)) => {
+            return Err(SchemaDumpError::IO(e));
+        }
+        _ => {
+            let mut lines = stderr_reader.lines();
+            if let Some(line) = lines.next_line().await? {
+                if line.contains(&format!("FATAL:  database \"{}\" does not exist", dbname)) {
+                    return Err(SchemaDumpError::DatabaseDoesNotExist);
+                }
+                warn!("pg_dump stderr: {}", line)
+            }
+            tokio::spawn(async move {
+                while let Ok(Some(line)) = lines.next_line().await {
+                    warn!("pg_dump stderr: {}", line)
+                }
+            });
+
+            return Err(SchemaDumpError::IO(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "failed to start pg_dump",
+            )));
+        }
+    };
+    let initial_stream = stream::once(Ok(first_chunk.freeze()));
+    // Consume stderr and log warnings
+    tokio::spawn(async move {
+        let mut lines = stderr_reader.lines();
+        while let Ok(Some(line)) = lines.next_line().await {
+            warn!("pg_dump stderr: {}", line)
+        }
+    });
+    Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,17 +5,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;

+use crate::catalog::SchemaDumpError;
+use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};

 use anyhow::Result;
+use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
+use utils::http::request::must_get_query_param;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
    ComputeStatusResponse {
@@ -133,6 +137,34 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        (&Method::GET, "/dbs_and_roles") => {
+            info!("serving /dbs_and_roles GET request",);
+            match get_dbs_and_roles(compute).await {
+                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
+                Err(_) => {
+                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
+        (&Method::GET, "/database_schema") => {
+            let database = match must_get_query_param(&req, "database") {
+                Err(e) => return e.into_response(),
+                Ok(database) => database,
+            };
+            info!("serving /database_schema GET request with database: {database}",);
+            match get_database_schema(compute, &database).await {
+                Ok(res) => render_plain(Body::wrap_stream(res)),
+                Err(SchemaDumpError::DatabaseDoesNotExist) => {
+                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
+                }
+                Err(e) => {
+                    error!("can't get schema dump: {}", e);
+                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
@@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
    };
    Response::builder()
        .status(status)
+        .header(CONTENT_TYPE, "application/json")
        .body(Body::from(serde_json::to_string(&error).unwrap()))
        .unwrap()
 }

+fn render_json(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "application/json")
+        .body(body)
+        .unwrap()
+}
+
+fn render_plain(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "text/plain")
+        .body(body)
+        .unwrap()
+}
+
 async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
    {
        let mut state = compute.state.lock().unwrap();
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,6 +68,51 @@ paths:
              schema:
                $ref: "#/components/schemas/Info"

+  /dbs_and_roles:
+    get:
+      tags:
+        - Info
+      summary: Get databases and roles in the catalog.
+      description: ""
+      operationId: getDbsAndRoles
+      responses:
+        200:
+          description: Compute schema objects
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/DbsAndRoles"
+
+  /database_schema:
+    get:
+      tags:
+        - Info
+      summary: Get schema dump
+      parameters:
+        - name: database
+          in: query
+          description: Database name to dump.
+          required: true
+          schema:
+            type: string
+          example: "postgres"
+      description: Get schema dump in SQL format.
+      operationId: getDatabaseSchema
+      responses:
+        200:
+          description: Schema dump
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Schema dump in SQL format.
+        404:
+          description: Non existing database.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
  /check_writability:
    post:
      tags:
@@ -229,6 +274,73 @@ components:
        num_cpus:
          type: integer

+    DbsAndRoles:
+      type: object
+      description: Databases and Roles
+      required:
+        - roles
+        - databases
+      properties:
+        roles:
+          type: array
+          items:
+            $ref: "#/components/schemas/Role"
+        databases:
+          type: array
+          items:
+            $ref: "#/components/schemas/Database"
+
+    Database:
+      type: object
+      description: Database
+      required:
+        - name
+        - owner
+        - restrict_conn
+        - invalid
+      properties:
+        name:
+          type: string
+        owner:
+          type: string
+        options:
+          type: array
+          items:
+            $ref: "#/components/schemas/GenericOption"
+        restrict_conn:
+          type: boolean
+        invalid:
+          type: boolean
+
+    Role:
+      type: object
+      description: Role
+      required:
+        - name
+      properties:
+        name:
+          type: string
+        encrypted_password:
+          type: string
+        options:
+          type: array
+          items:
+            $ref: "#/components/schemas/GenericOption"
+
+    GenericOption:
+      type: object
+      description: Schema Generic option
+      required:
+        - name
+        - vartype
+      properties:
+        name:
+          type: string
+        value:
+          type: string
+        vartype:
+          type: string
+
    ComputeState:
      type: object
      required:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,6 +8,7 @@ pub mod configurator;
 pub mod http;
 #[macro_use]
 pub mod logger;
+pub mod catalog;
 pub mod compute;
 pub mod extension_server;
 pub mod monitor;
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -3,7 +3,7 @@
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};

-use crate::spec::ComputeSpec;
+use crate::spec::{ComputeSpec, Database, Role};

 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -113,6 +113,12 @@ pub struct ComputeMetrics {
    pub total_ext_download_size: u64,
 }

+#[derive(Clone, Debug, Default, Serialize)]
+pub struct CatalogObjects {
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+}
+
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 /// This is not actually a compute API response, so consider moving
 /// to a different place.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,6 +10,7 @@ use std::{
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
    str::FromStr,
+    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };

@@ -308,13 +309,88 @@ pub struct TenantConfig {
    pub switch_aux_file_policy: Option<AuxFilePolicy>,
 }

+/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
+/// tenant config. When the first aux file written, the policy will be persisted in the
+/// `index_part.json` file and has a limited migration path.
+///
+/// Currently, we only allow the following migration path:
+///
+/// Unset -> V1
+///       -> V2
+///       -> CrossValidation -> V2
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum AuxFilePolicy {
+    /// V1 aux file policy: store everything in AUX_FILE_KEY
    V1,
+    /// V2 aux file policy: store in the AUX_FILE keyspace
    V2,
+    /// Cross validation runs both formats on the write path and does validation
+    /// on the read path.
    CrossValidation,
 }

+impl AuxFilePolicy {
+    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
+        matches!(
+            (from, to),
+            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
+        )
+    }
+
+    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
+    pub fn default_tenant_config() -> Self {
+        Self::V1
+    }
+}
+
+/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
+pub struct AtomicAuxFilePolicy(AtomicUsize);
+
+impl AtomicAuxFilePolicy {
+    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
+        Self(AtomicUsize::new(
+            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
+        ))
+    }
+
+    pub fn load(&self) -> Option<AuxFilePolicy> {
+        match self.0.load(std::sync::atomic::Ordering::Acquire) {
+            0 => None,
+            other => Some(AuxFilePolicy::from_usize(other)),
+        }
+    }
+
+    pub fn store(&self, policy: Option<AuxFilePolicy>) {
+        self.0.store(
+            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
+            std::sync::atomic::Ordering::Release,
+        );
+    }
+}
+
+impl AuxFilePolicy {
+    pub fn to_usize(self) -> usize {
+        match self {
+            Self::V1 => 1,
+            Self::CrossValidation => 2,
+            Self::V2 => 3,
+        }
+    }
+
+    pub fn try_from_usize(this: usize) -> Option<Self> {
+        match this {
+            1 => Some(Self::V1),
+            2 => Some(Self::CrossValidation),
+            3 => Some(Self::V2),
+            _ => None,
+        }
+    }
+
+    pub fn from_usize(this: usize) -> Self {
+        Self::try_from_usize(this).unwrap()
+    }
+}
+
 impl FromStr for AuxFilePolicy {
    type Err = anyhow::Error;

@@ -604,6 +680,9 @@ pub struct TimelineInfo {
    pub state: TimelineState,

    pub walreceiver_status: String,
+
+    /// The last aux file policy being used on this timeline
+    pub last_aux_file_policy: Option<AuxFilePolicy>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -762,6 +841,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
    pub max_concurrent_downloads: NonZeroUsize,
 }

+#[derive(Debug, Serialize, Deserialize)]
+pub struct IngestAuxFilesRequest {
+    pub aux_files: HashMap<String, String>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct ListAuxFilesRequest {
+    pub lsn: Lsn,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct DownloadRemoteLayersTaskInfo {
    pub task_id: String,
@@ -1505,4 +1594,59 @@ mod tests {
            assert_eq!(actual, expected, "example on {line}");
        }
    }
+
+    #[test]
+    fn test_aux_file_migration_path() {
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::V1
+        ));
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::V2
+        ));
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::CrossValidation
+        ));
+        // Self-migration is not a valid migration path, and the caller should handle it by itself.
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::V2
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::CrossValidation
+        ));
+        // Migrations not allowed
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::V2
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::CrossValidation
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::CrossValidation
+        ));
+        // Migrations allowed
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::V2
+        ));
+    }
 }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,8 +1,12 @@
+use std::collections::HashMap;
+
+use bytes::Bytes;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
+    lsn::Lsn,
 };

 pub mod util;
@@ -561,4 +565,57 @@ impl Client {
            }),
        }
    }
+
+    pub async fn ingest_aux_files(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        aux_files: HashMap<String, String>,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        );
+        let resp = self
+            .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
+            .await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
+
+    pub async fn list_aux_files(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<HashMap<String, Bytes>> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/list_aux_files",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        );
+        let resp = self
+            .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
+            .await?;
+        match resp.status() {
+            StatusCode::OK => {
+                let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
+                    Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
+                })?;
+                Ok(resp)
+            }
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
 }
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,4 +1,5 @@
 use clap::{Parser, Subcommand};
+use pageserver_compaction::helpers::PAGE_SZ;
 use pageserver_compaction::simulator::MockTimeline;
 use rand::Rng;
 use std::io::Write;
@@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
    let mut executor = MockTimeline::new();

    // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
    //let key_range = u64::MIN..u64::MAX;
    println!(
        "starting simulation with key range {:016X}-{:016X}",
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque};
 use std::ops::Range;

 use crate::helpers::{
-    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
+    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
 };
 use crate::interface::*;
 use utils::lsn::Lsn;
@@ -379,7 +379,7 @@ where
                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                .await?,
            &self.shard_identity,
-        ) * 8192;
+        ) * PAGE_SZ;

        let wal_size = job
            .input_layers
@@ -441,7 +441,7 @@ where
        let mut window = KeyspaceWindow::new(
            E::Key::MIN..E::Key::MAX,
            keyspace,
-            self.target_file_size / 8192,
+            self.target_file_size / PAGE_SZ,
        );
        while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
            new_jobs.push(CompactionJob::<E> {
@@ -663,8 +663,8 @@ where
    }
 }

-// Sliding window through keyspace and values
-// This is used by over_with_images to decide on good split points
+/// Sliding window through keyspace and values for image layer
+/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
 struct KeyspaceWindow<K> {
    head: KeyspaceWindowHead<K>,

@@ -804,9 +804,9 @@ struct WindowElement<K> {
    accum_size: u64,
 }

-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
+/// Sliding window through keyspace and values for delta layer tiling
+///
+/// This is used to decide which delta layer to write next.
 struct Window<K> {
    elems: VecDeque<WindowElement<K>>,

@@ -830,11 +830,13 @@ where
    fn feed(&mut self, key: K, size: u64) {
        let last_size;
        if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
-            if key == last.last_key {
-                last.accum_size += size;
-                return;
-            }
+            // We require the keys to be strictly increasing for the window.
+            // Keys should already have been deduplicated by `accum_key_values`
+            assert!(
+                last.last_key < key,
+                "last_key(={}) >= key(={key})",
+                last.last_key
+            );
            last_size = last.accum_size;
        } else {
            last_size = 0;
@@ -922,7 +924,7 @@ where
        // If we're willing to stretch it up to 1.25 target size, could we
        // gobble up the rest of the work? This avoids creating very small
        // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
+        if !has_more && self.remain_size() < target_size * 5 / 4 {
            self.commit_upto(self.elems.len());
        } else {
            let delta_split_at = self.find_size_split(target_size);
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -16,6 +16,8 @@ use std::pin::Pin;
 use std::task::{ready, Poll};
 use utils::lsn::Lsn;

+pub const PAGE_SZ: u64 = 8192;
+
 pub fn keyspace_total_size<K>(
    keyspace: &CompactionKeySpace<K>,
    shard_identity: &ShardIdentity,
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -14,6 +14,7 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::sync::Mutex;

+use crate::helpers::PAGE_SZ;
 use crate::helpers::{merge_delta_keys, overlaps_with};

 use crate::interface;
@@ -509,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
        let new_layer = Arc::new(MockImageLayer {
            key_range: key_range.clone(),
            lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
+            file_size: accum_size * PAGE_SZ,
            deleted: Mutex::new(false),
        });
        info!(
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -219,6 +219,7 @@ fn handle_metadata(
    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
    println!("Current metadata:\n{meta:?}");
    let mut update_meta = false;
+    // TODO: simplify this part
    if let Some(disk_consistent_lsn) = disk_consistent_lsn {
        meta = TimelineMetadata::new(
            *disk_consistent_lsn,
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -0,0 +1,98 @@
+use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
+use pageserver_api::shard::TenantShardId;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Ingest aux files into the pageserver.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let main_task = rt.spawn(main_impl(args));
+    rt.block_on(main_task).unwrap()
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: None,
+            targets: {
+                if let Some(targets) = &args.targets {
+                    if targets.len() != 1 {
+                        anyhow::bail!("must specify exactly one target");
+                    }
+                    Some(targets.clone())
+                } else {
+                    None
+                }
+            },
+        },
+    )
+    .await?;
+
+    let timeline = timelines[0];
+    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
+    let timeline_id = timeline.timeline_id;
+
+    println!("operating on timeline {}", timeline);
+
+    mgmt_api_client
+        .tenant_config(&TenantConfigRequest {
+            tenant_id: timeline.tenant_id,
+            config: TenantConfig {
+                switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                ..Default::default()
+            },
+        })
+        .await?;
+
+    for batch in 0..100 {
+        let items = (0..100)
+            .map(|id| {
+                (
+                    format!("pg_logical/mappings/{:03}.{:03}", batch, id),
+                    format!("{:08}", id),
+                )
+            })
+            .collect::<HashMap<_, _>>();
+        let file_cnt = items.len();
+        mgmt_api_client
+            .ingest_aux_files(tenant_shard_id, timeline_id, items)
+            .await?;
+        println!("ingested {file_cnt} files");
+    }
+
+    let files = mgmt_api_client
+        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+        .await?;
+
+    println!("{} files found", files.len());
+
+    anyhow::Ok(())
+}
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -14,6 +14,7 @@ mod util {

 /// The pagebench CLI sub-commands, dispatched in [`main`] below.
 mod cmd {
+    pub(super) mod aux_files;
    pub(super) mod basebackup;
    pub(super) mod getpage_latest_lsn;
    pub(super) mod ondemand_download_churn;
@@ -27,6 +28,7 @@ enum Args {
    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
+    AuxFiles(cmd::aux_files::Args),
 }

 fn main() {
@@ -46,6 +48,7 @@ fn main() {
            cmd::trigger_initial_size_calculation::main(args)
        }
        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Args::AuxFiles(args) => cmd::aux_files::main(args),
    }
    .unwrap()
 }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                }
                EvictionLayer::Secondary(layer) => {
                    let file_size = layer.metadata.file_size();
-                    let tenant_manager = tenant_manager.clone();

                    js.spawn(async move {
                        layer
                            .secondary_tenant
-                            .evict_layer(
-                                tenant_manager.get_conf(),
-                                layer.timeline_id,
-                                layer.name,
-                                layer.metadata,
-                            )
+                            .evict_layer(layer.timeline_id, layer.name)
                            .await;
                        Ok(file_size)
                    });
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,6 +16,8 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::IngestAuxFilesRequest;
+use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
@@ -439,6 +441,8 @@ async fn build_timeline_info_common(
        state,

        walreceiver_status,
+
+        last_aux_file_policy: timeline.last_aux_file_policy.load(),
    };
    Ok(info)
 }
@@ -2329,6 +2333,71 @@ async fn get_utilization(
        .map_err(ApiError::InternalServerError)
 }

+async fn list_aux_files(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let body: ListAuxFilesRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let process = || async move {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+        Ok::<_, anyhow::Error>(files)
+    };
+
+    match process().await {
+        Ok(st) => json_response(StatusCode::OK, st),
+        Err(err) => json_response(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            ApiError::InternalServerError(err).to_string(),
+        ),
+    }
+}
+
+async fn ingest_aux_files(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let body: IngestAuxFilesRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let process = || async move {
+        let mut modification = timeline.begin_modification(Lsn(
+            timeline.get_last_record_lsn().0 + 8
+        ) /* advance LSN by 8 */);
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        for (fname, content) in body.aux_files {
+            modification
+                .put_file(&fname, content.as_bytes(), &ctx)
+                .await?;
+        }
+        modification.commit(&ctx).await?;
+        Ok::<_, anyhow::Error>(())
+    };
+
+    match process().await {
+        Ok(st) => json_response(StatusCode::OK, st),
+        Err(err) => Err(ApiError::InternalServerError(err)),
+    }
+}
+
 /// Report on the largest tenants on this pageserver, for the storage controller to identify
 /// candidates for splitting
 async fn post_top_tenants(
@@ -2706,6 +2775,14 @@ pub fn make_router(
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
        .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
+            |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
+            |r| testing_api_handler("list_aux_files", r, list_aux_files),
+        )
        .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
        .any(handler_404))
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -35,7 +35,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -718,10 +718,11 @@ impl Timeline {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get_switch_aux_file_policy() {
-            AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
-            AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
-            AuxFilePolicy::CrossValidation => {
+        let current_policy = self.last_aux_file_policy.load();
+        match current_policy {
+            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
+            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
+            Some(AuxFilePolicy::CrossValidation) => {
                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
                let v2_result = self.list_aux_files_v2(lsn, ctx).await;
                match (v1_result, v2_result) {
@@ -1469,7 +1470,27 @@ impl<'a> DatadirModification<'a> {
        content: &[u8],
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let policy = self.tline.get_switch_aux_file_policy();
+        let switch_policy = self.tline.get_switch_aux_file_policy();
+
+        let policy = {
+            let current_policy = self.tline.last_aux_file_policy.load();
+            // Allowed switch path:
+            // * no aux files -> v1/v2/cross-validation
+            // * cross-validation->v2
+            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
+                self.tline.last_aux_file_policy.store(Some(switch_policy));
+                self.tline
+                    .remote_client
+                    .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
+                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
+                switch_policy
+            } else {
+                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
+                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
+                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
+            }
+        };
+
        if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
            let key = aux_file::encode_aux_file_key(path);
            // retrieve the key from the engine
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
@@ -529,6 +530,7 @@ impl Tenant {
        index_part: Option<IndexPart>,
        metadata: TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let tenant_id = self.tenant_shard_id;
@@ -539,6 +541,10 @@ impl Tenant {
            ancestor.clone(),
            resources,
            CreateTimelineCause::Load,
+            // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
+            // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
+            // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
+            last_aux_file_policy,
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
@@ -553,6 +559,10 @@ impl Tenant {

        if let Some(index_part) = index_part.as_ref() {
            timeline.remote_client.init_upload_queue(index_part)?;
+
+            timeline
+                .last_aux_file_policy
+                .store(index_part.last_aux_file_policy());
        } else {
            // No data on the remote storage, but we have local metadata file. We can end up
            // here with timeline_create being interrupted before finishing index part upload.
@@ -1173,12 +1183,15 @@ impl Tenant {
            None
        };

+        let last_aux_file_policy = index_part.last_aux_file_policy();
+
        self.timeline_init_and_sync(
            timeline_id,
            resources,
            Some(index_part),
            remote_metadata,
            ancestor,
+            last_aux_file_policy,
            ctx,
        )
        .await
@@ -1358,6 +1371,7 @@ impl Tenant {
            create_guard,
            initdb_lsn,
            None,
+            None,
        )
        .await
    }
@@ -2441,6 +2455,7 @@ impl Tenant {
        ancestor: Option<Arc<Timeline>>,
        resources: TimelineResources,
        cause: CreateTimelineCause,
+        last_aux_file_policy: Option<AuxFilePolicy>,
    ) -> anyhow::Result<Arc<Timeline>> {
        let state = match cause {
            CreateTimelineCause::Load => {
@@ -2469,6 +2484,7 @@ impl Tenant {
            resources,
            pg_version,
            state,
+            last_aux_file_policy,
            self.cancel.child_token(),
        );

@@ -3119,6 +3135,7 @@ impl Tenant {
                timeline_create_guard,
                start_lsn + 1,
                Some(Arc::clone(src_timeline)),
+                src_timeline.last_aux_file_policy.load(),
            )
            .await?;

@@ -3312,6 +3329,7 @@ impl Tenant {
                timeline_create_guard,
                pgdata_lsn,
                None,
+                None,
            )
            .await?;

@@ -3383,6 +3401,7 @@ impl Tenant {
        create_guard: TimelineCreateGuard<'a>,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
    ) -> anyhow::Result<UninitializedTimeline> {
        let tenant_shard_id = self.tenant_shard_id;

@@ -3398,6 +3417,7 @@ impl Tenant {
                ancestor,
                resources,
                CreateTimelineCause::Load,
+                last_aux_file_policy,
            )
            .context("Failed to create timeline data structure")?;

@@ -5621,4 +5641,280 @@ mod tests {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn test_branch_copies_dirty_aux_file_flag() {
+        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
+
+        // the default aux file policy to switch is v1 if not set by the admins
+        assert_eq!(
+            harness.tenant_conf.switch_aux_file_policy,
+            AuxFilePolicy::V1
+        );
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        // no aux file is written at this point, so the persistent flag should be unset
+        assert_eq!(tline.last_aux_file_policy.load(), None);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // there is no tenant manager to pass the configuration through, so lets mimic it
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V2,
+            "wanted state has been updated"
+        );
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
+        );
+
+        // we can read everything from the storage
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "keep v1 storage format when new files are written"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        // child copies the last flag even if that is not on remote storage yet
+        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
+
+        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+        assert_eq!(files.get("pg_logical/mappings/test2"), None);
+
+        // even if we crash here without flushing parent timeline with it's new
+        // last_aux_file_policy we are safe, because child was never meant to access ancestor's
+        // files. the ancestor can even switch back to V1 because of a migration safely.
+    }
+
+    #[tokio::test]
+    async fn aux_file_policy_switch() {
+        let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // there is no tenant manager to pass the configuration through, so lets mimic it
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V2,
+            "wanted state has been updated"
+        );
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::CrossValidation),
+            "dirty index_part.json reflected state is yet to be updated"
+        );
+
+        // we can still read the auxfile v1 before we ingest anything new
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "ingesting a file should apply the wanted switch state when applicable"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first")),
+            "cross validation writes to both v1 and v2 so this should be available in v2"
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+
+        // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V1),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"third", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V1,
+            "wanted state has been updated again, even if invalid request"
+        );
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "ingesting a file should apply the wanted switch state when applicable"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"third"))
+        );
+
+        // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test3", b"last", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
+
+        assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"third"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test3"),
+            Some(&bytes::Bytes::from_static(b"last"))
+        );
+    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -373,6 +373,8 @@ pub struct TenantConf {

    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
+    /// file is written.
    pub switch_aux_file_policy: AuxFilePolicy,
 }

@@ -574,7 +576,7 @@ impl Default for TenantConf {
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::V1,
+            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
        }
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -189,6 +189,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};

 pub(crate) use download::download_initdb_tar_zst;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -518,6 +519,7 @@ impl RemoteTimelineClient {
        &self,
        layer_file_name: &LayerName,
        layer_metadata: &LayerFileMetadata,
+        local_path: &Utf8Path,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<u64> {
@@ -536,6 +538,7 @@ impl RemoteTimelineClient {
                self.timeline_id,
                layer_file_name,
                layer_metadata,
+                local_path,
                cancel,
                ctx,
            )
@@ -609,6 +612,17 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
+    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
+        self: &Arc<Self>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -1849,6 +1863,7 @@ impl RemoteTimelineClient {
                        dangling_files: HashMap::default(),
                        shutting_down: false,
                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        last_aux_file_policy: initialized.last_aux_file_policy,
                    };

                    let upload_queue = std::mem::replace(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -21,7 +21,6 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
-use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
@@ -50,19 +49,13 @@ pub async fn download_layer_file<'a>(
    timeline_id: TimelineId,
    layer_file_name: &'a LayerName,
    layer_metadata: &'a LayerFileMetadata,
+    local_path: &Utf8Path,
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
-    let local_path = local_layer_path(
-        conf,
-        &tenant_shard_id,
-        &timeline_id,
-        layer_file_name,
-        &layer_metadata.generation,
-    );

    let remote_path = remote_layer_path(
        &tenant_shard_id.tenant_id,
@@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>(
    // For more context about durable_rename check this email from postgres mailing list:
    // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
+    let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);

    let bytes_amount = download_retry(
        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -5,6 +5,7 @@
 use std::collections::HashMap;

 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;

@@ -88,6 +89,16 @@ pub struct IndexPart {

    #[serde(default)]
    pub(crate) lineage: Lineage,
+
+    /// Describes the kind of aux files stored in the timeline.
+    ///
+    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
+    /// A V1 setting after V2 files have been committed is not accepted.
+    ///
+    /// None means no aux files have been written to the storage before the point
+    /// when this flag is introduced.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
 }

 impl IndexPart {
@@ -101,10 +112,11 @@ impl IndexPart {
    ///      is always generated from the keys of `layer_metadata`)
    /// - 4: timeline_layers is fully removed.
    /// - 5: lineage was added
-    const LATEST_VERSION: usize = 5;
+    /// - 6: last_aux_file_policy is added.
+    const LATEST_VERSION: usize = 6;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -113,6 +125,7 @@ impl IndexPart {
        disk_consistent_lsn: Lsn,
        metadata: TimelineMetadata,
        lineage: Lineage,
+        last_aux_file_policy: Option<AuxFilePolicy>,
    ) -> Self {
        let layer_metadata = layers_and_metadata
            .iter()
@@ -126,6 +139,7 @@ impl IndexPart {
            metadata,
            deleted_at: None,
            lineage,
+            last_aux_file_policy,
        }
    }

@@ -155,8 +169,13 @@ impl IndexPart {
            example_metadata.disk_consistent_lsn(),
            example_metadata,
            Default::default(),
+            Some(AuxFilePolicy::V1),
        )
    }
+
+    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
+        self.last_aux_file_policy
+    }
 }

 impl From<&UploadQueueInitialized> for IndexPart {
@@ -165,7 +184,13 @@ impl From<&UploadQueueInitialized> for IndexPart {
        let metadata = uq.latest_metadata.clone();
        let lineage = uq.latest_lineage.clone();

-        Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
+        Self::new(
+            &uq.latest_files,
+            disk_consistent_lsn,
+            metadata,
+            lineage,
+            uq.last_aux_file_policy,
+        )
    }
 }

@@ -299,6 +324,7 @@ mod tests {
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
            lineage: Lineage::default(),
+            last_aux_file_policy: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -340,6 +366,7 @@ mod tests {
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
            lineage: Lineage::default(),
+            last_aux_file_policy: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -383,6 +410,7 @@ mod tests {
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
            lineage: Lineage::default(),
+            last_aux_file_policy: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -428,6 +456,7 @@ mod tests {
            .unwrap(),
            deleted_at: None,
            lineage: Lineage::default(),
+            last_aux_file_policy: None,
        };

        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -468,6 +497,7 @@ mod tests {
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            lineage: Lineage::default(),
+            last_aux_file_policy: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -511,6 +541,57 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
+            last_aux_file_policy: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v6_indexpart_is_parsed() {
+        let example = r#"{
+            "version":6,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123",
+            "lineage":{
+                "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
+                "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
+            },
+            "last_aux_file_policy": "V2"
+        }"#;
+
+        let expected = IndexPart {
+            version: 6,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            lineage: Lineage {
+                reparenting_history_truncated: false,
+                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
+                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
+            },
+            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -6,11 +6,9 @@ mod scheduler;
 use std::{sync::Arc, time::SystemTime};

 use crate::{
-    config::PageServerConf,
    context::RequestContext,
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    virtual_file::MaybeFatalIo,
 };

 use self::{
@@ -21,9 +19,8 @@ use self::{
 use super::{
    config::{SecondaryLocationConfig, TenantConfOpt},
    mgr::TenantManager,
-    remote_timeline_client::LayerFileMetadata,
    span::debug_assert_current_span_has_tenant_id,
-    storage_layer::{layer::local_layer_path, LayerName},
+    storage_layer::LayerName,
 };

 use pageserver_api::{
@@ -178,13 +175,7 @@ impl SecondaryTenant {

    /// Cancellation safe, but on cancellation the eviction will go through
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
-    pub(crate) async fn evict_layer(
-        self: &Arc<Self>,
-        conf: &PageServerConf,
-        timeline_id: TimelineId,
-        name: LayerName,
-        metadata: LayerFileMetadata,
-    ) {
+    pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
        debug_assert_current_span_has_tenant_id();

        let guard = match self.gate.enter() {
@@ -197,41 +188,11 @@ impl SecondaryTenant {

        let now = SystemTime::now();

-        let local_path = local_layer_path(
-            conf,
-            &self.tenant_shard_id,
-            &timeline_id,
-            &name,
-            &metadata.generation,
-        );
-
        let this = self.clone();

        // spawn it to be cancellation safe
        tokio::task::spawn_blocking(move || {
            let _guard = guard;
-            // We tolerate ENOENT, because between planning eviction and executing
-            // it, the secondary downloader could have seen an updated heatmap that
-            // resulted in a layer being deleted.
-            // Other local I/O errors are process-fatal: these should never happen.
-            let deleted = std::fs::remove_file(local_path);
-
-            let not_found = deleted
-                .as_ref()
-                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
-
-            let deleted = if not_found {
-                false
-            } else {
-                deleted
-                    .map(|()| true)
-                    .fatal_err("Deleting layer during eviction")
-            };
-
-            if !deleted {
-                // skip updating accounting and putting perhaps later timestamp
-                return;
-            }

            // Update the timeline's state.  This does not have to be synchronized with
            // the download process, because:
@@ -250,8 +211,15 @@ impl SecondaryTenant {
            // of the cache.
            let mut detail = this.detail.lock().unwrap();
            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                timeline_detail.on_disk_layers.remove(&name);
-                timeline_detail.evicted_at.insert(name, now);
+                let removed = timeline_detail.on_disk_layers.remove(&name);
+
+                // We might race with removal of the same layer during downloads, if it was removed
+                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
+                // do a physical deletion or store in evicted_at.
+                if let Some(removed) = removed {
+                    removed.remove_blocking();
+                    timeline_detail.evicted_at.insert(name, now);
+                }
            }
        })
        .await
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -111,6 +111,7 @@ struct SecondaryDownloader {
 pub(super) struct OnDiskState {
    metadata: LayerFileMetadata,
    access_time: SystemTime,
+    local_path: Utf8PathBuf,
 }

 impl OnDiskState {
@@ -121,12 +122,26 @@ impl OnDiskState {
        _ame: LayerName,
        metadata: LayerFileMetadata,
        access_time: SystemTime,
+        local_path: Utf8PathBuf,
    ) -> Self {
        Self {
            metadata,
            access_time,
+            local_path,
        }
    }
+
+    // This is infallible, because all errors are either acceptable (ENOENT), or totally
+    // unexpected (fatal).
+    pub(super) fn remove_blocking(&self) {
+        // We tolerate ENOENT, because between planning eviction and executing
+        // it, the secondary downloader could have seen an updated heatmap that
+        // resulted in a layer being deleted.
+        // Other local I/O errors are process-fatal: these should never happen.
+        std::fs::remove_file(&self.local_path)
+            .or_else(fs_ext::ignore_not_found)
+            .fatal_err("Deleting secondary layer")
+    }
 }

 #[derive(Debug, Clone, Default)]
@@ -816,20 +831,12 @@ impl<'a> TenantDownloader<'a> {
                if cfg!(debug_assertions) {
                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
                    // are already present on disk are really there.
-                    let local_path = local_layer_path(
-                        self.conf,
-                        tenant_shard_id,
-                        &timeline.timeline_id,
-                        &layer.name,
-                        &layer.metadata.generation,
-                    );
-
-                    match tokio::fs::metadata(&local_path).await {
+                    match tokio::fs::metadata(&on_disk.local_path).await {
                        Ok(meta) => {
                            tracing::debug!(
                                "Layer {} present at {}, size {}",
                                layer.name,
-                                local_path,
+                                on_disk.local_path,
                                meta.len(),
                            );
                        }
@@ -837,7 +844,7 @@ impl<'a> TenantDownloader<'a> {
                            tracing::warn!(
                                "Layer {} not found at {} ({})",
                                layer.name,
-                                local_path,
+                                on_disk.local_path,
                                e
                            );
                            debug_assert!(false);
@@ -926,6 +933,13 @@ impl<'a> TenantDownloader<'a> {
                        v.get_mut().access_time = t.access_time;
                    }
                    Entry::Vacant(e) => {
+                        let local_path = local_layer_path(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            &t.name,
+                            &t.metadata.generation,
+                        );
                        e.insert(OnDiskState::new(
                            self.conf,
                            tenant_shard_id,
@@ -933,6 +947,7 @@ impl<'a> TenantDownloader<'a> {
                            t.name,
                            LayerFileMetadata::from(&t.metadata),
                            t.access_time,
+                            local_path,
                        ));
                    }
                }
@@ -955,6 +970,14 @@ impl<'a> TenantDownloader<'a> {
            &self.secondary_state.cancel
        );

+        let local_path = local_layer_path(
+            self.conf,
+            tenant_shard_id,
+            timeline_id,
+            &layer.name,
+            &layer.metadata.generation,
+        );
+
        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
        let downloaded_bytes = match download_layer_file(
            self.conf,
@@ -963,6 +986,7 @@ impl<'a> TenantDownloader<'a> {
            *timeline_id,
            &layer.name,
            &LayerFileMetadata::from(&layer.metadata),
+            &local_path,
            &self.secondary_state.cancel,
            ctx,
        )
@@ -1116,6 +1140,7 @@ async fn init_timeline_state(
                                    name,
                                    LayerFileMetadata::from(&remote_meta.metadata),
                                    remote_meta.access_time,
+                                    file_path,
                                ),
                            );
                        }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1108,6 +1108,7 @@ impl LayerInner {
            .download_layer_file(
                &self.desc.layer_name(),
                &self.metadata(),
+                &self.path,
                &timeline.cancel,
                ctx,
            )
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,7 +23,7 @@ use pageserver_api::{
    },
    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
    models::{
-        AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
        TimelineState,
    },
@@ -413,7 +413,11 @@ pub struct Timeline {
    /// Keep aux directory cache to avoid it's reconstruction on each update
    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,

+    /// Size estimator for aux file v2
    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
+
+    /// Indicate whether aux file v2 storage is enabled.
+    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
 }

 pub struct WalReceiverInfo {
@@ -2133,6 +2137,7 @@ impl Timeline {
        resources: TimelineResources,
        pg_version: u32,
        state: TimelineState,
+        aux_file_policy: Option<AuxFilePolicy>,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2257,6 +2262,8 @@ impl Timeline {
                }),

                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
+
+                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -280,6 +280,8 @@ impl DeleteTimelineFlow {
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
+                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
+                None,
            )
            .context("create_timeline_struct")?;

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -56,7 +56,7 @@ impl Default for Options {
    fn default() -> Self {
        Self {
            rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
-            copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
+            copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
        }
    }
 }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -8,6 +8,7 @@ use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;

 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -60,6 +61,9 @@ pub(crate) struct UploadQueueInitialized {
    /// Part of the flattened "next" `index_part.json`.
    pub(crate) latest_lineage: Lineage,

+    /// The last aux file policy used on this timeline.
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
+
    /// `disk_consistent_lsn` from the last metadata file that was successfully
    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -189,6 +193,7 @@ impl UploadQueue {
            dangling_files: HashMap::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: Default::default(),
        };

        *self = UploadQueue::Initialized(state);
@@ -239,6 +244,7 @@ impl UploadQueue {
            dangling_files: HashMap::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: index_part.last_aux_file_policy(),
        };

        *self = UploadQueue::Initialized(state);
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -26,6 +26,7 @@ clap.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
+framed-websockets.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -35,7 +36,6 @@ hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
 humantime.workspace = true
-hyper-tungstenite.workspace = true
 hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
@@ -76,7 +76,6 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
-sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
@@ -106,6 +105,7 @@ workspace_hack.workspace = true
 [dev-dependencies]
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
+tokio-tungstenite.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -102,7 +102,7 @@ pub async fn task_main(
    let connections = tokio_util::task::task_tracker::TaskTracker::new();
    connections.close(); // allows `connections.wait to complete`

-    let server = Builder::new(hyper_util::rt::TokioExecutor::new());
+    let server = Builder::new(TokioExecutor::new());

    while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
        let (conn, peer_addr) = res.context("could not accept TCP stream")?;
@@ -255,7 +255,6 @@ async fn connection_handler(
                .in_current_span()
                .map_ok_or_else(api_error_into_response, |r| r),
            );
-
            async move {
                let res = handler.await;
                cancel_request.disarm();
@@ -301,7 +300,7 @@ async fn request_handler(
        .map(|s| s.to_string());

    // Check if the request is a websocket upgrade request.
-    if hyper_tungstenite::is_upgrade_request(&request) {
+    if framed_websockets::upgrade::is_upgrade_request(&request) {
        let ctx = RequestMonitoring::new(
            session_id,
            peer_addr,
@@ -312,7 +311,7 @@ async fn request_handler(
        let span = ctx.span.clone();
        info!(parent: &span, "performing websocket upgrade");

-        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+        let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

        ws_connections.spawn(
@@ -334,7 +333,7 @@ async fn request_handler(
        );

        // Return the response so the spawned future can continue.
-        Ok(response)
+        Ok(response.map(|_: http_body_util::Empty<Bytes>| Full::new(Bytes::new())))
    } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
        let ctx = RequestMonitoring::new(
            session_id,
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -7,10 +7,11 @@ use crate::{
    proxy::{handle_client, ClientMode},
    rate_limiter::EndpointRateLimiter,
 };
-use bytes::{Buf, Bytes};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use framed_websockets::{Frame, OpCode, WebSocketServer};
 use futures::{Sink, Stream};
-use hyper::upgrade::Upgraded;
-use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
+use hyper1::upgrade::OnUpgrade;
+use hyper_util::rt::TokioIo;
 use pin_project_lite::pin_project;

 use std::{
@@ -21,25 +22,23 @@ use std::{
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
 use tracing::warn;

-// TODO: use `std::sync::Exclusive` once it's stabilized.
-// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
-use sync_wrapper::SyncWrapper;
-
 pin_project! {
    /// This is a wrapper around a [`WebSocketStream`] that
    /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw<S = Upgraded> {
+    pub struct WebSocketRw<S> {
        #[pin]
-        stream: SyncWrapper<WebSocketStream<S>>,
-        bytes: Bytes,
+        stream: WebSocketServer<S>,
+        recv: Bytes,
+        send: BytesMut,
    }
 }

 impl<S> WebSocketRw<S> {
-    pub fn new(stream: WebSocketStream<S>) -> Self {
+    pub fn new(stream: WebSocketServer<S>) -> Self {
        Self {
-            stream: stream.into(),
-            bytes: Bytes::new(),
+            stream,
+            recv: Bytes::new(),
+            send: BytesMut::new(),
        }
    }
 }
@@ -50,22 +49,24 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
        cx: &mut Context<'_>,
        buf: &[u8],
    ) -> Poll<io::Result<usize>> {
-        let mut stream = self.project().stream.get_pin_mut();
+        let this = self.project();
+        let mut stream = this.stream;
+        this.send.put(buf);

        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
-        match stream.as_mut().start_send(Message::Binary(buf.into())) {
+        match stream.as_mut().start_send(Frame::binary(this.send.split())) {
            Ok(()) => Poll::Ready(Ok(buf.len())),
            Err(e) => Poll::Ready(Err(io_error(e))),
        }
    }

    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
+        let stream = self.project().stream;
        stream.poll_flush(cx).map_err(io_error)
    }

    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
+        let stream = self.project().stream;
        stream.poll_close(cx).map_err(io_error)
    }
 }
@@ -76,13 +77,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
        cx: &mut Context<'_>,
        buf: &mut ReadBuf<'_>,
    ) -> Poll<io::Result<()>> {
-        if buf.remaining() > 0 {
-            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
-            let len = std::cmp::min(bytes.len(), buf.remaining());
-            buf.put_slice(&bytes[..len]);
-            self.consume(len);
-        }
-
+        let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
+        let len = std::cmp::min(bytes.len(), buf.remaining());
+        buf.put_slice(&bytes[..len]);
+        self.consume(len);
        Poll::Ready(Ok(()))
    }
 }
@@ -94,31 +92,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {

        let mut this = self.project();
        loop {
-            if !this.bytes.chunk().is_empty() {
-                let chunk = (*this.bytes).chunk();
+            if !this.recv.chunk().is_empty() {
+                let chunk = (*this.recv).chunk();
                return Poll::Ready(Ok(chunk));
            }

-            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
+            let res = ready!(this.stream.as_mut().poll_next(cx));
            match res.transpose().map_err(io_error)? {
-                Some(message) => match message {
-                    Message::Ping(_) => {}
-                    Message::Pong(_) => {}
-                    Message::Text(text) => {
+                Some(message) => match message.opcode {
+                    OpCode::Ping => {}
+                    OpCode::Pong => {}
+                    OpCode::Text => {
                        // We expect to see only binary messages.
                        let error = "unexpected text message in the websocket";
-                        warn!(length = text.len(), error);
+                        warn!(length = message.payload.len(), error);
                        return Poll::Ready(Err(io_error(error)));
                    }
-                    Message::Frame(_) => {
-                        // This case is impossible according to Frame's doc.
-                        panic!("unexpected raw frame in the websocket");
+                    OpCode::Binary | OpCode::Continuation => {
+                        debug_assert!(this.recv.is_empty());
+                        *this.recv = message.payload.freeze();
                    }
-                    Message::Binary(chunk) => {
-                        assert!(this.bytes.is_empty());
-                        *this.bytes = Bytes::from(chunk);
-                    }
-                    Message::Close(_) => return EOF,
+                    OpCode::Close => return EOF,
                },
                None => return EOF,
            }
@@ -126,19 +120,21 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
    }

    fn consume(self: Pin<&mut Self>, amount: usize) {
-        self.project().bytes.advance(amount);
+        self.project().recv.advance(amount);
    }
 }

 pub async fn serve_websocket(
    config: &'static ProxyConfig,
    mut ctx: RequestMonitoring,
-    websocket: HyperWebsocket,
+    websocket: OnUpgrade,
    cancellation_handler: Arc<CancellationHandlerMain>,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    hostname: Option<String>,
 ) -> anyhow::Result<()> {
    let websocket = websocket.await?;
+    let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket));
+
    let conn_gauge = Metrics::get()
        .proxy
        .client_connections
@@ -177,15 +173,16 @@ pub async fn serve_websocket(
 mod tests {
    use std::pin::pin;

+    use framed_websockets::WebSocketServer;
    use futures::{SinkExt, StreamExt};
-    use hyper_tungstenite::{
-        tungstenite::{protocol::Role, Message},
-        WebSocketStream,
-    };
    use tokio::{
        io::{duplex, AsyncReadExt, AsyncWriteExt},
        task::JoinSet,
    };
+    use tokio_tungstenite::{
+        tungstenite::{protocol::Role, Message},
+        WebSocketStream,
+    };

    use super::WebSocketRw;

@@ -210,9 +207,7 @@ mod tests {
        });

        js.spawn(async move {
-            let mut rw = pin!(WebSocketRw::new(
-                WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
-            ));
+            let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2)));

            let mut buf = vec![0; 1024];
            let n = rw.read(&mut buf).await.unwrap();
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants

 #### S3

-Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help).
+Do `aws sso login --profile dev` to get the SSO access to the bucket to clean.
+Also, set the following environment variables:

- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets
+- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`)
 - `REGION`: A region where the bucket is located at.
 - `BUCKET`: Bucket name
+- `BUCKET_PREFIX` (optional): Prefix inside the bucket

 #### Console API

@@ -43,7 +45,7 @@ processing by the `purge-garbage` subcommand.

 Example:

-`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`

 #### `purge-garbage`

@@ -59,7 +61,7 @@ to pass them on the command line

 Example:

-`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`

 Add the `--delete` argument before `purge-garbage` to enable deletion.  This is intentionally
 not provided inline in the example above to avoid accidents.  Without the `--delete` flag
@@ -72,7 +74,7 @@ Errors are logged to stderr and summary to stdout.

 For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
+env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver

 Timelines: 31106
 With errors: 3
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -200,30 +200,15 @@ impl RootTarget {
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
 pub struct BucketConfig {
    pub region: String,
    pub bucket: String,
    pub prefix_in_bucket: Option<String>,
-
-    /// Use SSO if this is set, else rely on AWS_* environment vars
-    pub sso_account_id: Option<String>,
-}
-
-impl Display for BucketConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}/{}/{}",
-            self.sso_account_id.as_deref().unwrap_or("<none>"),
-            self.region,
-            self.bucket
-        )
-    }
 }

 impl BucketConfig {
    pub fn from_env() -> anyhow::Result<Self> {
-        let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
        let region = env::var("REGION").context("'REGION' param retrieval")?;
        let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
        let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();
@@ -232,7 +217,6 @@ impl BucketConfig {
            region,
            bucket,
            prefix_in_bucket,
-            sso_account_id,
        })
    }
 }
@@ -276,7 +260,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
    guard
 }

-pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Client {
+pub fn init_s3_client(bucket_region: Region) -> Client {
    let credentials_provider = {
        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
        let chain = CredentialsProviderChain::first_try(
@@ -290,7 +274,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
        );

        // Use SSO if we were given an account ID
-        match account_id {
+        match std::env::var("SSO_ACCOUNT_ID").ok() {
            Some(sso_account) => chain.or_else(
                "sso",
                SsoCredentialsProvider::builder()
@@ -334,7 +318,7 @@ fn init_remote(
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
    let bucket_region = Region::new(bucket_config.region);
    let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
+    let s3_client = Arc::new(init_s3_client(bucket_region));

    let s3_root = match node_kind {
        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -173,7 +173,7 @@ impl Persistence {
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let latency = &METRICS_REGISTRY
@@ -199,13 +199,48 @@ impl Persistence {
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
+        // A generous allowance for how many times we may retry serializable transactions
+        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
+        // somehow engineer a situation where duelling transactions might otherwise live-lock.
+        const MAX_RETRIES: usize = 128;
+
        let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
-            .await
-            .expect("Task panic")
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            let mut retry_count = 0;
+            loop {
+                match conn.build_transaction().serializable().run(|c| func(c)) {
+                    Ok(r) => break Ok(r),
+                    Err(
+                        err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                            diesel::result::DatabaseErrorKind::SerializationFailure,
+                            _,
+                        )),
+                    ) => {
+                        retry_count += 1;
+                        if retry_count > MAX_RETRIES {
+                            tracing::error!(
+                                "Exceeded max retries on SerializationFailure errors: {err:?}"
+                            );
+                            break Err(err);
+                        } else {
+                            // Retry on serialization errors: these are expected, because even though our
+                            // transactions don't fight for the same rows, they will occasionally collide
+                            // on index pages (e.g. increment_generation for unrelated shards can collide)
+                            tracing::debug!(
+                                "Retrying transaction on serialization failure {err:?}"
+                            );
+                            continue;
+                        }
+                    }
+                    Err(e) => break Err(e),
+                }
+            }
+        })
+        .await
+        .expect("Task panic")
    }

    /// When a node is first registered, persist it before using it for anything
@@ -358,14 +393,11 @@ impl Persistence {
        self.with_measured_conn(
            DatabaseOperation::InsertTenantShards,
            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    for tenant in &shards {
-                        diesel::insert_into(tenant_shards)
-                            .values(tenant)
-                            .execute(conn)?;
-                    }
-                    Ok(())
-                })?;
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                Ok(())
            },
        )
@@ -533,8 +565,11 @@ impl Persistence {
            let update = ShardUpdate {
                generation: input_generation.map(|g| g.into().unwrap() as i32),
                placement_policy: input_placement_policy
+                    .as_ref()
                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
+                config: input_config
+                    .as_ref()
+                    .map(|c| serde_json::to_string(&c).unwrap()),
                scheduling_policy: input_scheduling_policy
                    .map(|p| serde_json::to_string(&p).unwrap()),
            };
@@ -581,55 +616,51 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> DatabaseResult<()> {
-                // Mark parent shards as splitting
+            // Mark parent shards as splitting

-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .set((splitting.eq(1),))
-                    .execute(conn)?;
-                if u8::try_from(updated)
-                    .map_err(|_| DatabaseError::Logical(
-                        format!("Overflow existing shard count {} while splitting", updated))
-                    )? != old_shard_count.count() {
-                    // Perhaps a deletion or another split raced with this attempt to split, mutating
-                    // the parent shards that we intend to split. In this case the split request should fail.
-                    return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
-                    ));
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(split_tenant_id.to_string()))
+                .filter(shard_count.eq(old_shard_count.literal() as i32))
+                .set((splitting.eq(1),))
+                .execute(conn)?;
+            if u8::try_from(updated)
+                .map_err(|_| DatabaseError::Logical(
+                    format!("Overflow existing shard count {} while splitting", updated))
+                )? != old_shard_count.count() {
+                // Perhaps a deletion or another split raced with this attempt to split, mutating
+                // the parent shards that we intend to split. In this case the split request should fail.
+                return Err(DatabaseError::Logical(
+                    format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
+                ));
+            }
+
+            // FIXME: spurious clone to sidestep closure move rules
+            let parent_to_children = parent_to_children.clone();
+
+            // Insert child shards
+            for (parent_shard_id, children) in parent_to_children {
+                let mut parent = crate::schema::tenant_shards::table
+                    .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
+                    .load::<TenantShardPersistence>(conn)?;
+                let parent = if parent.len() != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "Parent shard {parent_shard_id} not found"
+                    )));
+                } else {
+                    parent.pop().unwrap()
+                };
+                for mut shard in children {
+                    // Carry the parent's generation into the child
+                    shard.generation = parent.generation;
+
+                    debug_assert!(shard.splitting == SplitState::Splitting);
+                    diesel::insert_into(tenant_shards)
+                        .values(shard)
+                        .execute(conn)?;
                }
-
-                // FIXME: spurious clone to sidestep closure move rules
-                let parent_to_children = parent_to_children.clone();
-
-                // Insert child shards
-                for (parent_shard_id, children) in parent_to_children {
-                    let mut parent = crate::schema::tenant_shards::table
-                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                        .load::<TenantShardPersistence>(conn)?;
-                    let parent = if parent.len() != 1 {
-                        return Err(DatabaseError::Logical(format!(
-                            "Parent shard {parent_shard_id} not found"
-                        )));
-                    } else {
-                        parent.pop().unwrap()
-                    };
-                    for mut shard in children {
-                        // Carry the parent's generation into the child
-                        shard.generation = parent.generation;
-
-                        debug_assert!(shard.splitting == SplitState::Splitting);
-                        diesel::insert_into(tenant_shards)
-                            .values(shard)
-                            .execute(conn)?;
-                    }
-                }
-
-                Ok(())
-            })?;
+            }

            Ok(())
        })
@@ -647,22 +678,18 @@ impl Persistence {
        self.with_measured_conn(
            DatabaseOperation::CompleteShardSplit,
            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    // Drop parent shards
-                    diesel::delete(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .execute(conn)?;
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    .execute(conn)?;

-                    // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
-                        .execute(conn)?;
-                    debug_assert!(updated > 0);
-
-                    Ok(())
-                })?;
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);

                Ok(())
            },
@@ -681,39 +708,34 @@ impl Persistence {
        self.with_measured_conn(
            DatabaseOperation::AbortShardSplit,
            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let aborted =
-                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                        // Clear the splitting state on parent shards
-                        let updated = diesel::update(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.ne(new_shard_count.literal() as i32))
-                            .set((splitting.eq(0),))
-                            .execute(conn)?;
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;

-                        // Parent shards are already gone: we cannot abort.
-                        if updated == 0 {
-                            return Ok(AbortShardSplitStatus::Complete);
-                        }
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }

-                        // Sanity check: if parent shards were present, their cardinality should
-                        // be less than the number of child shards.
-                        if updated >= new_shard_count.count() as usize {
-                            return Err(DatabaseError::Logical(format!(
-                                "Unexpected parent shard count {updated} while aborting split to \
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
                            count {new_shard_count:?} on tenant {split_tenant_id}"
-                            )));
-                        }
+                    )));
+                }

-                        // Erase child shards
-                        diesel::delete(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.eq(new_shard_count.literal() as i32))
-                            .execute(conn)?;
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;

-                        Ok(AbortShardSplitStatus::Aborted)
-                    })?;
-
-                Ok(aborted)
+                Ok(AbortShardSplitStatus::Aborted)
            },
        )
        .await
--- a/test_runner/fixtures/endpoint/init.py
+++ b/test_runner/fixtures/endpoint/init.py
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -0,0 +1,23 @@
+import requests
+from requests.adapters import HTTPAdapter
+
+
+class EndpointHttpClient(requests.Session):
+    def __init__(
+        self,
+        port: int,
+    ):
+        super().__init__()
+        self.port = port
+
+        self.mount("http://", HTTPAdapter())
+
+    def dbs_and_roles(self):
+        res = self.get(f"http://localhost:{self.port}/dbs_and_roles")
+        res.raise_for_status()
+        return res.json()
+
+    def database_schema(self, database: str):
+        res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
+        res.raise_for_status()
+        return res.text
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -48,6 +48,7 @@ from urllib3.util.retry import Retry
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
@@ -454,7 +455,7 @@ class NeonEnvBuilder:
        test_overlay_dir: Optional[Path] = None,
        pageserver_remote_storage: Optional[RemoteStorage] = None,
        # toml that will be decomposed into `--config-override` flags during `pageserver --init`
-        pageserver_config_override: Optional[str] = None,
+        pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None,
        num_safekeepers: int = 1,
        num_pageservers: int = 1,
        # Use non-standard SK ids to check for various parsing bugs
@@ -1126,10 +1127,14 @@ class NeonEnv:
                )

            if config.pageserver_config_override is not None:
-                for o in config.pageserver_config_override.split(";"):
-                    override = toml.loads(o)
-                    for key, value in override.items():
-                        ps_cfg[key] = value
+                if callable(config.pageserver_config_override):
+                    config.pageserver_config_override(ps_cfg)
+                else:
+                    assert isinstance(config.pageserver_config_override, str)
+                    for o in config.pageserver_config_override.split(";"):
+                        override = toml.loads(o)
+                        for key, value in override.items():
+                            ps_cfg[key] = value

            # Create a corresponding NeonPageserver object
            self.pageservers.append(
@@ -2344,10 +2349,11 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"reconcile_all waited for {n} shards")
        return n

-    def reconcile_until_idle(self, timeout_secs=30, delay_max=5):
+    def reconcile_until_idle(self, timeout_secs=30):
        start_at = time.time()
        n = 1
        delay_sec = 0.5
+        delay_max = 5
        while n > 0:
            n = self.reconcile_all()
            if n == 0:
@@ -3372,6 +3378,13 @@ class Endpoint(PgProtocol):
        self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
        # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf

+    def http_client(
+        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
+    ) -> EndpointHttpClient:
+        return EndpointHttpClient(
+            port=self.http_port,
+        )
+
    def create(
        self,
        branch_name: str,
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -1,6 +1,5 @@
 import concurrent.futures
 import re
-import threading
 from pathlib import Path

 import pytest
@@ -61,19 +60,21 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
            ]
        )

+    env.storage_controller.allowed_errors.extend(
+        [
+            # The neon_local functionality for updating computes is flaky for unknown reasons
+            ".*Local notification hook failed.*",
+            ".*Marking shard.*for notification retry.*",
+            ".*Failed to notify compute.*",
+        ]
+    )
+
    # Total tenants
-    tenant_count = 3
+    tenant_count = 4

    # Transaction rate: we set this rather than running at full-speed because we
    # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently.
-    transaction_rate = 50
-
-    # Choose a pgbench scale that is just high enough to hit the split threshold around the time init
-    # finishes (we want splits going on during the main read/write bench)
-    pgbench_scale = 40
-
-    # Runtime selected to give storage controller time to do all the shard splits while it runs
-    pgbench_runtime = 180
+    transaction_rate = 100

    class TenantState:
        def __init__(self, timeline_id, endpoint):
@@ -84,9 +85,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    tenants = {}
    for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)):
        timeline_id = TimelineId.generate()
-        env.neon_cli.create_tenant(
-            tenant_id, timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}'
-        )
+        env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf)
        endpoint = env.endpoints.create("main", tenant_id=tenant_id)
        tenants[tenant_id] = TenantState(timeline_id, endpoint)
        endpoint.start()
@@ -95,7 +94,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
        pg_bin.run_capture(
            [
                "pgbench",
-                f"-s{pgbench_scale}",
+                "-s50",
                "-i",
                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
            ]
@@ -140,10 +139,12 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
            if low_watermark is None or low_watermark > tps:
                low_watermark = tps

-            if tps < min_tps:
-                raise RuntimeError(
-                    f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}"
-                )
+            # Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not
+            # at the same time as a shard split.
+            # if tps < min_tps:
+            #     raise RuntimeError(
+            #         f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}"
+            #     )

        log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}")

@@ -154,8 +155,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
        out_path = pg_bin.run_capture(
            [
                "pgbench",
+                "-s50",
                "-T",
-                f"{pgbench_runtime}",
+                "180",
                "-R",
                f"{transaction_rate}",
                "-P",
@@ -170,8 +172,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
        out_path = pg_bin.run_capture(
            [
                "pgbench",
+                "-s50",
                "-T",
-                "60",
+                "30",
                "-R",
                f"{transaction_rate}",
                "-S",
@@ -183,20 +186,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):

        check_pgbench_output(out_path)

-    background_reconcile_stop = threading.Event()
-
-    def background_reconcile_task():
-        # The controller will do all this autonomously, but with a 20 second wait between each
-        # time it considers doing a split/optimization.  To enable a shorter test, actively
-        # poll the reconcile_all endpoint to make it all happen faster.
-        #
-        # Note that this is mainly to drain the post-split migrations faster, rather than to
-        # prompt the splits themselves.
-        while not background_reconcile_stop.is_set():
-            env.storage_controller.reconcile_until_idle(timeout_secs=pgbench_runtime, delay_max=0.5)
-            background_reconcile_stop.wait(5)
-
-    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
        pgbench_futs = []
        for tenant_state in tenants.values():
            fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
@@ -206,8 +196,6 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
        for fut in pgbench_futs:
            fut.result()

-        reconcile_fut = pgbench_threads.submit(background_reconcile_task)
-
        pgbench_futs = []
        for tenant_state in tenants.values():
            fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
@@ -217,10 +205,6 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
        for fut in pgbench_futs:
            fut.result()

-        log.info("Waiting for background reconcile thread")
-        background_reconcile_stop.set()
-        reconcile_fut.result()
-
    def assert_all_split():
        for tenant_id in tenants.keys():
            shards = tenant_get_shards(env, tenant_id)
--- a/test_runner/regress/test_aux_files.py
+++ b/test_runner/regress/test_aux_files.py
@@ -0,0 +1,72 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    logical_replication_sync,
+)
+
+
+def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+    client = env.pageserver.http_client()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    tenant_config = client.tenant_config(tenant_id).effective_config
+    tenant_config["switch_aux_file_policy"] = "V2"
+    client.set_tenant_config(tenant_id, tenant_config)
+    # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
+    assert (
+        client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
+        is None
+    )
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("create table t(pk integer primary key, payload integer)")
+    cur.execute(
+        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
+    )
+    cur.execute("create publication pub1 for table t, replication_example")
+
+    # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
+    # instead of going through the full logical replication process.
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
+    vanilla_pg.safe_psql(
+        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
+    )
+    connstr = endpoint.connstr().replace("'", "''")
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    # Wait logical replication channel to be established
+    logical_replication_sync(vanilla_pg, endpoint)
+    vanilla_pg.stop()
+    endpoint.stop()
+
+    with env.pageserver.http_client() as client:
+        # aux file v2 flag should be enabled at this point
+        assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
+    with env.pageserver.http_client() as client:
+        tenant_config = client.tenant_config(tenant_id).effective_config
+        tenant_config["switch_aux_file_policy"] = "V1"
+        client.set_tenant_config(tenant_id, tenant_config)
+        # the flag should still be enabled
+        assert (
+            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
+                "last_aux_file_policy"
+            ]
+            == "V2"
+        )
+    env.pageserver.restart()
+    with env.pageserver.http_client() as client:
+        # aux file v2 flag should be persisted
+        assert (
+            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
+                "last_aux_file_policy"
+            ]
+            == "V2"
+        )
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -11,8 +11,7 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_branch_behind(neon_env_builder: NeonEnvBuilder):
    # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})

    error_regexes = [
        ".*invalid branch start lsn.*",
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -0,0 +1,34 @@
+import requests
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_compute_catalog(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_config", "empty")
+
+    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    client = endpoint.http_client()
+
+    objects = client.dbs_and_roles()
+
+    # Assert that 'cloud_admin' role exists in the 'roles' list
+    assert any(
+        role["name"] == "cloud_admin" for role in objects["roles"]
+    ), "The 'cloud_admin' role is missing"
+
+    # Assert that 'postgres' database exists in the 'databases' list
+    assert any(
+        db["name"] == "postgres" for db in objects["databases"]
+    ), "The 'postgres' database is missing"
+
+    ddl = client.database_schema(database="postgres")
+
+    assert "-- PostgreSQL database dump" in ddl
+
+    try:
+        client.database_schema(database="nonexistentdb")
+        raise AssertionError("Expected HTTPError was not raised")
+    except requests.exceptions.HTTPError as e:
+        assert (
+            e.response.status_code == 404
+        ), f"Expected 404 status code, but got {e.response.status_code}"
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -67,8 +67,7 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
 #
 def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
    # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
    timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
    endpoint = env.endpoints.create_start("test_gc_aggressive")

@@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):

 #
 def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
-    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
    num_index_uploads = 0

    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    env = neon_env_builder.init_start()
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
    tenant_id = env.initial_tenant
    timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
    endpoint = env.endpoints.create_start("test_gc_index_upload")
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -16,8 +16,7 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
    # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
    env.neon_cli.create_branch("test_old_request_lsn", "main")
    endpoint = env.endpoints.create_start("test_old_request_lsn")

--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -42,6 +42,8 @@ def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
        "listen_http_addr",
        "pg_auth_type",
        "http_auth_type",
+        # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
+        # "tenant_config",
    ]
    required_config_overrides = [
        f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -25,6 +25,7 @@ from fixtures.neon_fixtures import (
    S3Scrubber,
    generate_uploads_and_deletions,
 )
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    assert_tenant_state,
@@ -632,39 +633,86 @@ def test_upgrade_generationless_local_file_paths(
    generation numbers: it should accept these layer files, and avoid doing
    a delete/download cycle on them.
    """
-    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
+    )

    workload = Workload(env, tenant_id, timeline_id)
    workload.init()
    workload.write_rows(1000)

-    env.pageserver.stop()
+    attached_pageserver = env.get_tenant_pageserver(tenant_id)
+    secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
+        0
+    ]
+
+    attached_pageserver.http_client().tenant_heatmap_upload(tenant_id)
+    secondary_pageserver.http_client().tenant_secondary_download(tenant_id)

    # Rename the local paths to legacy format, to simulate what
-    # we would see when upgrading
-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    files_renamed = 0
-    for filename in os.listdir(timeline_dir):
-        path = os.path.join(timeline_dir, filename)
-        log.info(f"Found file {path}")
-        if path.endswith("-v1-00000001"):
-            new_path = path[:-12]
-            os.rename(path, new_path)
-            log.info(f"Renamed {path} -> {new_path}")
-            files_renamed += 1
+    # we would see when upgrading.  Do this on both attached and secondary locations, as we will
+    # test the behavior of both.
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
+        files_renamed = 0
+        for filename in os.listdir(timeline_dir):
+            path = os.path.join(timeline_dir, filename)
+            log.info(f"Found file {path}")
+            if path.endswith("-v1-00000001"):
+                new_path = path[:-12]
+                os.rename(path, new_path)
+                log.info(f"Renamed {path} -> {new_path}")
+                files_renamed += 1

-    assert files_renamed > 0
+        assert files_renamed > 0

-    env.pageserver.start()
+        pageserver.start()

    workload.validate()

    # Assert that there were no on-demand downloads
    assert (
-        env.pageserver.http_client().get_metric_value(
+        attached_pageserver.http_client().get_metric_value(
            "pageserver_remote_ondemand_downloaded_layers_total"
        )
        == 0
    )
+
+    # Do a secondary download and ensure there were no layer downloads
+    secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
+    assert (
+        secondary_pageserver.http_client().get_metric_value(
+            "pageserver_secondary_download_layer_total"
+        )
+        == 0
+    )
+
+    # Check that when we evict and promote one of the legacy-named layers, everything works as
+    # expected
+    local_layers = list(
+        (
+            parse_layer_file_name(path.name),
+            os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path),
+        )
+        for path in attached_pageserver.list_layers(tenant_id, timeline_id)
+    )
+    (victim_layer_name, victim_path) = local_layers[0]
+    assert os.path.exists(victim_path)
+
+    attached_pageserver.http_client().evict_layer(
+        tenant_id, timeline_id, victim_layer_name.to_str()
+    )
+    assert not os.path.exists(victim_path)
+
+    attached_pageserver.http_client().download_layer(
+        tenant_id, timeline_id, victim_layer_name.to_str()
+    )
+    # We should download into the same local path we started with
+    assert os.path.exists(victim_path)
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -10,11 +10,9 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
    # Set pitr interval such that we need to keep the data
-    neon_env_builder.pageserver_config_override = (
-        "tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"}
    )
-
-    env = neon_env_builder.init_start()
    endpoint_main = env.endpoints.create_start("main")

    main_pg_conn = endpoint_main.connect()
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -135,7 +135,14 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
        query_message = "SELECT 1".encode("utf-8") + b"\0"
        length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
        await websocket.send(
-            [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
+            length0
+            + startup_message
+            + b"p"
+            + length1
+            + auth_message
+            + b"Q"
+            + length2
+            + query_message
        )

        startup_response = await websocket.recv()
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -10,9 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 #
 def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
    # Override default checkpointer settings to run it more often
-    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "1048576",
+        }
+    )
    env.pageserver.is_testing_enabled_or_skip()

    # We expect the pageserver to exit, which will cause storage storage controller
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,5 +1,6 @@
 import json
 from contextlib import closing
+from typing import Any, Dict

 import psycopg2.extras
 from fixtures.common_types import Lsn
@@ -14,16 +15,22 @@ from fixtures.utils import wait_until

 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
    """Test per tenant configuration"""
-    # set some non-default global config
-    neon_env_builder.pageserver_config_override = """
-page_cache_size=444;
-wait_lsn_timeout='111 s';
-[tenant_config]
-checkpoint_distance = 10000
-compaction_target_size = 1048576
-evictions_low_residence_duration_metric_threshold = "2 days"
-eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
-"""
+
+    def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]):
+        ps_cfg["page_cache_size"] = 444
+        ps_cfg["wait_lsn_timeout"] = "111 s"
+
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["checkpoint_distance"] = 10000
+        tenant_config["compaction_target_size"] = 1048576
+        tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
+        tenant_config["eviction_policy"] = {
+            "kind": "LayerAccessThreshold",
+            "period": "20s",
+            "threshold": "23 hours",
+        }
+
+    neon_env_builder.pageserver_config_override = set_some_nondefault_global_config

    env = neon_env_builder.init_start()
    # we configure eviction but no remote storage, there might be error lines
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches(

    gc_horizon = 128 * 1024

-    neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "pitr_interval": "0sec",
+            "gc_horizon": gc_horizon,
+        }
+    )

    # FIXME: we have a race condition between GC and delete timeline. GC might fail with this
    # error. Similar to https://github.com/neondatabase/neon/issues/2671
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder

    # Disable background compaction as we don't want it to happen after `get_physical_size` request
    # and before checking the expected size on disk, which makes the assertion failed
-    neon_env_builder.pageserver_config_override = (
-        "tenant_config={checkpoint_distance=100000, compaction_period='10m'}"
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "100000",
+            "compaction_period": "10m",
+        }
    )
-
-    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

    new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
@@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):

    # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
    # and before checking the expected size on disk, which makes the assertion failed
-    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "100000",
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "pitr_interval": "1s",
+        }
+    )
    pageserver_http = env.pageserver.http_client()

    new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -1,4 +1,5 @@
 import time
+from typing import Any, Dict

 from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
@@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
 # Kills one of the safekeepers and ensures that only the active ones are printed in the state.
 def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
    # Trigger WAL wait timeout faster
-    neon_env_builder.pageserver_config_override = """
-        wait_lsn_timeout = "1s"
-        tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"}
-    """
+    def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
+        ps_cfg["wait_lsn_timeout"] = "1s"
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["walreceiver_connect_timeout"] = "2s"
+        tenant_config["lagging_wal_timeout"] = "2s"
+
+    neon_env_builder.pageserver_config_override = customize_pageserver_toml
+
    # Have notable SK ids to ensure we check logs for their presence, not some other random numbers
    neon_env_builder.safekeepers_id_start = 12345
    neon_env_builder.num_safekeepers = 3
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
-  "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
-  "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"]
+  "v16": ["16.2", "1f63dd206a8aaa4727baad334c548219c52878e1"],
+  "v15": ["15.6", "6fd679f5154d12f4892ddd450cc6be28a8ac31b0"],
+  "v14": ["14.11", "a6dc3f010da31472a7ae9ab0ddfbf6e49131d93c"]
 }
Author	SHA1	Message	Date
Heikki Linnakangas	8dc1c48605	Fix race condition in XLogWaitForReplayOf() ConditionVariablePrepareToSleep has this comment on it: > * Caution: "before entering the loop" means you must test the exit > * condition between calling ConditionVariablePrepareToSleep and calling > * ConditionVariableSleep. If that is inconvenient, omit calling > * ConditionVariablePrepareToSleep. We were not obeying that: we did not test the exit condition correctly between the ConditionVariablePrepareToSleep and ConditionVariableSleep calls, because the test that we had in between them only checked the local 'replayRecPtr' variable, without updating it from shared memory. That wasn't too serious, because the loop includes a 10 second timeout, and would retry and succeed if the original update was missed. Still, better fix it. To fix, just remove the ConditionVariablePrepareToSleep() call. As the comment says, that's also correct, and even more efficient if we assume that sleeping is rare.	2024-05-19 21:07:29 +03:00
Alex Chi Z	e1a9669d05	feat(pagebench): add aux file bench (#7746 ) part of https://github.com/neondatabase/neon/issues/7462 ## Summary of changes This pull request adds two APIs to the pageserver management API: list_aux_files and ingest_aux_files. The aux file pagebench is intended to be used on an empty timeline because the data do not go through the safekeeper. LSNs are advanced by 8 for each ingestion, to avoid invariant checks inside the pageserver. For now, I only care about space amplification / read amplification, so the bench is designed in a very simple way: ingest 10000 files, and I will manually dump the layer map to analyze. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-17 20:04:02 +00:00
Alex Chi Z	aaf60819fa	feat(pageserver): persist aux file policy in index part (#7668 ) Part of https://github.com/neondatabase/neon/issues/7462 ## Summary of changes Tenant config is not persisted unless it's attached on the storage controller. In this pull request, we persist the aux file policy flag in the `index_part.json`. Admins can set `switch_aux_file_policy` in the storage controller or using the page server API. Upon the first aux file gets written, the write path will compare the aux file policy target with the current policy. If it is switch-able, we will do the switch. Otherwise, the original policy will be used. The test cases show what the admins can do / cannot do. The `last_aux_file_policy` is stored in `IndexPart`. Updates to the persisted policy are done via `schedule_index_upload_for_aux_file_policy_update`. On the write path, the writer will update the field. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2024-05-17 19:22:49 +00:00
John Spray	c84656a53e	pageserver: implement auto-splitting (#7681 ) ## Problem Currently tenants are only split into multiple shards if a human being calls the API to do it. Issue: #7388 ## Summary of changes - Add a pageserver API for returning the top tenants by size - Add a step to the controller's background loop where if there is no reconciliation or optimization to be done, it looks for things to split. - Add a test that runs pgbench on many tenants concurrently, and checks that splitting happens as expected as tenants grow, without interrupting the client I/O. This PR is quite basic: there is a tasklist in https://github.com/neondatabase/neon/issues/7388 for further work. This PR is meant to be safe (off by default), and sufficient to enable our staging environment to run lots of sharded tenants without a human having to set them up.	2024-05-17 16:01:24 +00:00
John Spray	af99c959ef	storage controller: use SERIALIZABLE isolation level (#7792 ) ## Problem The storage controller generally assumes that things like updating generation numbers are atomic: it should use a strict isolation level. ## Summary of changes - Wrap all database operations in a SERIALIZABLE transaction. - Retry serialization failures, as these do not indicate problems and are normal when plenty of concurrent work is happening. Using this isolation level for all reads is overkill, but much simpler than reasoning about it on a per-operation basis, and does not hurt performance. Tested this with a modified version of storage_controller_many_tenants test with 128k shards, to check that our performance is still fine: it is.	2024-05-17 16:44:33 +01:00
John Spray	a8e6d259cb	pageserver: fixes for layer path changes (#7786 ) ## Problem - When a layer with legacy local path format is evicted and then re-downloaded, a panic happened because the path downloaded by remote storage didn't match the path stored in Layer. - While investigating, I also realized that secondary locations would have a similar issue with evictions. Closes: #7783 ## Summary of changes - Make remote timeline client take local paths as an input: it should not have its own ideas about local paths, instead it just uses the layer path that the Layer has. - Make secondary state store an explicit local path, populated on scan of local disk at startup. This provides the same behavior as for Layer, that our local_layer_path is a _default_, but the layer path can actually be anything (e.g. an old style one). - Add tests for both cases.	2024-05-17 13:24:03 +01:00
Joonas Koivunen	c1390bfc3b	chore: update defaults for timeline_detach_ancestor (#7779 ) by having 100 copy operations in flight twe climb up to 2500 requests per min or 41/s. This is still probably less than is allowed, but fast enough for our purposes.	2024-05-17 12:25:01 +02:00
Christian Schwarz	6d951e69d6	test_suite: patch, don't replace, the `tenant_config` field, where appropriate (#7771 ) Before this PR, the changed tests would overwrite the entire `tenant_config` because `pageserver_config_override` is merged non-recursively into the `ps_cfg`. This meant they would override the `PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM`, impacting our matrix build for `compaction_algorithm=Tiered\|Legacy` in https://github.com/neondatabase/neon/pull/7748. I found the tests fixed in this PR using the `NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM` env var that I added in #7748. Therefore, I think this is an exhaustive fix. This is better than just searching the code base for `tenant_config`, which is what I had sketched in #7747. refs #7749	2024-05-17 12:24:02 +02:00
Arpad Müller	4b8809b280	Tiered compaction: improvements to the windows (#7787 ) Tiered compaction employs two sliding windows over the keyspace: `KeyspaceWindow` for the image layer generation and `Window` for the delta layer generation. Do some fixes to both windows: * The distinction between the two windows is not very clear. Do the absolute minimum to mention where they are used in the rustdoc description of the struct. Maybe we should rename them (say `WindowForImage` and `WindowForDelta`) or merge them into one window implementation. * Require the keys to strictly increase. The `accum_key_values` already combines the key, so there is no logic needed in `Window::feed` for the same key repeating. This is a follow-up to address the request in https://github.com/neondatabase/neon/pull/7671#pullrequestreview-2051995541 * In `choose_next_delta`, we claimed in the comment to use 1.25 as the factor but it was 1.66 instead. Fix this discrepancy by using `*5/4` as the two operations.	2024-05-16 22:25:19 +02:00
Arpad Müller	4c5afb7b10	Remove SSO_ACCOUNT_ID from scrubber docs and BucketConfig (#7774 ) As of #6202 we support `AWS_PROFILE` as well, which is more convenient. Change the docs to using it instead of `SSO_ACCOUNT_ID`. Also, remove `SSO_ACCOUNT_ID` from BucketConfig as it is confusing to the code's reader: it's not the "main" way of setting up authentication for the scrubber any more. It is a breaking change for the on-disk format as we persist `sso_account_id` to disk, but it was quite inconsistent with the other methods which are not persistet. Also, I don't think we want to support the case where one version writes the json and another version reads it. Related: #7667	2024-05-16 19:35:13 +02:00
Arpad Müller	ec069dc45e	tiered compaction: introduce PAGE_SZ constant and use it (#7785 ) pointed out by @problame : we use the literal 8192 instead of a properly defined constant. replace the literal by a PAGE_SZ constant.	2024-05-16 16:48:49 +02:00
Conrad Ludgate	790c05d675	proxy: swap tungstenite for a simpler impl (#7353 ) ## Problem I wanted to do a deep dive of the tungstenite codebase. tokio-tungstenite is incredibly convoluted... In my searching I found [fastwebsockets by deno](https://github.com/denoland/fastwebsockets), but it wasn't quite sufficient. This also removes the default 16MB/64MB frame/message size limitation. framed-websockets solves this by inserting continuation frames for partially received messages, so the whole message does not need to be entirely read into memory. ## Summary of changes I took the fastwebsockets code as a starting off point and rewrote it to be simpler, server-only, and be poll-based to support our Read/Write wrappers. I have replaced our tungstenite code with my framed-websockets fork. <https://github.com/neondatabase/framed-websockets>	2024-05-16 13:05:50 +02:00
Andrew Rudenko	923cf91aa4	compute_ctl: catalog API endpoints (#7575 ) ## Problem There are two cloud's features that require extra compute endpoints. 1. We are running pg_dump to get DB schemas. Currently, we are using a special service for this. But it would be great to execute pg_dump in an isolated environment. And we already have such an environment, it's our compute! And likely enough pg_dump already exists there too! (see https://github.com/neondatabase/cloud/issues/11644#issuecomment-2084617832) 2. We need to have a way to get databases and roles from compute after time travel (see https://github.com/neondatabase/cloud/issues/12109) ## Summary of changes It adds two API endpoints to compute_ctl HTTP API that target both of the aforementioned cases. --------- Co-authored-by: Tristan Partin <tristan@neon.tech>	2024-05-16 12:04:16 +02:00