ci: Trigger E2E test only after images available on registry

proxy: pg17 fixes (#8321 )
## Problem #7809 - we do not support sslnegotiation=direct #7810 - we do not support negotiating down the protocol extensions. ## Summary of changes 1. Same as postgres, check the first startup packet byte for tls header `0x16`, and check the ALPN. 2. Tell clients using protocol >3.0 to downgrade
2026-02-08 05:00:38 +00:00 · 2024-07-10 14:54:58 +02:00 · 2024-07-10 09:10:29 +01:00 · 2024-07-09 23:17:42 +02:00 · 2024-07-09 19:58:48 +01:00
20 changed files with 546 additions and 474 deletions
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -6,6 +6,11 @@ on:
      - ready_for_review
  workflow_call:

+  workflow_run:
+    workflows: ["Build and Test"]
+    types:
+      - completed
+
 defaults:
  run:
    shell: bash -euxo pipefail {0}
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,7 +15,6 @@ use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use futures::SinkExt;
 use pageserver_api::models::{
    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -566,60 +565,39 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let (client, conn) = self.page_server_psql_client().await?;
-        // The connection object performs the actual communication with the database,
-        // so spawn it off to run on its own.
-        tokio::spawn(async move {
-            if let Err(e) = conn.await {
-                eprintln!("connection error: {}", e);
-            }
-        });
-        let client = std::pin::pin!(client);
-
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
+        let base_tarfile =
+            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));

        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
+            let wal_reader =
+                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };

-        let copy_in = |reader, cmd| {
-            let client = &client;
-            async move {
-                let writer = client.copy_in(&cmd).await?;
-                let writer = std::pin::pin!(writer);
-                let mut writer = writer.sink_map_err(|e| {
-                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
-                });
-                let mut reader = std::pin::pin!(reader);
-                writer.send_all(&mut reader).await?;
-                writer.into_inner().finish().await?;
-                anyhow::Ok(())
-            }
-        };
-
        // Import base
-        copy_in(
-            base_tarfile,
-            format!(
-                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-            ),
-        )
-        .await?;
-        // Import wal if necessary
-        if let Some(wal_reader) = wal_reader {
-            copy_in(
-                wal_reader,
-                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+        self.http_client
+            .import_basebackup(
+                tenant_id,
+                timeline_id,
+                start_lsn,
+                end_lsn,
+                pg_version,
+                base_tarfile,
            )
            .await?;
+
+        // Import wal if necessary
+        if let Some(wal_reader) = wal_reader {
+            self.http_client
+                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
+                .await?;
        }

        Ok(())
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -663,11 +663,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        assert!(self.state < ProtoState::Authentication);
        let have_tls = self.tls_config.is_some();
        match msg {
-            FeStartupPacket::SslRequest => {
+            FeStartupPacket::SslRequest { direct } => {
                debug!("SSL requested");

-                self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                    .await?;
+                if !direct {
+                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                        .await?;
+                } else if !have_tls {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "direct SSL negotiation but no TLS support"
+                    )));
+                }

                if have_tls {
                    self.start_tls().await?;
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    stream: S,
-    read_buf: BytesMut,
-    write_buf: BytesMut,
+    pub stream: S,
+    pub read_buf: BytesMut,
+    pub write_buf: BytesMut,
 }

 impl<S> Framed<S> {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,14 +39,39 @@ pub enum FeMessage {
    PasswordMessage(Bytes),
 }

+#[derive(Clone, Copy, PartialEq, PartialOrd)]
+pub struct ProtocolVersion(u32);
+
+impl ProtocolVersion {
+    pub const fn new(major: u16, minor: u16) -> Self {
+        Self((major as u32) << 16 | minor as u32)
+    }
+    pub const fn minor(self) -> u16 {
+        self.0 as u16
+    }
+    pub const fn major(self) -> u16 {
+        (self.0 >> 16) as u16
+    }
+}
+
+impl fmt::Debug for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list()
+            .entry(&self.major())
+            .entry(&self.minor())
+            .finish()
+    }
+}
+
 #[derive(Debug)]
 pub enum FeStartupPacket {
    CancelRequest(CancelKeyData),
-    SslRequest,
+    SslRequest {
+        direct: bool,
+    },
    GssEncRequest,
    StartupMessage {
-        major_version: u32,
-        minor_version: u32,
+        version: ProtocolVersion,
        params: StartupMessageParams,
    },
 }
@@ -301,11 +326,23 @@ impl FeStartupPacket {
    /// different from [`FeMessage::parse`] because startup messages don't have
    /// message type byte; otherwise, its comments apply.
    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
-        const CANCEL_REQUEST_CODE: u32 = 5678;
-        const NEGOTIATE_SSL_CODE: u32 = 5679;
-        const NEGOTIATE_GSS_CODE: u32 = 5680;
+        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
+        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
+        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
+        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
+
+        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
+        // First byte indicates standard SSL handshake message
+        // (It can't be a Postgres startup length because in network byte order
+        // that would be a startup packet hundreds of megabytes long)
+        if buf.first() == Some(&0x16) {
+            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
+        }

        // need at least 4 bytes with packet len
        if buf.len() < 4 {
@@ -338,12 +375,10 @@ impl FeStartupPacket {
        let mut msg = buf.split_to(len).freeze();
        msg.advance(4); // consume len

-        let request_code = msg.get_u32();
-        let req_hi = request_code >> 16;
-        let req_lo = request_code & ((1 << 16) - 1);
+        let request_code = ProtocolVersion(msg.get_u32());
        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match (req_hi, req_lo) {
-            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
+        let message = match request_code {
+            CANCEL_REQUEST_CODE => {
                if msg.remaining() != 8 {
                    return Err(ProtocolError::BadMessage(
                        "CancelRequest message is malformed, backend PID / secret key missing"
@@ -355,21 +390,22 @@ impl FeStartupPacket {
                    cancel_key: msg.get_i32(),
                })
            }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+            NEGOTIATE_SSL_CODE => {
                // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest
+                FeStartupPacket::SslRequest { direct: false }
            }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+            NEGOTIATE_GSS_CODE => {
                // Requested upgrade to GSSAPI
                FeStartupPacket::GssEncRequest
            }
-            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
+            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
                return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {unrecognized_code}"
+                    "Unrecognized request code {}",
+                    version.minor()
                )));
            }
            // TODO bail if protocol major_version is not 3?
-            (major_version, minor_version) => {
+            version => {
                // StartupMessage

                let s = str::from_utf8(&msg).map_err(|_e| {
@@ -382,8 +418,7 @@ impl FeStartupPacket {
                })?;

                FeStartupPacket::StartupMessage {
-                    major_version,
-                    minor_version,
+                    version,
                    params: StartupMessageParams {
                        params: msg.slice_ref(s.as_bytes()),
                    },
@@ -522,6 +557,10 @@ pub enum BeMessage<'a> {
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
    NoticeResponse(&'a str),
+    NegotiateProtocolVersion {
+        version: ProtocolVersion,
+        options: &'a [&'a str],
+    },
    KeepAlive(WalSndKeepAlive),
 }

@@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(u8::from(req.request_reply));
                });
            }
+
+            BeMessage::NegotiateProtocolVersion { version, options } => {
+                buf.put_u8(b'v');
+                write_body(buf, |buf| {
+                    buf.put_u32(version.0);
+                    buf.put_u32(options.len() as u32);
+                    for option in options.iter() {
+                        write_cstr(option, buf)?;
+                    }
+                    Ok(())
+                })?
+            }
        }
        Ok(())
    }
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,6 +74,15 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
        .transpose()
 }

+pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
+    request: &Request<Body>,
+    param_name: &str,
+) -> Result<T, ApiError> {
+    parse_query_param(request, param_name)?.ok_or_else(|| {
+        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
+    })
+}
+
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
    match request.body_mut().data().await {
        Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -9,6 +9,8 @@ use utils::{
    lsn::Lsn,
 };

+pub use reqwest::Body as ReqwestBody;
+
 pub mod util;

 #[derive(Debug, Clone)]
@@ -20,6 +22,9 @@ pub struct Client {

 #[derive(thiserror::Error, Debug)]
 pub enum Error {
+    #[error("send request: {0}")]
+    SendRequest(reqwest::Error),
+
    #[error("receive body: {0}")]
    ReceiveBody(reqwest::Error),

@@ -173,19 +178,30 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }

+    fn start_request<U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+    ) -> reqwest::RequestBuilder {
+        let req = self.client.request(method, uri);
+        if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        }
+    }
+
    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
        body: B,
    ) -> Result<reqwest::Response> {
-        let req = self.client.request(method, uri);
-        let req = if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        };
-        req.json(&body).send().await.map_err(Error::ReceiveBody)
+        self.start_request(method, uri)
+            .json(&body)
+            .send()
+            .await
+            .map_err(Error::ReceiveBody)
    }

    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -609,4 +625,53 @@ impl Client {
            }),
        }
    }
+
+    pub async fn import_basebackup(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        end_lsn: Lsn,
+        pg_version: u32,
+        basebackup_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(basebackup_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn import_wal(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        wal_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(wal_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -660,7 +660,6 @@ fn start_pageserver(
                async move {
                    page_service::libpq_listener_main(
                        tenant_manager,
-                        broker_client,
                        pg_auth,
                        pageserver_listener,
                        conf.pg_auth_type,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
+use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -44,12 +45,14 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
+use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use crate::context::{DownloadBehavior, RequestContext};
@@ -2404,6 +2407,189 @@ async fn post_top_tenants(
    )
 }

+async fn put_tenant_timeline_import_basebackup(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    async move {
+        let state = get_state(&request);
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+
+        let broker_client = state.broker_client.clone();
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .map_err(ApiError::InternalServerError)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+
+        timeline
+            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(body)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(span)
+    .await
+}
+
+async fn put_tenant_timeline_import_wal(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
+    async move {
+        let state = get_state(&request);
+
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })?;
+
+        info!("done");
+
+        json_response(StatusCode::OK, ())
+    }.instrument(span).await
+}
+
+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2698,5 +2884,13 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
            |r| testing_api_handler("perf_info", r, perf_info),
        )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
+            |r| api_handler(r, put_tenant_timeline_import_basebackup),
+        )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
+            |r| api_handler(r, put_tenant_timeline_import_wal),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1473,8 +1473,6 @@ pub(crate) enum ComputeCommandKind {
    PageStream,
    Basebackup,
    Fullbackup,
-    ImportBasebackup,
-    ImportWal,
    LeaseLsn,
    Show,
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,9 +4,7 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
-use bytes::Bytes;
 use futures::stream::FuturesUnordered;
-use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -28,7 +26,6 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
-use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -37,7 +34,6 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -53,7 +49,6 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
@@ -66,7 +61,6 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -82,56 +76,6 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 ///////////////////////////////////////////////////////////////////////////////

 ///
@@ -141,7 +85,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
    tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -186,7 +129,6 @@ pub async fn libpq_listener_main(
                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
-                        broker_client.clone(),
                        local_auth,
                        socket,
                        auth_type,
@@ -209,7 +151,6 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
@@ -262,8 +203,7 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler =
-        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -294,7 +234,6 @@ struct HandlerTimeline {
 }

 struct PageServerHandler {
-    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -386,13 +325,11 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
    pub fn new(
        tenant_manager: Arc<TenantManager>,
-        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            tenant_manager,
-            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -475,73 +412,6 @@ impl PageServerHandler {
        )
    }

-    fn copyin_stream<'a, IO>(
-        &'a self,
-        pgb: &'a mut PostgresBackend<IO>,
-        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        async_stream::try_stream! {
-            loop {
-                let msg = tokio::select! {
-                    biased;
-
-                    _ = cancel.cancelled() => {
-                        // We were requested to shut down.
-                        let msg = "pageserver is shutting down";
-                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Shutdown)
-                    }
-
-                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-                };
-
-                match msg {
-                    Ok(Some(message)) => {
-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            FeMessage::CopyDone => { break },
-                            FeMessage::Sync => continue,
-                            FeMessage::Terminate => {
-                                let msg = "client terminated connection with Terminate message during COPY";
-                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                                break;
-                            }
-                            m => {
-                                let msg = format!("unexpected message {m:?}");
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                                break;
-                            }
-                        };
-
-                        yield copy_data_bytes;
-                    }
-                    Ok(None) => {
-                        let msg = "client closed connection during COPY";
-                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                        // error can't happen here, ErrorResponse serialization should be always ok
-                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
-                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                    }
-                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                        Err(io_error)?;
-                    }
-                    Err(other) => {
-                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                    }
-                };
-            }
-        }
-    }
-
    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
        &mut self,
@@ -713,128 +583,6 @@ impl PageServerHandler {
        Ok(())
    }

-    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
-    async fn handle_import_basebackup<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        _end_lsn: Lsn,
-        pg_version: u32,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
-
-        // Create empty timeline
-        info!("creating new timeline");
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
-            .await?;
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
-
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
-        timeline
-            .import_basebackup_from_tar(
-                tenant.clone(),
-                &mut copyin_reader,
-                base_lsn,
-                self.broker_client.clone(),
-                &ctx,
-            )
-            .await?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        Ok(())
-    }
-
-    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
-    async fn handle_import_wal<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
-        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            FlushLayerError::Cancelled => QueryError::Shutdown,
-            other => QueryError::Other(other.into()),
-        })?;
-
-        info!("done");
-        Ok(())
-    }
-
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1705,109 +1453,6 @@ where
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("import basebackup ") {
-            // Import the `base` section (everything but the wal) of a basebackup.
-            // Assumes the tenant already exists on this pageserver.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            //
-            // Example import command:
-            // 1. Get start/end LSN from backup_manifest file
-            // 2. Run:
-            // cat my_backup/base.tar | psql -h $PAGESERVER \
-            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let params = &parts[2..];
-            if params.len() != 5 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import basebackup command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let base_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-            let pg_version = u32::from_str(params[4])
-                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::ImportBasebackup)
-                .inc();
-
-            match self
-                .handle_import_basebackup(
-                    pgb,
-                    tenant_id,
-                    timeline_id,
-                    base_lsn,
-                    end_lsn,
-                    pg_version,
-                    ctx,
-                )
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else if query_string.starts_with("import wal ") {
-            // Import the `pg_wal` section of a basebackup.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            let params = &parts[2..];
-            if params.len() != 4 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import wal command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let start_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::ImportWal)
-                .inc();
-
-            match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -715,16 +715,22 @@ impl InMemoryLayer {
                        res?;
                    }
                }
-
-                // Hold the permit until the IO is done; if we didn't, one could drop this future,
-                // thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
-                // => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
-                drop(_concurrency_permit);
            }
        }

        // MAX is used here because we identify L0 layers by full key range
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+
+        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
+        //
+        // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
+        // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
+        // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
+        //
+        // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
+        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
+        drop(_concurrency_permit);
+
        Ok(Some(delta_layer))
    }
 }
--- a/poetry.lock
+++ b/poetry.lock
@@ -3133,18 +3133,18 @@ multidict = ">=4.0"

 [[package]]
 name = "zipp"
-version = "3.19.1"
+version = "3.8.1"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
-    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
+    {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
+    {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
 ]

 [package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
+docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"]
+testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]

 [[package]]
 name = "zstandard"
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -216,10 +216,11 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
    use pq_proto::FeStartupPacket::*;

    match msg {
-        SslRequest => {
+        SslRequest { direct: false } => {
            stream
                .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
                .await?;
+
            // Upgrade raw stream into a secure TLS-backed stream.
            // NOTE: We've consumed `tls`; this fact will be used later.

--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -75,6 +75,9 @@ impl TlsConfig {
    }
 }

+/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L159>
+pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql";
+
 /// Configure TLS for the main endpoint.
 pub fn configure_tls(
    key_path: &str,
@@ -111,16 +114,17 @@ pub fn configure_tls(
    let cert_resolver = Arc::new(cert_resolver);

    // allow TLS 1.2 to be compatible with older client libraries
-    let config = rustls::ServerConfig::builder_with_protocol_versions(&[
+    let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
        &rustls::version::TLS13,
        &rustls::version::TLS12,
    ])
    .with_no_client_auth()
-    .with_cert_resolver(cert_resolver.clone())
-    .into();
+    .with_cert_resolver(cert_resolver.clone());
+
+    config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];

    Ok(TlsConfig {
-        config,
+        config: Arc::new(config),
        common_names,
        cert_resolver,
    })
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,11 +1,17 @@
-use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
+use bytes::Buf;
+use pq_proto::{
+    framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion,
+    StartupMessageParams,
+};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{info, warn};

 use crate::{
-    config::TlsConfig,
+    auth::endpoint_sni,
+    config::{TlsConfig, PG_ALPN_PROTOCOL},
    error::ReportableError,
+    metrics::Metrics,
    proxy::ERR_INSECURE_CONNECTION,
    stream::{PqStream, Stream, StreamUpgradeError},
 };
@@ -68,6 +74,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    // Client may try upgrading to each protocol only once
    let (mut tried_ssl, mut tried_gss) = (false, false);

+    const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0);
+    const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0);
+
    let mut stream = PqStream::new(Stream::from_raw(stream));
    loop {
        let msg = stream.read_startup_packet().await?;
@@ -75,40 +84,96 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(

        use FeStartupPacket::*;
        match msg {
-            SslRequest => match stream.get_ref() {
+            SslRequest { direct } => match stream.get_ref() {
                Stream::Raw { .. } if !tried_ssl => {
                    tried_ssl = true;

                    // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    let have_tls = tls.is_some();
+                    if !direct {
+                        stream
+                            .write_message(&Be::EncryptionResponse(have_tls))
+                            .await?;
+                    } else if !have_tls {
+                        return Err(HandshakeError::ProtocolViolation);
+                    }
+
                    if let Some(tls) = tls.take() {
                        // Upgrade raw stream into a secure TLS-backed stream.
                        // NOTE: We've consumed `tls`; this fact will be used later.

-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
+                        let Framed {
+                            stream: raw,
+                            read_buf,
+                            write_buf,
+                        } = stream.framed;
+
+                        let Stream::Raw { raw } = raw else {
+                            return Err(HandshakeError::StreamUpgradeError(
+                                StreamUpgradeError::AlreadyTls,
+                            ));
+                        };
+
+                        let mut read_buf = read_buf.reader();
+                        let mut res = Ok(());
+                        let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config())
+                            .accept_with(raw, |session| {
+                                // push the early data to the tls session
+                                while !read_buf.get_ref().is_empty() {
+                                    match session.read_tls(&mut read_buf) {
+                                        Ok(_) => {}
+                                        Err(e) => {
+                                            res = Err(e);
+                                            break;
+                                        }
+                                    }
+                                }
+                            });
+
+                        res?;
+
+                        let read_buf = read_buf.into_inner();
                        if !read_buf.is_empty() {
                            return Err(HandshakeError::EarlyData);
                        }
-                        let tls_stream = raw
-                            .upgrade(tls.to_server_config(), record_handshake_error)
-                            .await?;
+
+                        let tls_stream = accept.await.inspect_err(|_| {
+                            if record_handshake_error {
+                                Metrics::get().proxy.tls_handshake_failures.inc()
+                            }
+                        })?;
+
+                        let conn_info = tls_stream.get_ref().1;
+
+                        // check the ALPN, if exists, as required.
+                        match conn_info.alpn_protocol() {
+                            None | Some(PG_ALPN_PROTOCOL) => {}
+                            Some(other) => {
+                                // try parse ep for better error
+                                let ep = conn_info.server_name().and_then(|sni| {
+                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
+                                });
+                                let alpn = String::from_utf8_lossy(other);
+                                warn!(?ep, %alpn, "unexpected ALPN");
+                                return Err(HandshakeError::ProtocolViolation);
+                            }
+                        }

                        let (_, tls_server_end_point) = tls
                            .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
+                            .resolve(conn_info.server_name())
                            .ok_or(HandshakeError::MissingCertificate)?;

-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
+                        stream = PqStream {
+                            framed: Framed {
+                                stream: Stream::Tls {
+                                    tls: Box::new(tls_stream),
+                                    tls_server_end_point,
+                                },
+                                read_buf,
+                                write_buf,
+                            },
+                        };
                    }
                }
                _ => return Err(HandshakeError::ProtocolViolation),
@@ -122,7 +187,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                }
                _ => return Err(HandshakeError::ProtocolViolation),
            },
-            StartupMessage { params, .. } => {
+            StartupMessage { params, version }
+                if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
+            {
                // Check that the config has been consumed during upgrade
                // OR we didn't provide it at all (for dev purposes).
                if tls.is_some() {
@@ -131,9 +198,48 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                        .await?;
                }

-                info!(session_type = "normal", "successful handshake");
+                info!(?version, session_type = "normal", "successful handshake");
                break Ok(HandshakeData::Startup(stream, params));
            }
+            // downgrade protocol version
+            StartupMessage { params, version }
+                if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
+            {
+                warn!(?version, "unsupported minor version");
+
+                // no protocol extensions are supported.
+                // <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
+                let mut unsupported = vec![];
+                for (k, _) in params.iter() {
+                    if k.starts_with("_pq_.") {
+                        unsupported.push(k);
+                    }
+                }
+
+                // TODO: remove unsupported options so we don't send them to compute.
+
+                stream
+                    .write_message(&Be::NegotiateProtocolVersion {
+                        version: PG_PROTOCOL_LATEST,
+                        options: &unsupported,
+                    })
+                    .await?;
+
+                info!(
+                    ?version,
+                    session_type = "normal",
+                    "successful handshake; unsupported minor version requested"
+                );
+                break Ok(HandshakeData::Startup(stream, params));
+            }
+            StartupMessage { version, .. } => {
+                warn!(
+                    ?version,
+                    session_type = "normal",
+                    "unsuccessful handshake; unsupported version"
+                );
+                return Err(HandshakeError::ProtocolViolation);
+            }
            CancelRequest(cancel_key_data) => {
                info!(session_type = "cancellation", "successful handshake");
                break Ok(HandshakeData::Cancel(cancel_key_data));
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -226,7 +226,7 @@ impl Node {
        fn is_fatal(e: &mgmt_api::Error) -> bool {
            use mgmt_api::Error::*;
            match e {
-                ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                SendRequest(_) | ReceiveBody(_) | ReceiveErrorBody(_) => false,
                ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
                | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
                | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -151,6 +151,10 @@ struct ServiceState {
 /// controller API.
 fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
    match e {
+        mgmt_api::Error::SendRequest(e) => {
+            // Presume errors sending requests are connectivity/availability issues
+            ApiError::ResourceUnavailable(format!("{node} error sending request: {e}").into())
+        }
        mgmt_api::Error::ReceiveErrorBody(str) => {
            // Presume errors receiving body are connectivity/availability issues
            ApiError::ResourceUnavailable(
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -88,7 +88,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build

    env.pageserver.allowed_errors.extend(
        [
-            ".*error importing base backup .*",
+            ".*Failed to import basebackup.*",
+            ".*unexpected non-zero bytes after the tar archive.*",
            ".*Timeline got dropped without initializing, cleaning its files.*",
            ".*InternalServerError.*timeline not found.*",
            ".*InternalServerError.*Tenant .* not found.*",
Author	SHA1	Message	Date
Rahul Patil	3883d23316	ci: Trigger E2E test only after images available on registry	2024-07-10 14:54:58 +02:00
Conrad Ludgate	fe13fccdc2	proxy: pg17 fixes (#8321 ) ## Problem #7809 - we do not support sslnegotiation=direct #7810 - we do not support negotiating down the protocol extensions. ## Summary of changes 1. Same as postgres, check the first startup packet byte for tls header `0x16`, and check the ALPN. 2. Tell clients using protocol >3.0 to downgrade	2024-07-10 09:10:29 +01:00
Christian Schwarz	1a49f1c15c	pageserver: move `page_service`'s `import basebackup` / `import wal` to mgmt API (#8292 ) I want to fix bugs in `page_service` ([issue](https://github.com/neondatabase/neon/issues/7427)) and the `import basebackup` / `import wal` stand in the way / make the refactoring more complicated. We don't use these methods anyway in practice, but, there have been some objections to removing the functionality completely. So, this PR preserves the existing functionality but moves it into the HTTP management API. Note that I don't try to fix existing bugs in the code, specifically not fixing * it only ever worked correctly for unsharded tenants * it doesn't clean up on error All errors are mapped to `ApiError::InternalServerError`.	2024-07-09 23:17:42 +02:00
Christian Schwarz	9bb16c8780	fix(l0_flush): drops permit before fsync, potential cause for OOMs (#8327 ) ## Problem Slack thread: https://neondb.slack.com/archives/C033RQ5SPDH/p1720511577862519 We're seeing OOMs in staging on a pageserver that has l0_flush.mode=Direct enabled. There's a strong correlation between jumps in `maxrss_kb` and `pageserver_timeline_ephemeral_bytes`, so, it's quite likely that l0_flush.mode=Direct is the culprit. Notably, the expected max memory usage on that staging server by the l0_flush.mode=Direct is ~2GiB but we're seeing as much as 24GiB max RSS before the OOM kill. One hypothesis is that we're dropping the semaphore permit before all the dirtied pages have been flushed to disk. (The flushing to disk likely happens in the fsync inside the `.finish()` call, because we're using ext4 in data=ordered mode). ## Summary of changes Hold the permit until after we're done with `.finish()`.	2024-07-09 19:58:48 +01:00