Compare commits

..

5 Commits

Author SHA1 Message Date
Conrad Ludgate
b66e545e26 a little more type-safety, a little more verbose... 2024-10-24 12:33:10 +01:00
Conrad Ludgate
c8108a4b84 make ComputeConnectBackend dyn 2024-10-24 11:55:31 +01:00
Conrad Ludgate
2d34fec39b minor changes 2024-10-24 11:48:43 +01:00
Conrad Ludgate
3da4705775 rename to serverless backend 2024-10-24 11:44:15 +01:00
Conrad Ludgate
80c5576816 proxy: continue streamlining auth::Backend 2024-10-24 11:43:46 +01:00
81 changed files with 1891 additions and 4039 deletions

View File

@@ -53,6 +53,20 @@ jobs:
BUILD_TAG: ${{ inputs.build-tag }} BUILD_TAG: ${{ inputs.build-tag }}
steps: steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16 17; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
submodules: true submodules: true

View File

@@ -839,7 +839,6 @@ jobs:
- name: Build vm image - name: Build vm image
run: | run: |
./vm-builder \ ./vm-builder \
-size=2G \
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
@@ -1079,6 +1078,20 @@ jobs:
runs-on: [ self-hosted, small ] runs-on: [ self-hosted, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
steps: steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16 17; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Trigger deploy workflow - name: Trigger deploy workflow
@@ -1117,10 +1130,7 @@ jobs:
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
-f deployPgSniRouter=true \ -f deployPgSniRouter=true \
-f deployProxyLink=true \ -f deployProxy=true \
-f deployPrivatelinkProxy=true \
-f deployProxyScram=true \
-f deployProxyAuthBroker=true \
-f branch=main \ -f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} -f dockerTag=${{needs.tag.outputs.build-tag}}
else else

2
.gitignore vendored
View File

@@ -6,8 +6,6 @@ __pycache__/
test_output/ test_output/
.vscode .vscode
.idea .idea
*.swp
tags
neon.iml neon.iml
/.neon /.neon
/integration_tests/.neon /integration_tests/.neon

4
Cargo.lock generated
View File

@@ -6272,7 +6272,7 @@ dependencies = [
[[package]] [[package]]
name = "tokio-epoll-uring" name = "tokio-epoll-uring"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168" source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
dependencies = [ dependencies = [
"futures", "futures",
"nix 0.26.4", "nix 0.26.4",
@@ -6788,7 +6788,7 @@ dependencies = [
[[package]] [[package]]
name = "uring-common" name = "uring-common"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168" source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
dependencies = [ dependencies = [
"bytes", "bytes",
"io-uring", "io-uring",

View File

@@ -666,7 +666,7 @@ RUN apt-get update && \
# #
# Use new version only for v17 # Use new version only for v17
# because Release_2024_09_1 has some backward incompatible changes # because Release_2024_09_1 has some backward incompatible changes
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN case "${PG_VERSION}" in \ RUN case "${PG_VERSION}" in \
"v17") \ "v17") \
@@ -860,14 +860,13 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
USER nonroot USER nonroot
WORKDIR /home/nonroot WORKDIR /home/nonroot
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ RUN case "${PG_VERSION}" in "v17") \
echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
esac && \
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \ chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \ rm rustup-init && \
case "${PG_VERSION}" in \
'v17') \
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
esac && \
cargo install --locked --version 0.11.3 cargo-pgrx && \ cargo install --locked --version 0.11.3 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
@@ -1042,31 +1041,6 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
make -j $(getconf _NPROCESSORS_ONLN) install && \ make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
#########################################################################################
#
# Layer "pg_mooncake"
# compile pg_mooncake extension
#
#########################################################################################
FROM rust-extensions-build AS pg-mooncake-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PG_MOONCAKE_VERSION=0a7de4c0b5c7b1a5e2175e1c5f4625b97b7346f1
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \
'v14') \
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
esac && \
git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
cd pg_mooncake-src && \
git checkout "${PG_MOONCAKE_VERSION}" && \
git submodule update --init --depth 1 --recursive && \
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
######################################################################################### #########################################################################################
# #
# Layer "neon-pg-ext-build" # Layer "neon-pg-ext-build"
@@ -1110,7 +1084,6 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/ COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \ RUN make -j $(getconf _NPROCESSORS_ONLN) \

View File

@@ -18,7 +18,7 @@ commands:
- name: pgbouncer - name: pgbouncer
user: postgres user: postgres
sysvInitAction: respawn sysvInitAction: respawn
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0' shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
- name: local_proxy - name: local_proxy
user: postgres user: postgres
sysvInitAction: respawn sysvInitAction: respawn

View File

@@ -18,7 +18,7 @@ commands:
- name: pgbouncer - name: pgbouncer
user: postgres user: postgres
sysvInitAction: respawn sysvInitAction: respawn
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0' shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
- name: local_proxy - name: local_proxy
user: postgres user: postgres
sysvInitAction: respawn sysvInitAction: respawn

View File

@@ -1073,10 +1073,10 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
tenant_id, tenant_id,
TimelineCreateRequest { TimelineCreateRequest {
new_timeline_id, new_timeline_id,
mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap { ancestor_timeline_id: None,
existing_initdb_timeline_id: None, ancestor_start_lsn: None,
pg_version: Some(args.pg_version), existing_initdb_timeline_id: None,
}, pg_version: Some(args.pg_version),
}, },
) )
.await?; .await?;
@@ -1133,10 +1133,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
let storage_controller = StorageController::from_env(env); let storage_controller = StorageController::from_env(env);
let create_req = TimelineCreateRequest { let create_req = TimelineCreateRequest {
new_timeline_id, new_timeline_id,
mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap { ancestor_timeline_id: None,
existing_initdb_timeline_id: None, existing_initdb_timeline_id: None,
pg_version: Some(args.pg_version), ancestor_start_lsn: None,
}, pg_version: Some(args.pg_version),
}; };
let timeline_info = storage_controller let timeline_info = storage_controller
.tenant_timeline_create(tenant_id, create_req) .tenant_timeline_create(tenant_id, create_req)
@@ -1189,11 +1189,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
let storage_controller = StorageController::from_env(env); let storage_controller = StorageController::from_env(env);
let create_req = TimelineCreateRequest { let create_req = TimelineCreateRequest {
new_timeline_id, new_timeline_id,
mode: pageserver_api::models::TimelineCreateRequestMode::Branch { ancestor_timeline_id: Some(ancestor_timeline_id),
ancestor_timeline_id, existing_initdb_timeline_id: None,
ancestor_start_lsn: start_lsn, ancestor_start_lsn: start_lsn,
pg_version: None, pg_version: None,
},
}; };
let timeline_info = storage_controller let timeline_info = storage_controller
.tenant_timeline_create(tenant_id, create_req) .tenant_timeline_create(tenant_id, create_req)

View File

@@ -529,6 +529,28 @@ impl PageServerNode {
Ok(self.http_client.list_timelines(*tenant_shard_id).await?) Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
} }
pub async fn timeline_create(
&self,
tenant_shard_id: TenantShardId,
new_timeline_id: TimelineId,
ancestor_start_lsn: Option<Lsn>,
ancestor_timeline_id: Option<TimelineId>,
pg_version: Option<u32>,
existing_initdb_timeline_id: Option<TimelineId>,
) -> anyhow::Result<TimelineInfo> {
let req = models::TimelineCreateRequest {
new_timeline_id,
ancestor_start_lsn,
ancestor_timeline_id,
pg_version,
existing_initdb_timeline_id,
};
Ok(self
.http_client
.timeline_create(tenant_shard_id, &req)
.await?)
}
/// Import a basebackup prepared using either: /// Import a basebackup prepared using either:
/// a) `pg_basebackup -F tar`, or /// a) `pg_basebackup -F tar`, or
/// b) The `fullbackup` pageserver endpoint /// b) The `fullbackup` pageserver endpoint

View File

@@ -111,11 +111,6 @@ enum Command {
#[arg(long)] #[arg(long)]
node: NodeId, node: NodeId,
}, },
/// Cancel any ongoing reconciliation for this shard
TenantShardCancelReconcile {
#[arg(long)]
tenant_shard_id: TenantShardId,
},
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
/// that is passed through to pageservers, and does not affect storage controller behavior. /// that is passed through to pageservers, and does not affect storage controller behavior.
TenantConfig { TenantConfig {
@@ -540,15 +535,6 @@ async fn main() -> anyhow::Result<()> {
) )
.await?; .await?;
} }
Command::TenantShardCancelReconcile { tenant_shard_id } => {
storcon_client
.dispatch::<(), ()>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
None,
)
.await?;
}
Command::TenantConfig { tenant_id, config } => { Command::TenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?; let tenant_conf = serde_json::from_str(&config)?;

View File

@@ -19,7 +19,6 @@ use once_cell::sync::Lazy;
use prometheus::core::{ use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
}; };
pub use prometheus::local::LocalHistogram;
pub use prometheus::opts; pub use prometheus::opts;
pub use prometheus::register; pub use prometheus::register;
pub use prometheus::Error; pub use prometheus::Error;

View File

@@ -211,30 +211,13 @@ pub enum TimelineState {
#[derive(Serialize, Deserialize, Clone)] #[derive(Serialize, Deserialize, Clone)]
pub struct TimelineCreateRequest { pub struct TimelineCreateRequest {
pub new_timeline_id: TimelineId, pub new_timeline_id: TimelineId,
#[serde(flatten)] #[serde(default)]
pub mode: TimelineCreateRequestMode, pub ancestor_timeline_id: Option<TimelineId>,
} #[serde(default)]
pub existing_initdb_timeline_id: Option<TimelineId>,
#[derive(Serialize, Deserialize, Clone)] #[serde(default)]
#[serde(untagged)] pub ancestor_start_lsn: Option<Lsn>,
pub enum TimelineCreateRequestMode { pub pg_version: Option<u32>,
Branch {
ancestor_timeline_id: TimelineId,
#[serde(default)]
ancestor_start_lsn: Option<Lsn>,
// TODO: cplane sets this, but, the branching code always
// inherits the ancestor's pg_version. Earlier code wasn't
// using a flattened enum, so, it was an accepted field, and
// we continue to accept it by having it here.
pg_version: Option<u32>,
},
// NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
// (serde picks the first matching enum variant, in declaration order).
Bootstrap {
#[serde(default)]
existing_initdb_timeline_id: Option<TimelineId>,
pg_version: Option<u32>,
},
} }
#[derive(Serialize, Deserialize, Clone)] #[derive(Serialize, Deserialize, Clone)]
@@ -1068,12 +1051,6 @@ pub mod virtual_file {
} }
} }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScanDisposableKeysResponse {
pub disposable_count: usize,
pub not_disposable_count: usize,
}
// Wrapped in libpq CopyData // Wrapped in libpq CopyData
#[derive(PartialEq, Eq, Debug)] #[derive(PartialEq, Eq, Debug)]
pub enum PagestreamFeMessage { pub enum PagestreamFeMessage {

View File

@@ -357,20 +357,22 @@ impl RemoteStorage for LocalFs {
.list_recursive(prefix) .list_recursive(prefix)
.await .await
.map_err(DownloadError::Other)?; .map_err(DownloadError::Other)?;
let mut objects = Vec::with_capacity(keys.len()); let objects = keys
for key in keys { .into_iter()
let path = key.with_base(&self.storage_root); .filter_map(|k| {
let metadata = file_metadata(&path).await?; let path = k.with_base(&self.storage_root);
if metadata.is_dir() { if path.is_dir() {
continue; None
} } else {
objects.push(ListingObject { Some(ListingObject {
key: key.clone(), key: k.clone(),
last_modified: metadata.modified()?, // LocalFs is just for testing, so just specify a dummy time
size: metadata.len(), last_modified: SystemTime::now(),
}); size: 0,
} })
let objects = objects; }
})
.collect();
if let ListingMode::NoDelimiter = mode { if let ListingMode::NoDelimiter = mode {
result.keys = objects; result.keys = objects;
@@ -408,8 +410,9 @@ impl RemoteStorage for LocalFs {
} else { } else {
result.keys.push(ListingObject { result.keys.push(ListingObject {
key: RemotePath::from_string(&relative_key).unwrap(), key: RemotePath::from_string(&relative_key).unwrap(),
last_modified: object.last_modified, // LocalFs is just for testing
size: object.size, last_modified: SystemTime::now(),
size: 0,
}); });
} }
} }

View File

@@ -597,10 +597,6 @@ paths:
Create a timeline. Returns new timeline id on success. Create a timeline. Returns new timeline id on success.
Recreating the same timeline will succeed if the parameters match the existing timeline. Recreating the same timeline will succeed if the parameters match the existing timeline.
If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
To ensure durability, the caller must retry the creation until success.
Just because the timeline is visible via other endpoints does not mean it is durable.
Future versions may stop showing timelines that are not yet durable.
requestBody: requestBody:
content: content:
application/json: application/json:

View File

@@ -38,7 +38,6 @@ use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse; use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::TenantSorting; use pageserver_api::models::TenantSorting;
use pageserver_api::models::TimelineArchivalConfigRequest; use pageserver_api::models::TimelineArchivalConfigRequest;
use pageserver_api::models::TimelineCreateRequestMode;
use pageserver_api::models::TimelinesInfoAndOffloaded; use pageserver_api::models::TimelinesInfoAndOffloaded;
use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::TopTenantShardsRequest; use pageserver_api::models::TopTenantShardsRequest;
@@ -86,7 +85,6 @@ use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError; use crate::tenant::GetTimelineError;
use crate::tenant::OffloadedTimeline; use crate::tenant::OffloadedTimeline;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::DEFAULT_PG_VERSION;
use crate::{disk_usage_eviction_task, tenant}; use crate::{disk_usage_eviction_task, tenant};
use pageserver_api::models::{ use pageserver_api::models::{
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
@@ -549,26 +547,6 @@ async fn timeline_create_handler(
check_permission(&request, Some(tenant_shard_id.tenant_id))?; check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let new_timeline_id = request_data.new_timeline_id; let new_timeline_id = request_data.new_timeline_id;
// fill in the default pg_version if not provided & convert request into domain model
let params: tenant::CreateTimelineParams = match request_data.mode {
TimelineCreateRequestMode::Bootstrap {
existing_initdb_timeline_id,
pg_version,
} => tenant::CreateTimelineParams::Bootstrap(tenant::CreateTimelineParamsBootstrap {
new_timeline_id,
existing_initdb_timeline_id,
pg_version: pg_version.unwrap_or(DEFAULT_PG_VERSION),
}),
TimelineCreateRequestMode::Branch {
ancestor_timeline_id,
ancestor_start_lsn,
pg_version: _,
} => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch {
new_timeline_id,
ancestor_timeline_id,
ancestor_start_lsn,
}),
};
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
@@ -581,12 +559,22 @@ async fn timeline_create_handler(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
// earlier versions of the code had pg_version and ancestor_lsn in the span if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
// => continue to provide that information, but, through a log message that doesn't require us to destructure tracing::info!(%ancestor_id, "starting to branch");
tracing::info!(?params, "creating timeline"); } else {
tracing::info!("bootstrapping");
}
match tenant match tenant
.create_timeline(params, state.broker_client.clone(), &ctx) .create_timeline(
new_timeline_id,
request_data.ancestor_timeline_id,
request_data.ancestor_start_lsn,
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
request_data.existing_initdb_timeline_id,
state.broker_client.clone(),
&ctx,
)
.await .await
{ {
Ok(new_timeline) => { Ok(new_timeline) => {
@@ -637,6 +625,8 @@ async fn timeline_create_handler(
tenant_id = %tenant_shard_id.tenant_id, tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(), shard_id = %tenant_shard_id.shard_slug(),
timeline_id = %new_timeline_id, timeline_id = %new_timeline_id,
lsn=?request_data.ancestor_start_lsn,
pg_version=?request_data.pg_version
)) ))
.await .await
} }
@@ -1293,99 +1283,6 @@ async fn layer_map_info_handler(
json_response(StatusCode::OK, layer_map_info) json_response(StatusCode::OK, layer_map_info)
} }
#[instrument(skip_all, fields(tenant_id, shard_id, timeline_id, layer_name))]
async fn timeline_layer_scan_disposable_keys(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let layer_name: LayerName = parse_request_param(&request, "layer_name")?;
tracing::Span::current().record(
"tenant_id",
tracing::field::display(&tenant_shard_id.tenant_id),
);
tracing::Span::current().record(
"shard_id",
tracing::field::display(tenant_shard_id.shard_slug()),
);
tracing::Span::current().record("timeline_id", tracing::field::display(&timeline_id));
tracing::Span::current().record("layer_name", tracing::field::display(&layer_name));
let state = get_state(&request);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
// technically the timeline need not be active for this scan to complete
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let guard = timeline.layers.read().await;
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
));
};
let resident_layer = layer
.download_and_keep_resident()
.await
.map_err(|err| match err {
tenant::storage_layer::layer::DownloadError::TimelineShutdown
| tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
ApiError::ShuttingDown
}
tenant::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
| tenant::storage_layer::layer::DownloadError::DownloadRequired
| tenant::storage_layer::layer::DownloadError::NotFile(_)
| tenant::storage_layer::layer::DownloadError::DownloadFailed
| tenant::storage_layer::layer::DownloadError::PreStatFailed(_) => {
ApiError::InternalServerError(err.into())
}
#[cfg(test)]
tenant::storage_layer::layer::DownloadError::Failpoint(_) => {
ApiError::InternalServerError(err.into())
}
})?;
let keys = resident_layer
.load_keys(&ctx)
.await
.map_err(ApiError::InternalServerError)?;
let shard_identity = timeline.get_shard_identity();
let mut disposable_count = 0;
let mut not_disposable_count = 0;
let cancel = cancel.clone();
for (i, key) in keys.into_iter().enumerate() {
if shard_identity.is_key_disposable(&key) {
disposable_count += 1;
tracing::debug!(key = %key, key.dbg=?key, "disposable key");
} else {
not_disposable_count += 1;
}
#[allow(clippy::collapsible_if)]
if i % 10000 == 0 {
if cancel.is_cancelled() || timeline.cancel.is_cancelled() || timeline.is_stopping() {
return Err(ApiError::ShuttingDown);
}
}
}
json_response(
StatusCode::OK,
pageserver_api::models::ScanDisposableKeysResponse {
disposable_count,
not_disposable_count,
},
)
}
async fn layer_download_handler( async fn layer_download_handler(
request: Request<Body>, request: Request<Body>,
_cancel: CancellationToken, _cancel: CancellationToken,
@@ -3248,10 +3145,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, evict_timeline_layer_handler), |r| api_handler(r, evict_timeline_layer_handler),
) )
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_name/scan_disposable_keys",
|r| testing_api_handler("timeline_layer_scan_disposable_keys", r, timeline_layer_scan_disposable_keys),
)
.post( .post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc", "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
|r| api_handler(r, timeline_gc_blocking_handler), |r| api_handler(r, timeline_gc_blocking_handler),

View File

@@ -3040,111 +3040,13 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
} }
pub mod tokio_epoll_uring { pub mod tokio_epoll_uring {
use std::{ use metrics::{register_int_counter, UIntGauge};
collections::HashMap,
sync::{Arc, Mutex},
};
use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
/// Shared storage for tokio-epoll-uring thread local metrics.
pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
Lazy::new(|| {
let slots_submission_queue_depth = register_histogram!(
"pageserver_tokio_epoll_uring_slots_submission_queue_depth",
"The slots waiters queue depth of each tokio_epoll_uring system",
vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
)
.expect("failed to define a metric");
ThreadLocalMetricsStorage {
observers: Mutex::new(HashMap::new()),
slots_submission_queue_depth,
}
});
pub struct ThreadLocalMetricsStorage {
/// List of thread local metrics observers.
observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
/// A histogram shared between all thread local systems
/// for collecting slots submission queue depth.
slots_submission_queue_depth: Histogram,
}
/// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
/// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
///
/// The System makes observations into [`Self`] and periodically, the collector
/// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
///
/// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
/// But except for the periodic flush, the lock is uncontended so there's no waiting
/// for cache coherence protocol to get an exclusive cache line.
pub struct ThreadLocalMetrics {
/// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
slots_submission_queue_depth: Mutex<LocalHistogram>,
}
impl ThreadLocalMetricsStorage {
/// Registers a new thread local system. Returns a thread local metrics observer.
pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
self.slots_submission_queue_depth.local(),
));
let mut g = self.observers.lock().unwrap();
g.insert(id, Arc::clone(&per_system_metrics));
per_system_metrics
}
/// Removes metrics observer for a thread local system.
/// This should be called before dropping a thread local system.
pub fn remove_system(&self, id: u64) {
let mut g = self.observers.lock().unwrap();
g.remove(&id);
}
/// Flush all thread local metrics to the shared storage.
pub fn flush_thread_local_metrics(&self) {
let g = self.observers.lock().unwrap();
g.values().for_each(|local| {
local.flush();
});
}
}
impl ThreadLocalMetrics {
pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
ThreadLocalMetrics {
slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
}
}
/// Flushes the thread local metrics to shared aggregator.
pub fn flush(&self) {
let Self {
slots_submission_queue_depth,
} = self;
slots_submission_queue_depth.lock().unwrap().flush();
}
}
impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
let Self {
slots_submission_queue_depth,
} = self;
slots_submission_queue_depth
.lock()
.unwrap()
.observe(queue_depth as f64);
}
}
pub struct Collector { pub struct Collector {
descs: Vec<metrics::core::Desc>, descs: Vec<metrics::core::Desc>,
systems_created: UIntGauge, systems_created: UIntGauge,
systems_destroyed: UIntGauge, systems_destroyed: UIntGauge,
thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
} }
impl metrics::core::Collector for Collector { impl metrics::core::Collector for Collector {
@@ -3154,7 +3056,7 @@ pub mod tokio_epoll_uring {
fn collect(&self) -> Vec<metrics::proto::MetricFamily> { fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
let mut mfs = Vec::with_capacity(Self::NMETRICS); let mut mfs = Vec::with_capacity(Self::NMETRICS);
let tokio_epoll_uring::metrics::GlobalMetrics { let tokio_epoll_uring::metrics::Metrics {
systems_created, systems_created,
systems_destroyed, systems_destroyed,
} = tokio_epoll_uring::metrics::global(); } = tokio_epoll_uring::metrics::global();
@@ -3162,21 +3064,12 @@ pub mod tokio_epoll_uring {
mfs.extend(self.systems_created.collect()); mfs.extend(self.systems_created.collect());
self.systems_destroyed.set(systems_destroyed); self.systems_destroyed.set(systems_destroyed);
mfs.extend(self.systems_destroyed.collect()); mfs.extend(self.systems_destroyed.collect());
self.thread_local_metrics_storage
.flush_thread_local_metrics();
mfs.extend(
self.thread_local_metrics_storage
.slots_submission_queue_depth
.collect(),
);
mfs mfs
} }
} }
impl Collector { impl Collector {
const NMETRICS: usize = 3; const NMETRICS: usize = 2;
#[allow(clippy::new_without_default)] #[allow(clippy::new_without_default)]
pub fn new() -> Self { pub fn new() -> Self {
@@ -3208,7 +3101,6 @@ pub mod tokio_epoll_uring {
descs, descs,
systems_created, systems_created,
systems_destroyed, systems_destroyed,
thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
} }
} }
} }
@@ -3568,7 +3460,6 @@ pub fn preinitialize_metrics() {
Lazy::force(&RECONSTRUCT_TIME); Lazy::force(&RECONSTRUCT_TIME);
Lazy::force(&BASEBACKUP_QUERY_TIME); Lazy::force(&BASEBACKUP_QUERY_TIME);
Lazy::force(&COMPUTE_COMMANDS_COUNTERS); Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
tenant_throttling::preinitialize_global_metrics(); tenant_throttling::preinitialize_global_metrics();
} }

View File

@@ -1506,42 +1506,35 @@ impl<'a> DatadirModification<'a> {
Ok(()) Ok(())
} }
/// Drop some relations /// Drop a relation.
pub(crate) async fn put_rel_drops( pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
&mut self, anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
for ((spc_node, db_node), rel_tags) in drop_relations {
let dir_key = rel_dir_to_key(spc_node, db_node);
let buf = self.get(dir_key, ctx).await?;
let mut dir = RelDirectory::des(&buf)?;
let mut dirty = false; // Remove it from the directory entry
for rel_tag in rel_tags { let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { let buf = self.get(dir_key, ctx).await?;
dirty = true; let mut dir = RelDirectory::des(&buf)?;
// update logical size self.pending_directory_entries
let size_key = rel_size_to_key(rel_tag); .push((DirectoryKind::Rel, dir.rels.len()));
let old_size = self.get(size_key, ctx).await?.get_u32_le();
self.pending_nblocks -= old_size as i64;
// Remove entry from relation size cache if dir.rels.remove(&(rel.relnode, rel.forknum)) {
self.tline.remove_cached_rel_size(&rel_tag); self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
} else {
// Delete size entry, as well as all blocks warn!("dropped rel {} did not exist in rel directory", rel);
self.delete(rel_key_range(rel_tag));
}
}
if dirty {
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
self.pending_directory_entries
.push((DirectoryKind::Rel, dir.rels.len()));
}
} }
// update logical size
let size_key = rel_size_to_key(rel);
let old_size = self.get(size_key, ctx).await?.get_u32_le();
self.pending_nblocks -= old_size as i64;
// Remove enty from relation size cache
self.tline.remove_cached_rel_size(&rel);
// Delete size entry, as well as all blocks
self.delete(rel_key_range(rel));
Ok(()) Ok(())
} }

View File

@@ -294,11 +294,11 @@ pub struct Tenant {
/// During timeline creation, we first insert the TimelineId to the /// During timeline creation, we first insert the TimelineId to the
/// creating map, then `timelines`, then remove it from the creating map. /// creating map, then `timelines`, then remove it from the creating map.
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_creating`
timelines_creating: std::sync::Mutex<HashSet<TimelineId>>, timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
/// Possibly offloaded and archived timelines /// Possibly offloaded and archived timelines
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_offloaded`
timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>, timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
// This mutex prevents creation of new timelines during GC. // This mutex prevents creation of new timelines during GC.
@@ -584,40 +584,30 @@ impl OffloadedTimeline {
} }
} }
impl fmt::Debug for OffloadedTimeline {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "OffloadedTimeline<{}>", self.timeline_id)
}
}
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
pub enum MaybeOffloaded { pub enum MaybeOffloaded {
Yes, Yes,
No, No,
} }
#[derive(Clone, Debug)] #[derive(Clone)]
pub enum TimelineOrOffloaded { pub enum TimelineOrOffloaded {
Timeline(Arc<Timeline>), Timeline(Arc<Timeline>),
Offloaded(Arc<OffloadedTimeline>), Offloaded(Arc<OffloadedTimeline>),
} }
impl TimelineOrOffloaded { impl TimelineOrOffloaded {
pub fn arc_ref(&self) -> TimelineOrOffloadedArcRef<'_> { pub fn tenant_shard_id(&self) -> TenantShardId {
match self { match self {
TimelineOrOffloaded::Timeline(timeline) => { TimelineOrOffloaded::Timeline(timeline) => timeline.tenant_shard_id,
TimelineOrOffloadedArcRef::Timeline(timeline) TimelineOrOffloaded::Offloaded(offloaded) => offloaded.tenant_shard_id,
}
TimelineOrOffloaded::Offloaded(offloaded) => {
TimelineOrOffloadedArcRef::Offloaded(offloaded)
}
} }
} }
pub fn tenant_shard_id(&self) -> TenantShardId {
self.arc_ref().tenant_shard_id()
}
pub fn timeline_id(&self) -> TimelineId { pub fn timeline_id(&self) -> TimelineId {
self.arc_ref().timeline_id() match self {
TimelineOrOffloaded::Timeline(timeline) => timeline.timeline_id,
TimelineOrOffloaded::Offloaded(offloaded) => offloaded.timeline_id,
}
} }
pub fn delete_progress(&self) -> &Arc<tokio::sync::Mutex<DeleteTimelineFlow>> { pub fn delete_progress(&self) -> &Arc<tokio::sync::Mutex<DeleteTimelineFlow>> {
match self { match self {
@@ -625,7 +615,7 @@ impl TimelineOrOffloaded {
TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress, TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
} }
} }
fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> { pub fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
match self { match self {
TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(), TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() { TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
@@ -642,38 +632,6 @@ impl TimelineOrOffloaded {
} }
} }
pub enum TimelineOrOffloadedArcRef<'a> {
Timeline(&'a Arc<Timeline>),
Offloaded(&'a Arc<OffloadedTimeline>),
}
impl TimelineOrOffloadedArcRef<'_> {
pub fn tenant_shard_id(&self) -> TenantShardId {
match self {
TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.tenant_shard_id,
TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.tenant_shard_id,
}
}
pub fn timeline_id(&self) -> TimelineId {
match self {
TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.timeline_id,
TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.timeline_id,
}
}
}
impl<'a> From<&'a Arc<Timeline>> for TimelineOrOffloadedArcRef<'a> {
fn from(timeline: &'a Arc<Timeline>) -> Self {
Self::Timeline(timeline)
}
}
impl<'a> From<&'a Arc<OffloadedTimeline>> for TimelineOrOffloadedArcRef<'a> {
fn from(timeline: &'a Arc<OffloadedTimeline>) -> Self {
Self::Offloaded(timeline)
}
}
#[derive(Debug, thiserror::Error, PartialEq, Eq)] #[derive(Debug, thiserror::Error, PartialEq, Eq)]
pub enum GetTimelineError { pub enum GetTimelineError {
#[error("Timeline is shutting down")] #[error("Timeline is shutting down")]
@@ -779,99 +737,6 @@ impl Debug for SetStoppingError {
} }
} }
/// Arguments to [`Tenant::create_timeline`].
///
/// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
/// is `None`, the result of the timeline create call is not deterministic.
///
/// See [`CreateTimelineIdempotency`] for an idempotency key.
#[derive(Debug)]
pub(crate) enum CreateTimelineParams {
Bootstrap(CreateTimelineParamsBootstrap),
Branch(CreateTimelineParamsBranch),
}
#[derive(Debug)]
pub(crate) struct CreateTimelineParamsBootstrap {
pub(crate) new_timeline_id: TimelineId,
pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
pub(crate) pg_version: u32,
}
/// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
#[derive(Debug)]
pub(crate) struct CreateTimelineParamsBranch {
pub(crate) new_timeline_id: TimelineId,
pub(crate) ancestor_timeline_id: TimelineId,
pub(crate) ancestor_start_lsn: Option<Lsn>,
}
/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`].
///
/// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
///
/// We lower timeline creation requests to [`Self`], and then use [`PartialEq::eq`] to compare [`Timeline::create_idempotency`] with the request.
/// If they are equal, we return a reference to the existing timeline, otherwise it's an idempotency conflict.
///
/// There is special treatment for [`Self::FailWithConflict`] to always return an idempotency conflict.
/// It would be nice to have more advanced derive macros to make that special treatment declarative.
///
/// Notes:
/// - Unlike [`CreateTimelineParams`], ancestor LSN is fixed, so, branching will be at a deterministic LSN.
/// - We make some trade-offs though, e.g., [`CreateTimelineParamsBootstrap::existing_initdb_timeline_id`]
/// is not considered for idempotency. We can improve on this over time if we deem it necessary.
///
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum CreateTimelineIdempotency {
/// NB: special treatment, see comment in [`Self`].
FailWithConflict,
Bootstrap {
pg_version: u32,
},
/// NB: branches always have the same `pg_version` as their ancestor.
/// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
/// exists as a field, and is set by cplane, it has always been ignored by pageserver when
/// determining the child branch pg_version.
Branch {
ancestor_timeline_id: TimelineId,
ancestor_start_lsn: Lsn,
},
}
/// What is returned by [`Tenant::start_creating_timeline`].
#[must_use]
enum StartCreatingTimelineResult<'t> {
CreateGuard(TimelineCreateGuard<'t>),
Idempotent(Arc<Timeline>),
}
/// What is returned by [`Tenant::create_timeline`].
enum CreateTimelineResult {
Created(Arc<Timeline>),
Idempotent(Arc<Timeline>),
}
impl CreateTimelineResult {
fn discriminant(&self) -> &'static str {
match self {
Self::Created(_) => "Created",
Self::Idempotent(_) => "Idempotent",
}
}
fn timeline(&self) -> &Arc<Timeline> {
match self {
Self::Created(t) | Self::Idempotent(t) => t,
}
}
/// Unit test timelines aren't activated, test has to do it if it needs to.
#[cfg(test)]
fn into_timeline_for_test(self) -> Arc<Timeline> {
match self {
Self::Created(t) | Self::Idempotent(t) => t,
}
}
}
#[derive(thiserror::Error, Debug)] #[derive(thiserror::Error, Debug)]
pub enum CreateTimelineError { pub enum CreateTimelineError {
#[error("creation of timeline with the given ID is in progress")] #[error("creation of timeline with the given ID is in progress")]
@@ -1011,24 +876,12 @@ impl Tenant {
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let tenant_id = self.tenant_shard_id; let tenant_id = self.tenant_shard_id;
let idempotency = if metadata.ancestor_timeline().is_none() {
CreateTimelineIdempotency::Bootstrap {
pg_version: metadata.pg_version(),
}
} else {
CreateTimelineIdempotency::Branch {
ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
ancestor_start_lsn: metadata.ancestor_lsn(),
}
};
let timeline = self.create_timeline_struct( let timeline = self.create_timeline_struct(
timeline_id, timeline_id,
&metadata, &metadata,
ancestor.clone(), ancestor.clone(),
resources, resources,
CreateTimelineCause::Load, CreateTimelineCause::Load,
idempotency.clone(),
)?; )?;
let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!( anyhow::ensure!(
@@ -1821,8 +1674,6 @@ impl Tenant {
} }
/// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline /// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline
///
/// Counterpart to [`offload_timeline`].
async fn unoffload_timeline( async fn unoffload_timeline(
self: &Arc<Self>, self: &Arc<Self>,
timeline_id: TimelineId, timeline_id: TimelineId,
@@ -1831,24 +1682,6 @@ impl Tenant {
) -> Result<Arc<Timeline>, TimelineArchivalError> { ) -> Result<Arc<Timeline>, TimelineArchivalError> {
info!("unoffloading timeline"); info!("unoffloading timeline");
let cancel = self.cancel.clone(); let cancel = self.cancel.clone();
// Protect against concurrent attempts to use this TimelineId
// We don't care much about idempotency, as it's ensured a layer above.
let allow_offloaded = true;
let _create_guard = self
.create_timeline_create_guard(
timeline_id,
CreateTimelineIdempotency::FailWithConflict,
allow_offloaded,
)
.map_err(|err| match err {
TimelineExclusionError::AlreadyCreating => TimelineArchivalError::AlreadyInProgress,
TimelineExclusionError::AlreadyExists { .. } => {
TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
}
TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
})?;
let timeline_preload = self let timeline_preload = self
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone()) .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
.await; .await;
@@ -2115,17 +1948,16 @@ impl Tenant {
self.timelines.lock().unwrap().keys().cloned().collect() self.timelines.lock().unwrap().keys().cloned().collect()
} }
/// This is used by tests & import-from-basebackup. /// This is used to create the initial 'main' timeline during bootstrapping,
/// or when importing a new base backup. The caller is expected to load an
/// initial image of the datadir to the new timeline after this.
/// ///
/// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
/// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`. /// and the timeline will fail to load at a restart.
/// ///
/// The caller is responsible for getting the timeline into a state that will be accepted /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
/// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`]. /// minimum amount of keys required to get a writable timeline.
/// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline /// (Without it, `put` might fail due to `repartition` failing.)
/// to the [`Tenant::timelines`].
///
/// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
pub(crate) async fn create_empty_timeline( pub(crate) async fn create_empty_timeline(
&self, &self,
new_timeline_id: TimelineId, new_timeline_id: TimelineId,
@@ -2139,15 +1971,7 @@ impl Tenant {
); );
// Protect against concurrent attempts to use this TimelineId // Protect against concurrent attempts to use this TimelineId
let create_guard = match self let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
.start_creating_timeline(new_timeline_id, CreateTimelineIdempotency::FailWithConflict)
.await?
{
StartCreatingTimelineResult::CreateGuard(guard) => guard,
StartCreatingTimelineResult::Idempotent(_) => {
unreachable!("FailWithConflict implies we get an error instead")
}
};
let new_metadata = TimelineMetadata::new( let new_metadata = TimelineMetadata::new(
// Initialize disk_consistent LSN to 0, The caller must import some data to // Initialize disk_consistent LSN to 0, The caller must import some data to
@@ -2266,7 +2090,11 @@ impl Tenant {
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub(crate) async fn create_timeline( pub(crate) async fn create_timeline(
self: &Arc<Tenant>, self: &Arc<Tenant>,
params: CreateTimelineParams, new_timeline_id: TimelineId,
ancestor_timeline_id: Option<TimelineId>,
mut ancestor_start_lsn: Option<Lsn>,
pg_version: u32,
load_existing_initdb: Option<TimelineId>,
broker_client: storage_broker::BrokerClientChannel, broker_client: storage_broker::BrokerClientChannel,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> { ) -> Result<Arc<Timeline>, CreateTimelineError> {
@@ -2285,25 +2113,54 @@ impl Tenant {
.enter() .enter()
.map_err(|_| CreateTimelineError::ShuttingDown)?; .map_err(|_| CreateTimelineError::ShuttingDown)?;
let result: CreateTimelineResult = match params { // Get exclusive access to the timeline ID: this ensures that it does not already exist,
CreateTimelineParams::Bootstrap(CreateTimelineParamsBootstrap { // and that no other creation attempts will be allowed in while we are working.
new_timeline_id, let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
existing_initdb_timeline_id, Ok(m) => m,
pg_version, Err(TimelineExclusionError::AlreadyCreating) => {
}) => { // Creation is in progress, we cannot create it again, and we cannot
self.bootstrap_timeline( // check if this request matches the existing one, so caller must try
new_timeline_id, // again later.
pg_version, return Err(CreateTimelineError::AlreadyCreating);
existing_initdb_timeline_id,
ctx,
)
.await?
} }
CreateTimelineParams::Branch(CreateTimelineParamsBranch { Err(TimelineExclusionError::Other(e)) => {
new_timeline_id, return Err(CreateTimelineError::Other(e));
ancestor_timeline_id, }
mut ancestor_start_lsn, Err(TimelineExclusionError::AlreadyExists(existing)) => {
}) => { debug!("timeline {new_timeline_id} already exists");
// Idempotency: creating the same timeline twice is not an error, unless
// the second creation has different parameters.
if existing.get_ancestor_timeline_id() != ancestor_timeline_id
|| existing.pg_version != pg_version
|| (ancestor_start_lsn.is_some()
&& ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
{
return Err(CreateTimelineError::Conflict);
}
// Wait for uploads to complete, so that when we return Ok, the timeline
// is known to be durable on remote storage. Just like we do at the end of
// this function, after we have created the timeline ourselves.
//
// We only really care that the initial version of `index_part.json` has
// been uploaded. That's enough to remember that the timeline
// exists. However, there is no function to wait specifically for that so
// we just wait for all in-progress uploads to finish.
existing
.remote_client
.wait_completion()
.await
.context("wait for timeline uploads to complete")?;
return Ok(existing);
}
};
pausable_failpoint!("timeline-creation-after-uninit");
let loaded_timeline = match ancestor_timeline_id {
Some(ancestor_timeline_id) => {
let ancestor_timeline = self let ancestor_timeline = self
.get_timeline(ancestor_timeline_id, false) .get_timeline(ancestor_timeline_id, false)
.context("Cannot branch off the timeline that's not present in pageserver")?; .context("Cannot branch off the timeline that's not present in pageserver")?;
@@ -2350,48 +2207,43 @@ impl Tenant {
})?; })?;
} }
self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx) self.branch_timeline(
.await? &ancestor_timeline,
new_timeline_id,
ancestor_start_lsn,
create_guard,
ctx,
)
.await?
}
None => {
self.bootstrap_timeline(
new_timeline_id,
pg_version,
load_existing_initdb,
create_guard,
ctx,
)
.await?
} }
}; };
// At this point we have dropped our guard on [`Self::timelines_creating`], and // At this point we have dropped our guard on [`Self::timelines_creating`], and
// the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must
// not send a success to the caller until it is. The same applies to idempotent retries. // not send a success to the caller until it is. The same applies to handling retries,
// // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
// TODO: the timeline is already visible in [`Self::timelines`]; a caller could incorrectly let kind = ancestor_timeline_id
// assume that, because they can see the timeline via API, that the creation is done and .map(|_| "branched")
// that it is durable. Ideally, we would keep the timeline hidden (in [`Self::timelines_creating`]) .unwrap_or("bootstrapped");
// until it is durable, e.g., by extending the time we hold the creation guard. This also loaded_timeline
// interacts with UninitializedTimeline and is generally a bit tricky.
//
// To re-emphasize: the only correct way to create a timeline is to repeat calling the
// creation API until it returns success. Only then is durability guaranteed.
info!(creation_result=%result.discriminant(), "waiting for timeline to be durable");
result
.timeline()
.remote_client .remote_client
.wait_completion() .wait_completion()
.await .await
.context("wait for timeline initial uploads to complete")?; .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
// The creating task is responsible for activating the timeline. loaded_timeline.activate(self.clone(), broker_client, None, ctx);
// We do this after `wait_completion()` so that we don't spin up tasks that start
// doing stuff before the IndexPart is durable in S3, which is done by the previous section.
let activated_timeline = match result {
CreateTimelineResult::Created(timeline) => {
timeline.activate(self.clone(), broker_client, None, ctx);
timeline
}
CreateTimelineResult::Idempotent(timeline) => {
info!(
"request was deemed idempotent, activation will be done by the creating task"
);
timeline
}
};
Ok(activated_timeline) Ok(loaded_timeline)
} }
pub(crate) async fn delete_timeline( pub(crate) async fn delete_timeline(
@@ -3048,58 +2900,33 @@ impl Tenant {
&self, &self,
child_shards: &Vec<TenantShardId>, child_shards: &Vec<TenantShardId>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let (timelines, offloaded) = { let timelines = self.timelines.lock().unwrap().clone();
let timelines = self.timelines.lock().unwrap(); for timeline in timelines.values() {
let offloaded = self.timelines_offloaded.lock().unwrap();
(timelines.clone(), offloaded.clone())
};
let timelines_iter = timelines
.values()
.map(TimelineOrOffloadedArcRef::<'_>::from)
.chain(
offloaded
.values()
.map(TimelineOrOffloadedArcRef::<'_>::from),
);
for timeline in timelines_iter {
// We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
// to ensure that they do not start a split if currently in the process of doing these. // to ensure that they do not start a split if currently in the process of doing these.
let timeline_id = timeline.timeline_id(); // Upload an index from the parent: this is partly to provide freshness for the
// child tenants that will copy it, and partly for general ease-of-debugging: there will
if let TimelineOrOffloadedArcRef::Timeline(timeline) = timeline { // always be a parent shard index in the same generation as we wrote the child shard index.
// Upload an index from the parent: this is partly to provide freshness for the tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index");
// child tenants that will copy it, and partly for general ease-of-debugging: there will timeline
// always be a parent shard index in the same generation as we wrote the child shard index. .remote_client
tracing::info!(%timeline_id, "Uploading index"); .schedule_index_upload_for_file_changes()?;
timeline timeline.remote_client.wait_completion().await?;
.remote_client
.schedule_index_upload_for_file_changes()?;
timeline.remote_client.wait_completion().await?;
}
let remote_client = match timeline {
TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.remote_client.clone(),
TimelineOrOffloadedArcRef::Offloaded(offloaded) => {
let remote_client = self
.build_timeline_client(offloaded.timeline_id, self.remote_storage.clone());
Arc::new(remote_client)
}
};
// Shut down the timeline's remote client: this means that the indices we write // Shut down the timeline's remote client: this means that the indices we write
// for child shards will not be invalidated by the parent shard deleting layers. // for child shards will not be invalidated by the parent shard deleting layers.
tracing::info!(%timeline_id, "Shutting down remote storage client"); tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client");
remote_client.shutdown().await; timeline.remote_client.shutdown().await;
// Download methods can still be used after shutdown, as they don't flow through the remote client's // Download methods can still be used after shutdown, as they don't flow through the remote client's
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
// operation is rare, so it's simpler to just download it (and robustly guarantees that the index // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
// we use here really is the remotely persistent one). // we use here really is the remotely persistent one).
tracing::info!(%timeline_id, "Downloading index_part from parent"); tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent");
let result = remote_client let result = timeline.remote_client
.download_index_file(&self.cancel) .download_index_file(&self.cancel)
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id)) .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
.await?; .await?;
let index_part = match result { let index_part = match result {
MaybeDeletedIndexPart::Deleted(_) => { MaybeDeletedIndexPart::Deleted(_) => {
@@ -3109,11 +2936,11 @@ impl Tenant {
}; };
for child_shard in child_shards { for child_shard in child_shards {
tracing::info!(%timeline_id, "Uploading index_part for child {}", child_shard.to_index()); tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index());
upload_index_part( upload_index_part(
&self.remote_storage, &self.remote_storage,
child_shard, child_shard,
&timeline_id, &timeline.timeline_id,
self.generation, self.generation,
&index_part, &index_part,
&self.cancel, &self.cancel,
@@ -3122,6 +2949,8 @@ impl Tenant {
} }
} }
// TODO: also copy index files of offloaded timelines
let tenant_manifest = self.tenant_manifest(); let tenant_manifest = self.tenant_manifest();
// TODO: generation support // TODO: generation support
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
@@ -3404,7 +3233,6 @@ impl Tenant {
ancestor: Option<Arc<Timeline>>, ancestor: Option<Arc<Timeline>>,
resources: TimelineResources, resources: TimelineResources,
cause: CreateTimelineCause, cause: CreateTimelineCause,
create_idempotency: CreateTimelineIdempotency,
) -> anyhow::Result<Arc<Timeline>> { ) -> anyhow::Result<Arc<Timeline>> {
let state = match cause { let state = match cause {
CreateTimelineCause::Load => { CreateTimelineCause::Load => {
@@ -3434,7 +3262,6 @@ impl Tenant {
pg_version, pg_version,
state, state,
self.attach_wal_lag_cooldown.clone(), self.attach_wal_lag_cooldown.clone(),
create_idempotency,
self.cancel.child_token(), self.cancel.child_token(),
); );
@@ -3920,16 +3747,16 @@ impl Tenant {
/// timeline background tasks are launched, except the flush loop. /// timeline background tasks are launched, except the flush loop.
#[cfg(test)] #[cfg(test)]
async fn branch_timeline_test( async fn branch_timeline_test(
self: &Arc<Self>, &self,
src_timeline: &Arc<Timeline>, src_timeline: &Arc<Timeline>,
dst_id: TimelineId, dst_id: TimelineId,
ancestor_lsn: Option<Lsn>, ancestor_lsn: Option<Lsn>,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> { ) -> Result<Arc<Timeline>, CreateTimelineError> {
let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
let tl = self let tl = self
.branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, ctx) .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
.await? .await?;
.into_timeline_for_test();
tl.set_state(TimelineState::Active); tl.set_state(TimelineState::Active);
Ok(tl) Ok(tl)
} }
@@ -3938,7 +3765,7 @@ impl Tenant {
#[cfg(test)] #[cfg(test)]
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub async fn branch_timeline_test_with_layers( pub async fn branch_timeline_test_with_layers(
self: &Arc<Self>, &self,
src_timeline: &Arc<Timeline>, src_timeline: &Arc<Timeline>,
dst_id: TimelineId, dst_id: TimelineId,
ancestor_lsn: Option<Lsn>, ancestor_lsn: Option<Lsn>,
@@ -3986,24 +3813,28 @@ impl Tenant {
} }
/// Branch an existing timeline. /// Branch an existing timeline.
///
/// The caller is responsible for activating the returned timeline.
async fn branch_timeline( async fn branch_timeline(
self: &Arc<Self>, &self,
src_timeline: &Arc<Timeline>, src_timeline: &Arc<Timeline>,
dst_id: TimelineId, dst_id: TimelineId,
start_lsn: Option<Lsn>, start_lsn: Option<Lsn>,
timeline_create_guard: TimelineCreateGuard<'_>,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<CreateTimelineResult, CreateTimelineError> { ) -> Result<Arc<Timeline>, CreateTimelineError> {
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx) self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
.await .await
} }
async fn branch_timeline_impl( async fn branch_timeline_impl(
self: &Arc<Self>, &self,
src_timeline: &Arc<Timeline>, src_timeline: &Arc<Timeline>,
dst_id: TimelineId, dst_id: TimelineId,
start_lsn: Option<Lsn>, start_lsn: Option<Lsn>,
timeline_create_guard: TimelineCreateGuard<'_>,
_ctx: &RequestContext, _ctx: &RequestContext,
) -> Result<CreateTimelineResult, CreateTimelineError> { ) -> Result<Arc<Timeline>, CreateTimelineError> {
let src_id = src_timeline.timeline_id; let src_id = src_timeline.timeline_id;
// We will validate our ancestor LSN in this function. Acquire the GC lock so that // We will validate our ancestor LSN in this function. Acquire the GC lock so that
@@ -4018,23 +3849,6 @@ impl Tenant {
lsn lsn
}); });
// we finally have determined the ancestor_start_lsn, so we can get claim exclusivity now
let timeline_create_guard = match self
.start_creating_timeline(
dst_id,
CreateTimelineIdempotency::Branch {
ancestor_timeline_id: src_timeline.timeline_id,
ancestor_start_lsn: start_lsn,
},
)
.await?
{
StartCreatingTimelineResult::CreateGuard(guard) => guard,
StartCreatingTimelineResult::Idempotent(timeline) => {
return Ok(CreateTimelineResult::Idempotent(timeline));
}
};
// Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
// horizon on the source timeline // horizon on the source timeline
// //
@@ -4120,92 +3934,28 @@ impl Tenant {
.schedule_index_upload_for_full_metadata_update(&metadata) .schedule_index_upload_for_full_metadata_update(&metadata)
.context("branch initial metadata upload")?; .context("branch initial metadata upload")?;
// Callers are responsible to wait for uploads to complete and for activating the timeline. Ok(new_timeline)
Ok(CreateTimelineResult::Created(new_timeline))
} }
/// For unit tests, make this visible so that other modules can directly create timelines /// For unit tests, make this visible so that other modules can directly create timelines
#[cfg(test)] #[cfg(test)]
#[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
pub(crate) async fn bootstrap_timeline_test( pub(crate) async fn bootstrap_timeline_test(
self: &Arc<Self>, &self,
timeline_id: TimelineId, timeline_id: TimelineId,
pg_version: u32, pg_version: u32,
load_existing_initdb: Option<TimelineId>, load_existing_initdb: Option<TimelineId>,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> { ) -> anyhow::Result<Arc<Timeline>> {
self.bootstrap_timeline(timeline_id, pg_version, load_existing_initdb, ctx) let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
.await self.bootstrap_timeline(
.map_err(anyhow::Error::new) timeline_id,
.map(|r| r.into_timeline_for_test()) pg_version,
} load_existing_initdb,
create_guard,
/// Get exclusive access to the timeline ID for creation. ctx,
/// )
/// Timeline-creating code paths must use this function before making changes .await
/// to in-memory or persistent state.
///
/// The `state` parameter is a description of the timeline creation operation
/// we intend to perform.
/// If the timeline was already created in the meantime, we check whether this
/// request conflicts or is idempotent , based on `state`.
async fn start_creating_timeline(
&self,
new_timeline_id: TimelineId,
idempotency: CreateTimelineIdempotency,
) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
let allow_offloaded = false;
match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
Ok(create_guard) => {
pausable_failpoint!("timeline-creation-after-uninit");
Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
}
Err(TimelineExclusionError::AlreadyCreating) => {
// Creation is in progress, we cannot create it again, and we cannot
// check if this request matches the existing one, so caller must try
// again later.
Err(CreateTimelineError::AlreadyCreating)
}
Err(TimelineExclusionError::Other(e)) => Err(CreateTimelineError::Other(e)),
Err(TimelineExclusionError::AlreadyExists {
existing: TimelineOrOffloaded::Offloaded(_existing),
..
}) => {
info!("timeline already exists but is offloaded");
Err(CreateTimelineError::Conflict)
}
Err(TimelineExclusionError::AlreadyExists {
existing: TimelineOrOffloaded::Timeline(existing),
arg,
}) => {
{
let existing = &existing.create_idempotency;
let _span = info_span!("idempotency_check", ?existing, ?arg).entered();
debug!("timeline already exists");
match (existing, &arg) {
// FailWithConflict => no idempotency check
(CreateTimelineIdempotency::FailWithConflict, _)
| (_, CreateTimelineIdempotency::FailWithConflict) => {
warn!("timeline already exists, failing request");
return Err(CreateTimelineError::Conflict);
}
// Idempotent <=> CreateTimelineIdempotency is identical
(x, y) if x == y => {
info!("timeline already exists and idempotency matches, succeeding request");
// fallthrough
}
(_, _) => {
warn!("idempotency conflict, failing request");
return Err(CreateTimelineError::Conflict);
}
}
}
Ok(StartCreatingTimelineResult::Idempotent(existing))
}
}
} }
async fn upload_initdb( async fn upload_initdb(
@@ -4259,26 +4009,16 @@ impl Tenant {
/// - run initdb to init temporary instance and get bootstrap data /// - run initdb to init temporary instance and get bootstrap data
/// - after initialization completes, tar up the temp dir and upload it to S3. /// - after initialization completes, tar up the temp dir and upload it to S3.
///
/// The caller is responsible for activating the returned timeline.
async fn bootstrap_timeline( async fn bootstrap_timeline(
self: &Arc<Self>, &self,
timeline_id: TimelineId, timeline_id: TimelineId,
pg_version: u32, pg_version: u32,
load_existing_initdb: Option<TimelineId>, load_existing_initdb: Option<TimelineId>,
timeline_create_guard: TimelineCreateGuard<'_>,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<CreateTimelineResult, CreateTimelineError> { ) -> anyhow::Result<Arc<Timeline>> {
let timeline_create_guard = match self
.start_creating_timeline(
timeline_id,
CreateTimelineIdempotency::Bootstrap { pg_version },
)
.await?
{
StartCreatingTimelineResult::CreateGuard(guard) => guard,
StartCreatingTimelineResult::Idempotent(timeline) => {
return Ok(CreateTimelineResult::Idempotent(timeline))
}
};
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
// temporary directory for basebackup files for the given timeline. // temporary directory for basebackup files for the given timeline.
@@ -4342,9 +4082,7 @@ impl Tenant {
.context("extract initdb tar")?; .context("extract initdb tar")?;
} else { } else {
// Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel) run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
.await
.context("run initdb")?;
// Upload the created data dir to S3 // Upload the created data dir to S3
if self.tenant_shard_id().is_shard_zero() { if self.tenant_shard_id().is_shard_zero() {
@@ -4398,9 +4136,7 @@ impl Tenant {
})?; })?;
fail::fail_point!("before-checkpoint-new-timeline", |_| { fail::fail_point!("before-checkpoint-new-timeline", |_| {
Err(CreateTimelineError::Other(anyhow::anyhow!( anyhow::bail!("failpoint before-checkpoint-new-timeline");
"failpoint before-checkpoint-new-timeline"
)))
}); });
unfinished_timeline unfinished_timeline
@@ -4415,9 +4151,7 @@ impl Tenant {
// All done! // All done!
let timeline = raw_timeline.finish_creation()?; let timeline = raw_timeline.finish_creation()?;
// Callers are responsible to wait for uploads to complete and for activating the timeline. Ok(timeline)
Ok(CreateTimelineResult::Created(timeline))
} }
fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient { fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient {
@@ -4467,7 +4201,6 @@ impl Tenant {
ancestor, ancestor,
resources, resources,
CreateTimelineCause::Load, CreateTimelineCause::Load,
create_guard.idempotency.clone(),
) )
.context("Failed to create timeline data structure")?; .context("Failed to create timeline data structure")?;
@@ -4505,26 +4238,15 @@ impl Tenant {
/// Get a guard that provides exclusive access to the timeline directory, preventing /// Get a guard that provides exclusive access to the timeline directory, preventing
/// concurrent attempts to create the same timeline. /// concurrent attempts to create the same timeline.
///
/// The `allow_offloaded` parameter controls whether to tolerate the existence of
/// offloaded timelines or not.
fn create_timeline_create_guard( fn create_timeline_create_guard(
&self, &self,
timeline_id: TimelineId, timeline_id: TimelineId,
idempotency: CreateTimelineIdempotency,
allow_offloaded: bool,
) -> Result<TimelineCreateGuard, TimelineExclusionError> { ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
let tenant_shard_id = self.tenant_shard_id; let tenant_shard_id = self.tenant_shard_id;
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
let create_guard = TimelineCreateGuard::new( let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
self,
timeline_id,
timeline_path.clone(),
idempotency,
allow_offloaded,
)?;
// At this stage, we have got exclusive access to in-memory state for this timeline ID // At this stage, we have got exclusive access to in-memory state for this timeline ID
// for creation. // for creation.
@@ -5160,10 +4882,7 @@ mod tests {
.await .await
{ {
Ok(_) => panic!("duplicate timeline creation should fail"), Ok(_) => panic!("duplicate timeline creation should fail"),
Err(e) => assert_eq!( Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
e.to_string(),
"timeline already exists with different parameters".to_string()
),
} }
Ok(()) Ok(())

View File

@@ -1278,14 +1278,10 @@ impl RemoteTimelineClient {
let fut = { let fut = {
let mut guard = self.upload_queue.lock().unwrap(); let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = match &mut *guard { let upload_queue = match &mut *guard {
UploadQueue::Stopped(_) => { UploadQueue::Stopped(_) => return,
scopeguard::ScopeGuard::into_inner(sg);
return;
}
UploadQueue::Uninitialized => { UploadQueue::Uninitialized => {
// transition into Stopped state // transition into Stopped state
self.stop_impl(&mut guard); self.stop_impl(&mut guard);
scopeguard::ScopeGuard::into_inner(sg);
return; return;
} }
UploadQueue::Initialized(ref mut init) => init, UploadQueue::Initialized(ref mut init) => init,

View File

@@ -187,8 +187,6 @@ pub(super) async fn gather_inputs(
// but it is unlikely to cause any issues. In the worst case, // but it is unlikely to cause any issues. In the worst case,
// the calculation will error out. // the calculation will error out.
timelines.retain(|t| t.is_active()); timelines.retain(|t| t.is_active());
// Also filter out archived timelines.
timelines.retain(|t| t.is_archived() != Some(true));
// Build a map of branch points. // Build a map of branch points.
let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new(); let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();

View File

@@ -1,6 +1,5 @@
//! Common traits and structs for layers //! Common traits and structs for layers
pub mod batch_split_writer;
pub mod delta_layer; pub mod delta_layer;
pub mod filter_iterator; pub mod filter_iterator;
pub mod image_layer; pub mod image_layer;
@@ -9,6 +8,7 @@ pub(crate) mod layer;
mod layer_desc; mod layer_desc;
mod layer_name; mod layer_name;
pub mod merge_iterator; pub mod merge_iterator;
pub mod split_writer;
use crate::context::{AccessStatsBehavior, RequestContext}; use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value; use crate::repository::Value;

View File

@@ -1084,7 +1084,7 @@ impl DeltaLayerInner {
} }
} }
pub(crate) async fn index_entries<'a>( pub(super) async fn load_keys<'a>(
&'a self, &'a self,
ctx: &RequestContext, ctx: &RequestContext,
) -> Result<Vec<DeltaEntry<'a>>> { ) -> Result<Vec<DeltaEntry<'a>>> {
@@ -1346,7 +1346,7 @@ impl DeltaLayerInner {
tree_reader.dump().await?; tree_reader.dump().await?;
let keys = self.index_entries(ctx).await?; let keys = self.load_keys(ctx).await?;
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> { async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
let buf = val.load_raw(ctx).await?; let buf = val.load_raw(ctx).await?;
@@ -1453,16 +1453,6 @@ impl DeltaLayerInner {
), ),
} }
} }
/// NB: not super efficient, but not terrible either. Should prob be an iterator.
//
// We're reusing the index traversal logical in plan_reads; would be nice to
// factor that out.
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
self.index_entries(ctx)
.await
.map(|entries| entries.into_iter().map(|entry| entry.key).collect())
}
} }
/// A set of data associated with a delta layer key and its value /// A set of data associated with a delta layer key and its value

View File

@@ -673,21 +673,6 @@ impl ImageLayerInner {
), ),
} }
} }
/// NB: not super efficient, but not terrible either. Should prob be an iterator.
//
// We're reusing the index traversal logical in plan_reads; would be nice to
// factor that out.
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
let plan = self
.plan_reads(KeySpace::single(self.key_range.clone()), None, ctx)
.await?;
Ok(plan
.into_iter()
.flat_map(|read| read.blobs_at)
.map(|(_, blob_meta)| blob_meta.key)
.collect())
}
} }
/// A builder object for constructing a new image layer. /// A builder object for constructing a new image layer.
@@ -1024,7 +1009,7 @@ impl ImageLayerWriter {
self.inner.take().unwrap().finish(ctx, None).await self.inner.take().unwrap().finish(ctx, None).await
} }
/// Finish writing the image layer with an end key, used in [`super::batch_split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive. /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
pub(super) async fn finish_with_end_key( pub(super) async fn finish_with_end_key(
mut self, mut self,
end_key: Key, end_key: Key,

View File

@@ -19,7 +19,7 @@ use crate::task_mgr::TaskKind;
use crate::tenant::timeline::{CompactionError, GetVectoredError}; use crate::tenant::timeline::{CompactionError, GetVectoredError};
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
use super::delta_layer::{self}; use super::delta_layer::{self, DeltaEntry};
use super::image_layer::{self}; use super::image_layer::{self};
use super::{ use super::{
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
@@ -1841,22 +1841,23 @@ impl ResidentLayer {
pub(crate) async fn load_keys<'a>( pub(crate) async fn load_keys<'a>(
&'a self, &'a self,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<Vec<pageserver_api::key::Key>> { ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
use LayerKind::*; use LayerKind::*;
let owner = &self.owner.0; let owner = &self.owner.0;
let inner = self.downloaded.get(owner, ctx).await?; match self.downloaded.get(owner, ctx).await? {
Delta(ref d) => {
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
// while it's being held.
self.owner.record_access(ctx);
// this is valid because the DownloadedLayer::kind is a OnceCell, not a delta_layer::DeltaLayerInner::load_keys(d, ctx)
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take .await
// while it's being held. .with_context(|| format!("Layer index is corrupted for {self}"))
self.owner.record_access(ctx); }
Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
let res = match inner { }
Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
};
res.with_context(|| format!("Layer index is corrupted for {self}"))
} }
/// Read all they keys in this layer which match the ShardIdentity, and write them all to /// Read all they keys in this layer which match the ShardIdentity, and write them all to

View File

@@ -57,34 +57,6 @@ impl std::fmt::Display for PersistentLayerKey {
} }
} }
impl From<ImageLayerName> for PersistentLayerKey {
fn from(image_layer_name: ImageLayerName) -> Self {
Self {
key_range: image_layer_name.key_range,
lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer_name.lsn),
is_delta: false,
}
}
}
impl From<DeltaLayerName> for PersistentLayerKey {
fn from(delta_layer_name: DeltaLayerName) -> Self {
Self {
key_range: delta_layer_name.key_range,
lsn_range: delta_layer_name.lsn_range,
is_delta: true,
}
}
}
impl From<LayerName> for PersistentLayerKey {
fn from(layer_name: LayerName) -> Self {
match layer_name {
LayerName::Image(i) => i.into(),
LayerName::Delta(d) => d.into(),
}
}
}
impl PersistentLayerDesc { impl PersistentLayerDesc {
pub fn key(&self) -> PersistentLayerKey { pub fn key(&self) -> PersistentLayerKey {
PersistentLayerKey { PersistentLayerKey {

View File

@@ -12,154 +12,41 @@ use super::{
DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
}; };
pub(crate) enum BatchWriterResult { pub(crate) enum SplitWriterResult {
Produced(ResidentLayer), Produced(ResidentLayer),
Discarded(PersistentLayerKey), Discarded(PersistentLayerKey),
} }
#[cfg(test)] #[cfg(test)]
impl BatchWriterResult { impl SplitWriterResult {
fn into_resident_layer(self) -> ResidentLayer { fn into_resident_layer(self) -> ResidentLayer {
match self { match self {
BatchWriterResult::Produced(layer) => layer, SplitWriterResult::Produced(layer) => layer,
BatchWriterResult::Discarded(_) => panic!("unexpected discarded layer"), SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
} }
} }
fn into_discarded_layer(self) -> PersistentLayerKey { fn into_discarded_layer(self) -> PersistentLayerKey {
match self { match self {
BatchWriterResult::Produced(_) => panic!("unexpected produced layer"), SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
BatchWriterResult::Discarded(layer) => layer, SplitWriterResult::Discarded(layer) => layer,
} }
} }
} }
enum LayerWriterWrapper {
Image(ImageLayerWriter),
Delta(DeltaLayerWriter),
}
/// An layer writer that takes unfinished layers and finish them atomically.
#[must_use]
pub struct BatchLayerWriter {
generated_layer_writers: Vec<(LayerWriterWrapper, PersistentLayerKey)>,
conf: &'static PageServerConf,
}
impl BatchLayerWriter {
pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
Ok(Self {
generated_layer_writers: Vec::new(),
conf,
})
}
pub fn add_unfinished_image_writer(
&mut self,
writer: ImageLayerWriter,
key_range: Range<Key>,
lsn: Lsn,
) {
self.generated_layer_writers.push((
LayerWriterWrapper::Image(writer),
PersistentLayerKey {
key_range,
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
is_delta: false,
},
));
}
pub fn add_unfinished_delta_writer(
&mut self,
writer: DeltaLayerWriter,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
) {
self.generated_layer_writers.push((
LayerWriterWrapper::Delta(writer),
PersistentLayerKey {
key_range,
lsn_range,
is_delta: true,
},
));
}
pub(crate) async fn finish_with_discard_fn<D, F>(
self,
tline: &Arc<Timeline>,
ctx: &RequestContext,
discard_fn: D,
) -> anyhow::Result<Vec<BatchWriterResult>>
where
D: Fn(&PersistentLayerKey) -> F,
F: Future<Output = bool>,
{
let Self {
generated_layer_writers,
..
} = self;
let clean_up_layers = |generated_layers: Vec<BatchWriterResult>| {
for produced_layer in generated_layers {
if let BatchWriterResult::Produced(resident_layer) = produced_layer {
let layer: Layer = resident_layer.into();
layer.delete_on_drop();
}
}
};
// BEGIN: catch every error and do the recovery in the below section
let mut generated_layers: Vec<BatchWriterResult> = Vec::new();
for (inner, layer_key) in generated_layer_writers {
if discard_fn(&layer_key).await {
generated_layers.push(BatchWriterResult::Discarded(layer_key));
} else {
let res = match inner {
LayerWriterWrapper::Delta(writer) => {
writer.finish(layer_key.key_range.end, ctx).await
}
LayerWriterWrapper::Image(writer) => {
writer
.finish_with_end_key(layer_key.key_range.end, ctx)
.await
}
};
let layer = match res {
Ok((desc, path)) => {
match Layer::finish_creating(self.conf, tline, desc, &path) {
Ok(layer) => layer,
Err(e) => {
tokio::fs::remove_file(&path).await.ok();
clean_up_layers(generated_layers);
return Err(e);
}
}
}
Err(e) => {
// Image/DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
// so we don't need to remove the layer we just failed to create by ourselves.
clean_up_layers(generated_layers);
return Err(e);
}
};
generated_layers.push(BatchWriterResult::Produced(layer));
}
}
// END: catch every error and do the recovery in the above section
Ok(generated_layers)
}
}
/// An image writer that takes images and produces multiple image layers. /// An image writer that takes images and produces multiple image layers.
///
/// The interface does not guarantee atomicity (i.e., if the image layer generation
/// fails, there might be leftover files to be cleaned up)
#[must_use] #[must_use]
pub struct SplitImageLayerWriter { pub struct SplitImageLayerWriter {
inner: ImageLayerWriter, inner: ImageLayerWriter,
target_layer_size: u64, target_layer_size: u64,
lsn: Lsn, generated_layer_writers: Vec<(ImageLayerWriter, PersistentLayerKey)>,
conf: &'static PageServerConf, conf: &'static PageServerConf,
timeline_id: TimelineId, timeline_id: TimelineId,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
batches: BatchLayerWriter, lsn: Lsn,
start_key: Key, start_key: Key,
} }
@@ -184,10 +71,10 @@ impl SplitImageLayerWriter {
ctx, ctx,
) )
.await?, .await?,
generated_layer_writers: Vec::new(),
conf, conf,
timeline_id, timeline_id,
tenant_shard_id, tenant_shard_id,
batches: BatchLayerWriter::new(conf).await?,
lsn, lsn,
start_key, start_key,
}) })
@@ -215,13 +102,16 @@ impl SplitImageLayerWriter {
ctx, ctx,
) )
.await?; .await?;
let layer_key = PersistentLayerKey {
key_range: self.start_key..key,
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
is_delta: false,
};
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
self.batches.add_unfinished_image_writer(
prev_image_writer,
self.start_key..key,
self.lsn,
);
self.start_key = key; self.start_key = key;
self.generated_layer_writers
.push((prev_image_writer, layer_key));
} }
self.inner.put_image(key, img, ctx).await self.inner.put_image(key, img, ctx).await
} }
@@ -232,18 +122,64 @@ impl SplitImageLayerWriter {
ctx: &RequestContext, ctx: &RequestContext,
end_key: Key, end_key: Key,
discard_fn: D, discard_fn: D,
) -> anyhow::Result<Vec<BatchWriterResult>> ) -> anyhow::Result<Vec<SplitWriterResult>>
where where
D: Fn(&PersistentLayerKey) -> F, D: Fn(&PersistentLayerKey) -> F,
F: Future<Output = bool>, F: Future<Output = bool>,
{ {
let Self { let Self {
mut batches, inner, .. mut generated_layer_writers,
inner,
..
} = self; } = self;
if inner.num_keys() != 0 { if inner.num_keys() != 0 {
batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn); let layer_key = PersistentLayerKey {
key_range: self.start_key..end_key,
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
is_delta: false,
};
generated_layer_writers.push((inner, layer_key));
} }
batches.finish_with_discard_fn(tline, ctx, discard_fn).await let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
for produced_layer in generated_layers {
if let SplitWriterResult::Produced(image_layer) = produced_layer {
let layer: Layer = image_layer.into();
layer.delete_on_drop();
}
}
};
// BEGIN: catch every error and do the recovery in the below section
let mut generated_layers = Vec::new();
for (inner, layer_key) in generated_layer_writers {
if discard_fn(&layer_key).await {
generated_layers.push(SplitWriterResult::Discarded(layer_key));
} else {
let layer = match inner
.finish_with_end_key(layer_key.key_range.end, ctx)
.await
{
Ok((desc, path)) => {
match Layer::finish_creating(self.conf, tline, desc, &path) {
Ok(layer) => layer,
Err(e) => {
tokio::fs::remove_file(&path).await.ok();
clean_up_layers(generated_layers);
return Err(e);
}
}
}
Err(e) => {
// ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
// so we don't need to remove the layer we just failed to create by ourselves.
clean_up_layers(generated_layers);
return Err(e);
}
};
generated_layers.push(SplitWriterResult::Produced(layer));
}
}
// END: catch every error and do the recovery in the above section
Ok(generated_layers)
} }
#[cfg(test)] #[cfg(test)]
@@ -252,7 +188,7 @@ impl SplitImageLayerWriter {
tline: &Arc<Timeline>, tline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
end_key: Key, end_key: Key,
) -> anyhow::Result<Vec<BatchWriterResult>> { ) -> anyhow::Result<Vec<SplitWriterResult>> {
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false }) self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
.await .await
} }
@@ -260,6 +196,9 @@ impl SplitImageLayerWriter {
/// A delta writer that takes key-lsn-values and produces multiple delta layers. /// A delta writer that takes key-lsn-values and produces multiple delta layers.
/// ///
/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
/// there might be leftover files to be cleaned up).
///
/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
/// will split them into multiple files based on size. /// will split them into multiple files based on size.
@@ -267,12 +206,12 @@ impl SplitImageLayerWriter {
pub struct SplitDeltaLayerWriter { pub struct SplitDeltaLayerWriter {
inner: Option<(Key, DeltaLayerWriter)>, inner: Option<(Key, DeltaLayerWriter)>,
target_layer_size: u64, target_layer_size: u64,
generated_layer_writers: Vec<(DeltaLayerWriter, PersistentLayerKey)>,
conf: &'static PageServerConf, conf: &'static PageServerConf,
timeline_id: TimelineId, timeline_id: TimelineId,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
lsn_range: Range<Lsn>, lsn_range: Range<Lsn>,
last_key_written: Key, last_key_written: Key,
batches: BatchLayerWriter,
} }
impl SplitDeltaLayerWriter { impl SplitDeltaLayerWriter {
@@ -286,12 +225,12 @@ impl SplitDeltaLayerWriter {
Ok(Self { Ok(Self {
target_layer_size, target_layer_size,
inner: None, inner: None,
generated_layer_writers: Vec::new(),
conf, conf,
timeline_id, timeline_id,
tenant_shard_id, tenant_shard_id,
lsn_range, lsn_range,
last_key_written: Key::MIN, last_key_written: Key::MIN,
batches: BatchLayerWriter::new(conf).await?,
}) })
} }
@@ -340,11 +279,13 @@ impl SplitDeltaLayerWriter {
.await?; .await?;
let (start_key, prev_delta_writer) = let (start_key, prev_delta_writer) =
std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap(); std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
self.batches.add_unfinished_delta_writer( let layer_key = PersistentLayerKey {
prev_delta_writer, key_range: start_key..key,
start_key..key, lsn_range: self.lsn_range.clone(),
self.lsn_range.clone(), is_delta: true,
); };
self.generated_layer_writers
.push((prev_delta_writer, layer_key));
} else if inner.estimated_size() >= S3_UPLOAD_LIMIT { } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
// We have to produce a very large file b/c a key is updated too often. // We have to produce a very large file b/c a key is updated too often.
anyhow::bail!( anyhow::bail!(
@@ -364,25 +305,64 @@ impl SplitDeltaLayerWriter {
tline: &Arc<Timeline>, tline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
discard_fn: D, discard_fn: D,
) -> anyhow::Result<Vec<BatchWriterResult>> ) -> anyhow::Result<Vec<SplitWriterResult>>
where where
D: Fn(&PersistentLayerKey) -> F, D: Fn(&PersistentLayerKey) -> F,
F: Future<Output = bool>, F: Future<Output = bool>,
{ {
let Self { let Self {
mut batches, inner, .. mut generated_layer_writers,
inner,
..
} = self; } = self;
if let Some((start_key, writer)) = inner { if let Some((start_key, writer)) = inner {
if writer.num_keys() != 0 { if writer.num_keys() != 0 {
let end_key = self.last_key_written.next(); let end_key = self.last_key_written.next();
batches.add_unfinished_delta_writer( let layer_key = PersistentLayerKey {
writer, key_range: start_key..end_key,
start_key..end_key, lsn_range: self.lsn_range.clone(),
self.lsn_range.clone(), is_delta: true,
); };
generated_layer_writers.push((writer, layer_key));
} }
} }
batches.finish_with_discard_fn(tline, ctx, discard_fn).await let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
for produced_layer in generated_layers {
if let SplitWriterResult::Produced(delta_layer) = produced_layer {
let layer: Layer = delta_layer.into();
layer.delete_on_drop();
}
}
};
// BEGIN: catch every error and do the recovery in the below section
let mut generated_layers = Vec::new();
for (inner, layer_key) in generated_layer_writers {
if discard_fn(&layer_key).await {
generated_layers.push(SplitWriterResult::Discarded(layer_key));
} else {
let layer = match inner.finish(layer_key.key_range.end, ctx).await {
Ok((desc, path)) => {
match Layer::finish_creating(self.conf, tline, desc, &path) {
Ok(layer) => layer,
Err(e) => {
tokio::fs::remove_file(&path).await.ok();
clean_up_layers(generated_layers);
return Err(e);
}
}
}
Err(e) => {
// DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
// so we don't need to remove the layer we just failed to create by ourselves.
clean_up_layers(generated_layers);
return Err(e);
}
};
generated_layers.push(SplitWriterResult::Produced(layer));
}
}
// END: catch every error and do the recovery in the above section
Ok(generated_layers)
} }
#[cfg(test)] #[cfg(test)]
@@ -390,7 +370,7 @@ impl SplitDeltaLayerWriter {
self, self,
tline: &Arc<Timeline>, tline: &Arc<Timeline>,
ctx: &RequestContext, ctx: &RequestContext,
) -> anyhow::Result<Vec<BatchWriterResult>> { ) -> anyhow::Result<Vec<SplitWriterResult>> {
self.finish_with_discard_fn(tline, ctx, |_| async { false }) self.finish_with_discard_fn(tline, ctx, |_| async { false })
.await .await
} }

View File

@@ -424,9 +424,6 @@ pub struct Timeline {
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>, pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>, pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
/// Cf. [`crate::tenant::CreateTimelineIdempotency`].
pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
} }
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>; pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -2139,7 +2136,6 @@ impl Timeline {
pg_version: u32, pg_version: u32,
state: TimelineState, state: TimelineState,
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>, attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
create_idempotency: crate::tenant::CreateTimelineIdempotency,
cancel: CancellationToken, cancel: CancellationToken,
) -> Arc<Self> { ) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn(); let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2278,8 +2274,6 @@ impl Timeline {
handles: Default::default(), handles: Default::default(),
attach_wal_lag_cooldown, attach_wal_lag_cooldown,
create_idempotency,
}; };
result.repartition_threshold = result.repartition_threshold =

View File

@@ -32,11 +32,11 @@ use crate::page_cache;
use crate::statvfs::Statvfs; use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap; use crate::tenant::checks::check_valid_layermap;
use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::batch_split_writer::{
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
};
use crate::tenant::storage_layer::filter_iterator::FilterIterator; use crate::tenant::storage_layer::filter_iterator::FilterIterator;
use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::merge_iterator::MergeIterator;
use crate::tenant::storage_layer::split_writer::{
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
};
use crate::tenant::storage_layer::{ use crate::tenant::storage_layer::{
AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
}; };
@@ -834,12 +834,7 @@ impl Timeline {
if self.cancel.is_cancelled() { if self.cancel.is_cancelled() {
return Err(CompactionError::ShuttingDown); return Err(CompactionError::ShuttingDown);
} }
let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
let keys = delta
.index_entries(ctx)
.await
.map_err(CompactionError::Other)?;
all_keys.extend(keys);
} }
// The current stdlib sorting implementation is designed in a way where it is // The current stdlib sorting implementation is designed in a way where it is
// particularly fast where the slice is made up of sorted sub-ranges. // particularly fast where the slice is made up of sorted sub-ranges.
@@ -2043,11 +2038,11 @@ impl Timeline {
let produced_image_layers_len = produced_image_layers.len(); let produced_image_layers_len = produced_image_layers.len();
for action in produced_delta_layers { for action in produced_delta_layers {
match action { match action {
BatchWriterResult::Produced(layer) => { SplitWriterResult::Produced(layer) => {
stat.produce_delta_layer(layer.layer_desc().file_size()); stat.produce_delta_layer(layer.layer_desc().file_size());
compact_to.push(layer); compact_to.push(layer);
} }
BatchWriterResult::Discarded(l) => { SplitWriterResult::Discarded(l) => {
keep_layers.insert(l); keep_layers.insert(l);
stat.discard_delta_layer(); stat.discard_delta_layer();
} }
@@ -2055,11 +2050,11 @@ impl Timeline {
} }
for action in produced_image_layers { for action in produced_image_layers {
match action { match action {
BatchWriterResult::Produced(layer) => { SplitWriterResult::Produced(layer) => {
stat.produce_image_layer(layer.layer_desc().file_size()); stat.produce_image_layer(layer.layer_desc().file_size());
compact_to.push(layer); compact_to.push(layer);
} }
BatchWriterResult::Discarded(l) => { SplitWriterResult::Discarded(l) => {
keep_layers.insert(l); keep_layers.insert(l);
stat.discard_image_layer(); stat.discard_image_layer();
} }
@@ -2443,7 +2438,7 @@ impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
type DeltaEntry<'a> = DeltaEntry<'a>; type DeltaEntry<'a> = DeltaEntry<'a>;
async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> { async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
self.0.get_as_delta(ctx).await?.index_entries(ctx).await self.0.load_keys(ctx).await
} }
} }

View File

@@ -313,7 +313,6 @@ impl DeleteTimelineFlow {
// Important. We dont pass ancestor above because it can be missing. // Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here. // Thus we need to skip the validation here.
CreateTimelineCause::Delete, CreateTimelineCause::Delete,
crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
) )
.context("create_timeline_struct")?; .context("create_timeline_struct")?;

View File

@@ -45,16 +45,13 @@ impl LayerManager {
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer { pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
// The assumption for the `expect()` is that all code maintains the following invariant: // The assumption for the `expect()` is that all code maintains the following invariant:
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
self.try_get_from_key(key) self.layers()
.get(key)
.with_context(|| format!("get layer from key: {key}")) .with_context(|| format!("get layer from key: {key}"))
.expect("not found") .expect("not found")
.clone() .clone()
} }
pub(crate) fn try_get_from_key(&self, key: &PersistentLayerKey) -> Option<&Layer> {
self.layers().get(key)
}
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
self.get_from_key(&desc.key()) self.get_from_key(&desc.key())
} }

View File

@@ -5,11 +5,7 @@ use camino::Utf8PathBuf;
use tracing::{error, info, info_span}; use tracing::{error, info, info_span};
use utils::{fs_ext, id::TimelineId, lsn::Lsn}; use utils::{fs_ext, id::TimelineId, lsn::Lsn};
use crate::{ use crate::{context::RequestContext, import_datadir, tenant::Tenant};
context::RequestContext,
import_datadir,
tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
};
use super::Timeline; use super::Timeline;
@@ -169,17 +165,13 @@ pub(crate) struct TimelineCreateGuard<'t> {
owning_tenant: &'t Tenant, owning_tenant: &'t Tenant,
timeline_id: TimelineId, timeline_id: TimelineId,
pub(crate) timeline_path: Utf8PathBuf, pub(crate) timeline_path: Utf8PathBuf,
pub(crate) idempotency: CreateTimelineIdempotency,
} }
/// Errors when acquiring exclusive access to a timeline ID for creation /// Errors when acquiring exclusive access to a timeline ID for creation
#[derive(thiserror::Error, Debug)] #[derive(thiserror::Error, Debug)]
pub(crate) enum TimelineExclusionError { pub(crate) enum TimelineExclusionError {
#[error("Already exists")] #[error("Already exists")]
AlreadyExists { AlreadyExists(Arc<Timeline>),
existing: TimelineOrOffloaded,
arg: CreateTimelineIdempotency,
},
#[error("Already creating")] #[error("Already creating")]
AlreadyCreating, AlreadyCreating,
@@ -193,42 +185,27 @@ impl<'t> TimelineCreateGuard<'t> {
owning_tenant: &'t Tenant, owning_tenant: &'t Tenant,
timeline_id: TimelineId, timeline_id: TimelineId,
timeline_path: Utf8PathBuf, timeline_path: Utf8PathBuf,
idempotency: CreateTimelineIdempotency,
allow_offloaded: bool,
) -> Result<Self, TimelineExclusionError> { ) -> Result<Self, TimelineExclusionError> {
// Lock order: this is the only place we take both locks. During drop() we only // Lock order: this is the only place we take both locks. During drop() we only
// lock creating_timelines // lock creating_timelines
let timelines = owning_tenant.timelines.lock().unwrap(); let timelines = owning_tenant.timelines.lock().unwrap();
let timelines_offloaded = owning_tenant.timelines_offloaded.lock().unwrap();
let mut creating_timelines: std::sync::MutexGuard< let mut creating_timelines: std::sync::MutexGuard<
'_, '_,
std::collections::HashSet<TimelineId>, std::collections::HashSet<TimelineId>,
> = owning_tenant.timelines_creating.lock().unwrap(); > = owning_tenant.timelines_creating.lock().unwrap();
if let Some(existing) = timelines.get(&timeline_id) { if let Some(existing) = timelines.get(&timeline_id) {
return Err(TimelineExclusionError::AlreadyExists { Err(TimelineExclusionError::AlreadyExists(existing.clone()))
existing: TimelineOrOffloaded::Timeline(existing.clone()), } else if creating_timelines.contains(&timeline_id) {
arg: idempotency, Err(TimelineExclusionError::AlreadyCreating)
}); } else {
creating_timelines.insert(timeline_id);
Ok(Self {
owning_tenant,
timeline_id,
timeline_path,
})
} }
if !allow_offloaded {
if let Some(existing) = timelines_offloaded.get(&timeline_id) {
return Err(TimelineExclusionError::AlreadyExists {
existing: TimelineOrOffloaded::Offloaded(existing.clone()),
arg: idempotency,
});
}
}
if creating_timelines.contains(&timeline_id) {
return Err(TimelineExclusionError::AlreadyCreating);
}
creating_timelines.insert(timeline_id);
Ok(Self {
owning_tenant,
timeline_id,
timeline_path,
idempotency,
})
} }
} }

View File

@@ -16,24 +16,18 @@ use tokio_epoll_uring::{System, SystemHandle};
use crate::virtual_file::on_fatal_io_error; use crate::virtual_file::on_fatal_io_error;
use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE}; use crate::metrics::tokio_epoll_uring as metrics;
#[derive(Clone)] #[derive(Clone)]
struct ThreadLocalState(Arc<ThreadLocalStateInner>); struct ThreadLocalState(Arc<ThreadLocalStateInner>);
struct ThreadLocalStateInner { struct ThreadLocalStateInner {
cell: tokio::sync::OnceCell<SystemHandle<metrics::ThreadLocalMetrics>>, cell: tokio::sync::OnceCell<SystemHandle>,
launch_attempts: AtomicU32, launch_attempts: AtomicU32,
/// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`] /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
thread_local_state_id: u64, thread_local_state_id: u64,
} }
impl Drop for ThreadLocalStateInner {
fn drop(&mut self) {
THREAD_LOCAL_METRICS_STORAGE.remove_system(self.thread_local_state_id);
}
}
impl ThreadLocalState { impl ThreadLocalState {
pub fn new() -> Self { pub fn new() -> Self {
Self(Arc::new(ThreadLocalStateInner { Self(Arc::new(ThreadLocalStateInner {
@@ -77,8 +71,7 @@ pub async fn thread_local_system() -> Handle {
&fake_cancel, &fake_cancel,
) )
.await; .await;
let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id); let res = System::launch()
let res = System::launch_with_metrics(per_system_metrics)
// this might move us to another executor thread => loop outside the get_or_try_init, not inside it // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
.await; .await;
match res { match res {
@@ -93,7 +86,6 @@ pub async fn thread_local_system() -> Handle {
emit_launch_failure_process_stats(); emit_launch_failure_process_stats();
}); });
metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc(); metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id);
Err(()) Err(())
} }
// abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere. // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
@@ -123,7 +115,7 @@ fn emit_launch_failure_process_stats() {
// number of threads // number of threads
// rss / system memory usage generally // rss / system memory usage generally
let tokio_epoll_uring::metrics::GlobalMetrics { let tokio_epoll_uring::metrics::Metrics {
systems_created, systems_created,
systems_destroyed, systems_destroyed,
} = tokio_epoll_uring::metrics::global(); } = tokio_epoll_uring::metrics::global();
@@ -190,7 +182,7 @@ fn emit_launch_failure_process_stats() {
pub struct Handle(ThreadLocalState); pub struct Handle(ThreadLocalState);
impl std::ops::Deref for Handle { impl std::ops::Deref for Handle {
type Target = SystemHandle<metrics::ThreadLocalMetrics>; type Target = SystemHandle;
fn deref(&self) -> &Self::Target { fn deref(&self) -> &Self::Target {
self.0 self.0

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,6 @@ OBJS = \
file_cache.o \ file_cache.o \
hll.o \ hll.o \
libpagestore.o \ libpagestore.o \
logical_replication_monitor.o \
neon.o \ neon.o \
neon_pgversioncompat.o \ neon_pgversioncompat.o \
neon_perf_counters.o \ neon_perf_counters.o \

View File

@@ -1,253 +0,0 @@
#include <limits.h>
#include <string.h>
#include <dirent.h>
#include <signal.h>
#include "postgres.h"
#include "miscadmin.h"
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
#include "replication/slot.h"
#include "storage/fd.h"
#include "storage/procsignal.h"
#include "tcop/tcopprot.h"
#include "utils/guc.h"
#include "utils/wait_event.h"
#include "logical_replication_monitor.h"
#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
static int logical_replication_max_snap_files = 300;
PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
static int
LsnDescComparator(const void *a, const void *b)
{
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
if (lsn1 < lsn2)
return 1;
else if (lsn1 == lsn2)
return 0;
else
return -1;
}
/*
* Look at .snap files and calculate minimum allowed restart_lsn of slot so that
* next gc would leave not more than logical_replication_max_snap_files; all
* slots having lower restart_lsn should be dropped.
*/
static XLogRecPtr
get_num_snap_files_lsn_threshold(void)
{
DIR *dirdesc;
struct dirent *de;
char *snap_path = "pg_logical/snapshots/";
int lsns_allocated = 1024;
int lsns_num = 0;
XLogRecPtr *lsns;
XLogRecPtr cutoff;
if (logical_replication_max_snap_files < 0)
return 0;
lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
/* find all .snap files and get their lsns */
dirdesc = AllocateDir(snap_path);
while ((de = ReadDir(dirdesc, snap_path)) != NULL)
{
XLogRecPtr lsn;
uint32 hi;
uint32 lo;
if (strcmp(de->d_name, ".") == 0 ||
strcmp(de->d_name, "..") == 0)
continue;
if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
{
ereport(LOG,
(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
continue;
}
lsn = ((uint64) hi) << 32 | lo;
elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
if (lsns_allocated == lsns_num)
{
lsns_allocated *= 2;
lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
}
lsns[lsns_num++] = lsn;
}
/* sort by lsn desc */
qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
/* and take cutoff at logical_replication_max_snap_files */
if (logical_replication_max_snap_files > lsns_num)
cutoff = 0;
/* have less files than cutoff */
else
{
cutoff = lsns[logical_replication_max_snap_files - 1];
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
}
pfree(lsns);
FreeDir(dirdesc);
return cutoff;
}
void
InitLogicalReplicationMonitor(void)
{
BackgroundWorker bgw;
DefineCustomIntVariable(
"neon.logical_replication_max_snap_files",
"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
NULL,
&logical_replication_max_snap_files,
300, -1, INT_MAX,
PGC_SIGHUP,
0,
NULL, NULL, NULL);
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
bgw.bgw_restart_time = 5;
bgw.bgw_notify_pid = 0;
bgw.bgw_main_arg = (Datum) 0;
RegisterBackgroundWorker(&bgw);
}
/*
* Unused logical replication slots pins WAL and prevents deletion of snapshots.
* WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
* need too many .snap files.
*/
void
LogicalSlotsMonitorMain(Datum main_arg)
{
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGHUP, SignalHandlerForConfigReload);
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
for (;;)
{
XLogRecPtr cutoff_lsn;
/* In case of a SIGHUP, just reload the configuration. */
if (ConfigReloadPending)
{
ConfigReloadPending = false;
ProcessConfigFile(PGC_SIGHUP);
}
/*
* If there are too many .snap files, just drop all logical slots to
* prevent aux files bloat.
*/
cutoff_lsn = get_num_snap_files_lsn_threshold();
if (cutoff_lsn > 0)
{
for (int i = 0; i < max_replication_slots; i++)
{
char slot_name[NAMEDATALEN];
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
XLogRecPtr restart_lsn;
/* find the name */
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
/* Consider only logical repliction slots */
if (!s->in_use || !SlotIsLogical(s))
{
LWLockRelease(ReplicationSlotControlLock);
continue;
}
/* do we need to drop it? */
SpinLockAcquire(&s->mutex);
restart_lsn = s->data.restart_lsn;
SpinLockRelease(&s->mutex);
if (restart_lsn >= cutoff_lsn)
{
LWLockRelease(ReplicationSlotControlLock);
continue;
}
strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
LWLockRelease(ReplicationSlotControlLock);
/* now try to drop it, killing owner before if any */
for (;;)
{
pid_t active_pid;
SpinLockAcquire(&s->mutex);
active_pid = s->active_pid;
SpinLockRelease(&s->mutex);
if (active_pid == 0)
{
/*
* Slot is releasted, try to drop it. Though of course
* it could have been reacquired, so drop can ERROR
* out. Similarly it could have been dropped in the
* meanwhile.
*
* In principle we could remove pg_try/pg_catch, that
* would restart the whole bgworker.
*/
ConditionVariableCancelSleep();
PG_TRY();
{
ReplicationSlotDrop(slot_name, true);
elog(LOG, "ls_monitor: slot %s dropped", slot_name);
}
PG_CATCH();
{
/* log ERROR and reset elog stack */
EmitErrorReport();
FlushErrorState();
elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
}
PG_END_TRY();
break;
}
else
{
/* kill the owner and wait for release */
elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
(void) kill(active_pid, SIGTERM);
/* We shouldn't get stuck, but to be safe add timeout. */
ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
}
}
}
}
(void) WaitLatch(MyLatch,
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
LS_MONITOR_CHECK_INTERVAL,
PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
}
}

View File

@@ -1,6 +0,0 @@
#ifndef __NEON_LOGICAL_REPLICATION_MONITOR_H__
#define __NEON_LOGICAL_REPLICATION_MONITOR_H__
void InitLogicalReplicationMonitor(void);
#endif

View File

@@ -14,22 +14,32 @@
#include "miscadmin.h" #include "miscadmin.h"
#include "access/subtrans.h" #include "access/subtrans.h"
#include "access/twophase.h" #include "access/twophase.h"
#include "access/xact.h"
#include "access/xlog.h" #include "access/xlog.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "catalog/pg_type.h"
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
#include "replication/logical.h" #include "replication/logical.h"
#include "replication/slot.h" #include "replication/slot.h"
#include "replication/walsender.h" #include "replication/walsender.h"
#include "storage/proc.h" #include "storage/proc.h"
#include "storage/procsignal.h"
#include "tcop/tcopprot.h"
#include "funcapi.h" #include "funcapi.h"
#include "access/htup_details.h" #include "access/htup_details.h"
#include "utils/builtins.h" #include "utils/builtins.h"
#include "utils/pg_lsn.h" #include "utils/pg_lsn.h"
#include "utils/guc.h" #include "utils/guc.h"
#include "utils/guc_tables.h" #include "utils/guc_tables.h"
#include "utils/wait_event.h"
#include "extension_server.h" #include "extension_server.h"
#include "neon.h" #include "neon.h"
#include "walproposer.h"
#include "pagestore_client.h"
#include "control_plane_connector.h" #include "control_plane_connector.h"
#include "logical_replication_monitor.h"
#include "walsender_hooks.h" #include "walsender_hooks.h"
#if PG_MAJORVERSION_NUM >= 16 #if PG_MAJORVERSION_NUM >= 16
#include "storage/ipc.h" #include "storage/ipc.h"
@@ -38,6 +48,7 @@
PG_MODULE_MAGIC; PG_MODULE_MAGIC;
void _PG_init(void); void _PG_init(void);
static int logical_replication_max_snap_files = 300;
static int running_xacts_overflow_policy; static int running_xacts_overflow_policy;
@@ -71,6 +82,237 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = {
{NULL, 0, false} {NULL, 0, false}
}; };
static void
InitLogicalReplicationMonitor(void)
{
BackgroundWorker bgw;
DefineCustomIntVariable(
"neon.logical_replication_max_snap_files",
"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
NULL,
&logical_replication_max_snap_files,
300, -1, INT_MAX,
PGC_SIGHUP,
0,
NULL, NULL, NULL);
memset(&bgw, 0, sizeof(bgw));
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
bgw.bgw_restart_time = 5;
bgw.bgw_notify_pid = 0;
bgw.bgw_main_arg = (Datum) 0;
RegisterBackgroundWorker(&bgw);
}
static int
LsnDescComparator(const void *a, const void *b)
{
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
if (lsn1 < lsn2)
return 1;
else if (lsn1 == lsn2)
return 0;
else
return -1;
}
/*
* Look at .snap files and calculate minimum allowed restart_lsn of slot so that
* next gc would leave not more than logical_replication_max_snap_files; all
* slots having lower restart_lsn should be dropped.
*/
static XLogRecPtr
get_num_snap_files_lsn_threshold(void)
{
DIR *dirdesc;
struct dirent *de;
char *snap_path = "pg_logical/snapshots/";
int lsns_allocated = 1024;
int lsns_num = 0;
XLogRecPtr *lsns;
XLogRecPtr cutoff;
if (logical_replication_max_snap_files < 0)
return 0;
lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
/* find all .snap files and get their lsns */
dirdesc = AllocateDir(snap_path);
while ((de = ReadDir(dirdesc, snap_path)) != NULL)
{
XLogRecPtr lsn;
uint32 hi;
uint32 lo;
if (strcmp(de->d_name, ".") == 0 ||
strcmp(de->d_name, "..") == 0)
continue;
if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
{
ereport(LOG,
(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
continue;
}
lsn = ((uint64) hi) << 32 | lo;
elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
if (lsns_allocated == lsns_num)
{
lsns_allocated *= 2;
lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
}
lsns[lsns_num++] = lsn;
}
/* sort by lsn desc */
qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
/* and take cutoff at logical_replication_max_snap_files */
if (logical_replication_max_snap_files > lsns_num)
cutoff = 0;
/* have less files than cutoff */
else
{
cutoff = lsns[logical_replication_max_snap_files - 1];
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
}
pfree(lsns);
FreeDir(dirdesc);
return cutoff;
}
#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
/*
* Unused logical replication slots pins WAL and prevents deletion of snapshots.
* WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
* need too many .snap files.
*/
PGDLLEXPORT void
LogicalSlotsMonitorMain(Datum main_arg)
{
/* Establish signal handlers. */
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGHUP, SignalHandlerForConfigReload);
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
for (;;)
{
XLogRecPtr cutoff_lsn;
/* In case of a SIGHUP, just reload the configuration. */
if (ConfigReloadPending)
{
ConfigReloadPending = false;
ProcessConfigFile(PGC_SIGHUP);
}
/*
* If there are too many .snap files, just drop all logical slots to
* prevent aux files bloat.
*/
cutoff_lsn = get_num_snap_files_lsn_threshold();
if (cutoff_lsn > 0)
{
for (int i = 0; i < max_replication_slots; i++)
{
char slot_name[NAMEDATALEN];
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
XLogRecPtr restart_lsn;
/* find the name */
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
/* Consider only logical repliction slots */
if (!s->in_use || !SlotIsLogical(s))
{
LWLockRelease(ReplicationSlotControlLock);
continue;
}
/* do we need to drop it? */
SpinLockAcquire(&s->mutex);
restart_lsn = s->data.restart_lsn;
SpinLockRelease(&s->mutex);
if (restart_lsn >= cutoff_lsn)
{
LWLockRelease(ReplicationSlotControlLock);
continue;
}
strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
LWLockRelease(ReplicationSlotControlLock);
/* now try to drop it, killing owner before if any */
for (;;)
{
pid_t active_pid;
SpinLockAcquire(&s->mutex);
active_pid = s->active_pid;
SpinLockRelease(&s->mutex);
if (active_pid == 0)
{
/*
* Slot is releasted, try to drop it. Though of course
* it could have been reacquired, so drop can ERROR
* out. Similarly it could have been dropped in the
* meanwhile.
*
* In principle we could remove pg_try/pg_catch, that
* would restart the whole bgworker.
*/
ConditionVariableCancelSleep();
PG_TRY();
{
ReplicationSlotDrop(slot_name, true);
elog(LOG, "ls_monitor: slot %s dropped", slot_name);
}
PG_CATCH();
{
/* log ERROR and reset elog stack */
EmitErrorReport();
FlushErrorState();
elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
}
PG_END_TRY();
break;
}
else
{
/* kill the owner and wait for release */
elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
(void) kill(active_pid, SIGTERM);
/* We shouldn't get stuck, but to be safe add timeout. */
ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
}
}
}
}
(void) WaitLatch(MyLatch,
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
LS_MONITOR_CHECK_INTERVAL,
PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
}
}
/* /*
* XXX: These private to procarray.c, but we need them here. * XXX: These private to procarray.c, but we need them here.
*/ */
@@ -425,6 +667,7 @@ _PG_init(void)
SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitLogicalReplicationMonitor(); InitLogicalReplicationMonitor();
InitControlPlaneConnector(); InitControlPlaneConnector();
pg_init_extension_server(); pg_init_extension_server();

10
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]] [[package]]
name = "aiohappyeyeballs" name = "aiohappyeyeballs"
@@ -3118,13 +3118,13 @@ files = [
[[package]] [[package]]
name = "werkzeug" name = "werkzeug"
version = "3.0.6" version = "3.0.3"
description = "The comprehensive WSGI web application library." description = "The comprehensive WSGI web application library."
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"}, {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
{file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"}, {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
] ]
[package.dependencies] [package.dependencies]
@@ -3406,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf" content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"

View File

@@ -1,5 +1,5 @@
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{debug, info}; use tracing::{info, warn};
use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
use crate::auth::{self, AuthFlow}; use crate::auth::{self, AuthFlow};
@@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext(
secret: AuthSecret, secret: AuthSecret,
config: &'static AuthenticationConfig, config: &'static AuthenticationConfig,
) -> auth::Result<ComputeCredentials> { ) -> auth::Result<ComputeCredentials> {
debug!("cleartext auth flow override is enabled, proceeding"); warn!("cleartext auth flow override is enabled, proceeding");
ctx.set_auth_method(crate::context::AuthMethod::Cleartext); ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
// pause the timer while we communicate with the client // pause the timer while we communicate with the client
@@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
info: ComputeUserInfoNoEndpoint, info: ComputeUserInfoNoEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>, client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
) -> auth::Result<(ComputeUserInfo, Vec<u8>)> { ) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
debug!("project not specified, resorting to the password hack auth flow"); warn!("project not specified, resorting to the password hack auth flow");
ctx.set_auth_method(crate::context::AuthMethod::Cleartext); ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
// pause the timer while we communicate with the client // pause the timer while we communicate with the client

View File

@@ -5,7 +5,6 @@ use std::time::{Duration, SystemTime};
use arc_swap::ArcSwapOption; use arc_swap::ArcSwapOption;
use dashmap::DashMap; use dashmap::DashMap;
use jose_jwk::crypto::KeyInfo; use jose_jwk::crypto::KeyInfo;
use reqwest::{redirect, Client};
use serde::de::Visitor; use serde::de::Visitor;
use serde::{Deserialize, Deserializer}; use serde::{Deserialize, Deserializer};
use signature::Verifier; use signature::Verifier;
@@ -25,7 +24,6 @@ const MIN_RENEW: Duration = Duration::from_secs(30);
const AUTO_RENEW: Duration = Duration::from_secs(300); const AUTO_RENEW: Duration = Duration::from_secs(300);
const MAX_RENEW: Duration = Duration::from_secs(3600); const MAX_RENEW: Duration = Duration::from_secs(3600);
const MAX_JWK_BODY_SIZE: usize = 64 * 1024; const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
const JWKS_USER_AGENT: &str = "neon-proxy";
/// How to get the JWT auth rules /// How to get the JWT auth rules
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
@@ -52,6 +50,7 @@ pub(crate) struct AuthRule {
pub(crate) role_names: Vec<RoleNameInt>, pub(crate) role_names: Vec<RoleNameInt>,
} }
#[derive(Default)]
pub struct JwkCache { pub struct JwkCache {
client: reqwest::Client, client: reqwest::Client,
@@ -358,20 +357,6 @@ impl JwkCache {
} }
} }
impl Default for JwkCache {
fn default() -> Self {
let client = Client::builder()
.user_agent(JWKS_USER_AGENT)
.redirect(redirect::Policy::none())
.build()
.expect("using &str and standard redirect::Policy");
JwkCache {
client,
map: DashMap::default(),
}
}
}
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> { fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
use ecdsa::Signature; use ecdsa::Signature;
use signature::Verifier; use signature::Verifier;

View File

@@ -21,10 +21,7 @@ use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserIn
use crate::cache::Cached; use crate::cache::Cached;
use crate::config::AuthenticationConfig; use crate::config::AuthenticationConfig;
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
use crate::control_plane::errors::GetAuthInfoError; use crate::control_plane::provider::{CachedNodeInfo, ControlPlaneBackend};
use crate::control_plane::provider::{
CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend,
};
use crate::control_plane::{self, Api, AuthSecret}; use crate::control_plane::{self, Api, AuthSecret};
use crate::intern::EndpointIdInt; use crate::intern::EndpointIdInt;
use crate::metrics::Metrics; use crate::metrics::Metrics;
@@ -35,38 +32,19 @@ use crate::stream::Stream;
use crate::types::{EndpointCacheKey, EndpointId, RoleName}; use crate::types::{EndpointCacheKey, EndpointId, RoleName};
use crate::{scram, stream}; use crate::{scram, stream};
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality /// The [crate::serverless] module can authenticate either using control-plane
pub enum MaybeOwned<'a, T> { /// to get authentication state, or by using JWKs stored in the filesystem.
Owned(T), #[derive(Clone, Copy)]
Borrowed(&'a T), pub enum ServerlessBackend<'a> {
}
impl<T> std::ops::Deref for MaybeOwned<'_, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
match self {
MaybeOwned::Owned(t) => t,
MaybeOwned::Borrowed(t) => t,
}
}
}
/// This type serves two purposes:
///
/// * When `T` is `()`, it's just a regular auth backend selector
/// which we use in [`crate::config::ProxyConfig`].
///
/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
/// this helps us provide the credentials only to those auth
/// backends which require them for the authentication process.
pub enum Backend<'a, T> {
/// Cloud API (V2). /// Cloud API (V2).
ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T), ControlPlane(&'a ControlPlaneBackend),
/// Local proxy uses configured auth credentials and does not wake compute /// Local proxy uses configured auth credentials and does not wake compute
Local(MaybeOwned<'a, LocalBackend>), Local(&'a LocalBackend),
} }
#[cfg(test)]
use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
#[cfg(test)] #[cfg(test)]
pub(crate) trait TestBackend: Send + Sync + 'static { pub(crate) trait TestBackend: Send + Sync + 'static {
fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>; fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
@@ -83,56 +61,20 @@ impl Clone for Box<dyn TestBackend> {
} }
} }
impl std::fmt::Display for Backend<'_, ()> { impl std::fmt::Display for ControlPlaneBackend {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
Self::ControlPlane(api, ()) => match &**api { ControlPlaneBackend::Management(endpoint) => fmt
ControlPlaneBackend::Management(endpoint) => fmt .debug_tuple("ControlPlane::Management")
.debug_tuple("ControlPlane::Management") .field(&endpoint.url())
.field(&endpoint.url()) .finish(),
.finish(), #[cfg(any(test, feature = "testing"))]
#[cfg(any(test, feature = "testing"))] ControlPlaneBackend::PostgresMock(endpoint) => fmt
ControlPlaneBackend::PostgresMock(endpoint) => fmt .debug_tuple("ControlPlane::PostgresMock")
.debug_tuple("ControlPlane::PostgresMock") .field(&endpoint.url())
.field(&endpoint.url()) .finish(),
.finish(), #[cfg(test)]
#[cfg(test)] ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
},
Self::Local(_) => fmt.debug_tuple("Local").finish(),
}
}
}
impl<T> Backend<'_, T> {
/// Very similar to [`std::option::Option::as_ref`].
/// This helps us pass structured config to async tasks.
pub(crate) fn as_ref(&self) -> Backend<'_, &T> {
match self {
Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x),
Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
}
}
}
impl<'a, T> Backend<'a, T> {
/// Very similar to [`std::option::Option::map`].
/// Maps [`Backend<T>`] to [`Backend<R>`] by applying
/// a function to a contained value.
pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> {
match self {
Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)),
Self::Local(l) => Backend::Local(l),
}
}
}
impl<'a, T, E> Backend<'a, Result<T, E>> {
/// Very similar to [`std::option::Option::transpose`].
/// This is most useful for error handling.
pub(crate) fn transpose(self) -> Result<Backend<'a, T>, E> {
match self {
Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)),
Self::Local(l) => Ok(Backend::Local(l)),
} }
} }
} }
@@ -399,96 +341,79 @@ async fn authenticate_with_secret(
classic::authenticate(ctx, info, client, config, secret).await classic::authenticate(ctx, info, client, config, secret).await
} }
impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { impl ControlPlaneBackend {
/// Get username from the credentials.
pub(crate) fn get_user(&self) -> &str {
match self {
Self::ControlPlane(_, user_info) => &user_info.user,
Self::Local(_) => "local",
}
}
/// Authenticate the client via the requested backend, possibly using credentials.
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub(crate) async fn authenticate( pub(crate) async fn authenticate(
self, &self,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
user_info: ComputeUserInfoMaybeEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>, client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool, allow_cleartext: bool,
config: &'static AuthenticationConfig, config: &'static AuthenticationConfig,
endpoint_rate_limiter: Arc<EndpointRateLimiter>, endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> auth::Result<Backend<'a, ComputeCredentials>> { ) -> auth::Result<ControlPlaneComputeBackend> {
let res = match self { info!(
Self::ControlPlane(api, user_info) => { user = &*user_info.user,
info!( project = user_info.endpoint(),
user = &*user_info.user, "performing authentication using the console"
project = user_info.endpoint(), );
"performing authentication using the console"
);
let credentials = auth_quirks( let credentials = auth_quirks(
ctx, ctx,
&*api, self,
user_info, user_info,
client, client,
allow_cleartext, allow_cleartext,
config, config,
endpoint_rate_limiter, endpoint_rate_limiter,
) )
.await?; .await?;
Backend::ControlPlane(api, credentials)
}
Self::Local(_) => {
return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
}
};
info!("user successfully authenticated"); info!("user successfully authenticated");
Ok(res) Ok(ControlPlaneComputeBackend {
api: self,
creds: credentials,
})
}
pub(crate) fn attach_to_credentials(
&self,
creds: ComputeCredentials,
) -> ControlPlaneComputeBackend {
ControlPlaneComputeBackend { api: self, creds }
} }
} }
impl Backend<'_, ComputeUserInfo> { pub struct ControlPlaneComputeBackend<'a> {
pub(crate) async fn get_role_secret( api: &'a ControlPlaneBackend,
&self, creds: ComputeCredentials,
ctx: &RequestMonitoring,
) -> Result<CachedRoleSecret, GetAuthInfoError> {
match self {
Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await,
Self::Local(_) => Ok(Cached::new_uncached(None)),
}
}
pub(crate) async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
match self {
Self::ControlPlane(api, user_info) => {
api.get_allowed_ips_and_secret(ctx, user_info).await
}
Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
}
}
} }
#[async_trait::async_trait] #[async_trait::async_trait]
impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { impl ComputeConnectBackend for ControlPlaneComputeBackend<'static> {
async fn wake_compute( async fn wake_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> { ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
match self { self.api.wake_compute(ctx, &self.creds.info).await
Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
}
} }
fn get_keys(&self) -> &ComputeCredentialKeys { fn get_keys(&self) -> &ComputeCredentialKeys {
match self { &self.creds.keys
Self::ControlPlane(_, creds) => &creds.keys, }
Self::Local(_) => &ComputeCredentialKeys::None, }
}
#[async_trait::async_trait]
impl ComputeConnectBackend for LocalBackend {
async fn wake_compute(
&self,
_ctx: &RequestMonitoring,
) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
Ok(Cached::new_uncached(self.node_info.clone()))
}
fn get_keys(&self) -> &ComputeCredentialKeys {
&ComputeCredentialKeys::None
} }
} }

View File

@@ -1,7 +1,7 @@
//! Client authentication mechanisms. //! Client authentication mechanisms.
pub mod backend; pub mod backend;
pub use backend::Backend; pub use backend::ServerlessBackend;
mod credentials; mod credentials;
pub(crate) use credentials::{ pub(crate) use credentials::{

View File

@@ -203,7 +203,7 @@ async fn main() -> anyhow::Result<()> {
let task = serverless::task_main( let task = serverless::task_main(
config, config,
auth_backend, auth::ServerlessBackend::Local(auth_backend),
http_listener, http_listener,
shutdown.clone(), shutdown.clone(),
Arc::new(CancellationHandlerMain::new( Arc::new(CancellationHandlerMain::new(
@@ -295,12 +295,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
} }
/// auth::Backend is created at proxy startup, and lives forever. /// auth::Backend is created at proxy startup, and lives forever.
fn build_auth_backend( fn build_auth_backend(args: &LocalProxyCliArgs) -> anyhow::Result<&'static LocalBackend> {
args: &LocalProxyCliArgs, let auth_backend = LocalBackend::new(args.postgres, args.compute_ctl.clone());
) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
LocalBackend::new(args.postgres, args.compute_ctl.clone()),
));
Ok(Box::leak(Box::new(auth_backend))) Ok(Box::leak(Box::new(auth_backend)))
} }

View File

@@ -13,13 +13,14 @@ use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
use aws_config::Region; use aws_config::Region;
use futures::future::Either; use futures::future::Either;
use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::jwt::JwkCache;
use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend};
use proxy::cancellation::{CancelMap, CancellationHandler}; use proxy::cancellation::{CancelMap, CancellationHandler};
use proxy::config::{ use proxy::config::{
self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig, self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig,
ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
}; };
use proxy::context::parquet::ParquetUploadArgs; use proxy::context::parquet::ParquetUploadArgs;
use proxy::control_plane::provider::ControlPlaneBackend;
use proxy::http::health_server::AppMetrics; use proxy::http::health_server::AppMetrics;
use proxy::metrics::Metrics; use proxy::metrics::Metrics;
use proxy::rate_limiter::{ use proxy::rate_limiter::{
@@ -137,6 +138,9 @@ struct ProxyCliArgs {
/// size of the threadpool for password hashing /// size of the threadpool for password hashing
#[clap(long, default_value_t = 4)] #[clap(long, default_value_t = 4)]
scram_thread_pool_size: u8, scram_thread_pool_size: u8,
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
disable_dynamic_rate_limiter: bool,
/// Endpoint rate limiter max number of requests per second. /// Endpoint rate limiter max number of requests per second.
/// ///
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`. /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
@@ -464,7 +468,7 @@ async fn main() -> anyhow::Result<()> {
if let Some(serverless_listener) = serverless_listener { if let Some(serverless_listener) = serverless_listener {
client_tasks.spawn(serverless::task_main( client_tasks.spawn(serverless::task_main(
config, config,
auth_backend, auth::ServerlessBackend::ControlPlane(auth_backend),
serverless_listener, serverless_listener,
cancellation_token.clone(), cancellation_token.clone(),
cancellation_handler.clone(), cancellation_handler.clone(),
@@ -512,40 +516,38 @@ async fn main() -> anyhow::Result<()> {
)); ));
} }
if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { if let Either::Left(ControlPlaneBackend::Management(api)) = &auth_backend {
if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api { match (redis_notifications_client, regional_redis_client.clone()) {
match (redis_notifications_client, regional_redis_client.clone()) { (None, None) => {}
(None, None) => {} (client1, client2) => {
(client1, client2) => { let cache = api.caches.project_info.clone();
let cache = api.caches.project_info.clone(); if let Some(client) = client1 {
if let Some(client) = client1 { maintenance_tasks.spawn(notifications::task_main(
maintenance_tasks.spawn(notifications::task_main( client,
client, cache.clone(),
cache.clone(), cancel_map.clone(),
cancel_map.clone(), args.region.clone(),
args.region.clone(), ));
));
}
if let Some(client) = client2 {
maintenance_tasks.spawn(notifications::task_main(
client,
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
}
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
} }
if let Some(client) = client2 {
maintenance_tasks.spawn(notifications::task_main(
client,
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
}
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
} }
if let Some(regional_redis_client) = regional_redis_client { }
let cache = api.caches.endpoints_cache.clone(); if let Some(regional_redis_client) = regional_redis_client {
let con = regional_redis_client; let cache = api.caches.endpoints_cache.clone();
let span = tracing::info_span!("endpoints_cache"); let con = regional_redis_client;
maintenance_tasks.spawn( let span = tracing::info_span!("endpoints_cache");
async move { cache.do_read(con, cancellation_token.clone()).await } maintenance_tasks.spawn(
.instrument(span), async move { cache.do_read(con, cancellation_token.clone()).await }
); .instrument(span),
} );
} }
} }
@@ -612,6 +614,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
and metric-collection-interval must be specified" and metric-collection-interval must be specified"
), ),
}; };
if !args.disable_dynamic_rate_limiter {
bail!("dynamic rate limiter should be disabled");
}
let config::ConcurrencyLockOptions { let config::ConcurrencyLockOptions {
shards, shards,
@@ -688,7 +693,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
/// auth::Backend is created at proxy startup, and lives forever. /// auth::Backend is created at proxy startup, and lives forever.
fn build_auth_backend( fn build_auth_backend(
args: &ProxyCliArgs, args: &ProxyCliArgs,
) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> { ) -> anyhow::Result<Either<&'static ControlPlaneBackend, &'static ConsoleRedirectBackend>> {
match &args.auth_backend { match &args.auth_backend {
AuthBackendType::Console => { AuthBackendType::Console => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
@@ -738,8 +743,7 @@ fn build_auth_backend(
locks, locks,
wake_compute_endpoint_rate_limiter, wake_compute_endpoint_rate_limiter,
); );
let api = control_plane::provider::ControlPlaneBackend::Management(api); let auth_backend = control_plane::provider::ControlPlaneBackend::Management(api);
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
let config = Box::leak(Box::new(auth_backend)); let config = Box::leak(Box::new(auth_backend));
@@ -750,9 +754,7 @@ fn build_auth_backend(
AuthBackendType::Postgres => { AuthBackendType::Postgres => {
let url = args.auth_endpoint.parse()?; let url = args.auth_endpoint.parse()?;
let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy); let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api); let auth_backend = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
let config = Box::leak(Box::new(auth_backend)); let config = Box::leak(Box::new(auth_backend));

View File

@@ -56,7 +56,7 @@ pub(crate) trait ConnectMechanism {
} }
#[async_trait] #[async_trait]
pub(crate) trait ComputeConnectBackend { pub(crate) trait ComputeConnectBackend: Send + Sync + 'static {
async fn wake_compute( async fn wake_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
@@ -98,10 +98,10 @@ impl ConnectMechanism for TcpMechanism<'_> {
/// Try to connect to the compute node, retrying if necessary. /// Try to connect to the compute node, retrying if necessary.
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>( pub(crate) async fn connect_to_compute<M: ConnectMechanism>(
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
mechanism: &M, mechanism: &M,
user_info: &B, user_info: &dyn ComputeConnectBackend,
allow_self_signed_compute: bool, allow_self_signed_compute: bool,
wake_compute_retry_config: RetryConfig, wake_compute_retry_config: RetryConfig,
connect_to_compute_retry_config: RetryConfig, connect_to_compute_retry_config: RetryConfig,

View File

@@ -26,6 +26,7 @@ use self::passthrough::ProxyPassthrough;
use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
use crate::control_plane::provider::ControlPlaneBackend;
use crate::error::ReportableError; use crate::error::ReportableError;
use crate::metrics::{Metrics, NumClientConnectionsGuard}; use crate::metrics::{Metrics, NumClientConnectionsGuard};
use crate::protocol2::read_proxy_protocol; use crate::protocol2::read_proxy_protocol;
@@ -54,7 +55,7 @@ pub async fn run_until_cancelled<F: std::future::Future>(
pub async fn task_main( pub async fn task_main(
config: &'static ProxyConfig, config: &'static ProxyConfig,
auth_backend: &'static auth::Backend<'static, ()>, auth_backend: &'static ControlPlaneBackend,
listener: tokio::net::TcpListener, listener: tokio::net::TcpListener,
cancellation_token: CancellationToken, cancellation_token: CancellationToken,
cancellation_handler: Arc<CancellationHandlerMain>, cancellation_handler: Arc<CancellationHandlerMain>,
@@ -241,7 +242,7 @@ impl ReportableError for ClientRequestError {
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>( pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
config: &'static ProxyConfig, config: &'static ProxyConfig,
auth_backend: &'static auth::Backend<'static, ()>, auth_backend: &'static ControlPlaneBackend,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
cancellation_handler: Arc<CancellationHandlerMain>, cancellation_handler: Arc<CancellationHandlerMain>,
stream: S, stream: S,
@@ -282,20 +283,17 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let common_names = tls.map(|tls| &tls.common_names); let common_names = tls.map(|tls| &tls.common_names);
// Extract credentials which we're going to use for auth. // Extract credentials which we're going to use for auth.
let result = auth_backend let result = auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names);
.as_ref()
.map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
.transpose();
let user_info = match result { let user_info = match result {
Ok(user_info) => user_info, Ok(user_info) => user_info,
Err(e) => stream.throw_error(e).await?, Err(e) => stream.throw_error(e).await?,
}; };
let user = user_info.get_user().to_owned(); let user = user_info.user.clone();
let user_info = match user_info let user_info = match auth_backend
.authenticate( .authenticate(
ctx, ctx,
user_info,
&mut stream, &mut stream,
mode.allow_cleartext(), mode.allow_cleartext(),
&config.authentication_config, &config.authentication_config,

View File

@@ -6,6 +6,7 @@ use std::time::Duration;
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use async_trait::async_trait; use async_trait::async_trait;
use auth::backend::ControlPlaneComputeBackend;
use http::StatusCode; use http::StatusCode;
use retry::{retry_after, ShouldRetryWakeCompute}; use retry::{retry_after, ShouldRetryWakeCompute};
use rstest::rstest; use rstest::rstest;
@@ -19,7 +20,7 @@ use super::connect_compute::ConnectMechanism;
use super::retry::CouldRetry; use super::retry::CouldRetry;
use super::*; use super::*;
use crate::auth::backend::{ use crate::auth::backend::{
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend, ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, TestBackend,
}; };
use crate::config::{CertResolver, RetryConfig}; use crate::config::{CertResolver, RetryConfig};
use crate::control_plane::messages::{ControlPlaneError, Details, MetricsAuxInfo, Status}; use crate::control_plane::messages::{ControlPlaneError, Details, MetricsAuxInfo, Status};
@@ -566,19 +567,21 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
fn helper_create_connect_info( fn helper_create_connect_info(
mechanism: &TestConnectMechanism, mechanism: &TestConnectMechanism,
) -> auth::Backend<'static, ComputeCredentials> { ) -> ControlPlaneComputeBackend<'static> {
let user_info = auth::Backend::ControlPlane( let api = Box::leak(Box::new(ControlPlaneBackend::Test(Box::new(
MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))), mechanism.clone(),
ComputeCredentials { ))));
info: ComputeUserInfo {
endpoint: "endpoint".into(), let creds = ComputeCredentials {
user: "user".into(), info: ComputeUserInfo {
options: NeonOptions::parse_options_raw(""), endpoint: "endpoint".into(),
}, user: "user".into(),
keys: ComputeCredentialKeys::Password("password".into()), options: NeonOptions::parse_options_raw(""),
}, },
); keys: ComputeCredentialKeys::Password("password".into()),
user_info };
api.attach_to_credentials(creds)
} }
#[tokio::test] #[tokio::test]

View File

@@ -11,10 +11,10 @@ use crate::metrics::{
}; };
use crate::proxy::retry::{retry_after, should_retry}; use crate::proxy::retry::{retry_after, should_retry};
pub(crate) async fn wake_compute<B: ComputeConnectBackend>( pub(crate) async fn wake_compute(
num_retries: &mut u32, num_retries: &mut u32,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
api: &B, api: &dyn ComputeConnectBackend,
config: RetryConfig, config: RetryConfig,
) -> Result<CachedNodeInfo, WakeComputeError> { ) -> Result<CachedNodeInfo, WakeComputeError> {
let retry_type = RetryType::WakeCompute; let retry_type = RetryType::WakeCompute;

View File

@@ -15,9 +15,9 @@ use super::conn_pool::poll_client;
use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool}; use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
use super::http_conn_pool::{self, poll_http2_client, Send}; use super::http_conn_pool::{self, poll_http2_client, Send};
use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION}; use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::local::{LocalBackend, StaticAuthRules};
use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; use crate::auth::{check_peer_addr_is_in_list, AuthError, ServerlessBackend};
use crate::compute; use crate::compute;
use crate::compute_ctl::{ use crate::compute_ctl::{
ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
@@ -26,11 +26,11 @@ use crate::config::ProxyConfig;
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
use crate::control_plane::locks::ApiLocks; use crate::control_plane::locks::ApiLocks;
use crate::control_plane::provider::ApiLockError; use crate::control_plane::provider::{ApiLockError, ControlPlaneBackend};
use crate::control_plane::CachedNodeInfo; use crate::control_plane::{Api, CachedNodeInfo};
use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::error::{ErrorKind, ReportableError, UserFacingError};
use crate::intern::EndpointIdInt; use crate::intern::EndpointIdInt;
use crate::proxy::connect_compute::ConnectMechanism; use crate::proxy::connect_compute::{ComputeConnectBackend, ConnectMechanism};
use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
use crate::rate_limiter::EndpointRateLimiter; use crate::rate_limiter::EndpointRateLimiter;
use crate::types::{EndpointId, Host}; use crate::types::{EndpointId, Host};
@@ -41,7 +41,6 @@ pub(crate) struct PoolingBackend {
pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>, pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
pub(crate) config: &'static ProxyConfig, pub(crate) config: &'static ProxyConfig,
pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>, pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
} }
@@ -49,12 +48,13 @@ impl PoolingBackend {
pub(crate) async fn authenticate_with_password( pub(crate) async fn authenticate_with_password(
&self, &self,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
auth_backend: &ControlPlaneBackend,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
password: &[u8], password: &[u8],
) -> Result<ComputeCredentials, AuthError> { ) -> Result<ComputeCredentials, AuthError> {
let user_info = user_info.clone(); let (allowed_ips, maybe_secret) = auth_backend
let backend = self.auth_backend.as_ref().map(|()| user_info.clone()); .get_allowed_ips_and_secret(ctx, user_info)
let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; .await?;
if self.config.authentication_config.ip_allowlist_check_enabled if self.config.authentication_config.ip_allowlist_check_enabled
&& !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
{ {
@@ -68,7 +68,7 @@ impl PoolingBackend {
} }
let cached_secret = match maybe_secret { let cached_secret = match maybe_secret {
Some(secret) => secret, Some(secret) => secret,
None => backend.get_role_secret(ctx).await?, None => auth_backend.get_role_secret(ctx, user_info).await?,
}; };
let secret = match cached_secret.value.clone() { let secret = match cached_secret.value.clone() {
@@ -103,7 +103,7 @@ impl PoolingBackend {
} }
}; };
res.map(|key| ComputeCredentials { res.map(|key| ComputeCredentials {
info: user_info, info: user_info.clone(),
keys: key, keys: key,
}) })
} }
@@ -111,11 +111,12 @@ impl PoolingBackend {
pub(crate) async fn authenticate_with_jwt( pub(crate) async fn authenticate_with_jwt(
&self, &self,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
auth_backend: ServerlessBackend<'static>,
user_info: &ComputeUserInfo, user_info: &ComputeUserInfo,
jwt: String, jwt: String,
) -> Result<ComputeCredentials, AuthError> { ) -> Result<ComputeCredentials, AuthError> {
match &self.auth_backend { match auth_backend {
crate::auth::Backend::ControlPlane(console, ()) => { ServerlessBackend::ControlPlane(console) => {
self.config self.config
.authentication_config .authentication_config
.jwks_cache .jwks_cache
@@ -123,7 +124,7 @@ impl PoolingBackend {
ctx, ctx,
user_info.endpoint.clone(), user_info.endpoint.clone(),
&user_info.user, &user_info.user,
&**console, console,
&jwt, &jwt,
) )
.await .await
@@ -134,7 +135,7 @@ impl PoolingBackend {
keys: crate::auth::backend::ComputeCredentialKeys::None, keys: crate::auth::backend::ComputeCredentialKeys::None,
}) })
} }
crate::auth::Backend::Local(_) => { ServerlessBackend::Local(_) => {
let keys = self let keys = self
.config .config
.authentication_config .authentication_config
@@ -164,6 +165,7 @@ impl PoolingBackend {
pub(crate) async fn connect_to_compute( pub(crate) async fn connect_to_compute(
&self, &self,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
auth_backend: ServerlessBackend<'static>,
conn_info: ConnInfo, conn_info: ConnInfo,
keys: ComputeCredentials, keys: ComputeCredentials,
force_new: bool, force_new: bool,
@@ -182,7 +184,14 @@ impl PoolingBackend {
let conn_id = uuid::Uuid::new_v4(); let conn_id = uuid::Uuid::new_v4();
tracing::Span::current().record("conn_id", display(conn_id)); tracing::Span::current().record("conn_id", display(conn_id));
info!(%conn_id, "pool: opening a new connection '{conn_info}'"); info!(%conn_id, "pool: opening a new connection '{conn_info}'");
let backend = self.auth_backend.as_ref().map(|()| keys);
let api = match auth_backend {
ServerlessBackend::ControlPlane(cplane) => {
&cplane.attach_to_credentials(keys) as &dyn ComputeConnectBackend
}
ServerlessBackend::Local(local_proxy) => local_proxy as &dyn ComputeConnectBackend,
};
crate::proxy::connect_compute::connect_to_compute( crate::proxy::connect_compute::connect_to_compute(
ctx, ctx,
&TokioMechanism { &TokioMechanism {
@@ -191,7 +200,7 @@ impl PoolingBackend {
pool: self.pool.clone(), pool: self.pool.clone(),
locks: &self.config.connect_compute_locks, locks: &self.config.connect_compute_locks,
}, },
&backend, api,
false, // do not allow self signed compute for http flow false, // do not allow self signed compute for http flow
self.config.wake_compute_retry_config, self.config.wake_compute_retry_config,
self.config.connect_to_compute_retry_config, self.config.connect_to_compute_retry_config,
@@ -204,6 +213,7 @@ impl PoolingBackend {
pub(crate) async fn connect_to_local_proxy( pub(crate) async fn connect_to_local_proxy(
&self, &self,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
auth_backend: &'static ControlPlaneBackend,
conn_info: ConnInfo, conn_info: ConnInfo,
) -> Result<http_conn_pool::Client<Send>, HttpConnError> { ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
info!("pool: looking for an existing connection"); info!("pool: looking for an existing connection");
@@ -214,7 +224,8 @@ impl PoolingBackend {
let conn_id = uuid::Uuid::new_v4(); let conn_id = uuid::Uuid::new_v4();
tracing::Span::current().record("conn_id", display(conn_id)); tracing::Span::current().record("conn_id", display(conn_id));
info!(%conn_id, "pool: opening a new connection '{conn_info}'"); info!(%conn_id, "pool: opening a new connection '{conn_info}'");
let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials {
let backend = auth_backend.attach_to_credentials(ComputeCredentials {
info: ComputeUserInfo { info: ComputeUserInfo {
user: conn_info.user_info.user.clone(), user: conn_info.user_info.user.clone(),
endpoint: EndpointId::from(format!("{}-local-proxy", conn_info.user_info.endpoint)), endpoint: EndpointId::from(format!("{}-local-proxy", conn_info.user_info.endpoint)),
@@ -249,26 +260,20 @@ impl PoolingBackend {
pub(crate) async fn connect_to_local_postgres( pub(crate) async fn connect_to_local_postgres(
&self, &self,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
auth_backend: &LocalBackend,
conn_info: ConnInfo, conn_info: ConnInfo,
) -> Result<LocalClient<tokio_postgres::Client>, HttpConnError> { ) -> Result<LocalClient<tokio_postgres::Client>, HttpConnError> {
if let Some(client) = self.local_pool.get(ctx, &conn_info)? { if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
return Ok(client); return Ok(client);
} }
let local_backend = match &self.auth_backend {
auth::Backend::ControlPlane(_, ()) => {
unreachable!("only local_proxy can connect to local postgres")
}
auth::Backend::Local(local) => local,
};
if !self.local_pool.initialized(&conn_info) { if !self.local_pool.initialized(&conn_info) {
// only install and grant usage one at a time. // only install and grant usage one at a time.
let _permit = local_backend.initialize.acquire().await.unwrap(); let _permit = auth_backend.initialize.acquire().await.unwrap();
// check again for race // check again for race
if !self.local_pool.initialized(&conn_info) { if !self.local_pool.initialized(&conn_info) {
local_backend auth_backend
.compute_ctl .compute_ctl
.install_extension(&ExtensionInstallRequest { .install_extension(&ExtensionInstallRequest {
extension: EXT_NAME, extension: EXT_NAME,
@@ -277,7 +282,7 @@ impl PoolingBackend {
}) })
.await?; .await?;
local_backend auth_backend
.compute_ctl .compute_ctl
.grant_role(&SetRoleGrantsRequest { .grant_role(&SetRoleGrantsRequest {
schema: EXT_SCHEMA, schema: EXT_SCHEMA,
@@ -295,7 +300,7 @@ impl PoolingBackend {
tracing::Span::current().record("conn_id", display(conn_id)); tracing::Span::current().record("conn_id", display(conn_id));
info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); info!(%conn_id, "local_pool: opening a new connection '{conn_info}'");
let mut node_info = local_backend.node_info.clone(); let mut node_info = auth_backend.node_info.clone();
let (key, jwk) = create_random_jwk(); let (key, jwk) = create_random_jwk();

View File

@@ -32,7 +32,6 @@ use hyper_util::rt::TokioExecutor;
use hyper_util::server::conn::auto::Builder; use hyper_util::server::conn::auto::Builder;
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::SeedableRng; use rand::SeedableRng;
use sql_over_http::{uuid_to_header_value, NEON_REQUEST_ID};
use tokio::io::{AsyncRead, AsyncWrite}; use tokio::io::{AsyncRead, AsyncWrite};
use tokio::net::{TcpListener, TcpStream}; use tokio::net::{TcpListener, TcpStream};
use tokio::time::timeout; use tokio::time::timeout;
@@ -42,6 +41,7 @@ use tokio_util::task::TaskTracker;
use tracing::{info, warn, Instrument}; use tracing::{info, warn, Instrument};
use utils::http::error::ApiError; use utils::http::error::ApiError;
use crate::auth::ServerlessBackend;
use crate::cancellation::CancellationHandlerMain; use crate::cancellation::CancellationHandlerMain;
use crate::config::ProxyConfig; use crate::config::ProxyConfig;
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
@@ -56,7 +56,7 @@ pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
pub async fn task_main( pub async fn task_main(
config: &'static ProxyConfig, config: &'static ProxyConfig,
auth_backend: &'static crate::auth::Backend<'static, ()>, auth_backend: ServerlessBackend<'static>,
ws_listener: TcpListener, ws_listener: TcpListener,
cancellation_token: CancellationToken, cancellation_token: CancellationToken,
cancellation_handler: Arc<CancellationHandlerMain>, cancellation_handler: Arc<CancellationHandlerMain>,
@@ -112,7 +112,6 @@ pub async fn task_main(
local_pool, local_pool,
pool: Arc::clone(&conn_pool), pool: Arc::clone(&conn_pool),
config, config,
auth_backend,
endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
}); });
let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() { let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
@@ -185,6 +184,7 @@ pub async fn task_main(
Box::pin(connection_handler( Box::pin(connection_handler(
config, config,
auth_backend,
backend, backend,
connections2, connections2,
cancellation_handler, cancellation_handler,
@@ -290,6 +290,7 @@ async fn connection_startup(
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
async fn connection_handler( async fn connection_handler(
config: &'static ProxyConfig, config: &'static ProxyConfig,
auth_backend: ServerlessBackend<'static>,
backend: Arc<PoolingBackend>, backend: Arc<PoolingBackend>,
connections: TaskTracker, connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>, cancellation_handler: Arc<CancellationHandlerMain>,
@@ -310,18 +311,7 @@ async fn connection_handler(
hyper_util::rt::TokioIo::new(conn), hyper_util::rt::TokioIo::new(conn),
hyper::service::service_fn(move |req: hyper::Request<Incoming>| { hyper::service::service_fn(move |req: hyper::Request<Incoming>| {
// First HTTP request shares the same session ID // First HTTP request shares the same session ID
let mut session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
if matches!(backend.auth_backend, crate::auth::Backend::Local(_)) {
// take session_id from request, if given.
if let Some(id) = req
.headers()
.get(&NEON_REQUEST_ID)
.and_then(|id| uuid::Uuid::try_parse_ascii(id.as_bytes()).ok())
{
session_id = id;
}
}
// Cancel the current inflight HTTP request if the requets stream is closed. // Cancel the current inflight HTTP request if the requets stream is closed.
// This is slightly different to `_cancel_connection` in that // This is slightly different to `_cancel_connection` in that
@@ -335,6 +325,7 @@ async fn connection_handler(
request_handler( request_handler(
req, req,
config, config,
auth_backend,
backend.clone(), backend.clone(),
connections.clone(), connections.clone(),
cancellation_handler.clone(), cancellation_handler.clone(),
@@ -347,15 +338,8 @@ async fn connection_handler(
.map_ok_or_else(api_error_into_response, |r| r), .map_ok_or_else(api_error_into_response, |r| r),
); );
async move { async move {
let mut res = handler.await; let res = handler.await;
cancel_request.disarm(); cancel_request.disarm();
// add the session ID to the response
if let Ok(resp) = &mut res {
resp.headers_mut()
.append(&NEON_REQUEST_ID, uuid_to_header_value(session_id));
}
res res
} }
}), }),
@@ -381,6 +365,7 @@ async fn connection_handler(
async fn request_handler( async fn request_handler(
mut request: hyper::Request<Incoming>, mut request: hyper::Request<Incoming>,
config: &'static ProxyConfig, config: &'static ProxyConfig,
auth_backend: ServerlessBackend<'static>,
backend: Arc<PoolingBackend>, backend: Arc<PoolingBackend>,
ws_connections: TaskTracker, ws_connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>, cancellation_handler: Arc<CancellationHandlerMain>,
@@ -401,6 +386,10 @@ async fn request_handler(
if config.http_config.accept_websockets if config.http_config.accept_websockets
&& framed_websockets::upgrade::is_upgrade_request(&request) && framed_websockets::upgrade::is_upgrade_request(&request)
{ {
let ServerlessBackend::ControlPlane(auth_backend) = auth_backend else {
return json_response(StatusCode::BAD_REQUEST, "query is not supported");
};
let ctx = RequestMonitoring::new( let ctx = RequestMonitoring::new(
session_id, session_id,
peer_addr, peer_addr,
@@ -418,7 +407,7 @@ async fn request_handler(
async move { async move {
if let Err(e) = websocket::serve_websocket( if let Err(e) = websocket::serve_websocket(
config, config,
backend.auth_backend, auth_backend,
ctx, ctx,
websocket, websocket,
cancellation_handler, cancellation_handler,
@@ -444,9 +433,16 @@ async fn request_handler(
); );
let span = ctx.span(); let span = ctx.span();
sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) sql_over_http::handle(
.instrument(span) config,
.await ctx,
request,
auth_backend,
backend,
http_cancellation_token,
)
.instrument(span)
.await
} else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS {
Response::builder() Response::builder()
.header("Allow", "OPTIONS, POST") .header("Allow", "OPTIONS, POST")

View File

@@ -23,7 +23,6 @@ use typed_json::json;
use url::Url; use url::Url;
use urlencoding; use urlencoding;
use utils::http::error::ApiError; use utils::http::error::ApiError;
use uuid::Uuid;
use super::backend::{LocalProxyConnError, PoolingBackend}; use super::backend::{LocalProxyConnError, PoolingBackend};
use super::conn_pool::{AuthData, ConnInfoWithAuth}; use super::conn_pool::{AuthData, ConnInfoWithAuth};
@@ -31,10 +30,11 @@ use super::conn_pool_lib::{self, ConnInfo};
use super::http_util::json_response; use super::http_util::json_response;
use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
use super::local_conn_pool; use super::local_conn_pool;
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::backend::{ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo};
use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; use crate::auth::{endpoint_sni, ComputeUserInfoParseError, ServerlessBackend};
use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
use crate::control_plane::provider::ControlPlaneBackend;
use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::error::{ErrorKind, ReportableError, UserFacingError};
use crate::metrics::{HttpDirection, Metrics}; use crate::metrics::{HttpDirection, Metrics};
use crate::proxy::{run_until_cancelled, NeonOptions}; use crate::proxy::{run_until_cancelled, NeonOptions};
@@ -64,8 +64,6 @@ enum Payload {
Batch(BatchQueryData), Batch(BatchQueryData),
} }
pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id");
static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -243,10 +241,11 @@ pub(crate) async fn handle(
config: &'static ProxyConfig, config: &'static ProxyConfig,
ctx: RequestMonitoring, ctx: RequestMonitoring,
request: Request<Incoming>, request: Request<Incoming>,
auth_backend: ServerlessBackend<'static>,
backend: Arc<PoolingBackend>, backend: Arc<PoolingBackend>,
cancel: CancellationToken, cancel: CancellationToken,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> { ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
let result = handle_inner(cancel, config, &ctx, request, backend).await; let result = handle_inner(cancel, config, &ctx, request, auth_backend, backend).await;
let mut response = match result { let mut response = match result {
Ok(r) => { Ok(r) => {
@@ -501,6 +500,7 @@ async fn handle_inner(
config: &'static ProxyConfig, config: &'static ProxyConfig,
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
request: Request<Incoming>, request: Request<Incoming>,
auth_backend: ServerlessBackend<'static>,
backend: Arc<PoolingBackend>, backend: Arc<PoolingBackend>,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> { ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
let _requeset_gauge = Metrics::get() let _requeset_gauge = Metrics::get()
@@ -525,7 +525,11 @@ async fn handle_inner(
match conn_info.auth { match conn_info.auth {
AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => { AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => {
handle_auth_broker_inner(ctx, request, conn_info.conn_info, jwt, backend).await let ServerlessBackend::ControlPlane(cplane) = auth_backend else {
panic!("auth_broker must be configured with a control-plane auth backend.")
};
handle_auth_broker_inner(ctx, request, conn_info.conn_info, jwt, cplane, backend).await
} }
auth => { auth => {
handle_db_inner( handle_db_inner(
@@ -535,6 +539,7 @@ async fn handle_inner(
request, request,
conn_info.conn_info, conn_info.conn_info,
auth, auth,
auth_backend,
backend, backend,
) )
.await .await
@@ -542,6 +547,7 @@ async fn handle_inner(
} }
} }
#[allow(clippy::too_many_arguments)]
async fn handle_db_inner( async fn handle_db_inner(
cancel: CancellationToken, cancel: CancellationToken,
config: &'static ProxyConfig, config: &'static ProxyConfig,
@@ -549,6 +555,7 @@ async fn handle_db_inner(
request: Request<Incoming>, request: Request<Incoming>,
conn_info: ConnInfo, conn_info: ConnInfo,
auth: AuthData, auth: AuthData,
auth_backend: ServerlessBackend<'static>,
backend: Arc<PoolingBackend>, backend: Arc<PoolingBackend>,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> { ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
// //
@@ -591,45 +598,58 @@ async fn handle_db_inner(
.map_err(SqlOverHttpError::from), .map_err(SqlOverHttpError::from),
); );
let authenticate_and_connect = Box::pin( let authenticate_and_connect = Box::pin(async {
async { let creds = match auth {
let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_)); AuthData::Password(pw) => {
let ServerlessBackend::ControlPlane(cplane) = auth_backend else {
return Err(SqlOverHttpError::ConnInfo(
ConnInfoError::MissingCredentials(Credentials::BearerJwt),
));
};
let keys = match auth { backend
AuthData::Password(pw) => { .authenticate_with_password(ctx, cplane, &conn_info.user_info, &pw)
backend .await
.authenticate_with_password(ctx, &conn_info.user_info, &pw) .map_err(HttpConnError::from)?
.await? }
} AuthData::Jwt(jwt) => backend
AuthData::Jwt(jwt) => { .authenticate_with_jwt(ctx, auth_backend, &conn_info.user_info, jwt)
backend .await
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .map_err(HttpConnError::from)?,
.await? };
}
};
let client = match keys.keys { let client = match (creds.keys, auth_backend) {
ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => { (ComputeCredentialKeys::JwtPayload(payload), ServerlessBackend::Local(local)) => {
let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; let mut client = backend
let (cli_inner, _dsc) = client.client_inner(); .connect_to_local_postgres(ctx, local, conn_info)
cli_inner.set_jwt_session(&payload).await?; .await?;
Client::Local(client) let (cli_inner, _dsc) = client.client_inner();
} cli_inner.set_jwt_session(&payload).await?;
_ => { Client::Local(client)
let client = backend }
.connect_to_compute(ctx, conn_info, keys, !allow_pool) (keys, auth_backend) => {
.await?; let client = backend
Client::Remote(client) .connect_to_compute(
} ctx,
}; auth_backend,
conn_info,
ComputeCredentials {
keys,
info: creds.info,
},
!allow_pool,
)
.await
.map_err(HttpConnError::from)?;
Client::Remote(client)
}
};
// not strictly necessary to mark success here, // not strictly necessary to mark success here,
// but it's just insurance for if we forget it somewhere else // but it's just insurance for if we forget it somewhere else
ctx.success(); ctx.success();
Ok::<_, HttpConnError>(client) Ok::<_, SqlOverHttpError>(client)
} });
.map_err(SqlOverHttpError::from),
);
let (payload, mut client) = match run_until_cancelled( let (payload, mut client) = match run_until_cancelled(
// Run both operations in parallel // Run both operations in parallel
@@ -709,25 +729,27 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[
&TXN_DEFERRABLE, &TXN_DEFERRABLE,
]; ];
pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue {
let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH];
HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..]))
.expect("uuid hyphenated format should be all valid header characters")
}
async fn handle_auth_broker_inner( async fn handle_auth_broker_inner(
ctx: &RequestMonitoring, ctx: &RequestMonitoring,
request: Request<Incoming>, request: Request<Incoming>,
conn_info: ConnInfo, conn_info: ConnInfo,
jwt: String, jwt: String,
auth_backend: &'static ControlPlaneBackend,
backend: Arc<PoolingBackend>, backend: Arc<PoolingBackend>,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> { ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
backend backend
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .authenticate_with_jwt(
ctx,
ServerlessBackend::ControlPlane(auth_backend),
&conn_info.user_info,
jwt,
)
.await .await
.map_err(HttpConnError::from)?; .map_err(HttpConnError::from)?;
let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; let mut client = backend
.connect_to_local_proxy(ctx, auth_backend, conn_info)
.await?;
let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql"); let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql");
@@ -741,7 +763,6 @@ async fn handle_auth_broker_inner(
req = req.header(h, hv); req = req.header(h, hv);
} }
} }
req = req.header(&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id()));
let req = req let req = req
.body(body) .body(body)

View File

@@ -15,6 +15,7 @@ use tracing::warn;
use crate::cancellation::CancellationHandlerMain; use crate::cancellation::CancellationHandlerMain;
use crate::config::ProxyConfig; use crate::config::ProxyConfig;
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
use crate::control_plane::provider::ControlPlaneBackend;
use crate::error::{io_error, ReportableError}; use crate::error::{io_error, ReportableError};
use crate::metrics::Metrics; use crate::metrics::Metrics;
use crate::proxy::{handle_client, ClientMode, ErrorSource}; use crate::proxy::{handle_client, ClientMode, ErrorSource};
@@ -125,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
pub(crate) async fn serve_websocket( pub(crate) async fn serve_websocket(
config: &'static ProxyConfig, config: &'static ProxyConfig,
auth_backend: &'static crate::auth::Backend<'static, ()>, auth_backend: &'static ControlPlaneBackend,
ctx: RequestMonitoring, ctx: RequestMonitoring,
websocket: OnUpgrade, websocket: OnUpgrade,
cancellation_handler: Arc<CancellationHandlerMain>, cancellation_handler: Arc<CancellationHandlerMain>,

View File

@@ -23,7 +23,7 @@ backoff = "^2.2.1"
pytest-lazy-fixture = "^0.6.3" pytest-lazy-fixture = "^0.6.3"
prometheus-client = "^0.14.1" prometheus-client = "^0.14.1"
pytest-timeout = "^2.1.0" pytest-timeout = "^2.1.0"
Werkzeug = "^3.0.6" Werkzeug = "^3.0.3"
pytest-order = "^1.1.0" pytest-order = "^1.1.0"
allure-pytest = "^2.13.2" allure-pytest = "^2.13.2"
pytest-asyncio = "^0.21.0" pytest-asyncio = "^0.21.0"

View File

@@ -193,8 +193,6 @@ struct Args {
/// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction, /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
/// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again, /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
/// if it weren't for `eviction_min_resident` preventing that. /// if it weren't for `eviction_min_resident` preventing that.
///
/// Also defines interval for eviction retries.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)] #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
eviction_min_resident: Duration, eviction_min_resident: Duration,
} }

View File

@@ -14,10 +14,12 @@ use std::path::Path;
use std::time::Instant; use std::time::Instant;
use crate::control_file_upgrade::downgrade_v9_to_v8; use crate::control_file_upgrade::downgrade_v9_to_v8;
use crate::control_file_upgrade::upgrade_control_file;
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
use crate::state::{EvictionState, TimelinePersistentState}; use crate::state::{EvictionState, TimelinePersistentState};
use utils::bin_ser::LeSer; use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
use utils::{bin_ser::LeSer, id::TenantTimelineId};
use crate::SafeKeeperConf;
pub const SK_MAGIC: u32 = 0xcafeceefu32; pub const SK_MAGIC: u32 = 0xcafeceefu32;
pub const SK_FORMAT_VERSION: u32 = 9; pub const SK_FORMAT_VERSION: u32 = 9;
@@ -52,12 +54,13 @@ pub struct FileStorage {
impl FileStorage { impl FileStorage {
/// Initialize storage by loading state from disk. /// Initialize storage by loading state from disk.
pub fn restore_new(timeline_dir: &Utf8Path, no_sync: bool) -> Result<FileStorage> { pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
let state = Self::load_control_file_from_dir(timeline_dir)?; let timeline_dir = get_timeline_dir(conf, ttid);
let state = Self::load_control_file_from_dir(&timeline_dir)?;
Ok(FileStorage { Ok(FileStorage {
timeline_dir: timeline_dir.to_path_buf(), timeline_dir,
no_sync, no_sync: conf.no_sync,
state, state,
last_persist_at: Instant::now(), last_persist_at: Instant::now(),
}) })
@@ -68,16 +71,16 @@ impl FileStorage {
/// Note: we normally call this in temp directory for atomic init, so /// Note: we normally call this in temp directory for atomic init, so
/// interested in FileStorage as a result only in tests. /// interested in FileStorage as a result only in tests.
pub async fn create_new( pub async fn create_new(
timeline_dir: &Utf8Path, dir: Utf8PathBuf,
conf: &SafeKeeperConf,
state: TimelinePersistentState, state: TimelinePersistentState,
no_sync: bool,
) -> Result<FileStorage> { ) -> Result<FileStorage> {
// we don't support creating new timelines in offloaded state // we don't support creating new timelines in offloaded state
assert!(matches!(state.eviction_state, EvictionState::Present)); assert!(matches!(state.eviction_state, EvictionState::Present));
let mut store = FileStorage { let mut store = FileStorage {
timeline_dir: timeline_dir.to_path_buf(), timeline_dir: dir,
no_sync, no_sync: conf.no_sync,
state: state.clone(), state: state.clone(),
last_persist_at: Instant::now(), last_persist_at: Instant::now(),
}; };
@@ -236,46 +239,89 @@ mod test {
use tokio::fs; use tokio::fs;
use utils::lsn::Lsn; use utils::lsn::Lsn;
const NO_SYNC: bool = true; fn stub_conf() -> SafeKeeperConf {
let workdir = camino_tempfile::tempdir().unwrap().into_path();
SafeKeeperConf {
workdir,
..SafeKeeperConf::dummy()
}
}
#[tokio::test] async fn load_from_control_file(
async fn test_read_write_safekeeper_state() -> anyhow::Result<()> { conf: &SafeKeeperConf,
let tempdir = camino_tempfile::tempdir()?; ttid: &TenantTimelineId,
let mut state = TimelinePersistentState::empty(); ) -> Result<(FileStorage, TimelinePersistentState)> {
let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?; let timeline_dir = get_timeline_dir(conf, ttid);
fs::create_dir_all(&timeline_dir)
.await
.expect("failed to create timeline dir");
Ok((
FileStorage::restore_new(ttid, conf)?,
FileStorage::load_control_file_from_dir(&timeline_dir)?,
))
}
// Make a change. async fn create(
state.commit_lsn = Lsn(42); conf: &SafeKeeperConf,
storage.persist(&state).await?; ttid: &TenantTimelineId,
) -> Result<(FileStorage, TimelinePersistentState)> {
// Reload the state. It should match the previously persisted state. let timeline_dir = get_timeline_dir(conf, ttid);
let loaded_state = FileStorage::load_control_file_from_dir(tempdir.path())?; fs::create_dir_all(&timeline_dir)
assert_eq!(loaded_state, state); .await
Ok(()) .expect("failed to create timeline dir");
let state = TimelinePersistentState::empty();
let storage = FileStorage::create_new(timeline_dir, conf, state.clone()).await?;
Ok((storage, state))
} }
#[tokio::test] #[tokio::test]
async fn test_safekeeper_state_checksum_mismatch() -> anyhow::Result<()> { async fn test_read_write_safekeeper_state() {
let tempdir = camino_tempfile::tempdir()?; let conf = stub_conf();
let mut state = TimelinePersistentState::empty(); let ttid = TenantTimelineId::generate();
let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?; {
let (mut storage, mut state) =
// Make a change. create(&conf, &ttid).await.expect("failed to create state");
state.commit_lsn = Lsn(42); // change something
storage.persist(&state).await?; state.commit_lsn = Lsn(42);
storage
// Change the first byte to fail checksum validation. .persist(&state)
let ctrl_path = tempdir.path().join(CONTROL_FILE_NAME); .await
let mut data = fs::read(&ctrl_path).await?; .expect("failed to persist state");
data[0] += 1; }
fs::write(&ctrl_path, &data).await?;
let (_, state) = load_from_control_file(&conf, &ttid)
// Loading the file should fail checksum validation. .await
if let Err(err) = FileStorage::load_control_file_from_dir(tempdir.path()) { .expect("failed to read state");
assert!(err.to_string().contains("control file checksum mismatch")) assert_eq!(state.commit_lsn, Lsn(42));
} else { }
panic!("expected checksum error")
#[tokio::test]
async fn test_safekeeper_state_checksum_mismatch() {
let conf = stub_conf();
let ttid = TenantTimelineId::generate();
{
let (mut storage, mut state) =
create(&conf, &ttid).await.expect("failed to read state");
// change something
state.commit_lsn = Lsn(42);
storage
.persist(&state)
.await
.expect("failed to persist state");
}
let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
let mut data = fs::read(&control_path).await.unwrap();
data[0] += 1; // change the first byte of the file to fail checksum validation
fs::write(&control_path, &data)
.await
.expect("failed to write control file");
match load_from_control_file(&conf, &ttid).await {
Err(err) => assert!(err
.to_string()
.contains("safekeeper control file checksum mismatch")),
Ok(_) => panic!("expected error"),
} }
Ok(())
} }
} }

View File

@@ -154,7 +154,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
new_state.peer_horizon_lsn = request.until_lsn; new_state.peer_horizon_lsn = request.until_lsn;
new_state.backup_lsn = new_backup_lsn; new_state.backup_lsn = new_backup_lsn;
FileStorage::create_new(&tli_dir_path, new_state.clone(), conf.no_sync).await?; FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone()).await?;
// now we have a ready timeline in a temp directory // now we have a ready timeline in a temp directory
validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?; validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;

View File

@@ -262,6 +262,14 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
check_permission(&request, Some(ttid.tenant_id))?; check_permission(&request, Some(ttid.tenant_id))?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
// Note: with evicted timelines it should work better then de-evict them and
// stream; probably start_snapshot would copy partial s3 file to dest path
// and stream control file, or return WalResidentTimeline if timeline is not
// evicted.
let tli = tli
.wal_residence_guard()
.await
.map_err(ApiError::InternalServerError)?;
// To stream the body use wrap_stream which wants Stream of Result<Bytes>, // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
// so create the chan and write to it in another task. // so create the chan and write to it in another task.

View File

@@ -113,7 +113,6 @@ impl SafeKeeperConf {
impl SafeKeeperConf { impl SafeKeeperConf {
#[cfg(test)] #[cfg(test)]
#[allow(unused)]
fn dummy() -> Self { fn dummy() -> Self {
SafeKeeperConf { SafeKeeperConf {
workdir: Utf8PathBuf::from("./"), workdir: Utf8PathBuf::from("./"),

View File

@@ -8,7 +8,6 @@ use serde::{Deserialize, Serialize};
use std::{ use std::{
cmp::min, cmp::min,
io::{self, ErrorKind}, io::{self, ErrorKind},
sync::Arc,
}; };
use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
use tokio_tar::{Archive, Builder, Header}; use tokio_tar::{Archive, Builder, Header};
@@ -26,8 +25,8 @@ use crate::{
routes::TimelineStatus, routes::TimelineStatus,
}, },
safekeeper::Term, safekeeper::Term,
state::{EvictionState, TimelinePersistentState}, state::TimelinePersistentState,
timeline::{Timeline, WalResidentTimeline}, timeline::WalResidentTimeline,
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
wal_backup, wal_backup,
wal_storage::open_wal_file, wal_storage::open_wal_file,
@@ -44,33 +43,18 @@ use utils::{
/// Stream tar archive of timeline to tx. /// Stream tar archive of timeline to tx.
#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
pub async fn stream_snapshot( pub async fn stream_snapshot(
tli: Arc<Timeline>, tli: WalResidentTimeline,
source: NodeId, source: NodeId,
destination: NodeId, destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>, tx: mpsc::Sender<Result<Bytes>>,
) { ) {
match tli.try_wal_residence_guard().await { if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
Err(e) => { // Error type/contents don't matter as they won't can't reach the client
tx.send(Err(anyhow!("Error checking residence: {:#}", e))) // (hyper likely doesn't do anything with it), but http stream will be
.await // prematurely terminated. It would be nice to try to send the error in
.ok(); // trailers though.
} tx.send(Err(anyhow!("snapshot failed"))).await.ok();
Ok(maybe_resident_tli) => { error!("snapshot failed: {:#}", e);
if let Err(e) = match maybe_resident_tli {
Some(resident_tli) => {
stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
.await
}
None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
} {
// Error type/contents don't matter as they won't can't reach the client
// (hyper likely doesn't do anything with it), but http stream will be
// prematurely terminated. It would be nice to try to send the error in
// trailers though.
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
error!("snapshot failed: {:#}", e);
}
}
} }
} }
@@ -96,10 +80,12 @@ impl Drop for SnapshotContext {
} }
} }
/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel. pub async fn stream_snapshot_guts(
fn prepare_tar_stream( tli: WalResidentTimeline,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>, tx: mpsc::Sender<Result<Bytes>>,
) -> tokio_tar::Builder<impl AsyncWrite + Unpin + Send> { ) -> Result<()> {
// tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>; // tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
// use SinkWriter as a Write impl. That is, // use SinkWriter as a Write impl. That is,
// - create Sink from the tx. It returns PollSendError if chan is closed. // - create Sink from the tx. It returns PollSendError if chan is closed.
@@ -114,38 +100,12 @@ fn prepare_tar_stream(
// - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap // - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
// into CopyToBytes. This is a data copy. // into CopyToBytes. This is a data copy.
let copy_to_bytes = CopyToBytes::new(oksink); let copy_to_bytes = CopyToBytes::new(oksink);
let writer = SinkWriter::new(copy_to_bytes); let mut writer = SinkWriter::new(copy_to_bytes);
let pinned_writer = Box::pin(writer); let pinned_writer = std::pin::pin!(writer);
// Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer // Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
// which is also likely suboptimal. // which is also likely suboptimal.
Builder::new_non_terminated(pinned_writer) let mut ar = Builder::new_non_terminated(pinned_writer);
}
/// Implementation of snapshot for an offloaded timeline, only reads control file
pub(crate) async fn stream_snapshot_offloaded_guts(
tli: Arc<Timeline>,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>,
) -> Result<()> {
let mut ar = prepare_tar_stream(tx);
tli.snapshot_offloaded(&mut ar, source, destination).await?;
ar.finish().await?;
Ok(())
}
/// Implementation of snapshot for a timeline which is resident (includes some segment data)
pub async fn stream_snapshot_resident_guts(
tli: WalResidentTimeline,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>,
) -> Result<()> {
let mut ar = prepare_tar_stream(tx);
let bctx = tli.start_snapshot(&mut ar, source, destination).await?; let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
pausable_failpoint!("sk-snapshot-after-list-pausable"); pausable_failpoint!("sk-snapshot-after-list-pausable");
@@ -178,70 +138,6 @@ pub async fn stream_snapshot_resident_guts(
Ok(()) Ok(())
} }
impl Timeline {
/// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and
/// pass a modified control file into the provided tar stream (nothing with data segments on disk, since
/// we are offloaded and there aren't any)
async fn snapshot_offloaded<W: AsyncWrite + Unpin + Send>(
self: &Arc<Timeline>,
ar: &mut tokio_tar::Builder<W>,
source: NodeId,
destination: NodeId,
) -> Result<()> {
// Take initial copy of control file, then release state lock
let mut control_file = {
let shared_state = self.write_shared_state().await;
let control_file = TimelinePersistentState::clone(shared_state.sk.state());
// Rare race: we got unevicted between entering function and reading control file.
// We error out and let API caller retry.
if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) {
bail!("Timeline was un-evicted during snapshot, please retry");
}
control_file
};
// Modify the partial segment of the in-memory copy for the control file to
// point to the destination safekeeper.
let replace = control_file
.partial_backup
.replace_uploaded_segment(source, destination)?;
let Some(replace) = replace else {
// In Manager:: ready_for_eviction, we do not permit eviction unless the timeline
// has a partial segment. It is unexpected that
anyhow::bail!("Timeline has no partial segment, cannot generate snapshot");
};
tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}");
// Optimistically try to copy the partial segment to the destination's path: this
// can fail if the timeline was un-evicted and modified in the background.
let remote_timeline_path = &self.remote_path;
wal_backup::copy_partial_segment(
&replace.previous.remote_path(remote_timeline_path),
&replace.current.remote_path(remote_timeline_path),
)
.await?;
// Since the S3 copy succeeded with the path given in our control file snapshot, and
// we are sending that snapshot in our response, we are giving the caller a consistent
// snapshot even if our local Timeline was unevicted or otherwise modified in the meantime.
let buf = control_file
.write_to_buf()
.with_context(|| "failed to serialize control store")?;
let mut header = Header::new_gnu();
header.set_size(buf.len().try_into().expect("never breaches u64"));
ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
.await
.with_context(|| "failed to append to archive")?;
Ok(())
}
}
impl WalResidentTimeline { impl WalResidentTimeline {
/// Start streaming tar archive with timeline: /// Start streaming tar archive with timeline:
/// 1) stream control file under lock; /// 1) stream control file under lock;

View File

@@ -21,15 +21,18 @@ use postgres_backend::QueryError;
use pq_proto::BeMessage; use pq_proto::BeMessage;
use serde::Deserialize; use serde::Deserialize;
use serde::Serialize; use serde::Serialize;
use std::future;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::sync::Arc; use std::sync::Arc;
use tokio::io::AsyncRead; use tokio::io::AsyncRead;
use tokio::io::AsyncWrite; use tokio::io::AsyncWrite;
use tokio::sync::mpsc::{channel, Receiver, Sender}; use tokio::sync::mpsc::channel;
use tokio::sync::mpsc::error::TryRecvError;
use tokio::sync::mpsc::Receiver;
use tokio::sync::mpsc::Sender;
use tokio::task; use tokio::task;
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use tokio::time::{Duration, MissedTickBehavior}; use tokio::time::Duration;
use tokio::time::Instant;
use tracing::*; use tracing::*;
use utils::id::TenantTimelineId; use utils::id::TenantTimelineId;
use utils::lsn::Lsn; use utils::lsn::Lsn;
@@ -441,9 +444,9 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
} }
} }
/// The WAL flush interval. This ensures we periodically flush the WAL and send AppendResponses to // Send keepalive messages to walproposer, to make sure it receives updates
/// walproposer, even when it's writing a steady stream of messages. // even when it writes a steady stream of messages.
const FLUSH_INTERVAL: Duration = Duration::from_secs(1); const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
/// Encapsulates a task which takes messages from msg_rx, processes and pushes /// Encapsulates a task which takes messages from msg_rx, processes and pushes
/// replies to reply_tx. /// replies to reply_tx.
@@ -491,76 +494,67 @@ impl WalAcceptor {
async fn run(&mut self) -> anyhow::Result<()> { async fn run(&mut self) -> anyhow::Result<()> {
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id); let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
// Periodically flush the WAL. // After this timestamp we will stop processing AppendRequests and send a response
let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL); // to the walproposer. walproposer sends at least one AppendRequest per second,
flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); // we will send keepalives by replying to these requests once per second.
flush_ticker.tick().await; // skip the initial, immediate tick let mut next_keepalive = Instant::now();
// Tracks unflushed appends. while let Some(mut next_msg) = self.msg_rx.recv().await {
let mut dirty = false; // Update walreceiver state in shmem for reporting.
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
}
loop { let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
let reply = tokio::select! { // Loop through AppendRequests while available to write as many WAL records as
// Process inbound message. // possible without fsyncing.
msg = self.msg_rx.recv() => {
// If disconnected, break to flush WAL and return.
let Some(mut msg) = msg else {
break;
};
// Update walreceiver state in shmem for reporting.
if let ProposerAcceptorMessage::Elected(_) = &msg {
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
}
// Don't flush the WAL on every append, only periodically via flush_ticker.
// This batches multiple appends per fsync. If the channel is empty after
// sending the reply, we'll schedule an immediate flush.
if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
dirty = true;
}
self.tli.process_msg(&msg).await?
}
// While receiving AppendRequests, flush the WAL periodically and respond with an
// AppendResponse to let walproposer know we're still alive.
_ = flush_ticker.tick(), if dirty => {
dirty = false;
self.tli
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?
}
// If there are no pending messages, flush the WAL immediately.
// //
// TODO: this should be done via flush_ticker.reset_immediately(), but that's always // Make sure the WAL is flushed before returning, see:
// delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866. // https://github.com/neondatabase/neon/issues/9259
_ = future::ready(()), if dirty && self.msg_rx.is_empty() => { //
dirty = false; // Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
flush_ticker.reset(); // Otherwise, we might end up in a situation where we read a message, but don't
self.tli // process it.
.process_msg(&ProposerAcceptorMessage::FlushWAL) while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
.await? let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
if self.reply_tx.send(reply).await.is_err() {
break; // disconnected, flush WAL and return on next send/recv
}
}
// get out of this loop if keepalive time is reached
if Instant::now() >= next_keepalive {
break;
}
// continue pulling AppendRequests if available
match self.msg_rx.try_recv() {
Ok(msg) => next_msg = msg,
Err(TryRecvError::Empty) => break,
// on disconnect, flush WAL and return on next send/recv
Err(TryRecvError::Disconnected) => break,
};
} }
// flush all written WAL to the disk
self.tli
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?
} else {
// process message other than AppendRequest
self.tli.process_msg(&next_msg).await?
}; };
// Send reply, if any. if let Some(reply) = reply_msg {
if let Some(reply) = reply {
if self.reply_tx.send(reply).await.is_err() { if self.reply_tx.send(reply).await.is_err() {
break; // disconnected, break to flush WAL and return return Ok(()); // chan closed, streaming terminated
} }
// reset keepalive time
next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
} }
} }
// Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
if dirty {
self.tli
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?;
}
Ok(()) Ok(())
} }
} }

View File

@@ -143,8 +143,8 @@ impl TimelinePersistentState {
TimelinePersistentState::new( TimelinePersistentState::new(
&TenantTimelineId::empty(), &TenantTimelineId::empty(),
ServerInfo { ServerInfo {
pg_version: 170000, /* Postgres server version (major * 10000) */ pg_version: 17, /* Postgres server version */
system_id: 0, /* Postgres system identifier */ system_id: 0, /* Postgres system identifier */
wal_seg_size: 16 * 1024 * 1024, wal_seg_size: 16 * 1024 * 1024,
}, },
vec![], vec![],

View File

@@ -328,19 +328,15 @@ impl SharedState {
/// Restore SharedState from control file. If file doesn't exist, bails out. /// Restore SharedState from control file. If file doesn't exist, bails out.
fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> { fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
let timeline_dir = get_timeline_dir(conf, ttid); let timeline_dir = get_timeline_dir(conf, ttid);
let control_store = control_file::FileStorage::restore_new(&timeline_dir, conf.no_sync)?; let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
if control_store.server.wal_seg_size == 0 { if control_store.server.wal_seg_size == 0 {
bail!(TimelineError::UninitializedWalSegSize(*ttid)); bail!(TimelineError::UninitializedWalSegSize(*ttid));
} }
let sk = match control_store.eviction_state { let sk = match control_store.eviction_state {
EvictionState::Present => { EvictionState::Present => {
let wal_store = wal_storage::PhysicalStorage::new( let wal_store =
ttid, wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
&timeline_dir,
&control_store,
conf.no_sync,
)?;
StateSK::Loaded(SafeKeeper::new( StateSK::Loaded(SafeKeeper::new(
TimelineState::new(control_store), TimelineState::new(control_store),
wal_store, wal_store,
@@ -797,17 +793,14 @@ impl Timeline {
state.sk.term_bump(to).await state.sk.term_bump(to).await
} }
/// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`] /// Get the timeline guard for reading/writing WAL files.
async fn do_wal_residence_guard( /// If WAL files are not present on disk (evicted), they will be automatically
self: &Arc<Self>, /// downloaded from remote storage. This is done in the manager task, which is
block: bool, /// responsible for issuing all guards.
) -> Result<Option<WalResidentTimeline>> { ///
let op_label = if block { /// NB: don't use this function from timeline_manager, it will deadlock.
"wal_residence_guard" /// NB: don't use this function while holding shared_state lock.
} else { pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
"try_wal_residence_guard"
};
if self.is_cancelled() { if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid)); bail!(TimelineError::Cancelled(self.ttid));
} }
@@ -819,13 +812,10 @@ impl Timeline {
// Wait 30 seconds for the guard to be acquired. It can time out if someone is // Wait 30 seconds for the guard to be acquired. It can time out if someone is
// holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
// is stuck. // is stuck.
let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async { let res = tokio::time::timeout_at(
if block { started_at + Duration::from_secs(30),
self.manager_ctl.wal_residence_guard().await.map(Some) self.manager_ctl.wal_residence_guard(),
} else { )
self.manager_ctl.try_wal_residence_guard().await
}
})
.await; .await;
let guard = match res { let guard = match res {
@@ -833,14 +823,14 @@ impl Timeline {
let finished_at = Instant::now(); let finished_at = Instant::now();
let elapsed = finished_at - started_at; let elapsed = finished_at - started_at;
MISC_OPERATION_SECONDS MISC_OPERATION_SECONDS
.with_label_values(&[op_label]) .with_label_values(&["wal_residence_guard"])
.observe(elapsed.as_secs_f64()); .observe(elapsed.as_secs_f64());
guard guard
} }
Ok(Err(e)) => { Ok(Err(e)) => {
warn!( warn!(
"error acquiring in {op_label}, statuses {:?} => {:?}", "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
status_before, status_before,
self.mgr_status.get() self.mgr_status.get()
); );
@@ -848,7 +838,7 @@ impl Timeline {
} }
Err(_) => { Err(_) => {
warn!( warn!(
"timeout acquiring in {op_label} guard, statuses {:?} => {:?}", "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
status_before, status_before,
self.mgr_status.get() self.mgr_status.get()
); );
@@ -856,28 +846,7 @@ impl Timeline {
} }
}; };
Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g))) Ok(WalResidentTimeline::new(self.clone(), guard))
}
/// Get the timeline guard for reading/writing WAL files.
/// If WAL files are not present on disk (evicted), they will be automatically
/// downloaded from remote storage. This is done in the manager task, which is
/// responsible for issuing all guards.
///
/// NB: don't use this function from timeline_manager, it will deadlock.
/// NB: don't use this function while holding shared_state lock.
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
self.do_wal_residence_guard(true)
.await
.map(|m| m.expect("Always get Some in block=true mode"))
}
/// Get the timeline guard for reading/writing WAL files if the timeline is resident,
/// else return None
pub(crate) async fn try_wal_residence_guard(
self: &Arc<Self>,
) -> Result<Option<WalResidentTimeline>> {
self.do_wal_residence_guard(false).await
} }
pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> { pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
@@ -1077,9 +1046,9 @@ impl ManagerTimeline {
// trying to restore WAL storage // trying to restore WAL storage
let wal_store = wal_storage::PhysicalStorage::new( let wal_store = wal_storage::PhysicalStorage::new(
&self.ttid, &self.ttid,
&self.timeline_dir, self.timeline_dir.clone(),
&conf,
shared.sk.state(), shared.sk.state(),
conf.no_sync,
)?; )?;
// updating control file // updating control file

View File

@@ -56,9 +56,6 @@ impl Manager {
// This also works for the first segment despite last_removed_segno // This also works for the first segment despite last_removed_segno
// being 0 on init because this 0 triggers run of wal_removal_task // being 0 on init because this 0 triggers run of wal_removal_task
// on success of which manager updates the horizon. // on success of which manager updates the horizon.
//
// **Note** pull_timeline functionality assumes that evicted timelines always have
// a partial segment: if we ever change this condition, must also update that code.
&& self && self
.partial_backup_uploaded .partial_backup_uploaded
.as_ref() .as_ref()
@@ -69,15 +66,15 @@ impl Manager {
ready ready
} }
/// Evict the timeline to remote storage. Returns whether the eviction was successful. /// Evict the timeline to remote storage.
#[instrument(name = "evict_timeline", skip_all)] #[instrument(name = "evict_timeline", skip_all)]
pub(crate) async fn evict_timeline(&mut self) -> bool { pub(crate) async fn evict_timeline(&mut self) {
assert!(!self.is_offloaded); assert!(!self.is_offloaded);
let partial_backup_uploaded = match &self.partial_backup_uploaded { let partial_backup_uploaded = match &self.partial_backup_uploaded {
Some(p) => p.clone(), Some(p) => p.clone(),
None => { None => {
warn!("no partial backup uploaded, skipping eviction"); warn!("no partial backup uploaded, skipping eviction");
return false; return;
} }
}; };
@@ -94,12 +91,11 @@ impl Manager {
if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
warn!("failed to evict timeline: {:?}", e); warn!("failed to evict timeline: {:?}", e);
return false; return;
} }
info!("successfully evicted timeline"); info!("successfully evicted timeline");
NUM_EVICTED_TIMELINES.inc(); NUM_EVICTED_TIMELINES.inc();
true
} }
/// Attempt to restore evicted timeline from remote storage; it must be /// Attempt to restore evicted timeline from remote storage; it must be

View File

@@ -100,8 +100,6 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
pub enum ManagerCtlMessage { pub enum ManagerCtlMessage {
/// Request to get a guard for WalResidentTimeline, with WAL files available locally. /// Request to get a guard for WalResidentTimeline, with WAL files available locally.
GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>), GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
/// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None
TryGuardRequest(tokio::sync::oneshot::Sender<Option<ResidenceGuard>>),
/// Request to drop the guard. /// Request to drop the guard.
GuardDrop(GuardId), GuardDrop(GuardId),
/// Request to reset uploaded partial backup state. /// Request to reset uploaded partial backup state.
@@ -112,7 +110,6 @@ impl std::fmt::Debug for ManagerCtlMessage {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"),
ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"), ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
} }
@@ -155,19 +152,6 @@ impl ManagerCtl {
.and_then(std::convert::identity) .and_then(std::convert::identity)
} }
/// Issue a new guard if the timeline is currently not offloaded, else return None
/// Sends a message to the manager and waits for the response.
/// Can be blocked indefinitely if the manager is stuck.
pub async fn try_wal_residence_guard(&self) -> anyhow::Result<Option<ResidenceGuard>> {
let (tx, rx) = tokio::sync::oneshot::channel();
self.manager_tx
.send(ManagerCtlMessage::TryGuardRequest(tx))?;
// wait for the manager to respond with the guard
rx.await
.map_err(|e| anyhow::anyhow!("response read fail: {:?}", e))
}
/// Request timeline manager to reset uploaded partial segment state and /// Request timeline manager to reset uploaded partial segment state and
/// wait for the result. /// wait for the result.
pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> { pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
@@ -313,12 +297,7 @@ pub async fn main_task(
match mgr.global_rate_limiter.try_acquire_eviction() { match mgr.global_rate_limiter.try_acquire_eviction() {
Some(_permit) => { Some(_permit) => {
mgr.set_status(Status::EvictTimeline); mgr.set_status(Status::EvictTimeline);
if !mgr.evict_timeline().await { mgr.evict_timeline().await;
// eviction failed, try again later
mgr.evict_not_before =
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
update_next_event(&mut next_event, mgr.evict_not_before);
}
} }
None => { None => {
// we can't evict timeline now, will try again later // we can't evict timeline now, will try again later
@@ -690,17 +669,6 @@ impl Manager {
warn!("failed to reply with a guard, receiver dropped"); warn!("failed to reply with a guard, receiver dropped");
} }
} }
Some(ManagerCtlMessage::TryGuardRequest(tx)) => {
let result = if self.is_offloaded {
None
} else {
Some(self.access_service.create_guard())
};
if tx.send(result).is_err() {
warn!("failed to reply with a guard, receiver dropped");
}
}
Some(ManagerCtlMessage::GuardDrop(guard_id)) => { Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
self.access_service.drop_guard(guard_id); self.access_service.drop_guard(guard_id);
} }

View File

@@ -244,7 +244,7 @@ impl GlobalTimelines {
// immediately initialize first WAL segment as well. // immediately initialize first WAL segment as well.
let state = let state =
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?; control_file::FileStorage::create_new(tmp_dir_path.clone(), &conf, state).await?;
let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?; let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
Ok(timeline) Ok(timeline)
} }
@@ -596,7 +596,7 @@ pub async fn validate_temp_timeline(
bail!("wal_seg_size is not set"); bail!("wal_seg_size is not set");
} }
let wal_store = wal_storage::PhysicalStorage::new(&ttid, path, &control_store, conf.no_sync)?; let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
let commit_lsn = control_store.commit_lsn; let commit_lsn = control_store.commit_lsn;
let flush_lsn = wal_store.flush_lsn(); let flush_lsn = wal_store.flush_lsn();

View File

@@ -29,6 +29,7 @@ use crate::metrics::{
}; };
use crate::state::TimelinePersistentState; use crate::state::TimelinePersistentState;
use crate::wal_backup::{read_object, remote_timeline_path}; use crate::wal_backup::{read_object, remote_timeline_path};
use crate::SafeKeeperConf;
use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::waldecoder::WalStreamDecoder;
use postgres_ffi::XLogFileName; use postgres_ffi::XLogFileName;
use postgres_ffi::XLOG_BLCKSZ; use postgres_ffi::XLOG_BLCKSZ;
@@ -86,9 +87,7 @@ pub trait Storage {
pub struct PhysicalStorage { pub struct PhysicalStorage {
metrics: WalStorageMetrics, metrics: WalStorageMetrics,
timeline_dir: Utf8PathBuf, timeline_dir: Utf8PathBuf,
conf: SafeKeeperConf,
/// Disables fsync if true.
no_sync: bool,
/// Size of WAL segment in bytes. /// Size of WAL segment in bytes.
wal_seg_size: usize, wal_seg_size: usize,
@@ -152,9 +151,9 @@ impl PhysicalStorage {
/// the disk. Otherwise, all LSNs are set to zero. /// the disk. Otherwise, all LSNs are set to zero.
pub fn new( pub fn new(
ttid: &TenantTimelineId, ttid: &TenantTimelineId,
timeline_dir: &Utf8Path, timeline_dir: Utf8PathBuf,
conf: &SafeKeeperConf,
state: &TimelinePersistentState, state: &TimelinePersistentState,
no_sync: bool,
) -> Result<PhysicalStorage> { ) -> Result<PhysicalStorage> {
let wal_seg_size = state.server.wal_seg_size as usize; let wal_seg_size = state.server.wal_seg_size as usize;
@@ -199,8 +198,8 @@ impl PhysicalStorage {
Ok(PhysicalStorage { Ok(PhysicalStorage {
metrics: WalStorageMetrics::default(), metrics: WalStorageMetrics::default(),
timeline_dir: timeline_dir.to_path_buf(), timeline_dir,
no_sync, conf: conf.clone(),
wal_seg_size, wal_seg_size,
pg_version: state.server.pg_version, pg_version: state.server.pg_version,
system_id: state.server.system_id, system_id: state.server.system_id,
@@ -225,7 +224,7 @@ impl PhysicalStorage {
/// Call fdatasync if config requires so. /// Call fdatasync if config requires so.
async fn fdatasync_file(&mut self, file: &File) -> Result<()> { async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
if !self.no_sync { if !self.conf.no_sync {
self.metrics self.metrics
.observe_flush_seconds(time_io_closure(file.sync_data()).await?); .observe_flush_seconds(time_io_closure(file.sync_data()).await?);
} }
@@ -264,7 +263,9 @@ impl PhysicalStorage {
// Note: this doesn't get into observe_flush_seconds metric. But // Note: this doesn't get into observe_flush_seconds metric. But
// segment init should be separate metric, if any. // segment init should be separate metric, if any.
if let Err(e) = durable_rename(&tmp_path, &wal_file_partial_path, !self.no_sync).await { if let Err(e) =
durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
{
// Probably rename succeeded, but fsync of it failed. Remove // Probably rename succeeded, but fsync of it failed. Remove
// the file then to avoid using it. // the file then to avoid using it.
remove_file(wal_file_partial_path) remove_file(wal_file_partial_path)

View File

@@ -968,28 +968,6 @@ async fn handle_tenant_shard_migrate(
) )
} }
async fn handle_tenant_shard_cancel_reconcile(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
json_response(
StatusCode::OK,
service
.tenant_shard_cancel_reconcile(tenant_shard_id)
.await?,
)
}
async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?; check_permissions(&req, Scope::Admin)?;
@@ -1798,16 +1776,6 @@ pub fn make_router(
RequestName("control_v1_tenant_migrate"), RequestName("control_v1_tenant_migrate"),
) )
}) })
.put(
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|r| {
tenant_service_handler(
r,
handle_tenant_shard_cancel_reconcile,
RequestName("control_v1_tenant_cancel_reconcile"),
)
},
)
.put("/control/v1/tenant/:tenant_id/shard_split", |r| { .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
tenant_service_handler( tenant_service_handler(
r, r,

View File

@@ -450,9 +450,6 @@ impl Reconciler {
} }
} }
/// This function does _not_ mutate any state, so it is cancellation safe.
///
/// This function does not respect [`Self::cancel`], callers should handle that.
async fn await_lsn( async fn await_lsn(
&self, &self,
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
@@ -573,10 +570,8 @@ impl Reconciler {
if let Some(baseline) = baseline_lsns { if let Some(baseline) = baseline_lsns {
tracing::info!("🕑 Waiting for LSN to catch up..."); tracing::info!("🕑 Waiting for LSN to catch up...");
tokio::select! { self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
r = self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) => {r?;} .await?;
_ = self.cancel.cancelled() => {return Err(ReconcileError::Cancel)}
};
} }
tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}"); tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");

View File

@@ -3130,11 +3130,9 @@ impl Service {
.await?; .await?;
// Propagate the LSN that shard zero picked, if caller didn't provide one // Propagate the LSN that shard zero picked, if caller didn't provide one
match &mut create_req.mode { if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none()
models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => { {
*ancestor_start_lsn = timeline_info.ancestor_lsn; create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
},
_ => {}
} }
// Create timeline on remaining shards with number >0 // Create timeline on remaining shards with number >0
@@ -4834,43 +4832,6 @@ impl Service {
Ok(TenantShardMigrateResponse {}) Ok(TenantShardMigrateResponse {})
} }
/// 'cancel' in this context means cancel any ongoing reconcile
pub(crate) async fn tenant_shard_cancel_reconcile(
&self,
tenant_shard_id: TenantShardId,
) -> Result<(), ApiError> {
// Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete
let waiter = {
let locked = self.inner.write().unwrap();
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant shard not found").into(),
));
};
let waiter = shard.get_waiter();
match waiter {
None => {
tracing::info!("Shard does not have an ongoing Reconciler");
return Ok(());
}
Some(waiter) => {
tracing::info!("Cancelling Reconciler");
shard.cancel_reconciler();
waiter
}
}
};
// Cancellation should be prompt. If this fails we have still done our job of firing the
// cancellation token, but by returning an ApiError we will indicate to the caller that
// the Reconciler is misbehaving and not respecting the cancellation token
self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT)
.await?;
Ok(())
}
/// This is for debug/support only: we simply drop all state for a tenant, without /// This is for debug/support only: we simply drop all state for a tenant, without
/// detaching or deleting it on pageservers. /// detaching or deleting it on pageservers.
pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> { pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {

View File

@@ -1317,12 +1317,6 @@ impl TenantShard {
}) })
} }
pub(crate) fn cancel_reconciler(&self) {
if let Some(handle) = self.reconciler.as_ref() {
handle.cancel.cancel()
}
}
/// Get a waiter for any reconciliation in flight, but do not start reconciliation /// Get a waiter for any reconciliation in flight, but do not start reconciliation
/// if it is not already running /// if it is not already running
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> { pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {

View File

@@ -150,7 +150,6 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
counter("pageserver_tenant_throttling_count_accounted_finish_global"), counter("pageserver_tenant_throttling_count_accounted_finish_global"),
counter("pageserver_tenant_throttling_wait_usecs_sum_global"), counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
counter("pageserver_tenant_throttling_count_global"), counter("pageserver_tenant_throttling_count_global"),
*histogram("pageserver_tokio_epoll_uring_slots_submission_queue_depth"),
) )
PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (

View File

@@ -40,19 +40,11 @@ from _pytest.fixtures import FixtureRequest
from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import connection as PgConnection
from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import cursor as PgCursor
from psycopg2.extensions import make_dsn, parse_dsn from psycopg2.extensions import make_dsn, parse_dsn
from pytest_httpserver import HTTPServer
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from fixtures import overlayfs from fixtures import overlayfs
from fixtures.auth_tokens import AuthKeys, TokenScope from fixtures.auth_tokens import AuthKeys, TokenScope
from fixtures.common_types import ( from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
Lsn,
NodeId,
TenantId,
TenantShardId,
TimelineArchivalState,
TimelineId,
)
from fixtures.endpoint.http import EndpointHttpClient from fixtures.endpoint.http import EndpointHttpClient
from fixtures.log_helper import log from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
@@ -62,11 +54,7 @@ from fixtures.pageserver.allowed_errors import (
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
) )
from fixtures.pageserver.common_types import LayerName, parse_layer_file_name from fixtures.pageserver.common_types import LayerName, parse_layer_file_name
from fixtures.pageserver.http import ( from fixtures.pageserver.http import PageserverHttpClient
HistoricLayerInfo,
PageserverHttpClient,
ScanDisposableKeysResponse,
)
from fixtures.pageserver.utils import ( from fixtures.pageserver.utils import (
wait_for_last_record_lsn, wait_for_last_record_lsn,
) )
@@ -2144,24 +2132,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
response.raise_for_status() response.raise_for_status()
return response.json() return response.json()
def timeline_archival_config(
self,
tenant_id: TenantId,
timeline_id: TimelineId,
state: TimelineArchivalState,
):
config = {"state": state.value}
log.info(
f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
)
res = self.request(
"PUT",
f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
json=config,
headers=self.headers(TokenScope.ADMIN),
)
return res.json()
def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]): def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]):
if isinstance(config_strings, tuple): if isinstance(config_strings, tuple):
pairs = [config_strings] pairs = [config_strings]
@@ -2675,51 +2645,6 @@ class NeonPageserver(PgProtocol, LogUtils):
layers = self.list_layers(tenant_id, timeline_id) layers = self.list_layers(tenant_id, timeline_id)
return layer_name in [parse_layer_file_name(p.name) for p in layers] return layer_name in [parse_layer_file_name(p.name) for p in layers]
def timeline_scan_no_disposable_keys(
self, tenant_shard_id: TenantShardId, timeline_id: TimelineId
) -> TimelineAssertNoDisposableKeysResult:
"""
Scan all keys in all layers of the tenant/timeline for disposable keys.
Disposable keys are keys that are present in a layer referenced by the shard
but are not going to be accessed by the shard.
For example, after shard split, the child shards will reference the parent's layer
files until new data is ingested and/or compaction rewrites the layers.
"""
ps_http = self.http_client()
tally = ScanDisposableKeysResponse(0, 0)
per_layer = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futs = []
shard_layer_map = ps_http.layer_map_info(tenant_shard_id, timeline_id)
for layer in shard_layer_map.historic_layers:
def do_layer(
shard_ps_http: PageserverHttpClient,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
layer: HistoricLayerInfo,
) -> tuple[HistoricLayerInfo, ScanDisposableKeysResponse]:
return (
layer,
shard_ps_http.timeline_layer_scan_disposable_keys(
tenant_shard_id, timeline_id, layer.layer_file_name
),
)
futs.append(executor.submit(do_layer, ps_http, tenant_shard_id, timeline_id, layer))
for fut in futs:
layer, result = fut.result()
tally += result
per_layer.append((layer, result))
return TimelineAssertNoDisposableKeysResult(tally, per_layer)
@dataclass
class TimelineAssertNoDisposableKeysResult:
tally: ScanDisposableKeysResponse
per_layer: list[tuple[HistoricLayerInfo, ScanDisposableKeysResponse]]
class PgBin: class PgBin:
"""A helper class for executing postgres binaries""" """A helper class for executing postgres binaries"""
@@ -3099,6 +3024,10 @@ class NeonProxy(PgProtocol):
class AuthBackend(abc.ABC): class AuthBackend(abc.ABC):
"""All auth backends must inherit from this class""" """All auth backends must inherit from this class"""
@property
def default_conn_url(self) -> Optional[str]:
return None
@abc.abstractmethod @abc.abstractmethod
def extra_args(self) -> list[str]: def extra_args(self) -> list[str]:
pass pass
@@ -3112,7 +3041,7 @@ class NeonProxy(PgProtocol):
*["--allow-self-signed-compute", "true"], *["--allow-self-signed-compute", "true"],
] ]
class ControlPlane(AuthBackend): class Console(AuthBackend):
def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None): def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None):
self.endpoint = endpoint self.endpoint = endpoint
self.fixed_rate_limit = fixed_rate_limit self.fixed_rate_limit = fixed_rate_limit
@@ -3136,6 +3065,21 @@ class NeonProxy(PgProtocol):
] ]
return args return args
@dataclass(frozen=True)
class Postgres(AuthBackend):
pg_conn_url: str
@property
def default_conn_url(self) -> Optional[str]:
return self.pg_conn_url
def extra_args(self) -> list[str]:
return [
# Postgres auth backend params
*["--auth-backend", "postgres"],
*["--auth-endpoint", self.pg_conn_url],
]
def __init__( def __init__(
self, self,
neon_binpath: Path, neon_binpath: Path,
@@ -3150,7 +3094,7 @@ class NeonProxy(PgProtocol):
): ):
host = "127.0.0.1" host = "127.0.0.1"
domain = "proxy.localtest.me" # resolves to 127.0.0.1 domain = "proxy.localtest.me" # resolves to 127.0.0.1
super().__init__(host=domain, port=proxy_port) super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
self.domain = domain self.domain = domain
self.host = host self.host = host
@@ -3404,39 +3348,20 @@ def static_proxy(
port_distributor: PortDistributor, port_distributor: PortDistributor,
neon_binpath: Path, neon_binpath: Path,
test_output_dir: Path, test_output_dir: Path,
httpserver: HTTPServer,
) -> Iterator[NeonProxy]: ) -> Iterator[NeonProxy]:
"""Neon proxy that routes directly to vanilla postgres and a mocked cplane HTTP API.""" """Neon proxy that routes directly to vanilla postgres."""
port = vanilla_pg.default_options["port"] port = vanilla_pg.default_options["port"]
host = vanilla_pg.default_options["host"] host = vanilla_pg.default_options["host"]
dbname = vanilla_pg.default_options["dbname"] dbname = vanilla_pg.default_options["dbname"]
auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"
# For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
vanilla_pg.start() vanilla_pg.start()
vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
[(rolpassword,)] = vanilla_pg.safe_psql( vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS neon_control_plane")
"select rolpassword from pg_catalog.pg_authid where rolname = 'proxy'" vanilla_pg.safe_psql(
) "CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))"
# return local postgres addr on ProxyWakeCompute.
httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
{
"address": f"{host}:{port}",
"aux": {
"endpoint_id": "ep-foo-bar-1234",
"branch_id": "br-foo-bar",
"project_id": "foo-bar",
},
}
)
# return local postgres addr on ProxyWakeCompute.
httpserver.expect_request("/cplane/proxy_get_role_secret").respond_with_json(
{
"role_secret": rolpassword,
"allowed_ips": None,
"project_id": "foo-bar",
}
) )
proxy_port = port_distributor.get_port() proxy_port = port_distributor.get_port()
@@ -3451,12 +3376,8 @@ def static_proxy(
http_port=http_port, http_port=http_port,
mgmt_port=mgmt_port, mgmt_port=mgmt_port,
external_http_port=external_http_port, external_http_port=external_http_port,
auth_backend=NeonProxy.ControlPlane(httpserver.url_for("/cplane")), auth_backend=NeonProxy.Postgres(auth_endpoint),
) as proxy: ) as proxy:
proxy.default_options["user"] = "proxy"
proxy.default_options["password"] = "password"
proxy.default_options["dbname"] = dbname
proxy.start() proxy.start()
yield proxy yield proxy

View File

@@ -129,26 +129,6 @@ class LayerMapInfo:
return set(x.layer_file_name for x in self.historic_layers) return set(x.layer_file_name for x in self.historic_layers)
@dataclass
class ScanDisposableKeysResponse:
disposable_count: int
not_disposable_count: int
def __add__(self, b):
a = self
assert isinstance(a, ScanDisposableKeysResponse)
assert isinstance(b, ScanDisposableKeysResponse)
return ScanDisposableKeysResponse(
a.disposable_count + b.disposable_count, a.not_disposable_count + b.not_disposable_count
)
@classmethod
def from_json(cls, d: dict[str, Any]) -> ScanDisposableKeysResponse:
disposable_count = d["disposable_count"]
not_disposable_count = d["not_disposable_count"]
return ScanDisposableKeysResponse(disposable_count, not_disposable_count)
@dataclass @dataclass
class TenantConfig: class TenantConfig:
tenant_specific_overrides: dict[str, Any] tenant_specific_overrides: dict[str, Any]
@@ -162,19 +142,6 @@ class TenantConfig:
) )
@dataclass
class TimelinesInfoAndOffloaded:
timelines: list[dict[str, Any]]
offloaded: list[dict[str, Any]]
@classmethod
def from_json(cls, d: dict[str, Any]) -> TimelinesInfoAndOffloaded:
return TimelinesInfoAndOffloaded(
timelines=d["timelines"],
offloaded=d["offloaded"],
)
class PageserverHttpClient(requests.Session, MetricsGetter): class PageserverHttpClient(requests.Session, MetricsGetter):
def __init__( def __init__(
self, self,
@@ -497,18 +464,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
assert isinstance(res_json, list) assert isinstance(res_json, list)
return res_json return res_json
def timeline_and_offloaded_list(
self,
tenant_id: Union[TenantId, TenantShardId],
) -> TimelinesInfoAndOffloaded:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline_and_offloaded",
)
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, dict)
return TimelinesInfoAndOffloaded.from_json(res_json)
def timeline_create( def timeline_create(
self, self,
pg_version: PgVersion, pg_version: PgVersion,
@@ -521,13 +476,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
) -> dict[Any, Any]: ) -> dict[Any, Any]:
body: dict[str, Any] = { body: dict[str, Any] = {
"new_timeline_id": str(new_timeline_id), "new_timeline_id": str(new_timeline_id),
"ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
"ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
"existing_initdb_timeline_id": str(existing_initdb_timeline_id)
if existing_initdb_timeline_id
else None,
} }
if ancestor_timeline_id:
body["ancestor_timeline_id"] = str(ancestor_timeline_id)
if ancestor_start_lsn:
body["ancestor_start_lsn"] = str(ancestor_start_lsn)
if existing_initdb_timeline_id:
body["existing_initdb_timeline_id"] = str(existing_initdb_timeline_id)
if pg_version != PgVersion.NOT_SET: if pg_version != PgVersion.NOT_SET:
body["pg_version"] = int(pg_version) body["pg_version"] = int(pg_version)
@@ -925,16 +879,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
self.verbose_error(res) self.verbose_error(res)
return LayerMapInfo.from_json(res.json()) return LayerMapInfo.from_json(res.json())
def timeline_layer_scan_disposable_keys(
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
) -> ScanDisposableKeysResponse:
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}/scan_disposable_keys",
)
self.verbose_error(res)
assert res.status_code == 200
return ScanDisposableKeysResponse.from_json(res.json())
def download_layer( def download_layer(
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
): ):

View File

@@ -3,13 +3,10 @@
# #
from __future__ import annotations from __future__ import annotations
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, cast from typing import TYPE_CHECKING, cast
import pytest import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import ( from fixtures.neon_fixtures import (
Endpoint, Endpoint,
NeonEnv, NeonEnv,
@@ -327,97 +324,3 @@ def test_sql_regress(
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
post_checks(env, test_output_dir, DBNAME, endpoint) post_checks(env, test_output_dir, DBNAME, endpoint)
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
def test_tx_abort_with_many_relations(
neon_env_builder: NeonEnvBuilder,
):
"""
This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres
behavior when aborting a transaction with lots of relations.
Reproducer for https://github.com/neondatabase/neon/issues/9505
"""
env = neon_env_builder.init_start()
ep = env.endpoints.create_start(
"main",
tenant_id=env.initial_tenant,
config_lines=[
"shared_buffers=1000MB",
"max_locks_per_transaction=16384",
],
)
# How many relations: this number is tuned to be long enough to take tens of seconds
# if the rollback code path is buggy, tripping the test's timeout.
n = 4000
def create():
# Create many relations
log.info(f"Creating {n} relations...")
ep.safe_psql_many(
[
"BEGIN",
f"""DO $$
DECLARE
i INT;
table_name TEXT;
BEGIN
FOR i IN 1..{n} LOOP
table_name := 'table_' || i;
EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
END LOOP;
END $$;
""",
"COMMIT",
]
)
def truncate():
# Truncate relations, then roll back the transaction containing the truncations
log.info(f"Truncating {n} relations...")
ep.safe_psql_many(
[
"BEGIN",
f"""DO $$
DECLARE
i INT;
table_name TEXT;
BEGIN
FOR i IN 1..{n} LOOP
table_name := 'table_' || i;
EXECUTE 'TRUNCATE ' || table_name ;
END LOOP;
END $$;
""",
]
)
def rollback_and_wait():
log.info(f"Rolling back after truncating {n} relations...")
ep.safe_psql("ROLLBACK")
# Restart the endpoint: this ensures that we can read back what we just wrote, i.e. pageserver
# ingest has caught up.
ep.stop()
log.info(f"Starting endpoint after truncating {n} relations...")
ep.start()
log.info(f"Started endpoint after truncating {n} relations...")
# Actual create & truncate phases may be slow, these involves lots of WAL records. We do not
# apply a special timeout, they are expected to complete within general test timeout
create()
truncate()
# Run in a thread because the failure case is to take pathologically long time, and we don't want
# to block the test executor on that.
with ThreadPoolExecutor(max_workers=1) as exec:
try:
# Rollback phase should be fast: this is one WAL record that we should process efficiently
fut = exec.submit(rollback_and_wait)
fut.result(timeout=5)
except:
exec.shutdown(wait=False, cancel_futures=True)
raise

View File

@@ -6,27 +6,20 @@ from fixtures.neon_fixtures import (
NeonProxy, NeonProxy,
VanillaPostgres, VanillaPostgres,
) )
from pytest_httpserver import HTTPServer
TABLE_NAME = "neon_control_plane.endpoints" TABLE_NAME = "neon_control_plane.endpoints"
def test_proxy_psql_not_allowed_ips( # Proxy uses the same logic for psql and websockets.
static_proxy: NeonProxy, @pytest.mark.asyncio
vanilla_pg: VanillaPostgres, async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres):
httpserver: HTTPServer,
):
[(rolpassword,)] = vanilla_pg.safe_psql(
"select rolpassword from pg_catalog.pg_authid where rolname = 'proxy'"
)
# Shouldn't be able to connect to this project # Shouldn't be able to connect to this project
httpserver.expect_request("/cplane/proxy_get_role_secret").respond_with_json( vanilla_pg.safe_psql(
{ f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')"
"role_secret": rolpassword, )
"allowed_ips": ["8.8.8.8"], # Should be able to connect to this project
"project_id": "foo-bar", vanilla_pg.safe_psql(
} f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')"
) )
def check_cannot_connect(**kwargs): def check_cannot_connect(**kwargs):
@@ -44,25 +37,6 @@ def test_proxy_psql_not_allowed_ips(
# with SNI # with SNI
check_cannot_connect(query="select 1", host="private-project.localtest.me") check_cannot_connect(query="select 1", host="private-project.localtest.me")
def test_proxy_psql_allowed_ips(
static_proxy: NeonProxy,
vanilla_pg: VanillaPostgres,
httpserver: HTTPServer,
):
[(rolpassword,)] = vanilla_pg.safe_psql(
"select rolpassword from pg_catalog.pg_authid where rolname = 'proxy'"
)
# Should be able to connect to this project
httpserver.expect_request("/cplane/proxy_get_role_secret").respond_with_json(
{
"role_secret": rolpassword,
"allowed_ips": ["::1", "127.0.0.1"],
"project_id": "foo-bar",
}
)
# no SNI, deprecated `options=project` syntax (before we had several endpoint in project) # no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project") out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project")
assert out[0][0] == 1 assert out[0][0] == 1
@@ -76,61 +50,27 @@ def test_proxy_psql_allowed_ips(
assert out[0][0] == 1 assert out[0][0] == 1
def test_proxy_http_not_allowed_ips( @pytest.mark.asyncio
static_proxy: NeonProxy, async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres):
vanilla_pg: VanillaPostgres, static_proxy.safe_psql("create user http_auth with password 'http' superuser")
httpserver: HTTPServer,
):
vanilla_pg.safe_psql("create user http_auth with password 'http' superuser")
[(rolpassword,)] = vanilla_pg.safe_psql( # Shouldn't be able to connect to this project
"select rolpassword from pg_catalog.pg_authid where rolname = 'http_auth'" vanilla_pg.safe_psql(
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')"
) )
httpserver.expect_oneshot_request("/cplane/proxy_get_role_secret").respond_with_json( def query(status: int, query: str, *args):
{
"role_secret": rolpassword,
"allowed_ips": ["8.8.8.8"],
"project_id": "foo-bar",
}
)
with httpserver.wait() as waiting:
static_proxy.http_query( static_proxy.http_query(
"select 1;", query,
[], args,
user="http_auth", user="http_auth",
password="http", password="http",
expected_code=400, expected_code=status,
) )
assert waiting.result
query(400, "select 1;") # ip address is not allowed
def test_proxy_http_allowed_ips( # Should be able to connect to this project
static_proxy: NeonProxy, vanilla_pg.safe_psql(
vanilla_pg: VanillaPostgres, f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'"
httpserver: HTTPServer,
):
vanilla_pg.safe_psql("create user http_auth with password 'http' superuser")
[(rolpassword,)] = vanilla_pg.safe_psql(
"select rolpassword from pg_catalog.pg_authid where rolname = 'http_auth'"
) )
query(200, "select 1;") # should work now
httpserver.expect_oneshot_request("/cplane/proxy_get_role_secret").respond_with_json(
{
"role_secret": rolpassword,
"allowed_ips": ["8.8.8.8", "127.0.0.1"],
"project_id": "foo-bar",
}
)
with httpserver.wait() as waiting:
static_proxy.http_query(
"select 1;",
[],
user="http_auth",
password="http",
expected_code=200,
)
assert waiting.result

View File

@@ -169,24 +169,23 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
) )
return last_flush_lsn return last_flush_lsn
def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str): def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
""" """
Trigger GC manually on all pageservers. Then run an `SELECT` query. Trigger GC manually on all pageservers. Then run an `SELECT` query.
""" """
for shard, ps in tenant_get_shards(env, env.initial_tenant): for shard, ps in tenant_get_shards(env, env.initial_tenant):
client = ps.http_client() client = ps.http_client()
gc_result = client.timeline_gc(shard, env.initial_timeline, 0) gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
# Note: cannot assert on `layers_removed` here because it could be layers
# not guarded by the lease. Rely on successful execution of the query instead.
log.info(f"{gc_result=}") log.info(f"{gc_result=}")
assert (
gc_result["layers_removed"] == 0
), "No layers should be removed, old layers are guarded by leases."
with ep_static.cursor() as cur: with ep_static.cursor() as cur:
# Following query should succeed if pages are properly guarded by leases.
cur.execute("SELECT count(*) FROM t0") cur.execute("SELECT count(*) FROM t0")
assert cur.fetchone() == (ROW_COUNT,) assert cur.fetchone() == (ROW_COUNT,)
log.info(f"`SELECT` query succeed after GC, {ctx=}")
# Insert some records on main branch # Insert some records on main branch
with env.endpoints.create_start("main") as ep_main: with env.endpoints.create_start("main") as ep_main:
with ep_main.cursor() as cur: with ep_main.cursor() as cur:
@@ -211,9 +210,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
# Wait for static compute to renew lease at least once. # Wait for static compute to renew lease at least once.
time.sleep(LSN_LEASE_LENGTH / 2) time.sleep(LSN_LEASE_LENGTH / 2)
generate_updates_on_main(env, ep_main, 3, end=100) generate_updates_on_main(env, ep_main, i, end=100)
trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart") trigger_gc_and_select(env, ep_static)
# Trigger Pageserver restarts # Trigger Pageserver restarts
for ps in env.pageservers: for ps in env.pageservers:
@@ -222,7 +221,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
time.sleep(LSN_LEASE_LENGTH / 2) time.sleep(LSN_LEASE_LENGTH / 2)
ps.start() ps.start()
trigger_gc_and_select(env, ep_static, ctx="After pageservers restart") trigger_gc_and_select(env, ep_static)
# Reconfigure pageservers # Reconfigure pageservers
env.pageservers[0].stop() env.pageservers[0].stop()
@@ -231,7 +230,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
) )
env.storage_controller.reconcile_until_idle() env.storage_controller.reconcile_until_idle()
trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline") trigger_gc_and_select(env, ep_static)
# Do some update so we can increment latest_gc_cutoff # Do some update so we can increment latest_gc_cutoff
generate_updates_on_main(env, ep_main, i, end=100) generate_updates_on_main(env, ep_main, i, end=100)

View File

@@ -3,11 +3,11 @@ from __future__ import annotations
import os import os
import time import time
from collections import defaultdict from collections import defaultdict
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING
import pytest import pytest
import requests import requests
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.compute_reconfigure import ComputeReconfigure
from fixtures.log_helper import log from fixtures.log_helper import log
from fixtures.neon_fixtures import ( from fixtures.neon_fixtures import (
@@ -188,9 +188,7 @@ def test_sharding_split_unsharded(
"compact-shard-ancestors-persistent", "compact-shard-ancestors-persistent",
], ],
) )
def test_sharding_split_compaction( def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
neon_env_builder: NeonEnvBuilder, failpoint: Optional[str], build_type: str
):
""" """
Test that after a split, we clean up parent layer data in the child shards via compaction. Test that after a split, we clean up parent layer data in the child shards via compaction.
""" """
@@ -324,19 +322,9 @@ def test_sharding_split_compaction(
# Physical size should shrink because layers are smaller # Physical size should shrink because layers are smaller
assert detail_after["current_physical_size"] < detail_before["current_physical_size"] assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
# Validate filtering compaction actually happened # Validate size statistics
for shard in shards: for shard in shards:
ps = env.get_tenant_pageserver(shard) ps = env.get_tenant_pageserver(shard)
log.info("scan all layer files for disposable keys, there shouldn't be any")
result = ps.timeline_scan_no_disposable_keys(shard, timeline_id)
tally = result.tally
raw_page_count = tally.not_disposable_count + tally.disposable_count
assert tally.not_disposable_count > (
raw_page_count // 2
), "compaction doesn't rewrite layers that are >=50pct local"
log.info("check sizes")
timeline_info = ps.http_client().timeline_detail(shard, timeline_id) timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
reported_size = timeline_info["current_physical_size"] reported_size = timeline_info["current_physical_size"]
layer_paths = ps.list_layers(shard, timeline_id) layer_paths = ps.list_layers(shard, timeline_id)
@@ -365,145 +353,6 @@ def test_sharding_split_compaction(
workload.validate() workload.validate()
def test_sharding_split_offloading(neon_env_builder: NeonEnvBuilder):
"""
Test that during a split, we don't miss archived and offloaded timelines.
"""
TENANT_CONF = {
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": 128 * 1024,
"compaction_threshold": 1,
"compaction_target_size": 128 * 1024,
# no PITR horizon, we specify the horizon when we request on-demand GC
"pitr_interval": "3600s",
# disable background compaction, GC and offloading. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# Disable automatic creation of image layers, as we will create them explicitly when we want them
"image_creation_threshold": 9999,
"image_layer_creation_check_threshold": 0,
"lsn_lease_length": "0s",
}
neon_env_builder.storage_controller_config = {
# Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
"max_offline": "30s",
"max_warming_up": "300s",
}
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
tenant_id = env.initial_tenant
timeline_id_main = env.initial_timeline
# Check that we created with an unsharded TenantShardId: this is the default,
# but check it in case we change the default in future
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
workload_main = Workload(env, tenant_id, timeline_id_main, branch_name="main")
workload_main.init()
workload_main.write_rows(256)
workload_main.validate()
workload_main.stop()
# Create two timelines, archive one, offload the other
timeline_id_archived = env.create_branch("archived_not_offloaded")
timeline_id_offloaded = env.create_branch("archived_offloaded")
def timeline_id_set_for(list: list[dict[str, Any]]) -> set[TimelineId]:
return set(
map(
lambda t: TimelineId(t["timeline_id"]),
list,
)
)
expected_offloaded_set = {timeline_id_offloaded}
expected_timeline_set = {timeline_id_main, timeline_id_archived}
with env.get_tenant_pageserver(tenant_id).http_client() as http_client:
http_client.timeline_archival_config(
tenant_id, timeline_id_archived, TimelineArchivalState.ARCHIVED
)
http_client.timeline_archival_config(
tenant_id, timeline_id_offloaded, TimelineArchivalState.ARCHIVED
)
http_client.timeline_offload(tenant_id, timeline_id_offloaded)
list = http_client.timeline_and_offloaded_list(tenant_id)
assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
assert timeline_id_set_for(list.timelines) == expected_timeline_set
# Do a full image layer generation before splitting
http_client.timeline_checkpoint(
tenant_id, timeline_id_main, force_image_layer_creation=True, wait_until_uploaded=True
)
# Split one shard into two
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
# Let all shards move into their stable locations, so that during subsequent steps we
# don't have reconciles in progress (simpler to reason about what messages we expect in logs)
env.storage_controller.reconcile_until_idle()
# Check we got the shard IDs we expected
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
workload_main.validate()
workload_main.stop()
env.storage_controller.consistency_check()
# Ensure each shard has the same list of timelines and offloaded timelines
for shard in shards:
ps = env.get_tenant_pageserver(shard)
list = ps.http_client().timeline_and_offloaded_list(shard)
assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
assert timeline_id_set_for(list.timelines) == expected_timeline_set
ps.http_client().timeline_compact(shard, timeline_id_main)
# Check that we can still read all the data
workload_main.validate()
# Force a restart, which requires the state to be persisted.
env.pageserver.stop()
env.pageserver.start()
# Ensure each shard has the same list of timelines and offloaded timelines
for shard in shards:
ps = env.get_tenant_pageserver(shard)
list = ps.http_client().timeline_and_offloaded_list(shard)
assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
assert timeline_id_set_for(list.timelines) == expected_timeline_set
ps.http_client().timeline_compact(shard, timeline_id_main)
# Compaction shouldn't make anything unreadable
workload_main.validate()
# Do sharded unarchival
env.storage_controller.timeline_archival_config(
tenant_id, timeline_id_offloaded, TimelineArchivalState.UNARCHIVED
)
env.storage_controller.timeline_archival_config(
tenant_id, timeline_id_archived, TimelineArchivalState.UNARCHIVED
)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
list = ps.http_client().timeline_and_offloaded_list(shard)
assert timeline_id_set_for(list.offloaded) == set()
assert timeline_id_set_for(list.timelines) == {
timeline_id_main,
timeline_id_archived,
timeline_id_offloaded,
}
def test_sharding_split_smoke( def test_sharding_split_smoke(
neon_env_builder: NeonEnvBuilder, neon_env_builder: NeonEnvBuilder,
): ):

View File

@@ -18,7 +18,6 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import ( from fixtures.neon_fixtures import (
NeonEnv, NeonEnv,
NeonEnvBuilder, NeonEnvBuilder,
NeonPageserver,
PageserverAvailability, PageserverAvailability,
PageserverSchedulingPolicy, PageserverSchedulingPolicy,
PgBin, PgBin,
@@ -299,20 +298,17 @@ def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
env.storage_controller.consistency_check() env.storage_controller.consistency_check()
def prepare_onboarding_env( @pytest.mark.parametrize("warm_up", [True, False])
neon_env_builder: NeonEnvBuilder, def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
) -> tuple[NeonEnv, NeonPageserver, TenantId, int]:
""" """
For tests that do onboarding of a tenant to the storage controller, a small dance to We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
set up one pageserver that won't be managed by the storage controller and create which provides the /location_config API. This is similar to creating a tenant,
a tenant there. but imports the generation number.
""" """
# One pageserver to simulate legacy environment, two to be managed by storage controller # One pageserver to simulate legacy environment, two to be managed by storage controller
neon_env_builder.num_pageservers = 3 neon_env_builder.num_pageservers = 3
# Enable tests to use methods that require real S3 API
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
# Start services by hand so that we can skip registration on one of the pageservers # Start services by hand so that we can skip registration on one of the pageservers
env = neon_env_builder.init_configs() env = neon_env_builder.init_configs()
env.broker.start() env.broker.start()
@@ -333,6 +329,7 @@ def prepare_onboarding_env(
# will be attached after onboarding # will be attached after onboarding
env.pageservers[1].start() env.pageservers[1].start()
env.pageservers[2].start() env.pageservers[2].start()
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
for sk in env.safekeepers: for sk in env.safekeepers:
sk.start() sk.start()
@@ -342,23 +339,6 @@ def prepare_onboarding_env(
generation = 123 generation = 123
origin_ps.tenant_create(tenant_id, generation=generation) origin_ps.tenant_create(tenant_id, generation=generation)
origin_ps.http_client().timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
return (env, origin_ps, tenant_id, generation)
@pytest.mark.parametrize("warm_up", [True, False])
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
"""
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
which provides the /location_config API. This is similar to creating a tenant,
but imports the generation number.
"""
env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
# As if doing a live migration, first configure origin into stale mode # As if doing a live migration, first configure origin into stale mode
r = origin_ps.http_client().tenant_location_conf( r = origin_ps.http_client().tenant_location_conf(
tenant_id, tenant_id,
@@ -495,70 +475,6 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
env.storage_controller.consistency_check() env.storage_controller.consistency_check()
@run_only_on_default_postgres("this test doesn't start an endpoint")
def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
"""
Sometimes, the control plane wants to delete a tenant that wasn't attached to any pageserver,
and also wasn't ever registered with the storage controller.
It may do this by calling /location_conf in mode Detached and then calling the delete API
as normal.
"""
env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
remote_prefix = "/".join(
(
"tenants",
str(tenant_id),
)
)
# Detach it from its original pageserver.
origin_ps.http_client().tenant_location_conf(
tenant_id,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
"generation": None,
},
)
# Since we will later assert that remote data is gone, as a control also check it was ever there
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=remote_prefix,
)
# Register with storage controller in Detached state
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
generation += 1
r = virtual_ps_http.tenant_location_conf(
tenant_id,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
"generation": generation,
},
)
assert len(r["shards"]) == 0 # location_conf tells us there are no attached shards
# Onboarding in Detached state shouldn't have attached it to any pageserver
for ps in env.pageservers:
assert ps.http_client().tenant_list() == []
# Delete it via the storage controller
virtual_ps_http.tenant_delete(tenant_id)
# Check that we really deleted it
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix=remote_prefix,
)
def test_storage_controller_compute_hook( def test_storage_controller_compute_hook(
httpserver: HTTPServer, httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder, neon_env_builder: NeonEnvBuilder,
@@ -956,14 +872,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3 assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
assert all(v["may_schedule"] for v in response.json()["nodes"].values()) assert all(v["may_schedule"] for v in response.json()["nodes"].values())
# Reconciler cancel API should be a no-op when nothing is in flight
env.storage_controller.request(
"PUT",
f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0102/cancel_reconcile",
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
# Node unclean drop API
response = env.storage_controller.request( response = env.storage_controller.request(
"POST", "POST",
f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop", f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
@@ -971,7 +879,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
) )
assert len(env.storage_controller.node_list()) == 1 assert len(env.storage_controller.node_list()) == 1
# Tenant unclean drop API
response = env.storage_controller.request( response = env.storage_controller.request(
"POST", "POST",
f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
@@ -985,6 +892,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
headers=env.storage_controller.headers(TokenScope.ADMIN), headers=env.storage_controller.headers(TokenScope.ADMIN),
) )
assert len(response.json()) == 1 assert len(response.json()) == 1
# Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
# meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind. # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
env.storage_controller.consistency_check() env.storage_controller.consistency_check()
@@ -1752,11 +1660,6 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
assert "Stop" in storcon_cli(["tenants"])[3] assert "Stop" in storcon_cli(["tenants"])[3]
# Cancel ongoing reconcile on a tenant
storcon_cli(
["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
)
# Change a tenant's placement # Change a tenant's placement
storcon_cli( storcon_cli(
["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]

View File

@@ -435,9 +435,7 @@ def test_emergency_relocate_with_branches_slow_replay(
# This fail point will pause the WAL ingestion on the main branch, after the # This fail point will pause the WAL ingestion on the main branch, after the
# the first insert # the first insert
pageserver_http.configure_failpoints( pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
[("pageserver-wal-ingest-logical-message-sleep", "return(5000)")]
)
# Attach and wait a few seconds to give it time to load the tenants, attach to the # Attach and wait a few seconds to give it time to load the tenants, attach to the
# safekeepers, and to stream and ingest the WAL up to the pause-point. # safekeepers, and to stream and ingest the WAL up to the pause-point.
@@ -455,13 +453,11 @@ def test_emergency_relocate_with_branches_slow_replay(
assert cur.fetchall() == [("before pause",), ("after pause",)] assert cur.fetchall() == [("before pause",), ("after pause",)]
# Sanity check that the failpoint was reached # Sanity check that the failpoint was reached
env.pageserver.assert_log_contains( env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
'failpoint "pageserver-wal-ingest-logical-message-sleep": sleep done'
)
assert time.time() - before_attach_time > 5 assert time.time() - before_attach_time > 5
# Clean up # Clean up
pageserver_http.configure_failpoints(("pageserver-wal-ingest-logical-message-sleep", "off")) pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
# Simulate hard crash of pageserver and re-attach a tenant with a branch # Simulate hard crash of pageserver and re-attach a tenant with a branch
@@ -585,9 +581,7 @@ def test_emergency_relocate_with_branches_createdb(
# bug reproduced easily even without this, as there is always some delay between # bug reproduced easily even without this, as there is always some delay between
# loading the timeline and establishing the connection to the safekeeper to stream and # loading the timeline and establishing the connection to the safekeeper to stream and
# ingest the WAL, but let's make this less dependent on accidental timing. # ingest the WAL, but let's make this less dependent on accidental timing.
pageserver_http.configure_failpoints( pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
[("pageserver-wal-ingest-logical-message-sleep", "return(5000)")]
)
before_attach_time = time.time() before_attach_time = time.time()
env.pageserver.tenant_attach(tenant_id) env.pageserver.tenant_attach(tenant_id)
@@ -596,10 +590,8 @@ def test_emergency_relocate_with_branches_createdb(
assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200 assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
# Sanity check that the failpoint was reached # Sanity check that the failpoint was reached
env.pageserver.assert_log_contains( env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
'failpoint "pageserver-wal-ingest-logical-message-sleep": sleep done'
)
assert time.time() - before_attach_time > 5 assert time.time() - before_attach_time > 5
# Clean up # Clean up
pageserver_http.configure_failpoints(("pageserver-wal-ingest-logical-message-sleep", "off")) pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))

View File

@@ -1998,109 +1998,6 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
pt_handle.join() pt_handle.join()
def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
"""
Verify that when pull_timeline is used on an evicted timeline, it does not result in
promoting any segments to local disk on the source, and the timeline is correctly instantiated
in evicted state on the destination. This behavior is important to avoid ballooning disk
usage when doing mass migration of timelines.
"""
neon_env_builder.num_safekeepers = 4
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
# Configure safekeepers with ultra-fast eviction policy
neon_env_builder.safekeeper_extra_opts = [
"--enable-offload",
"--partial-backup-timeout",
"50ms",
"--control-file-save-interval",
"1s",
# Safekeepers usually wait a while before evicting something: for this test we want them to
# evict things as soon as they are inactive.
"--eviction-min-resident=100ms",
"--delete-offloaded-wal",
]
initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
(src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[-1])
log.info(f"Will pull_timeline on destination {dst_sk.id} from source {src_sk.id}")
ep = env.endpoints.create("main")
ep.active_safekeepers = [s.id for s in env.safekeepers if s.id != dst_sk.id]
log.info(f"Compute writing initially to safekeepers: {ep.active_safekeepers}")
ep.active_safekeepers = [1, 2, 3] # Exclude dst_sk from set written by compute initially
ep.start()
ep.safe_psql("CREATE TABLE t(i int)")
ep.safe_psql("INSERT INTO t VALUES (0)")
ep.stop()
wait_lsn_force_checkpoint_at_sk(src_sk, tenant_id, timeline_id, env.pageserver)
src_http = src_sk.http_client()
dst_http = dst_sk.http_client()
def evicted_on_source():
# Wait for timeline to go into evicted state
assert src_http.get_eviction_state(timeline_id) != "Present"
assert (
src_http.get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "evict"}
)
or 0 > 0
)
assert src_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
# Check that on source no segment files are present
assert src_sk.list_segments(tenant_id, timeline_id) == []
wait_until(60, 1, evicted_on_source)
# Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
# destination should import the control file only & go into evicted mode immediately
dst_sk.pull_timeline([src_sk], tenant_id, timeline_id)
# Check that on source and destination no segment files are present
assert src_sk.list_segments(tenant_id, timeline_id) == []
assert dst_sk.list_segments(tenant_id, timeline_id) == []
# Check that the timeline on the destination is in the expected evicted state.
evicted_on_source() # It should still be evicted on the source
def evicted_on_destination():
assert dst_http.get_eviction_state(timeline_id) != "Present"
assert dst_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
# This should be fast, it is a wait_until because eviction state is updated
# in the background wrt pull_timeline.
wait_until(10, 0.1, evicted_on_destination)
# Delete the timeline on the source, to prove that deletion works on an
# evicted timeline _and_ that the final compute test is really not using
# the original location
src_sk.http_client().timeline_delete(tenant_id, timeline_id, only_local=True)
# Check that using the timeline correctly un-evicts it on the new location
ep.active_safekeepers = [2, 3, 4]
ep.start()
ep.safe_psql("INSERT INTO t VALUES (0)")
ep.stop()
def unevicted_on_dest():
assert (
dst_http.get_metric_value(
"safekeeper_eviction_events_completed_total", {"kind": "restore"}
)
or 0 > 0
)
n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
assert n_evicted == 0
wait_until(10, 1, unevicted_on_dest)
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
# when compute is active, but there are no writes to the timeline. In that case # when compute is active, but there are no writes to the timeline. In that case
# pageserver should maintain a single connection to safekeeper and don't attempt # pageserver should maintain a single connection to safekeeper and don't attempt

View File

@@ -1,12 +1,11 @@
from __future__ import annotations from __future__ import annotations
import os import time
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from fixtures.common_types import Lsn, TenantId from fixtures.common_types import Lsn, TenantId
from fixtures.log_helper import log from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
from fixtures.utils import wait_until
if TYPE_CHECKING: if TYPE_CHECKING:
from typing import Any from typing import Any
@@ -20,10 +19,6 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start() env = neon_env_builder.init_start()
env.pageserver.http_client() env.pageserver.http_client()
# In this test we force 'Timed out while waiting for WAL record error' while
# fetching basebackup and don't want any retries.
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
tenant_id, timeline_id = env.create_tenant() tenant_id, timeline_id = env.create_tenant()
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
@@ -54,14 +49,11 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder): def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
# Trigger WAL wait timeout faster # Trigger WAL wait timeout faster
def customize_pageserver_toml(ps_cfg: dict[str, Any]): def customize_pageserver_toml(ps_cfg: dict[str, Any]):
ps_cfg["wait_lsn_timeout"] = "2s" ps_cfg["wait_lsn_timeout"] = "1s"
tenant_config = ps_cfg.setdefault("tenant_config", {}) tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config["walreceiver_connect_timeout"] = "2s" tenant_config["walreceiver_connect_timeout"] = "2s"
tenant_config["lagging_wal_timeout"] = "2s" tenant_config["lagging_wal_timeout"] = "2s"
# In this test we force 'Timed out while waiting for WAL record error' while
# fetching basebackup and don't want any retries.
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
neon_env_builder.pageserver_config_override = customize_pageserver_toml neon_env_builder.pageserver_config_override = customize_pageserver_toml
# Have notable SK ids to ensure we check logs for their presence, not some other random numbers # Have notable SK ids to ensure we check logs for their presence, not some other random numbers
@@ -72,6 +64,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
tenant_id, timeline_id = env.create_tenant() tenant_id, timeline_id = env.create_tenant()
elements_to_insert = 1_000_000
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
# we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout # we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
@@ -81,50 +74,45 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
".*ingesting record with timestamp lagging more than wait_lsn_timeout.*" ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
) )
insert_test_elements(env, tenant_id, start=0, count=1) insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)
def all_sks_in_wareceiver_state(): try:
try: trigger_wait_lsn_timeout(env, tenant_id)
trigger_wait_lsn_timeout(env, tenant_id) except Exception as e:
except Exception as e: exception_string = str(e)
exception_string = str(e) assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
for safekeeper in env.safekeepers:
assert ( assert (
expected_timeout_error in exception_string str(safekeeper.id) in exception_string
), "Should time out during waiting for WAL" ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
for safekeeper in env.safekeepers:
assert (
str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
wait_until(60, 0.5, all_sks_in_wareceiver_state)
stopped_safekeeper = env.safekeepers[-1] stopped_safekeeper = env.safekeepers[-1]
stopped_safekeeper_id = stopped_safekeeper.id stopped_safekeeper_id = stopped_safekeeper.id
log.info(f"Stopping safekeeper {stopped_safekeeper.id}") log.info(f"Stopping safekeeper {stopped_safekeeper.id}")
stopped_safekeeper.stop() stopped_safekeeper.stop()
# sleep until stopped safekeeper is removed from candidates
time.sleep(2)
def all_but_stopped_sks_in_wareceiver_state(): # Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats.
try: insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert)
trigger_wait_lsn_timeout(env, tenant_id)
except Exception as e:
# Strip out the part before stdout, as it contains full command with the list of all safekeepers
exception_string = str(e).split("stdout", 1)[-1]
assert (
expected_timeout_error in exception_string
), "Should time out during waiting for WAL"
for safekeeper in env.safekeepers: try:
if safekeeper.id == stopped_safekeeper_id: trigger_wait_lsn_timeout(env, tenant_id)
assert ( except Exception as e:
str(safekeeper.id) not in exception_string # Strip out the part before stdout, as it contains full command with the list of all safekeepers
), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" exception_string = str(e).split("stdout", 1)[-1]
else: assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
assert (
str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state) for safekeeper in env.safekeepers:
if safekeeper.id == stopped_safekeeper_id:
assert (
str(safekeeper.id) not in exception_string
), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
else:
assert (
str(safekeeper.id) in exception_string
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int): def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):