mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-14 10:50:37 +00:00
Compare commits
5 Commits
remove-pos
...
proxy-simp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b66e545e26 | ||
|
|
c8108a4b84 | ||
|
|
2d34fec39b | ||
|
|
3da4705775 | ||
|
|
80c5576816 |
14
.github/workflows/_build-and-test-locally.yml
vendored
14
.github/workflows/_build-and-test-locally.yml
vendored
@@ -53,6 +53,20 @@ jobs:
|
|||||||
BUILD_TAG: ${{ inputs.build-tag }}
|
BUILD_TAG: ${{ inputs.build-tag }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- name: Fix git ownership
|
||||||
|
run: |
|
||||||
|
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||||
|
#
|
||||||
|
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||||
|
# Ref https://github.com/actions/checkout/issues/785
|
||||||
|
#
|
||||||
|
git config --global --add safe.directory ${{ github.workspace }}
|
||||||
|
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||||
|
for r in 14 15 16 17; do
|
||||||
|
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||||
|
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||||
|
done
|
||||||
|
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
|||||||
20
.github/workflows/build_and_test.yml
vendored
20
.github/workflows/build_and_test.yml
vendored
@@ -839,7 +839,6 @@ jobs:
|
|||||||
- name: Build vm image
|
- name: Build vm image
|
||||||
run: |
|
run: |
|
||||||
./vm-builder \
|
./vm-builder \
|
||||||
-size=2G \
|
|
||||||
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
|
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
|
||||||
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||||
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||||
@@ -1079,6 +1078,20 @@ jobs:
|
|||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, small ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||||
steps:
|
steps:
|
||||||
|
- name: Fix git ownership
|
||||||
|
run: |
|
||||||
|
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||||
|
#
|
||||||
|
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||||
|
# Ref https://github.com/actions/checkout/issues/785
|
||||||
|
#
|
||||||
|
git config --global --add safe.directory ${{ github.workspace }}
|
||||||
|
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||||
|
for r in 14 15 16 17; do
|
||||||
|
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||||
|
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||||
|
done
|
||||||
|
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Trigger deploy workflow
|
- name: Trigger deploy workflow
|
||||||
@@ -1117,10 +1130,7 @@ jobs:
|
|||||||
|
|
||||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||||
-f deployPgSniRouter=true \
|
-f deployPgSniRouter=true \
|
||||||
-f deployProxyLink=true \
|
-f deployProxy=true \
|
||||||
-f deployPrivatelinkProxy=true \
|
|
||||||
-f deployProxyScram=true \
|
|
||||||
-f deployProxyAuthBroker=true \
|
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||||
else
|
else
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -6,8 +6,6 @@ __pycache__/
|
|||||||
test_output/
|
test_output/
|
||||||
.vscode
|
.vscode
|
||||||
.idea
|
.idea
|
||||||
*.swp
|
|
||||||
tags
|
|
||||||
neon.iml
|
neon.iml
|
||||||
/.neon
|
/.neon
|
||||||
/integration_tests/.neon
|
/integration_tests/.neon
|
||||||
|
|||||||
4
Cargo.lock
generated
4
Cargo.lock
generated
@@ -6272,7 +6272,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio-epoll-uring"
|
name = "tokio-epoll-uring"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"nix 0.26.4",
|
"nix 0.26.4",
|
||||||
@@ -6788,7 +6788,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "uring-common"
|
name = "uring-common"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"io-uring",
|
"io-uring",
|
||||||
|
|||||||
@@ -666,7 +666,7 @@ RUN apt-get update && \
|
|||||||
#
|
#
|
||||||
# Use new version only for v17
|
# Use new version only for v17
|
||||||
# because Release_2024_09_1 has some backward incompatible changes
|
# because Release_2024_09_1 has some backward incompatible changes
|
||||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||||
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||||
RUN case "${PG_VERSION}" in \
|
RUN case "${PG_VERSION}" in \
|
||||||
"v17") \
|
"v17") \
|
||||||
@@ -860,14 +860,13 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
|||||||
USER nonroot
|
USER nonroot
|
||||||
WORKDIR /home/nonroot
|
WORKDIR /home/nonroot
|
||||||
|
|
||||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
RUN case "${PG_VERSION}" in "v17") \
|
||||||
|
echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
|
||||||
|
esac && \
|
||||||
|
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||||
chmod +x rustup-init && \
|
chmod +x rustup-init && \
|
||||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||||
rm rustup-init && \
|
rm rustup-init && \
|
||||||
case "${PG_VERSION}" in \
|
|
||||||
'v17') \
|
|
||||||
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
|
|
||||||
esac && \
|
|
||||||
cargo install --locked --version 0.11.3 cargo-pgrx && \
|
cargo install --locked --version 0.11.3 cargo-pgrx && \
|
||||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||||
|
|
||||||
@@ -1042,31 +1041,6 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
|
|||||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
||||||
|
|
||||||
#########################################################################################
|
|
||||||
#
|
|
||||||
# Layer "pg_mooncake"
|
|
||||||
# compile pg_mooncake extension
|
|
||||||
#
|
|
||||||
#########################################################################################
|
|
||||||
FROM rust-extensions-build AS pg-mooncake-build
|
|
||||||
ARG PG_VERSION
|
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
|
|
||||||
ENV PG_MOONCAKE_VERSION=0a7de4c0b5c7b1a5e2175e1c5f4625b97b7346f1
|
|
||||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
|
||||||
|
|
||||||
RUN case "${PG_VERSION}" in \
|
|
||||||
'v14') \
|
|
||||||
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
|
|
||||||
esac && \
|
|
||||||
git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
|
|
||||||
cd pg_mooncake-src && \
|
|
||||||
git checkout "${PG_MOONCAKE_VERSION}" && \
|
|
||||||
git submodule update --init --depth 1 --recursive && \
|
|
||||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
|
|
||||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
|
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
|
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
# Layer "neon-pg-ext-build"
|
# Layer "neon-pg-ext-build"
|
||||||
@@ -1110,7 +1084,6 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
|||||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
|
|
||||||
COPY pgxn/ pgxn/
|
COPY pgxn/ pgxn/
|
||||||
|
|
||||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ commands:
|
|||||||
- name: pgbouncer
|
- name: pgbouncer
|
||||||
user: postgres
|
user: postgres
|
||||||
sysvInitAction: respawn
|
sysvInitAction: respawn
|
||||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
|
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||||
- name: local_proxy
|
- name: local_proxy
|
||||||
user: postgres
|
user: postgres
|
||||||
sysvInitAction: respawn
|
sysvInitAction: respawn
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ commands:
|
|||||||
- name: pgbouncer
|
- name: pgbouncer
|
||||||
user: postgres
|
user: postgres
|
||||||
sysvInitAction: respawn
|
sysvInitAction: respawn
|
||||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
|
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||||
- name: local_proxy
|
- name: local_proxy
|
||||||
user: postgres
|
user: postgres
|
||||||
sysvInitAction: respawn
|
sysvInitAction: respawn
|
||||||
|
|||||||
@@ -1073,10 +1073,10 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
|
|||||||
tenant_id,
|
tenant_id,
|
||||||
TimelineCreateRequest {
|
TimelineCreateRequest {
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
|
ancestor_timeline_id: None,
|
||||||
existing_initdb_timeline_id: None,
|
ancestor_start_lsn: None,
|
||||||
pg_version: Some(args.pg_version),
|
existing_initdb_timeline_id: None,
|
||||||
},
|
pg_version: Some(args.pg_version),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -1133,10 +1133,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
|
|||||||
let storage_controller = StorageController::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
let create_req = TimelineCreateRequest {
|
let create_req = TimelineCreateRequest {
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
|
ancestor_timeline_id: None,
|
||||||
existing_initdb_timeline_id: None,
|
existing_initdb_timeline_id: None,
|
||||||
pg_version: Some(args.pg_version),
|
ancestor_start_lsn: None,
|
||||||
},
|
pg_version: Some(args.pg_version),
|
||||||
};
|
};
|
||||||
let timeline_info = storage_controller
|
let timeline_info = storage_controller
|
||||||
.tenant_timeline_create(tenant_id, create_req)
|
.tenant_timeline_create(tenant_id, create_req)
|
||||||
@@ -1189,11 +1189,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
|
|||||||
let storage_controller = StorageController::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
let create_req = TimelineCreateRequest {
|
let create_req = TimelineCreateRequest {
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
mode: pageserver_api::models::TimelineCreateRequestMode::Branch {
|
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||||
ancestor_timeline_id,
|
existing_initdb_timeline_id: None,
|
||||||
ancestor_start_lsn: start_lsn,
|
ancestor_start_lsn: start_lsn,
|
||||||
pg_version: None,
|
pg_version: None,
|
||||||
},
|
|
||||||
};
|
};
|
||||||
let timeline_info = storage_controller
|
let timeline_info = storage_controller
|
||||||
.tenant_timeline_create(tenant_id, create_req)
|
.tenant_timeline_create(tenant_id, create_req)
|
||||||
|
|||||||
@@ -529,6 +529,28 @@ impl PageServerNode {
|
|||||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn timeline_create(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
new_timeline_id: TimelineId,
|
||||||
|
ancestor_start_lsn: Option<Lsn>,
|
||||||
|
ancestor_timeline_id: Option<TimelineId>,
|
||||||
|
pg_version: Option<u32>,
|
||||||
|
existing_initdb_timeline_id: Option<TimelineId>,
|
||||||
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
|
let req = models::TimelineCreateRequest {
|
||||||
|
new_timeline_id,
|
||||||
|
ancestor_start_lsn,
|
||||||
|
ancestor_timeline_id,
|
||||||
|
pg_version,
|
||||||
|
existing_initdb_timeline_id,
|
||||||
|
};
|
||||||
|
Ok(self
|
||||||
|
.http_client
|
||||||
|
.timeline_create(tenant_shard_id, &req)
|
||||||
|
.await?)
|
||||||
|
}
|
||||||
|
|
||||||
/// Import a basebackup prepared using either:
|
/// Import a basebackup prepared using either:
|
||||||
/// a) `pg_basebackup -F tar`, or
|
/// a) `pg_basebackup -F tar`, or
|
||||||
/// b) The `fullbackup` pageserver endpoint
|
/// b) The `fullbackup` pageserver endpoint
|
||||||
|
|||||||
@@ -111,11 +111,6 @@ enum Command {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
node: NodeId,
|
node: NodeId,
|
||||||
},
|
},
|
||||||
/// Cancel any ongoing reconciliation for this shard
|
|
||||||
TenantShardCancelReconcile {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
},
|
|
||||||
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||||
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
||||||
TenantConfig {
|
TenantConfig {
|
||||||
@@ -540,15 +535,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
Command::TenantShardCancelReconcile { tenant_shard_id } => {
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<(), ()>(
|
|
||||||
Method::PUT,
|
|
||||||
format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
Command::TenantConfig { tenant_id, config } => {
|
Command::TenantConfig { tenant_id, config } => {
|
||||||
let tenant_conf = serde_json::from_str(&config)?;
|
let tenant_conf = serde_json::from_str(&config)?;
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ use once_cell::sync::Lazy;
|
|||||||
use prometheus::core::{
|
use prometheus::core::{
|
||||||
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
||||||
};
|
};
|
||||||
pub use prometheus::local::LocalHistogram;
|
|
||||||
pub use prometheus::opts;
|
pub use prometheus::opts;
|
||||||
pub use prometheus::register;
|
pub use prometheus::register;
|
||||||
pub use prometheus::Error;
|
pub use prometheus::Error;
|
||||||
|
|||||||
@@ -211,30 +211,13 @@ pub enum TimelineState {
|
|||||||
#[derive(Serialize, Deserialize, Clone)]
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
pub struct TimelineCreateRequest {
|
pub struct TimelineCreateRequest {
|
||||||
pub new_timeline_id: TimelineId,
|
pub new_timeline_id: TimelineId,
|
||||||
#[serde(flatten)]
|
#[serde(default)]
|
||||||
pub mode: TimelineCreateRequestMode,
|
pub ancestor_timeline_id: Option<TimelineId>,
|
||||||
}
|
#[serde(default)]
|
||||||
|
pub existing_initdb_timeline_id: Option<TimelineId>,
|
||||||
#[derive(Serialize, Deserialize, Clone)]
|
#[serde(default)]
|
||||||
#[serde(untagged)]
|
pub ancestor_start_lsn: Option<Lsn>,
|
||||||
pub enum TimelineCreateRequestMode {
|
pub pg_version: Option<u32>,
|
||||||
Branch {
|
|
||||||
ancestor_timeline_id: TimelineId,
|
|
||||||
#[serde(default)]
|
|
||||||
ancestor_start_lsn: Option<Lsn>,
|
|
||||||
// TODO: cplane sets this, but, the branching code always
|
|
||||||
// inherits the ancestor's pg_version. Earlier code wasn't
|
|
||||||
// using a flattened enum, so, it was an accepted field, and
|
|
||||||
// we continue to accept it by having it here.
|
|
||||||
pg_version: Option<u32>,
|
|
||||||
},
|
|
||||||
// NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
|
|
||||||
// (serde picks the first matching enum variant, in declaration order).
|
|
||||||
Bootstrap {
|
|
||||||
#[serde(default)]
|
|
||||||
existing_initdb_timeline_id: Option<TimelineId>,
|
|
||||||
pg_version: Option<u32>,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone)]
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
@@ -1068,12 +1051,6 @@ pub mod virtual_file {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct ScanDisposableKeysResponse {
|
|
||||||
pub disposable_count: usize,
|
|
||||||
pub not_disposable_count: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wrapped in libpq CopyData
|
// Wrapped in libpq CopyData
|
||||||
#[derive(PartialEq, Eq, Debug)]
|
#[derive(PartialEq, Eq, Debug)]
|
||||||
pub enum PagestreamFeMessage {
|
pub enum PagestreamFeMessage {
|
||||||
|
|||||||
@@ -357,20 +357,22 @@ impl RemoteStorage for LocalFs {
|
|||||||
.list_recursive(prefix)
|
.list_recursive(prefix)
|
||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
let mut objects = Vec::with_capacity(keys.len());
|
let objects = keys
|
||||||
for key in keys {
|
.into_iter()
|
||||||
let path = key.with_base(&self.storage_root);
|
.filter_map(|k| {
|
||||||
let metadata = file_metadata(&path).await?;
|
let path = k.with_base(&self.storage_root);
|
||||||
if metadata.is_dir() {
|
if path.is_dir() {
|
||||||
continue;
|
None
|
||||||
}
|
} else {
|
||||||
objects.push(ListingObject {
|
Some(ListingObject {
|
||||||
key: key.clone(),
|
key: k.clone(),
|
||||||
last_modified: metadata.modified()?,
|
// LocalFs is just for testing, so just specify a dummy time
|
||||||
size: metadata.len(),
|
last_modified: SystemTime::now(),
|
||||||
});
|
size: 0,
|
||||||
}
|
})
|
||||||
let objects = objects;
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
if let ListingMode::NoDelimiter = mode {
|
if let ListingMode::NoDelimiter = mode {
|
||||||
result.keys = objects;
|
result.keys = objects;
|
||||||
@@ -408,8 +410,9 @@ impl RemoteStorage for LocalFs {
|
|||||||
} else {
|
} else {
|
||||||
result.keys.push(ListingObject {
|
result.keys.push(ListingObject {
|
||||||
key: RemotePath::from_string(&relative_key).unwrap(),
|
key: RemotePath::from_string(&relative_key).unwrap(),
|
||||||
last_modified: object.last_modified,
|
// LocalFs is just for testing
|
||||||
size: object.size,
|
last_modified: SystemTime::now(),
|
||||||
|
size: 0,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -597,10 +597,6 @@ paths:
|
|||||||
Create a timeline. Returns new timeline id on success.
|
Create a timeline. Returns new timeline id on success.
|
||||||
Recreating the same timeline will succeed if the parameters match the existing timeline.
|
Recreating the same timeline will succeed if the parameters match the existing timeline.
|
||||||
If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
|
If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
|
||||||
|
|
||||||
To ensure durability, the caller must retry the creation until success.
|
|
||||||
Just because the timeline is visible via other endpoints does not mean it is durable.
|
|
||||||
Future versions may stop showing timelines that are not yet durable.
|
|
||||||
requestBody:
|
requestBody:
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
|
|||||||
@@ -38,7 +38,6 @@ use pageserver_api::models::TenantShardSplitRequest;
|
|||||||
use pageserver_api::models::TenantShardSplitResponse;
|
use pageserver_api::models::TenantShardSplitResponse;
|
||||||
use pageserver_api::models::TenantSorting;
|
use pageserver_api::models::TenantSorting;
|
||||||
use pageserver_api::models::TimelineArchivalConfigRequest;
|
use pageserver_api::models::TimelineArchivalConfigRequest;
|
||||||
use pageserver_api::models::TimelineCreateRequestMode;
|
|
||||||
use pageserver_api::models::TimelinesInfoAndOffloaded;
|
use pageserver_api::models::TimelinesInfoAndOffloaded;
|
||||||
use pageserver_api::models::TopTenantShardItem;
|
use pageserver_api::models::TopTenantShardItem;
|
||||||
use pageserver_api::models::TopTenantShardsRequest;
|
use pageserver_api::models::TopTenantShardsRequest;
|
||||||
@@ -86,7 +85,6 @@ use crate::tenant::timeline::Timeline;
|
|||||||
use crate::tenant::GetTimelineError;
|
use crate::tenant::GetTimelineError;
|
||||||
use crate::tenant::OffloadedTimeline;
|
use crate::tenant::OffloadedTimeline;
|
||||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||||
use crate::DEFAULT_PG_VERSION;
|
|
||||||
use crate::{disk_usage_eviction_task, tenant};
|
use crate::{disk_usage_eviction_task, tenant};
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
|
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
|
||||||
@@ -549,26 +547,6 @@ async fn timeline_create_handler(
|
|||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let new_timeline_id = request_data.new_timeline_id;
|
let new_timeline_id = request_data.new_timeline_id;
|
||||||
// fill in the default pg_version if not provided & convert request into domain model
|
|
||||||
let params: tenant::CreateTimelineParams = match request_data.mode {
|
|
||||||
TimelineCreateRequestMode::Bootstrap {
|
|
||||||
existing_initdb_timeline_id,
|
|
||||||
pg_version,
|
|
||||||
} => tenant::CreateTimelineParams::Bootstrap(tenant::CreateTimelineParamsBootstrap {
|
|
||||||
new_timeline_id,
|
|
||||||
existing_initdb_timeline_id,
|
|
||||||
pg_version: pg_version.unwrap_or(DEFAULT_PG_VERSION),
|
|
||||||
}),
|
|
||||||
TimelineCreateRequestMode::Branch {
|
|
||||||
ancestor_timeline_id,
|
|
||||||
ancestor_start_lsn,
|
|
||||||
pg_version: _,
|
|
||||||
} => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch {
|
|
||||||
new_timeline_id,
|
|
||||||
ancestor_timeline_id,
|
|
||||||
ancestor_start_lsn,
|
|
||||||
}),
|
|
||||||
};
|
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
|
||||||
|
|
||||||
@@ -581,12 +559,22 @@ async fn timeline_create_handler(
|
|||||||
|
|
||||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
// earlier versions of the code had pg_version and ancestor_lsn in the span
|
if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
|
||||||
// => continue to provide that information, but, through a log message that doesn't require us to destructure
|
tracing::info!(%ancestor_id, "starting to branch");
|
||||||
tracing::info!(?params, "creating timeline");
|
} else {
|
||||||
|
tracing::info!("bootstrapping");
|
||||||
|
}
|
||||||
|
|
||||||
match tenant
|
match tenant
|
||||||
.create_timeline(params, state.broker_client.clone(), &ctx)
|
.create_timeline(
|
||||||
|
new_timeline_id,
|
||||||
|
request_data.ancestor_timeline_id,
|
||||||
|
request_data.ancestor_start_lsn,
|
||||||
|
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||||
|
request_data.existing_initdb_timeline_id,
|
||||||
|
state.broker_client.clone(),
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(new_timeline) => {
|
Ok(new_timeline) => {
|
||||||
@@ -637,6 +625,8 @@ async fn timeline_create_handler(
|
|||||||
tenant_id = %tenant_shard_id.tenant_id,
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
shard_id = %tenant_shard_id.shard_slug(),
|
shard_id = %tenant_shard_id.shard_slug(),
|
||||||
timeline_id = %new_timeline_id,
|
timeline_id = %new_timeline_id,
|
||||||
|
lsn=?request_data.ancestor_start_lsn,
|
||||||
|
pg_version=?request_data.pg_version
|
||||||
))
|
))
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
@@ -1293,99 +1283,6 @@ async fn layer_map_info_handler(
|
|||||||
json_response(StatusCode::OK, layer_map_info)
|
json_response(StatusCode::OK, layer_map_info)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id, shard_id, timeline_id, layer_name))]
|
|
||||||
async fn timeline_layer_scan_disposable_keys(
|
|
||||||
request: Request<Body>,
|
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
|
||||||
let layer_name: LayerName = parse_request_param(&request, "layer_name")?;
|
|
||||||
|
|
||||||
tracing::Span::current().record(
|
|
||||||
"tenant_id",
|
|
||||||
tracing::field::display(&tenant_shard_id.tenant_id),
|
|
||||||
);
|
|
||||||
tracing::Span::current().record(
|
|
||||||
"shard_id",
|
|
||||||
tracing::field::display(tenant_shard_id.shard_slug()),
|
|
||||||
);
|
|
||||||
tracing::Span::current().record("timeline_id", tracing::field::display(&timeline_id));
|
|
||||||
tracing::Span::current().record("layer_name", tracing::field::display(&layer_name));
|
|
||||||
|
|
||||||
let state = get_state(&request);
|
|
||||||
|
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
|
||||||
|
|
||||||
// technically the timeline need not be active for this scan to complete
|
|
||||||
let timeline =
|
|
||||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
|
||||||
|
|
||||||
let guard = timeline.layers.read().await;
|
|
||||||
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
|
|
||||||
return Err(ApiError::NotFound(
|
|
||||||
anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
|
|
||||||
));
|
|
||||||
};
|
|
||||||
|
|
||||||
let resident_layer = layer
|
|
||||||
.download_and_keep_resident()
|
|
||||||
.await
|
|
||||||
.map_err(|err| match err {
|
|
||||||
tenant::storage_layer::layer::DownloadError::TimelineShutdown
|
|
||||||
| tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
|
|
||||||
ApiError::ShuttingDown
|
|
||||||
}
|
|
||||||
tenant::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
|
|
||||||
| tenant::storage_layer::layer::DownloadError::DownloadRequired
|
|
||||||
| tenant::storage_layer::layer::DownloadError::NotFile(_)
|
|
||||||
| tenant::storage_layer::layer::DownloadError::DownloadFailed
|
|
||||||
| tenant::storage_layer::layer::DownloadError::PreStatFailed(_) => {
|
|
||||||
ApiError::InternalServerError(err.into())
|
|
||||||
}
|
|
||||||
#[cfg(test)]
|
|
||||||
tenant::storage_layer::layer::DownloadError::Failpoint(_) => {
|
|
||||||
ApiError::InternalServerError(err.into())
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let keys = resident_layer
|
|
||||||
.load_keys(&ctx)
|
|
||||||
.await
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
let shard_identity = timeline.get_shard_identity();
|
|
||||||
|
|
||||||
let mut disposable_count = 0;
|
|
||||||
let mut not_disposable_count = 0;
|
|
||||||
let cancel = cancel.clone();
|
|
||||||
for (i, key) in keys.into_iter().enumerate() {
|
|
||||||
if shard_identity.is_key_disposable(&key) {
|
|
||||||
disposable_count += 1;
|
|
||||||
tracing::debug!(key = %key, key.dbg=?key, "disposable key");
|
|
||||||
} else {
|
|
||||||
not_disposable_count += 1;
|
|
||||||
}
|
|
||||||
#[allow(clippy::collapsible_if)]
|
|
||||||
if i % 10000 == 0 {
|
|
||||||
if cancel.is_cancelled() || timeline.cancel.is_cancelled() || timeline.is_stopping() {
|
|
||||||
return Err(ApiError::ShuttingDown);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
pageserver_api::models::ScanDisposableKeysResponse {
|
|
||||||
disposable_count,
|
|
||||||
not_disposable_count,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn layer_download_handler(
|
async fn layer_download_handler(
|
||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
@@ -3248,10 +3145,6 @@ pub fn make_router(
|
|||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||||
|r| api_handler(r, evict_timeline_layer_handler),
|
|r| api_handler(r, evict_timeline_layer_handler),
|
||||||
)
|
)
|
||||||
.post(
|
|
||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_name/scan_disposable_keys",
|
|
||||||
|r| testing_api_handler("timeline_layer_scan_disposable_keys", r, timeline_layer_scan_disposable_keys),
|
|
||||||
)
|
|
||||||
.post(
|
.post(
|
||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
|
||||||
|r| api_handler(r, timeline_gc_blocking_handler),
|
|r| api_handler(r, timeline_gc_blocking_handler),
|
||||||
|
|||||||
@@ -3040,111 +3040,13 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub mod tokio_epoll_uring {
|
pub mod tokio_epoll_uring {
|
||||||
use std::{
|
use metrics::{register_int_counter, UIntGauge};
|
||||||
collections::HashMap,
|
|
||||||
sync::{Arc, Mutex},
|
|
||||||
};
|
|
||||||
|
|
||||||
use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
/// Shared storage for tokio-epoll-uring thread local metrics.
|
|
||||||
pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
|
|
||||||
Lazy::new(|| {
|
|
||||||
let slots_submission_queue_depth = register_histogram!(
|
|
||||||
"pageserver_tokio_epoll_uring_slots_submission_queue_depth",
|
|
||||||
"The slots waiters queue depth of each tokio_epoll_uring system",
|
|
||||||
vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric");
|
|
||||||
ThreadLocalMetricsStorage {
|
|
||||||
observers: Mutex::new(HashMap::new()),
|
|
||||||
slots_submission_queue_depth,
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
pub struct ThreadLocalMetricsStorage {
|
|
||||||
/// List of thread local metrics observers.
|
|
||||||
observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
|
|
||||||
/// A histogram shared between all thread local systems
|
|
||||||
/// for collecting slots submission queue depth.
|
|
||||||
slots_submission_queue_depth: Histogram,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
|
|
||||||
/// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
|
|
||||||
///
|
|
||||||
/// The System makes observations into [`Self`] and periodically, the collector
|
|
||||||
/// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
|
|
||||||
///
|
|
||||||
/// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
|
|
||||||
/// But except for the periodic flush, the lock is uncontended so there's no waiting
|
|
||||||
/// for cache coherence protocol to get an exclusive cache line.
|
|
||||||
pub struct ThreadLocalMetrics {
|
|
||||||
/// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
|
|
||||||
slots_submission_queue_depth: Mutex<LocalHistogram>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ThreadLocalMetricsStorage {
|
|
||||||
/// Registers a new thread local system. Returns a thread local metrics observer.
|
|
||||||
pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
|
|
||||||
let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
|
|
||||||
self.slots_submission_queue_depth.local(),
|
|
||||||
));
|
|
||||||
let mut g = self.observers.lock().unwrap();
|
|
||||||
g.insert(id, Arc::clone(&per_system_metrics));
|
|
||||||
per_system_metrics
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Removes metrics observer for a thread local system.
|
|
||||||
/// This should be called before dropping a thread local system.
|
|
||||||
pub fn remove_system(&self, id: u64) {
|
|
||||||
let mut g = self.observers.lock().unwrap();
|
|
||||||
g.remove(&id);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Flush all thread local metrics to the shared storage.
|
|
||||||
pub fn flush_thread_local_metrics(&self) {
|
|
||||||
let g = self.observers.lock().unwrap();
|
|
||||||
g.values().for_each(|local| {
|
|
||||||
local.flush();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ThreadLocalMetrics {
|
|
||||||
pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
|
|
||||||
ThreadLocalMetrics {
|
|
||||||
slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Flushes the thread local metrics to shared aggregator.
|
|
||||||
pub fn flush(&self) {
|
|
||||||
let Self {
|
|
||||||
slots_submission_queue_depth,
|
|
||||||
} = self;
|
|
||||||
slots_submission_queue_depth.lock().unwrap().flush();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
|
|
||||||
fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
|
|
||||||
let Self {
|
|
||||||
slots_submission_queue_depth,
|
|
||||||
} = self;
|
|
||||||
slots_submission_queue_depth
|
|
||||||
.lock()
|
|
||||||
.unwrap()
|
|
||||||
.observe(queue_depth as f64);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Collector {
|
pub struct Collector {
|
||||||
descs: Vec<metrics::core::Desc>,
|
descs: Vec<metrics::core::Desc>,
|
||||||
systems_created: UIntGauge,
|
systems_created: UIntGauge,
|
||||||
systems_destroyed: UIntGauge,
|
systems_destroyed: UIntGauge,
|
||||||
thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl metrics::core::Collector for Collector {
|
impl metrics::core::Collector for Collector {
|
||||||
@@ -3154,7 +3056,7 @@ pub mod tokio_epoll_uring {
|
|||||||
|
|
||||||
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||||
let mut mfs = Vec::with_capacity(Self::NMETRICS);
|
let mut mfs = Vec::with_capacity(Self::NMETRICS);
|
||||||
let tokio_epoll_uring::metrics::GlobalMetrics {
|
let tokio_epoll_uring::metrics::Metrics {
|
||||||
systems_created,
|
systems_created,
|
||||||
systems_destroyed,
|
systems_destroyed,
|
||||||
} = tokio_epoll_uring::metrics::global();
|
} = tokio_epoll_uring::metrics::global();
|
||||||
@@ -3162,21 +3064,12 @@ pub mod tokio_epoll_uring {
|
|||||||
mfs.extend(self.systems_created.collect());
|
mfs.extend(self.systems_created.collect());
|
||||||
self.systems_destroyed.set(systems_destroyed);
|
self.systems_destroyed.set(systems_destroyed);
|
||||||
mfs.extend(self.systems_destroyed.collect());
|
mfs.extend(self.systems_destroyed.collect());
|
||||||
|
|
||||||
self.thread_local_metrics_storage
|
|
||||||
.flush_thread_local_metrics();
|
|
||||||
|
|
||||||
mfs.extend(
|
|
||||||
self.thread_local_metrics_storage
|
|
||||||
.slots_submission_queue_depth
|
|
||||||
.collect(),
|
|
||||||
);
|
|
||||||
mfs
|
mfs
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Collector {
|
impl Collector {
|
||||||
const NMETRICS: usize = 3;
|
const NMETRICS: usize = 2;
|
||||||
|
|
||||||
#[allow(clippy::new_without_default)]
|
#[allow(clippy::new_without_default)]
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
@@ -3208,7 +3101,6 @@ pub mod tokio_epoll_uring {
|
|||||||
descs,
|
descs,
|
||||||
systems_created,
|
systems_created,
|
||||||
systems_destroyed,
|
systems_destroyed,
|
||||||
thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3568,7 +3460,6 @@ pub fn preinitialize_metrics() {
|
|||||||
Lazy::force(&RECONSTRUCT_TIME);
|
Lazy::force(&RECONSTRUCT_TIME);
|
||||||
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
||||||
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
|
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
|
||||||
Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
|
|
||||||
|
|
||||||
tenant_throttling::preinitialize_global_metrics();
|
tenant_throttling::preinitialize_global_metrics();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1506,42 +1506,35 @@ impl<'a> DatadirModification<'a> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Drop some relations
|
/// Drop a relation.
|
||||||
pub(crate) async fn put_rel_drops(
|
pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||||
&mut self,
|
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
|
||||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
|
||||||
let buf = self.get(dir_key, ctx).await?;
|
|
||||||
let mut dir = RelDirectory::des(&buf)?;
|
|
||||||
|
|
||||||
let mut dirty = false;
|
// Remove it from the directory entry
|
||||||
for rel_tag in rel_tags {
|
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||||
if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
|
let buf = self.get(dir_key, ctx).await?;
|
||||||
dirty = true;
|
let mut dir = RelDirectory::des(&buf)?;
|
||||||
|
|
||||||
// update logical size
|
self.pending_directory_entries
|
||||||
let size_key = rel_size_to_key(rel_tag);
|
.push((DirectoryKind::Rel, dir.rels.len()));
|
||||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
|
||||||
self.pending_nblocks -= old_size as i64;
|
|
||||||
|
|
||||||
// Remove entry from relation size cache
|
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
||||||
self.tline.remove_cached_rel_size(&rel_tag);
|
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
||||||
|
} else {
|
||||||
// Delete size entry, as well as all blocks
|
warn!("dropped rel {} did not exist in rel directory", rel);
|
||||||
self.delete(rel_key_range(rel_tag));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if dirty {
|
|
||||||
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
|
||||||
self.pending_directory_entries
|
|
||||||
.push((DirectoryKind::Rel, dir.rels.len()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update logical size
|
||||||
|
let size_key = rel_size_to_key(rel);
|
||||||
|
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||||
|
self.pending_nblocks -= old_size as i64;
|
||||||
|
|
||||||
|
// Remove enty from relation size cache
|
||||||
|
self.tline.remove_cached_rel_size(&rel);
|
||||||
|
|
||||||
|
// Delete size entry, as well as all blocks
|
||||||
|
self.delete(rel_key_range(rel));
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -294,11 +294,11 @@ pub struct Tenant {
|
|||||||
|
|
||||||
/// During timeline creation, we first insert the TimelineId to the
|
/// During timeline creation, we first insert the TimelineId to the
|
||||||
/// creating map, then `timelines`, then remove it from the creating map.
|
/// creating map, then `timelines`, then remove it from the creating map.
|
||||||
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
|
/// **Lock order**: if acquiring both, acquire`timelines` before `timelines_creating`
|
||||||
timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
|
timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
|
||||||
|
|
||||||
/// Possibly offloaded and archived timelines
|
/// Possibly offloaded and archived timelines
|
||||||
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
|
/// **Lock order**: if acquiring both, acquire`timelines` before `timelines_offloaded`
|
||||||
timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
|
timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
|
||||||
|
|
||||||
// This mutex prevents creation of new timelines during GC.
|
// This mutex prevents creation of new timelines during GC.
|
||||||
@@ -584,40 +584,30 @@ impl OffloadedTimeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for OffloadedTimeline {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
write!(f, "OffloadedTimeline<{}>", self.timeline_id)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
|
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
|
||||||
pub enum MaybeOffloaded {
|
pub enum MaybeOffloaded {
|
||||||
Yes,
|
Yes,
|
||||||
No,
|
No,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone)]
|
||||||
pub enum TimelineOrOffloaded {
|
pub enum TimelineOrOffloaded {
|
||||||
Timeline(Arc<Timeline>),
|
Timeline(Arc<Timeline>),
|
||||||
Offloaded(Arc<OffloadedTimeline>),
|
Offloaded(Arc<OffloadedTimeline>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TimelineOrOffloaded {
|
impl TimelineOrOffloaded {
|
||||||
pub fn arc_ref(&self) -> TimelineOrOffloadedArcRef<'_> {
|
pub fn tenant_shard_id(&self) -> TenantShardId {
|
||||||
match self {
|
match self {
|
||||||
TimelineOrOffloaded::Timeline(timeline) => {
|
TimelineOrOffloaded::Timeline(timeline) => timeline.tenant_shard_id,
|
||||||
TimelineOrOffloadedArcRef::Timeline(timeline)
|
TimelineOrOffloaded::Offloaded(offloaded) => offloaded.tenant_shard_id,
|
||||||
}
|
|
||||||
TimelineOrOffloaded::Offloaded(offloaded) => {
|
|
||||||
TimelineOrOffloadedArcRef::Offloaded(offloaded)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn tenant_shard_id(&self) -> TenantShardId {
|
|
||||||
self.arc_ref().tenant_shard_id()
|
|
||||||
}
|
|
||||||
pub fn timeline_id(&self) -> TimelineId {
|
pub fn timeline_id(&self) -> TimelineId {
|
||||||
self.arc_ref().timeline_id()
|
match self {
|
||||||
|
TimelineOrOffloaded::Timeline(timeline) => timeline.timeline_id,
|
||||||
|
TimelineOrOffloaded::Offloaded(offloaded) => offloaded.timeline_id,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
pub fn delete_progress(&self) -> &Arc<tokio::sync::Mutex<DeleteTimelineFlow>> {
|
pub fn delete_progress(&self) -> &Arc<tokio::sync::Mutex<DeleteTimelineFlow>> {
|
||||||
match self {
|
match self {
|
||||||
@@ -625,7 +615,7 @@ impl TimelineOrOffloaded {
|
|||||||
TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
|
TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
|
pub fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
|
||||||
match self {
|
match self {
|
||||||
TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
|
TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
|
||||||
TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
|
TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
|
||||||
@@ -642,38 +632,6 @@ impl TimelineOrOffloaded {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum TimelineOrOffloadedArcRef<'a> {
|
|
||||||
Timeline(&'a Arc<Timeline>),
|
|
||||||
Offloaded(&'a Arc<OffloadedTimeline>),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TimelineOrOffloadedArcRef<'_> {
|
|
||||||
pub fn tenant_shard_id(&self) -> TenantShardId {
|
|
||||||
match self {
|
|
||||||
TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.tenant_shard_id,
|
|
||||||
TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.tenant_shard_id,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub fn timeline_id(&self) -> TimelineId {
|
|
||||||
match self {
|
|
||||||
TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.timeline_id,
|
|
||||||
TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.timeline_id,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> From<&'a Arc<Timeline>> for TimelineOrOffloadedArcRef<'a> {
|
|
||||||
fn from(timeline: &'a Arc<Timeline>) -> Self {
|
|
||||||
Self::Timeline(timeline)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> From<&'a Arc<OffloadedTimeline>> for TimelineOrOffloadedArcRef<'a> {
|
|
||||||
fn from(timeline: &'a Arc<OffloadedTimeline>) -> Self {
|
|
||||||
Self::Offloaded(timeline)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||||
pub enum GetTimelineError {
|
pub enum GetTimelineError {
|
||||||
#[error("Timeline is shutting down")]
|
#[error("Timeline is shutting down")]
|
||||||
@@ -779,99 +737,6 @@ impl Debug for SetStoppingError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Arguments to [`Tenant::create_timeline`].
|
|
||||||
///
|
|
||||||
/// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
|
|
||||||
/// is `None`, the result of the timeline create call is not deterministic.
|
|
||||||
///
|
|
||||||
/// See [`CreateTimelineIdempotency`] for an idempotency key.
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(crate) enum CreateTimelineParams {
|
|
||||||
Bootstrap(CreateTimelineParamsBootstrap),
|
|
||||||
Branch(CreateTimelineParamsBranch),
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(crate) struct CreateTimelineParamsBootstrap {
|
|
||||||
pub(crate) new_timeline_id: TimelineId,
|
|
||||||
pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
|
|
||||||
pub(crate) pg_version: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(crate) struct CreateTimelineParamsBranch {
|
|
||||||
pub(crate) new_timeline_id: TimelineId,
|
|
||||||
pub(crate) ancestor_timeline_id: TimelineId,
|
|
||||||
pub(crate) ancestor_start_lsn: Option<Lsn>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in [`Tenant::start_creating_timeline`].
|
|
||||||
///
|
|
||||||
/// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
|
|
||||||
///
|
|
||||||
/// We lower timeline creation requests to [`Self`], and then use [`PartialEq::eq`] to compare [`Timeline::create_idempotency`] with the request.
|
|
||||||
/// If they are equal, we return a reference to the existing timeline, otherwise it's an idempotency conflict.
|
|
||||||
///
|
|
||||||
/// There is special treatment for [`Self::FailWithConflict`] to always return an idempotency conflict.
|
|
||||||
/// It would be nice to have more advanced derive macros to make that special treatment declarative.
|
|
||||||
///
|
|
||||||
/// Notes:
|
|
||||||
/// - Unlike [`CreateTimelineParams`], ancestor LSN is fixed, so, branching will be at a deterministic LSN.
|
|
||||||
/// - We make some trade-offs though, e.g., [`CreateTimelineParamsBootstrap::existing_initdb_timeline_id`]
|
|
||||||
/// is not considered for idempotency. We can improve on this over time if we deem it necessary.
|
|
||||||
///
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
||||||
pub(crate) enum CreateTimelineIdempotency {
|
|
||||||
/// NB: special treatment, see comment in [`Self`].
|
|
||||||
FailWithConflict,
|
|
||||||
Bootstrap {
|
|
||||||
pg_version: u32,
|
|
||||||
},
|
|
||||||
/// NB: branches always have the same `pg_version` as their ancestor.
|
|
||||||
/// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
|
|
||||||
/// exists as a field, and is set by cplane, it has always been ignored by pageserver when
|
|
||||||
/// determining the child branch pg_version.
|
|
||||||
Branch {
|
|
||||||
ancestor_timeline_id: TimelineId,
|
|
||||||
ancestor_start_lsn: Lsn,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
/// What is returned by [`Tenant::start_creating_timeline`].
|
|
||||||
#[must_use]
|
|
||||||
enum StartCreatingTimelineResult<'t> {
|
|
||||||
CreateGuard(TimelineCreateGuard<'t>),
|
|
||||||
Idempotent(Arc<Timeline>),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// What is returned by [`Tenant::create_timeline`].
|
|
||||||
enum CreateTimelineResult {
|
|
||||||
Created(Arc<Timeline>),
|
|
||||||
Idempotent(Arc<Timeline>),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CreateTimelineResult {
|
|
||||||
fn discriminant(&self) -> &'static str {
|
|
||||||
match self {
|
|
||||||
Self::Created(_) => "Created",
|
|
||||||
Self::Idempotent(_) => "Idempotent",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn timeline(&self) -> &Arc<Timeline> {
|
|
||||||
match self {
|
|
||||||
Self::Created(t) | Self::Idempotent(t) => t,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Unit test timelines aren't activated, test has to do it if it needs to.
|
|
||||||
#[cfg(test)]
|
|
||||||
fn into_timeline_for_test(self) -> Arc<Timeline> {
|
|
||||||
match self {
|
|
||||||
Self::Created(t) | Self::Idempotent(t) => t,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
#[derive(thiserror::Error, Debug)]
|
||||||
pub enum CreateTimelineError {
|
pub enum CreateTimelineError {
|
||||||
#[error("creation of timeline with the given ID is in progress")]
|
#[error("creation of timeline with the given ID is in progress")]
|
||||||
@@ -1011,24 +876,12 @@ impl Tenant {
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let tenant_id = self.tenant_shard_id;
|
let tenant_id = self.tenant_shard_id;
|
||||||
|
|
||||||
let idempotency = if metadata.ancestor_timeline().is_none() {
|
|
||||||
CreateTimelineIdempotency::Bootstrap {
|
|
||||||
pg_version: metadata.pg_version(),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
CreateTimelineIdempotency::Branch {
|
|
||||||
ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
|
|
||||||
ancestor_start_lsn: metadata.ancestor_lsn(),
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let timeline = self.create_timeline_struct(
|
let timeline = self.create_timeline_struct(
|
||||||
timeline_id,
|
timeline_id,
|
||||||
&metadata,
|
&metadata,
|
||||||
ancestor.clone(),
|
ancestor.clone(),
|
||||||
resources,
|
resources,
|
||||||
CreateTimelineCause::Load,
|
CreateTimelineCause::Load,
|
||||||
idempotency.clone(),
|
|
||||||
)?;
|
)?;
|
||||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
@@ -1821,8 +1674,6 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline
|
/// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline
|
||||||
///
|
|
||||||
/// Counterpart to [`offload_timeline`].
|
|
||||||
async fn unoffload_timeline(
|
async fn unoffload_timeline(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
@@ -1831,24 +1682,6 @@ impl Tenant {
|
|||||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||||
info!("unoffloading timeline");
|
info!("unoffloading timeline");
|
||||||
let cancel = self.cancel.clone();
|
let cancel = self.cancel.clone();
|
||||||
|
|
||||||
// Protect against concurrent attempts to use this TimelineId
|
|
||||||
// We don't care much about idempotency, as it's ensured a layer above.
|
|
||||||
let allow_offloaded = true;
|
|
||||||
let _create_guard = self
|
|
||||||
.create_timeline_create_guard(
|
|
||||||
timeline_id,
|
|
||||||
CreateTimelineIdempotency::FailWithConflict,
|
|
||||||
allow_offloaded,
|
|
||||||
)
|
|
||||||
.map_err(|err| match err {
|
|
||||||
TimelineExclusionError::AlreadyCreating => TimelineArchivalError::AlreadyInProgress,
|
|
||||||
TimelineExclusionError::AlreadyExists { .. } => {
|
|
||||||
TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
|
|
||||||
}
|
|
||||||
TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let timeline_preload = self
|
let timeline_preload = self
|
||||||
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
|
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
|
||||||
.await;
|
.await;
|
||||||
@@ -2115,17 +1948,16 @@ impl Tenant {
|
|||||||
self.timelines.lock().unwrap().keys().cloned().collect()
|
self.timelines.lock().unwrap().keys().cloned().collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This is used by tests & import-from-basebackup.
|
/// This is used to create the initial 'main' timeline during bootstrapping,
|
||||||
|
/// or when importing a new base backup. The caller is expected to load an
|
||||||
|
/// initial image of the datadir to the new timeline after this.
|
||||||
///
|
///
|
||||||
/// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
|
/// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
|
||||||
/// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
|
/// and the timeline will fail to load at a restart.
|
||||||
///
|
///
|
||||||
/// The caller is responsible for getting the timeline into a state that will be accepted
|
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
|
||||||
/// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
|
/// minimum amount of keys required to get a writable timeline.
|
||||||
/// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
|
/// (Without it, `put` might fail due to `repartition` failing.)
|
||||||
/// to the [`Tenant::timelines`].
|
|
||||||
///
|
|
||||||
/// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
|
|
||||||
pub(crate) async fn create_empty_timeline(
|
pub(crate) async fn create_empty_timeline(
|
||||||
&self,
|
&self,
|
||||||
new_timeline_id: TimelineId,
|
new_timeline_id: TimelineId,
|
||||||
@@ -2139,15 +1971,7 @@ impl Tenant {
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Protect against concurrent attempts to use this TimelineId
|
// Protect against concurrent attempts to use this TimelineId
|
||||||
let create_guard = match self
|
let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
|
||||||
.start_creating_timeline(new_timeline_id, CreateTimelineIdempotency::FailWithConflict)
|
|
||||||
.await?
|
|
||||||
{
|
|
||||||
StartCreatingTimelineResult::CreateGuard(guard) => guard,
|
|
||||||
StartCreatingTimelineResult::Idempotent(_) => {
|
|
||||||
unreachable!("FailWithConflict implies we get an error instead")
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let new_metadata = TimelineMetadata::new(
|
let new_metadata = TimelineMetadata::new(
|
||||||
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
||||||
@@ -2266,7 +2090,11 @@ impl Tenant {
|
|||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(crate) async fn create_timeline(
|
pub(crate) async fn create_timeline(
|
||||||
self: &Arc<Tenant>,
|
self: &Arc<Tenant>,
|
||||||
params: CreateTimelineParams,
|
new_timeline_id: TimelineId,
|
||||||
|
ancestor_timeline_id: Option<TimelineId>,
|
||||||
|
mut ancestor_start_lsn: Option<Lsn>,
|
||||||
|
pg_version: u32,
|
||||||
|
load_existing_initdb: Option<TimelineId>,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||||
@@ -2285,25 +2113,54 @@ impl Tenant {
|
|||||||
.enter()
|
.enter()
|
||||||
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
||||||
|
|
||||||
let result: CreateTimelineResult = match params {
|
// Get exclusive access to the timeline ID: this ensures that it does not already exist,
|
||||||
CreateTimelineParams::Bootstrap(CreateTimelineParamsBootstrap {
|
// and that no other creation attempts will be allowed in while we are working.
|
||||||
new_timeline_id,
|
let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
|
||||||
existing_initdb_timeline_id,
|
Ok(m) => m,
|
||||||
pg_version,
|
Err(TimelineExclusionError::AlreadyCreating) => {
|
||||||
}) => {
|
// Creation is in progress, we cannot create it again, and we cannot
|
||||||
self.bootstrap_timeline(
|
// check if this request matches the existing one, so caller must try
|
||||||
new_timeline_id,
|
// again later.
|
||||||
pg_version,
|
return Err(CreateTimelineError::AlreadyCreating);
|
||||||
existing_initdb_timeline_id,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?
|
|
||||||
}
|
}
|
||||||
CreateTimelineParams::Branch(CreateTimelineParamsBranch {
|
Err(TimelineExclusionError::Other(e)) => {
|
||||||
new_timeline_id,
|
return Err(CreateTimelineError::Other(e));
|
||||||
ancestor_timeline_id,
|
}
|
||||||
mut ancestor_start_lsn,
|
Err(TimelineExclusionError::AlreadyExists(existing)) => {
|
||||||
}) => {
|
debug!("timeline {new_timeline_id} already exists");
|
||||||
|
|
||||||
|
// Idempotency: creating the same timeline twice is not an error, unless
|
||||||
|
// the second creation has different parameters.
|
||||||
|
if existing.get_ancestor_timeline_id() != ancestor_timeline_id
|
||||||
|
|| existing.pg_version != pg_version
|
||||||
|
|| (ancestor_start_lsn.is_some()
|
||||||
|
&& ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
|
||||||
|
{
|
||||||
|
return Err(CreateTimelineError::Conflict);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for uploads to complete, so that when we return Ok, the timeline
|
||||||
|
// is known to be durable on remote storage. Just like we do at the end of
|
||||||
|
// this function, after we have created the timeline ourselves.
|
||||||
|
//
|
||||||
|
// We only really care that the initial version of `index_part.json` has
|
||||||
|
// been uploaded. That's enough to remember that the timeline
|
||||||
|
// exists. However, there is no function to wait specifically for that so
|
||||||
|
// we just wait for all in-progress uploads to finish.
|
||||||
|
existing
|
||||||
|
.remote_client
|
||||||
|
.wait_completion()
|
||||||
|
.await
|
||||||
|
.context("wait for timeline uploads to complete")?;
|
||||||
|
|
||||||
|
return Ok(existing);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pausable_failpoint!("timeline-creation-after-uninit");
|
||||||
|
|
||||||
|
let loaded_timeline = match ancestor_timeline_id {
|
||||||
|
Some(ancestor_timeline_id) => {
|
||||||
let ancestor_timeline = self
|
let ancestor_timeline = self
|
||||||
.get_timeline(ancestor_timeline_id, false)
|
.get_timeline(ancestor_timeline_id, false)
|
||||||
.context("Cannot branch off the timeline that's not present in pageserver")?;
|
.context("Cannot branch off the timeline that's not present in pageserver")?;
|
||||||
@@ -2350,48 +2207,43 @@ impl Tenant {
|
|||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
|
self.branch_timeline(
|
||||||
.await?
|
&ancestor_timeline,
|
||||||
|
new_timeline_id,
|
||||||
|
ancestor_start_lsn,
|
||||||
|
create_guard,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
self.bootstrap_timeline(
|
||||||
|
new_timeline_id,
|
||||||
|
pg_version,
|
||||||
|
load_existing_initdb,
|
||||||
|
create_guard,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// At this point we have dropped our guard on [`Self::timelines_creating`], and
|
// At this point we have dropped our guard on [`Self::timelines_creating`], and
|
||||||
// the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must
|
// the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must
|
||||||
// not send a success to the caller until it is. The same applies to idempotent retries.
|
// not send a success to the caller until it is. The same applies to handling retries,
|
||||||
//
|
// see the handling of [`TimelineExclusionError::AlreadyExists`] above.
|
||||||
// TODO: the timeline is already visible in [`Self::timelines`]; a caller could incorrectly
|
let kind = ancestor_timeline_id
|
||||||
// assume that, because they can see the timeline via API, that the creation is done and
|
.map(|_| "branched")
|
||||||
// that it is durable. Ideally, we would keep the timeline hidden (in [`Self::timelines_creating`])
|
.unwrap_or("bootstrapped");
|
||||||
// until it is durable, e.g., by extending the time we hold the creation guard. This also
|
loaded_timeline
|
||||||
// interacts with UninitializedTimeline and is generally a bit tricky.
|
|
||||||
//
|
|
||||||
// To re-emphasize: the only correct way to create a timeline is to repeat calling the
|
|
||||||
// creation API until it returns success. Only then is durability guaranteed.
|
|
||||||
info!(creation_result=%result.discriminant(), "waiting for timeline to be durable");
|
|
||||||
result
|
|
||||||
.timeline()
|
|
||||||
.remote_client
|
.remote_client
|
||||||
.wait_completion()
|
.wait_completion()
|
||||||
.await
|
.await
|
||||||
.context("wait for timeline initial uploads to complete")?;
|
.with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
|
||||||
|
|
||||||
// The creating task is responsible for activating the timeline.
|
loaded_timeline.activate(self.clone(), broker_client, None, ctx);
|
||||||
// We do this after `wait_completion()` so that we don't spin up tasks that start
|
|
||||||
// doing stuff before the IndexPart is durable in S3, which is done by the previous section.
|
|
||||||
let activated_timeline = match result {
|
|
||||||
CreateTimelineResult::Created(timeline) => {
|
|
||||||
timeline.activate(self.clone(), broker_client, None, ctx);
|
|
||||||
timeline
|
|
||||||
}
|
|
||||||
CreateTimelineResult::Idempotent(timeline) => {
|
|
||||||
info!(
|
|
||||||
"request was deemed idempotent, activation will be done by the creating task"
|
|
||||||
);
|
|
||||||
timeline
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(activated_timeline)
|
Ok(loaded_timeline)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn delete_timeline(
|
pub(crate) async fn delete_timeline(
|
||||||
@@ -3048,58 +2900,33 @@ impl Tenant {
|
|||||||
&self,
|
&self,
|
||||||
child_shards: &Vec<TenantShardId>,
|
child_shards: &Vec<TenantShardId>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let (timelines, offloaded) = {
|
let timelines = self.timelines.lock().unwrap().clone();
|
||||||
let timelines = self.timelines.lock().unwrap();
|
for timeline in timelines.values() {
|
||||||
let offloaded = self.timelines_offloaded.lock().unwrap();
|
|
||||||
(timelines.clone(), offloaded.clone())
|
|
||||||
};
|
|
||||||
let timelines_iter = timelines
|
|
||||||
.values()
|
|
||||||
.map(TimelineOrOffloadedArcRef::<'_>::from)
|
|
||||||
.chain(
|
|
||||||
offloaded
|
|
||||||
.values()
|
|
||||||
.map(TimelineOrOffloadedArcRef::<'_>::from),
|
|
||||||
);
|
|
||||||
for timeline in timelines_iter {
|
|
||||||
// We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
|
// We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
|
||||||
// to ensure that they do not start a split if currently in the process of doing these.
|
// to ensure that they do not start a split if currently in the process of doing these.
|
||||||
|
|
||||||
let timeline_id = timeline.timeline_id();
|
// Upload an index from the parent: this is partly to provide freshness for the
|
||||||
|
// child tenants that will copy it, and partly for general ease-of-debugging: there will
|
||||||
if let TimelineOrOffloadedArcRef::Timeline(timeline) = timeline {
|
// always be a parent shard index in the same generation as we wrote the child shard index.
|
||||||
// Upload an index from the parent: this is partly to provide freshness for the
|
tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index");
|
||||||
// child tenants that will copy it, and partly for general ease-of-debugging: there will
|
timeline
|
||||||
// always be a parent shard index in the same generation as we wrote the child shard index.
|
.remote_client
|
||||||
tracing::info!(%timeline_id, "Uploading index");
|
.schedule_index_upload_for_file_changes()?;
|
||||||
timeline
|
timeline.remote_client.wait_completion().await?;
|
||||||
.remote_client
|
|
||||||
.schedule_index_upload_for_file_changes()?;
|
|
||||||
timeline.remote_client.wait_completion().await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let remote_client = match timeline {
|
|
||||||
TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.remote_client.clone(),
|
|
||||||
TimelineOrOffloadedArcRef::Offloaded(offloaded) => {
|
|
||||||
let remote_client = self
|
|
||||||
.build_timeline_client(offloaded.timeline_id, self.remote_storage.clone());
|
|
||||||
Arc::new(remote_client)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Shut down the timeline's remote client: this means that the indices we write
|
// Shut down the timeline's remote client: this means that the indices we write
|
||||||
// for child shards will not be invalidated by the parent shard deleting layers.
|
// for child shards will not be invalidated by the parent shard deleting layers.
|
||||||
tracing::info!(%timeline_id, "Shutting down remote storage client");
|
tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client");
|
||||||
remote_client.shutdown().await;
|
timeline.remote_client.shutdown().await;
|
||||||
|
|
||||||
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
||||||
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
||||||
// operation is rare, so it's simpler to just download it (and robustly guarantees that the index
|
// operation is rare, so it's simpler to just download it (and robustly guarantees that the index
|
||||||
// we use here really is the remotely persistent one).
|
// we use here really is the remotely persistent one).
|
||||||
tracing::info!(%timeline_id, "Downloading index_part from parent");
|
tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent");
|
||||||
let result = remote_client
|
let result = timeline.remote_client
|
||||||
.download_index_file(&self.cancel)
|
.download_index_file(&self.cancel)
|
||||||
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))
|
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
|
||||||
.await?;
|
.await?;
|
||||||
let index_part = match result {
|
let index_part = match result {
|
||||||
MaybeDeletedIndexPart::Deleted(_) => {
|
MaybeDeletedIndexPart::Deleted(_) => {
|
||||||
@@ -3109,11 +2936,11 @@ impl Tenant {
|
|||||||
};
|
};
|
||||||
|
|
||||||
for child_shard in child_shards {
|
for child_shard in child_shards {
|
||||||
tracing::info!(%timeline_id, "Uploading index_part for child {}", child_shard.to_index());
|
tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index());
|
||||||
upload_index_part(
|
upload_index_part(
|
||||||
&self.remote_storage,
|
&self.remote_storage,
|
||||||
child_shard,
|
child_shard,
|
||||||
&timeline_id,
|
&timeline.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
&index_part,
|
&index_part,
|
||||||
&self.cancel,
|
&self.cancel,
|
||||||
@@ -3122,6 +2949,8 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: also copy index files of offloaded timelines
|
||||||
|
|
||||||
let tenant_manifest = self.tenant_manifest();
|
let tenant_manifest = self.tenant_manifest();
|
||||||
// TODO: generation support
|
// TODO: generation support
|
||||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||||
@@ -3404,7 +3233,6 @@ impl Tenant {
|
|||||||
ancestor: Option<Arc<Timeline>>,
|
ancestor: Option<Arc<Timeline>>,
|
||||||
resources: TimelineResources,
|
resources: TimelineResources,
|
||||||
cause: CreateTimelineCause,
|
cause: CreateTimelineCause,
|
||||||
create_idempotency: CreateTimelineIdempotency,
|
|
||||||
) -> anyhow::Result<Arc<Timeline>> {
|
) -> anyhow::Result<Arc<Timeline>> {
|
||||||
let state = match cause {
|
let state = match cause {
|
||||||
CreateTimelineCause::Load => {
|
CreateTimelineCause::Load => {
|
||||||
@@ -3434,7 +3262,6 @@ impl Tenant {
|
|||||||
pg_version,
|
pg_version,
|
||||||
state,
|
state,
|
||||||
self.attach_wal_lag_cooldown.clone(),
|
self.attach_wal_lag_cooldown.clone(),
|
||||||
create_idempotency,
|
|
||||||
self.cancel.child_token(),
|
self.cancel.child_token(),
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -3920,16 +3747,16 @@ impl Tenant {
|
|||||||
/// timeline background tasks are launched, except the flush loop.
|
/// timeline background tasks are launched, except the flush loop.
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
async fn branch_timeline_test(
|
async fn branch_timeline_test(
|
||||||
self: &Arc<Self>,
|
&self,
|
||||||
src_timeline: &Arc<Timeline>,
|
src_timeline: &Arc<Timeline>,
|
||||||
dst_id: TimelineId,
|
dst_id: TimelineId,
|
||||||
ancestor_lsn: Option<Lsn>,
|
ancestor_lsn: Option<Lsn>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||||
|
let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
|
||||||
let tl = self
|
let tl = self
|
||||||
.branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, ctx)
|
.branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
|
||||||
.await?
|
.await?;
|
||||||
.into_timeline_for_test();
|
|
||||||
tl.set_state(TimelineState::Active);
|
tl.set_state(TimelineState::Active);
|
||||||
Ok(tl)
|
Ok(tl)
|
||||||
}
|
}
|
||||||
@@ -3938,7 +3765,7 @@ impl Tenant {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub async fn branch_timeline_test_with_layers(
|
pub async fn branch_timeline_test_with_layers(
|
||||||
self: &Arc<Self>,
|
&self,
|
||||||
src_timeline: &Arc<Timeline>,
|
src_timeline: &Arc<Timeline>,
|
||||||
dst_id: TimelineId,
|
dst_id: TimelineId,
|
||||||
ancestor_lsn: Option<Lsn>,
|
ancestor_lsn: Option<Lsn>,
|
||||||
@@ -3986,24 +3813,28 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Branch an existing timeline.
|
/// Branch an existing timeline.
|
||||||
|
///
|
||||||
|
/// The caller is responsible for activating the returned timeline.
|
||||||
async fn branch_timeline(
|
async fn branch_timeline(
|
||||||
self: &Arc<Self>,
|
&self,
|
||||||
src_timeline: &Arc<Timeline>,
|
src_timeline: &Arc<Timeline>,
|
||||||
dst_id: TimelineId,
|
dst_id: TimelineId,
|
||||||
start_lsn: Option<Lsn>,
|
start_lsn: Option<Lsn>,
|
||||||
|
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<CreateTimelineResult, CreateTimelineError> {
|
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
|
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn branch_timeline_impl(
|
async fn branch_timeline_impl(
|
||||||
self: &Arc<Self>,
|
&self,
|
||||||
src_timeline: &Arc<Timeline>,
|
src_timeline: &Arc<Timeline>,
|
||||||
dst_id: TimelineId,
|
dst_id: TimelineId,
|
||||||
start_lsn: Option<Lsn>,
|
start_lsn: Option<Lsn>,
|
||||||
|
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||||
_ctx: &RequestContext,
|
_ctx: &RequestContext,
|
||||||
) -> Result<CreateTimelineResult, CreateTimelineError> {
|
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||||
let src_id = src_timeline.timeline_id;
|
let src_id = src_timeline.timeline_id;
|
||||||
|
|
||||||
// We will validate our ancestor LSN in this function. Acquire the GC lock so that
|
// We will validate our ancestor LSN in this function. Acquire the GC lock so that
|
||||||
@@ -4018,23 +3849,6 @@ impl Tenant {
|
|||||||
lsn
|
lsn
|
||||||
});
|
});
|
||||||
|
|
||||||
// we finally have determined the ancestor_start_lsn, so we can get claim exclusivity now
|
|
||||||
let timeline_create_guard = match self
|
|
||||||
.start_creating_timeline(
|
|
||||||
dst_id,
|
|
||||||
CreateTimelineIdempotency::Branch {
|
|
||||||
ancestor_timeline_id: src_timeline.timeline_id,
|
|
||||||
ancestor_start_lsn: start_lsn,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await?
|
|
||||||
{
|
|
||||||
StartCreatingTimelineResult::CreateGuard(guard) => guard,
|
|
||||||
StartCreatingTimelineResult::Idempotent(timeline) => {
|
|
||||||
return Ok(CreateTimelineResult::Idempotent(timeline));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
|
// Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
|
||||||
// horizon on the source timeline
|
// horizon on the source timeline
|
||||||
//
|
//
|
||||||
@@ -4120,92 +3934,28 @@ impl Tenant {
|
|||||||
.schedule_index_upload_for_full_metadata_update(&metadata)
|
.schedule_index_upload_for_full_metadata_update(&metadata)
|
||||||
.context("branch initial metadata upload")?;
|
.context("branch initial metadata upload")?;
|
||||||
|
|
||||||
// Callers are responsible to wait for uploads to complete and for activating the timeline.
|
Ok(new_timeline)
|
||||||
|
|
||||||
Ok(CreateTimelineResult::Created(new_timeline))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For unit tests, make this visible so that other modules can directly create timelines
|
/// For unit tests, make this visible so that other modules can directly create timelines
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
#[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
||||||
pub(crate) async fn bootstrap_timeline_test(
|
pub(crate) async fn bootstrap_timeline_test(
|
||||||
self: &Arc<Self>,
|
&self,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
load_existing_initdb: Option<TimelineId>,
|
load_existing_initdb: Option<TimelineId>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Arc<Timeline>> {
|
) -> anyhow::Result<Arc<Timeline>> {
|
||||||
self.bootstrap_timeline(timeline_id, pg_version, load_existing_initdb, ctx)
|
let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
|
||||||
.await
|
self.bootstrap_timeline(
|
||||||
.map_err(anyhow::Error::new)
|
timeline_id,
|
||||||
.map(|r| r.into_timeline_for_test())
|
pg_version,
|
||||||
}
|
load_existing_initdb,
|
||||||
|
create_guard,
|
||||||
/// Get exclusive access to the timeline ID for creation.
|
ctx,
|
||||||
///
|
)
|
||||||
/// Timeline-creating code paths must use this function before making changes
|
.await
|
||||||
/// to in-memory or persistent state.
|
|
||||||
///
|
|
||||||
/// The `state` parameter is a description of the timeline creation operation
|
|
||||||
/// we intend to perform.
|
|
||||||
/// If the timeline was already created in the meantime, we check whether this
|
|
||||||
/// request conflicts or is idempotent , based on `state`.
|
|
||||||
async fn start_creating_timeline(
|
|
||||||
&self,
|
|
||||||
new_timeline_id: TimelineId,
|
|
||||||
idempotency: CreateTimelineIdempotency,
|
|
||||||
) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
|
|
||||||
let allow_offloaded = false;
|
|
||||||
match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
|
|
||||||
Ok(create_guard) => {
|
|
||||||
pausable_failpoint!("timeline-creation-after-uninit");
|
|
||||||
Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
|
|
||||||
}
|
|
||||||
Err(TimelineExclusionError::AlreadyCreating) => {
|
|
||||||
// Creation is in progress, we cannot create it again, and we cannot
|
|
||||||
// check if this request matches the existing one, so caller must try
|
|
||||||
// again later.
|
|
||||||
Err(CreateTimelineError::AlreadyCreating)
|
|
||||||
}
|
|
||||||
Err(TimelineExclusionError::Other(e)) => Err(CreateTimelineError::Other(e)),
|
|
||||||
Err(TimelineExclusionError::AlreadyExists {
|
|
||||||
existing: TimelineOrOffloaded::Offloaded(_existing),
|
|
||||||
..
|
|
||||||
}) => {
|
|
||||||
info!("timeline already exists but is offloaded");
|
|
||||||
Err(CreateTimelineError::Conflict)
|
|
||||||
}
|
|
||||||
Err(TimelineExclusionError::AlreadyExists {
|
|
||||||
existing: TimelineOrOffloaded::Timeline(existing),
|
|
||||||
arg,
|
|
||||||
}) => {
|
|
||||||
{
|
|
||||||
let existing = &existing.create_idempotency;
|
|
||||||
let _span = info_span!("idempotency_check", ?existing, ?arg).entered();
|
|
||||||
debug!("timeline already exists");
|
|
||||||
|
|
||||||
match (existing, &arg) {
|
|
||||||
// FailWithConflict => no idempotency check
|
|
||||||
(CreateTimelineIdempotency::FailWithConflict, _)
|
|
||||||
| (_, CreateTimelineIdempotency::FailWithConflict) => {
|
|
||||||
warn!("timeline already exists, failing request");
|
|
||||||
return Err(CreateTimelineError::Conflict);
|
|
||||||
}
|
|
||||||
// Idempotent <=> CreateTimelineIdempotency is identical
|
|
||||||
(x, y) if x == y => {
|
|
||||||
info!("timeline already exists and idempotency matches, succeeding request");
|
|
||||||
// fallthrough
|
|
||||||
}
|
|
||||||
(_, _) => {
|
|
||||||
warn!("idempotency conflict, failing request");
|
|
||||||
return Err(CreateTimelineError::Conflict);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(StartCreatingTimelineResult::Idempotent(existing))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn upload_initdb(
|
async fn upload_initdb(
|
||||||
@@ -4259,26 +4009,16 @@ impl Tenant {
|
|||||||
|
|
||||||
/// - run initdb to init temporary instance and get bootstrap data
|
/// - run initdb to init temporary instance and get bootstrap data
|
||||||
/// - after initialization completes, tar up the temp dir and upload it to S3.
|
/// - after initialization completes, tar up the temp dir and upload it to S3.
|
||||||
|
///
|
||||||
|
/// The caller is responsible for activating the returned timeline.
|
||||||
async fn bootstrap_timeline(
|
async fn bootstrap_timeline(
|
||||||
self: &Arc<Self>,
|
&self,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
load_existing_initdb: Option<TimelineId>,
|
load_existing_initdb: Option<TimelineId>,
|
||||||
|
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<CreateTimelineResult, CreateTimelineError> {
|
) -> anyhow::Result<Arc<Timeline>> {
|
||||||
let timeline_create_guard = match self
|
|
||||||
.start_creating_timeline(
|
|
||||||
timeline_id,
|
|
||||||
CreateTimelineIdempotency::Bootstrap { pg_version },
|
|
||||||
)
|
|
||||||
.await?
|
|
||||||
{
|
|
||||||
StartCreatingTimelineResult::CreateGuard(guard) => guard,
|
|
||||||
StartCreatingTimelineResult::Idempotent(timeline) => {
|
|
||||||
return Ok(CreateTimelineResult::Idempotent(timeline))
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||||
// temporary directory for basebackup files for the given timeline.
|
// temporary directory for basebackup files for the given timeline.
|
||||||
|
|
||||||
@@ -4342,9 +4082,7 @@ impl Tenant {
|
|||||||
.context("extract initdb tar")?;
|
.context("extract initdb tar")?;
|
||||||
} else {
|
} else {
|
||||||
// Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
|
// Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
|
||||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel)
|
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||||
.await
|
|
||||||
.context("run initdb")?;
|
|
||||||
|
|
||||||
// Upload the created data dir to S3
|
// Upload the created data dir to S3
|
||||||
if self.tenant_shard_id().is_shard_zero() {
|
if self.tenant_shard_id().is_shard_zero() {
|
||||||
@@ -4398,9 +4136,7 @@ impl Tenant {
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||||
Err(CreateTimelineError::Other(anyhow::anyhow!(
|
anyhow::bail!("failpoint before-checkpoint-new-timeline");
|
||||||
"failpoint before-checkpoint-new-timeline"
|
|
||||||
)))
|
|
||||||
});
|
});
|
||||||
|
|
||||||
unfinished_timeline
|
unfinished_timeline
|
||||||
@@ -4415,9 +4151,7 @@ impl Tenant {
|
|||||||
// All done!
|
// All done!
|
||||||
let timeline = raw_timeline.finish_creation()?;
|
let timeline = raw_timeline.finish_creation()?;
|
||||||
|
|
||||||
// Callers are responsible to wait for uploads to complete and for activating the timeline.
|
Ok(timeline)
|
||||||
|
|
||||||
Ok(CreateTimelineResult::Created(timeline))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient {
|
fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient {
|
||||||
@@ -4467,7 +4201,6 @@ impl Tenant {
|
|||||||
ancestor,
|
ancestor,
|
||||||
resources,
|
resources,
|
||||||
CreateTimelineCause::Load,
|
CreateTimelineCause::Load,
|
||||||
create_guard.idempotency.clone(),
|
|
||||||
)
|
)
|
||||||
.context("Failed to create timeline data structure")?;
|
.context("Failed to create timeline data structure")?;
|
||||||
|
|
||||||
@@ -4505,26 +4238,15 @@ impl Tenant {
|
|||||||
|
|
||||||
/// Get a guard that provides exclusive access to the timeline directory, preventing
|
/// Get a guard that provides exclusive access to the timeline directory, preventing
|
||||||
/// concurrent attempts to create the same timeline.
|
/// concurrent attempts to create the same timeline.
|
||||||
///
|
|
||||||
/// The `allow_offloaded` parameter controls whether to tolerate the existence of
|
|
||||||
/// offloaded timelines or not.
|
|
||||||
fn create_timeline_create_guard(
|
fn create_timeline_create_guard(
|
||||||
&self,
|
&self,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
idempotency: CreateTimelineIdempotency,
|
|
||||||
allow_offloaded: bool,
|
|
||||||
) -> Result<TimelineCreateGuard, TimelineExclusionError> {
|
) -> Result<TimelineCreateGuard, TimelineExclusionError> {
|
||||||
let tenant_shard_id = self.tenant_shard_id;
|
let tenant_shard_id = self.tenant_shard_id;
|
||||||
|
|
||||||
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
|
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||||
|
|
||||||
let create_guard = TimelineCreateGuard::new(
|
let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
|
||||||
self,
|
|
||||||
timeline_id,
|
|
||||||
timeline_path.clone(),
|
|
||||||
idempotency,
|
|
||||||
allow_offloaded,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// At this stage, we have got exclusive access to in-memory state for this timeline ID
|
// At this stage, we have got exclusive access to in-memory state for this timeline ID
|
||||||
// for creation.
|
// for creation.
|
||||||
@@ -5160,10 +4882,7 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(_) => panic!("duplicate timeline creation should fail"),
|
Ok(_) => panic!("duplicate timeline creation should fail"),
|
||||||
Err(e) => assert_eq!(
|
Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
|
||||||
e.to_string(),
|
|
||||||
"timeline already exists with different parameters".to_string()
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -1278,14 +1278,10 @@ impl RemoteTimelineClient {
|
|||||||
let fut = {
|
let fut = {
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
let upload_queue = match &mut *guard {
|
let upload_queue = match &mut *guard {
|
||||||
UploadQueue::Stopped(_) => {
|
UploadQueue::Stopped(_) => return,
|
||||||
scopeguard::ScopeGuard::into_inner(sg);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
UploadQueue::Uninitialized => {
|
UploadQueue::Uninitialized => {
|
||||||
// transition into Stopped state
|
// transition into Stopped state
|
||||||
self.stop_impl(&mut guard);
|
self.stop_impl(&mut guard);
|
||||||
scopeguard::ScopeGuard::into_inner(sg);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
UploadQueue::Initialized(ref mut init) => init,
|
UploadQueue::Initialized(ref mut init) => init,
|
||||||
|
|||||||
@@ -187,8 +187,6 @@ pub(super) async fn gather_inputs(
|
|||||||
// but it is unlikely to cause any issues. In the worst case,
|
// but it is unlikely to cause any issues. In the worst case,
|
||||||
// the calculation will error out.
|
// the calculation will error out.
|
||||||
timelines.retain(|t| t.is_active());
|
timelines.retain(|t| t.is_active());
|
||||||
// Also filter out archived timelines.
|
|
||||||
timelines.retain(|t| t.is_archived() != Some(true));
|
|
||||||
|
|
||||||
// Build a map of branch points.
|
// Build a map of branch points.
|
||||||
let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
|
let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
//! Common traits and structs for layers
|
//! Common traits and structs for layers
|
||||||
|
|
||||||
pub mod batch_split_writer;
|
|
||||||
pub mod delta_layer;
|
pub mod delta_layer;
|
||||||
pub mod filter_iterator;
|
pub mod filter_iterator;
|
||||||
pub mod image_layer;
|
pub mod image_layer;
|
||||||
@@ -9,6 +8,7 @@ pub(crate) mod layer;
|
|||||||
mod layer_desc;
|
mod layer_desc;
|
||||||
mod layer_name;
|
mod layer_name;
|
||||||
pub mod merge_iterator;
|
pub mod merge_iterator;
|
||||||
|
pub mod split_writer;
|
||||||
|
|
||||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||||
use crate::repository::Value;
|
use crate::repository::Value;
|
||||||
|
|||||||
@@ -1084,7 +1084,7 @@ impl DeltaLayerInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn index_entries<'a>(
|
pub(super) async fn load_keys<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Vec<DeltaEntry<'a>>> {
|
) -> Result<Vec<DeltaEntry<'a>>> {
|
||||||
@@ -1346,7 +1346,7 @@ impl DeltaLayerInner {
|
|||||||
|
|
||||||
tree_reader.dump().await?;
|
tree_reader.dump().await?;
|
||||||
|
|
||||||
let keys = self.index_entries(ctx).await?;
|
let keys = self.load_keys(ctx).await?;
|
||||||
|
|
||||||
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
|
||||||
let buf = val.load_raw(ctx).await?;
|
let buf = val.load_raw(ctx).await?;
|
||||||
@@ -1453,16 +1453,6 @@ impl DeltaLayerInner {
|
|||||||
),
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// NB: not super efficient, but not terrible either. Should prob be an iterator.
|
|
||||||
//
|
|
||||||
// We're reusing the index traversal logical in plan_reads; would be nice to
|
|
||||||
// factor that out.
|
|
||||||
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
|
|
||||||
self.index_entries(ctx)
|
|
||||||
.await
|
|
||||||
.map(|entries| entries.into_iter().map(|entry| entry.key).collect())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A set of data associated with a delta layer key and its value
|
/// A set of data associated with a delta layer key and its value
|
||||||
|
|||||||
@@ -673,21 +673,6 @@ impl ImageLayerInner {
|
|||||||
),
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// NB: not super efficient, but not terrible either. Should prob be an iterator.
|
|
||||||
//
|
|
||||||
// We're reusing the index traversal logical in plan_reads; would be nice to
|
|
||||||
// factor that out.
|
|
||||||
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
|
|
||||||
let plan = self
|
|
||||||
.plan_reads(KeySpace::single(self.key_range.clone()), None, ctx)
|
|
||||||
.await?;
|
|
||||||
Ok(plan
|
|
||||||
.into_iter()
|
|
||||||
.flat_map(|read| read.blobs_at)
|
|
||||||
.map(|(_, blob_meta)| blob_meta.key)
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A builder object for constructing a new image layer.
|
/// A builder object for constructing a new image layer.
|
||||||
@@ -1024,7 +1009,7 @@ impl ImageLayerWriter {
|
|||||||
self.inner.take().unwrap().finish(ctx, None).await
|
self.inner.take().unwrap().finish(ctx, None).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Finish writing the image layer with an end key, used in [`super::batch_split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||||
pub(super) async fn finish_with_end_key(
|
pub(super) async fn finish_with_end_key(
|
||||||
mut self,
|
mut self,
|
||||||
end_key: Key,
|
end_key: Key,
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ use crate::task_mgr::TaskKind;
|
|||||||
use crate::tenant::timeline::{CompactionError, GetVectoredError};
|
use crate::tenant::timeline::{CompactionError, GetVectoredError};
|
||||||
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
|
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
|
||||||
|
|
||||||
use super::delta_layer::{self};
|
use super::delta_layer::{self, DeltaEntry};
|
||||||
use super::image_layer::{self};
|
use super::image_layer::{self};
|
||||||
use super::{
|
use super::{
|
||||||
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
||||||
@@ -1841,22 +1841,23 @@ impl ResidentLayer {
|
|||||||
pub(crate) async fn load_keys<'a>(
|
pub(crate) async fn load_keys<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
|
) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
|
||||||
use LayerKind::*;
|
use LayerKind::*;
|
||||||
|
|
||||||
let owner = &self.owner.0;
|
let owner = &self.owner.0;
|
||||||
let inner = self.downloaded.get(owner, ctx).await?;
|
match self.downloaded.get(owner, ctx).await? {
|
||||||
|
Delta(ref d) => {
|
||||||
|
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||||
|
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||||
|
// while it's being held.
|
||||||
|
self.owner.record_access(ctx);
|
||||||
|
|
||||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
delta_layer::DeltaLayerInner::load_keys(d, ctx)
|
||||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
.await
|
||||||
// while it's being held.
|
.with_context(|| format!("Layer index is corrupted for {self}"))
|
||||||
self.owner.record_access(ctx);
|
}
|
||||||
|
Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
|
||||||
let res = match inner {
|
}
|
||||||
Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
|
|
||||||
Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
|
|
||||||
};
|
|
||||||
res.with_context(|| format!("Layer index is corrupted for {self}"))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read all they keys in this layer which match the ShardIdentity, and write them all to
|
/// Read all they keys in this layer which match the ShardIdentity, and write them all to
|
||||||
|
|||||||
@@ -57,34 +57,6 @@ impl std::fmt::Display for PersistentLayerKey {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<ImageLayerName> for PersistentLayerKey {
|
|
||||||
fn from(image_layer_name: ImageLayerName) -> Self {
|
|
||||||
Self {
|
|
||||||
key_range: image_layer_name.key_range,
|
|
||||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer_name.lsn),
|
|
||||||
is_delta: false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<DeltaLayerName> for PersistentLayerKey {
|
|
||||||
fn from(delta_layer_name: DeltaLayerName) -> Self {
|
|
||||||
Self {
|
|
||||||
key_range: delta_layer_name.key_range,
|
|
||||||
lsn_range: delta_layer_name.lsn_range,
|
|
||||||
is_delta: true,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<LayerName> for PersistentLayerKey {
|
|
||||||
fn from(layer_name: LayerName) -> Self {
|
|
||||||
match layer_name {
|
|
||||||
LayerName::Image(i) => i.into(),
|
|
||||||
LayerName::Delta(d) => d.into(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl PersistentLayerDesc {
|
impl PersistentLayerDesc {
|
||||||
pub fn key(&self) -> PersistentLayerKey {
|
pub fn key(&self) -> PersistentLayerKey {
|
||||||
PersistentLayerKey {
|
PersistentLayerKey {
|
||||||
|
|||||||
@@ -12,154 +12,41 @@ use super::{
|
|||||||
DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
|
DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub(crate) enum BatchWriterResult {
|
pub(crate) enum SplitWriterResult {
|
||||||
Produced(ResidentLayer),
|
Produced(ResidentLayer),
|
||||||
Discarded(PersistentLayerKey),
|
Discarded(PersistentLayerKey),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
impl BatchWriterResult {
|
impl SplitWriterResult {
|
||||||
fn into_resident_layer(self) -> ResidentLayer {
|
fn into_resident_layer(self) -> ResidentLayer {
|
||||||
match self {
|
match self {
|
||||||
BatchWriterResult::Produced(layer) => layer,
|
SplitWriterResult::Produced(layer) => layer,
|
||||||
BatchWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
|
SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn into_discarded_layer(self) -> PersistentLayerKey {
|
fn into_discarded_layer(self) -> PersistentLayerKey {
|
||||||
match self {
|
match self {
|
||||||
BatchWriterResult::Produced(_) => panic!("unexpected produced layer"),
|
SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
|
||||||
BatchWriterResult::Discarded(layer) => layer,
|
SplitWriterResult::Discarded(layer) => layer,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum LayerWriterWrapper {
|
|
||||||
Image(ImageLayerWriter),
|
|
||||||
Delta(DeltaLayerWriter),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An layer writer that takes unfinished layers and finish them atomically.
|
|
||||||
#[must_use]
|
|
||||||
pub struct BatchLayerWriter {
|
|
||||||
generated_layer_writers: Vec<(LayerWriterWrapper, PersistentLayerKey)>,
|
|
||||||
conf: &'static PageServerConf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BatchLayerWriter {
|
|
||||||
pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
|
|
||||||
Ok(Self {
|
|
||||||
generated_layer_writers: Vec::new(),
|
|
||||||
conf,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_unfinished_image_writer(
|
|
||||||
&mut self,
|
|
||||||
writer: ImageLayerWriter,
|
|
||||||
key_range: Range<Key>,
|
|
||||||
lsn: Lsn,
|
|
||||||
) {
|
|
||||||
self.generated_layer_writers.push((
|
|
||||||
LayerWriterWrapper::Image(writer),
|
|
||||||
PersistentLayerKey {
|
|
||||||
key_range,
|
|
||||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
|
|
||||||
is_delta: false,
|
|
||||||
},
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_unfinished_delta_writer(
|
|
||||||
&mut self,
|
|
||||||
writer: DeltaLayerWriter,
|
|
||||||
key_range: Range<Key>,
|
|
||||||
lsn_range: Range<Lsn>,
|
|
||||||
) {
|
|
||||||
self.generated_layer_writers.push((
|
|
||||||
LayerWriterWrapper::Delta(writer),
|
|
||||||
PersistentLayerKey {
|
|
||||||
key_range,
|
|
||||||
lsn_range,
|
|
||||||
is_delta: true,
|
|
||||||
},
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
|
||||||
self,
|
|
||||||
tline: &Arc<Timeline>,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
discard_fn: D,
|
|
||||||
) -> anyhow::Result<Vec<BatchWriterResult>>
|
|
||||||
where
|
|
||||||
D: Fn(&PersistentLayerKey) -> F,
|
|
||||||
F: Future<Output = bool>,
|
|
||||||
{
|
|
||||||
let Self {
|
|
||||||
generated_layer_writers,
|
|
||||||
..
|
|
||||||
} = self;
|
|
||||||
let clean_up_layers = |generated_layers: Vec<BatchWriterResult>| {
|
|
||||||
for produced_layer in generated_layers {
|
|
||||||
if let BatchWriterResult::Produced(resident_layer) = produced_layer {
|
|
||||||
let layer: Layer = resident_layer.into();
|
|
||||||
layer.delete_on_drop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// BEGIN: catch every error and do the recovery in the below section
|
|
||||||
let mut generated_layers: Vec<BatchWriterResult> = Vec::new();
|
|
||||||
for (inner, layer_key) in generated_layer_writers {
|
|
||||||
if discard_fn(&layer_key).await {
|
|
||||||
generated_layers.push(BatchWriterResult::Discarded(layer_key));
|
|
||||||
} else {
|
|
||||||
let res = match inner {
|
|
||||||
LayerWriterWrapper::Delta(writer) => {
|
|
||||||
writer.finish(layer_key.key_range.end, ctx).await
|
|
||||||
}
|
|
||||||
LayerWriterWrapper::Image(writer) => {
|
|
||||||
writer
|
|
||||||
.finish_with_end_key(layer_key.key_range.end, ctx)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let layer = match res {
|
|
||||||
Ok((desc, path)) => {
|
|
||||||
match Layer::finish_creating(self.conf, tline, desc, &path) {
|
|
||||||
Ok(layer) => layer,
|
|
||||||
Err(e) => {
|
|
||||||
tokio::fs::remove_file(&path).await.ok();
|
|
||||||
clean_up_layers(generated_layers);
|
|
||||||
return Err(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
// Image/DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
|
|
||||||
// so we don't need to remove the layer we just failed to create by ourselves.
|
|
||||||
clean_up_layers(generated_layers);
|
|
||||||
return Err(e);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
generated_layers.push(BatchWriterResult::Produced(layer));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// END: catch every error and do the recovery in the above section
|
|
||||||
Ok(generated_layers)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An image writer that takes images and produces multiple image layers.
|
/// An image writer that takes images and produces multiple image layers.
|
||||||
|
///
|
||||||
|
/// The interface does not guarantee atomicity (i.e., if the image layer generation
|
||||||
|
/// fails, there might be leftover files to be cleaned up)
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub struct SplitImageLayerWriter {
|
pub struct SplitImageLayerWriter {
|
||||||
inner: ImageLayerWriter,
|
inner: ImageLayerWriter,
|
||||||
target_layer_size: u64,
|
target_layer_size: u64,
|
||||||
lsn: Lsn,
|
generated_layer_writers: Vec<(ImageLayerWriter, PersistentLayerKey)>,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
batches: BatchLayerWriter,
|
lsn: Lsn,
|
||||||
start_key: Key,
|
start_key: Key,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -184,10 +71,10 @@ impl SplitImageLayerWriter {
|
|||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?,
|
.await?,
|
||||||
|
generated_layer_writers: Vec::new(),
|
||||||
conf,
|
conf,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
batches: BatchLayerWriter::new(conf).await?,
|
|
||||||
lsn,
|
lsn,
|
||||||
start_key,
|
start_key,
|
||||||
})
|
})
|
||||||
@@ -215,13 +102,16 @@ impl SplitImageLayerWriter {
|
|||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
let layer_key = PersistentLayerKey {
|
||||||
|
key_range: self.start_key..key,
|
||||||
|
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
|
||||||
|
is_delta: false,
|
||||||
|
};
|
||||||
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
|
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
|
||||||
self.batches.add_unfinished_image_writer(
|
|
||||||
prev_image_writer,
|
|
||||||
self.start_key..key,
|
|
||||||
self.lsn,
|
|
||||||
);
|
|
||||||
self.start_key = key;
|
self.start_key = key;
|
||||||
|
|
||||||
|
self.generated_layer_writers
|
||||||
|
.push((prev_image_writer, layer_key));
|
||||||
}
|
}
|
||||||
self.inner.put_image(key, img, ctx).await
|
self.inner.put_image(key, img, ctx).await
|
||||||
}
|
}
|
||||||
@@ -232,18 +122,64 @@ impl SplitImageLayerWriter {
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
end_key: Key,
|
end_key: Key,
|
||||||
discard_fn: D,
|
discard_fn: D,
|
||||||
) -> anyhow::Result<Vec<BatchWriterResult>>
|
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||||
where
|
where
|
||||||
D: Fn(&PersistentLayerKey) -> F,
|
D: Fn(&PersistentLayerKey) -> F,
|
||||||
F: Future<Output = bool>,
|
F: Future<Output = bool>,
|
||||||
{
|
{
|
||||||
let Self {
|
let Self {
|
||||||
mut batches, inner, ..
|
mut generated_layer_writers,
|
||||||
|
inner,
|
||||||
|
..
|
||||||
} = self;
|
} = self;
|
||||||
if inner.num_keys() != 0 {
|
if inner.num_keys() != 0 {
|
||||||
batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
|
let layer_key = PersistentLayerKey {
|
||||||
|
key_range: self.start_key..end_key,
|
||||||
|
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
|
||||||
|
is_delta: false,
|
||||||
|
};
|
||||||
|
generated_layer_writers.push((inner, layer_key));
|
||||||
}
|
}
|
||||||
batches.finish_with_discard_fn(tline, ctx, discard_fn).await
|
let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
|
||||||
|
for produced_layer in generated_layers {
|
||||||
|
if let SplitWriterResult::Produced(image_layer) = produced_layer {
|
||||||
|
let layer: Layer = image_layer.into();
|
||||||
|
layer.delete_on_drop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// BEGIN: catch every error and do the recovery in the below section
|
||||||
|
let mut generated_layers = Vec::new();
|
||||||
|
for (inner, layer_key) in generated_layer_writers {
|
||||||
|
if discard_fn(&layer_key).await {
|
||||||
|
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||||
|
} else {
|
||||||
|
let layer = match inner
|
||||||
|
.finish_with_end_key(layer_key.key_range.end, ctx)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok((desc, path)) => {
|
||||||
|
match Layer::finish_creating(self.conf, tline, desc, &path) {
|
||||||
|
Ok(layer) => layer,
|
||||||
|
Err(e) => {
|
||||||
|
tokio::fs::remove_file(&path).await.ok();
|
||||||
|
clean_up_layers(generated_layers);
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
|
||||||
|
// so we don't need to remove the layer we just failed to create by ourselves.
|
||||||
|
clean_up_layers(generated_layers);
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
generated_layers.push(SplitWriterResult::Produced(layer));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// END: catch every error and do the recovery in the above section
|
||||||
|
Ok(generated_layers)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -252,7 +188,7 @@ impl SplitImageLayerWriter {
|
|||||||
tline: &Arc<Timeline>,
|
tline: &Arc<Timeline>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
end_key: Key,
|
end_key: Key,
|
||||||
) -> anyhow::Result<Vec<BatchWriterResult>> {
|
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||||
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
|
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
@@ -260,6 +196,9 @@ impl SplitImageLayerWriter {
|
|||||||
|
|
||||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers.
|
/// A delta writer that takes key-lsn-values and produces multiple delta layers.
|
||||||
///
|
///
|
||||||
|
/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
|
||||||
|
/// there might be leftover files to be cleaned up).
|
||||||
|
///
|
||||||
/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
|
/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
|
||||||
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
||||||
/// will split them into multiple files based on size.
|
/// will split them into multiple files based on size.
|
||||||
@@ -267,12 +206,12 @@ impl SplitImageLayerWriter {
|
|||||||
pub struct SplitDeltaLayerWriter {
|
pub struct SplitDeltaLayerWriter {
|
||||||
inner: Option<(Key, DeltaLayerWriter)>,
|
inner: Option<(Key, DeltaLayerWriter)>,
|
||||||
target_layer_size: u64,
|
target_layer_size: u64,
|
||||||
|
generated_layer_writers: Vec<(DeltaLayerWriter, PersistentLayerKey)>,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
last_key_written: Key,
|
last_key_written: Key,
|
||||||
batches: BatchLayerWriter,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SplitDeltaLayerWriter {
|
impl SplitDeltaLayerWriter {
|
||||||
@@ -286,12 +225,12 @@ impl SplitDeltaLayerWriter {
|
|||||||
Ok(Self {
|
Ok(Self {
|
||||||
target_layer_size,
|
target_layer_size,
|
||||||
inner: None,
|
inner: None,
|
||||||
|
generated_layer_writers: Vec::new(),
|
||||||
conf,
|
conf,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
lsn_range,
|
lsn_range,
|
||||||
last_key_written: Key::MIN,
|
last_key_written: Key::MIN,
|
||||||
batches: BatchLayerWriter::new(conf).await?,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -340,11 +279,13 @@ impl SplitDeltaLayerWriter {
|
|||||||
.await?;
|
.await?;
|
||||||
let (start_key, prev_delta_writer) =
|
let (start_key, prev_delta_writer) =
|
||||||
std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
|
std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
|
||||||
self.batches.add_unfinished_delta_writer(
|
let layer_key = PersistentLayerKey {
|
||||||
prev_delta_writer,
|
key_range: start_key..key,
|
||||||
start_key..key,
|
lsn_range: self.lsn_range.clone(),
|
||||||
self.lsn_range.clone(),
|
is_delta: true,
|
||||||
);
|
};
|
||||||
|
self.generated_layer_writers
|
||||||
|
.push((prev_delta_writer, layer_key));
|
||||||
} else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
|
} else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
|
||||||
// We have to produce a very large file b/c a key is updated too often.
|
// We have to produce a very large file b/c a key is updated too often.
|
||||||
anyhow::bail!(
|
anyhow::bail!(
|
||||||
@@ -364,25 +305,64 @@ impl SplitDeltaLayerWriter {
|
|||||||
tline: &Arc<Timeline>,
|
tline: &Arc<Timeline>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
discard_fn: D,
|
discard_fn: D,
|
||||||
) -> anyhow::Result<Vec<BatchWriterResult>>
|
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||||
where
|
where
|
||||||
D: Fn(&PersistentLayerKey) -> F,
|
D: Fn(&PersistentLayerKey) -> F,
|
||||||
F: Future<Output = bool>,
|
F: Future<Output = bool>,
|
||||||
{
|
{
|
||||||
let Self {
|
let Self {
|
||||||
mut batches, inner, ..
|
mut generated_layer_writers,
|
||||||
|
inner,
|
||||||
|
..
|
||||||
} = self;
|
} = self;
|
||||||
if let Some((start_key, writer)) = inner {
|
if let Some((start_key, writer)) = inner {
|
||||||
if writer.num_keys() != 0 {
|
if writer.num_keys() != 0 {
|
||||||
let end_key = self.last_key_written.next();
|
let end_key = self.last_key_written.next();
|
||||||
batches.add_unfinished_delta_writer(
|
let layer_key = PersistentLayerKey {
|
||||||
writer,
|
key_range: start_key..end_key,
|
||||||
start_key..end_key,
|
lsn_range: self.lsn_range.clone(),
|
||||||
self.lsn_range.clone(),
|
is_delta: true,
|
||||||
);
|
};
|
||||||
|
generated_layer_writers.push((writer, layer_key));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
batches.finish_with_discard_fn(tline, ctx, discard_fn).await
|
let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
|
||||||
|
for produced_layer in generated_layers {
|
||||||
|
if let SplitWriterResult::Produced(delta_layer) = produced_layer {
|
||||||
|
let layer: Layer = delta_layer.into();
|
||||||
|
layer.delete_on_drop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// BEGIN: catch every error and do the recovery in the below section
|
||||||
|
let mut generated_layers = Vec::new();
|
||||||
|
for (inner, layer_key) in generated_layer_writers {
|
||||||
|
if discard_fn(&layer_key).await {
|
||||||
|
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||||
|
} else {
|
||||||
|
let layer = match inner.finish(layer_key.key_range.end, ctx).await {
|
||||||
|
Ok((desc, path)) => {
|
||||||
|
match Layer::finish_creating(self.conf, tline, desc, &path) {
|
||||||
|
Ok(layer) => layer,
|
||||||
|
Err(e) => {
|
||||||
|
tokio::fs::remove_file(&path).await.ok();
|
||||||
|
clean_up_layers(generated_layers);
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
|
||||||
|
// so we don't need to remove the layer we just failed to create by ourselves.
|
||||||
|
clean_up_layers(generated_layers);
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
generated_layers.push(SplitWriterResult::Produced(layer));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// END: catch every error and do the recovery in the above section
|
||||||
|
Ok(generated_layers)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -390,7 +370,7 @@ impl SplitDeltaLayerWriter {
|
|||||||
self,
|
self,
|
||||||
tline: &Arc<Timeline>,
|
tline: &Arc<Timeline>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Vec<BatchWriterResult>> {
|
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||||
self.finish_with_discard_fn(tline, ctx, |_| async { false })
|
self.finish_with_discard_fn(tline, ctx, |_| async { false })
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
@@ -424,9 +424,6 @@ pub struct Timeline {
|
|||||||
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
||||||
|
|
||||||
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||||
|
|
||||||
/// Cf. [`crate::tenant::CreateTimelineIdempotency`].
|
|
||||||
pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
|
pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
|
||||||
@@ -2139,7 +2136,6 @@ impl Timeline {
|
|||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
state: TimelineState,
|
state: TimelineState,
|
||||||
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||||
create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Arc<Self> {
|
) -> Arc<Self> {
|
||||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||||
@@ -2278,8 +2274,6 @@ impl Timeline {
|
|||||||
handles: Default::default(),
|
handles: Default::default(),
|
||||||
|
|
||||||
attach_wal_lag_cooldown,
|
attach_wal_lag_cooldown,
|
||||||
|
|
||||||
create_idempotency,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
result.repartition_threshold =
|
result.repartition_threshold =
|
||||||
|
|||||||
@@ -32,11 +32,11 @@ use crate::page_cache;
|
|||||||
use crate::statvfs::Statvfs;
|
use crate::statvfs::Statvfs;
|
||||||
use crate::tenant::checks::check_valid_layermap;
|
use crate::tenant::checks::check_valid_layermap;
|
||||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||||
use crate::tenant::storage_layer::batch_split_writer::{
|
|
||||||
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
|
|
||||||
};
|
|
||||||
use crate::tenant::storage_layer::filter_iterator::FilterIterator;
|
use crate::tenant::storage_layer::filter_iterator::FilterIterator;
|
||||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||||
|
use crate::tenant::storage_layer::split_writer::{
|
||||||
|
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
|
||||||
|
};
|
||||||
use crate::tenant::storage_layer::{
|
use crate::tenant::storage_layer::{
|
||||||
AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
|
AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
|
||||||
};
|
};
|
||||||
@@ -834,12 +834,7 @@ impl Timeline {
|
|||||||
if self.cancel.is_cancelled() {
|
if self.cancel.is_cancelled() {
|
||||||
return Err(CompactionError::ShuttingDown);
|
return Err(CompactionError::ShuttingDown);
|
||||||
}
|
}
|
||||||
let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
|
all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
|
||||||
let keys = delta
|
|
||||||
.index_entries(ctx)
|
|
||||||
.await
|
|
||||||
.map_err(CompactionError::Other)?;
|
|
||||||
all_keys.extend(keys);
|
|
||||||
}
|
}
|
||||||
// The current stdlib sorting implementation is designed in a way where it is
|
// The current stdlib sorting implementation is designed in a way where it is
|
||||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||||
@@ -2043,11 +2038,11 @@ impl Timeline {
|
|||||||
let produced_image_layers_len = produced_image_layers.len();
|
let produced_image_layers_len = produced_image_layers.len();
|
||||||
for action in produced_delta_layers {
|
for action in produced_delta_layers {
|
||||||
match action {
|
match action {
|
||||||
BatchWriterResult::Produced(layer) => {
|
SplitWriterResult::Produced(layer) => {
|
||||||
stat.produce_delta_layer(layer.layer_desc().file_size());
|
stat.produce_delta_layer(layer.layer_desc().file_size());
|
||||||
compact_to.push(layer);
|
compact_to.push(layer);
|
||||||
}
|
}
|
||||||
BatchWriterResult::Discarded(l) => {
|
SplitWriterResult::Discarded(l) => {
|
||||||
keep_layers.insert(l);
|
keep_layers.insert(l);
|
||||||
stat.discard_delta_layer();
|
stat.discard_delta_layer();
|
||||||
}
|
}
|
||||||
@@ -2055,11 +2050,11 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
for action in produced_image_layers {
|
for action in produced_image_layers {
|
||||||
match action {
|
match action {
|
||||||
BatchWriterResult::Produced(layer) => {
|
SplitWriterResult::Produced(layer) => {
|
||||||
stat.produce_image_layer(layer.layer_desc().file_size());
|
stat.produce_image_layer(layer.layer_desc().file_size());
|
||||||
compact_to.push(layer);
|
compact_to.push(layer);
|
||||||
}
|
}
|
||||||
BatchWriterResult::Discarded(l) => {
|
SplitWriterResult::Discarded(l) => {
|
||||||
keep_layers.insert(l);
|
keep_layers.insert(l);
|
||||||
stat.discard_image_layer();
|
stat.discard_image_layer();
|
||||||
}
|
}
|
||||||
@@ -2443,7 +2438,7 @@ impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
|
|||||||
type DeltaEntry<'a> = DeltaEntry<'a>;
|
type DeltaEntry<'a> = DeltaEntry<'a>;
|
||||||
|
|
||||||
async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
|
async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
|
||||||
self.0.get_as_delta(ctx).await?.index_entries(ctx).await
|
self.0.load_keys(ctx).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -313,7 +313,6 @@ impl DeleteTimelineFlow {
|
|||||||
// Important. We dont pass ancestor above because it can be missing.
|
// Important. We dont pass ancestor above because it can be missing.
|
||||||
// Thus we need to skip the validation here.
|
// Thus we need to skip the validation here.
|
||||||
CreateTimelineCause::Delete,
|
CreateTimelineCause::Delete,
|
||||||
crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
|
|
||||||
)
|
)
|
||||||
.context("create_timeline_struct")?;
|
.context("create_timeline_struct")?;
|
||||||
|
|
||||||
|
|||||||
@@ -45,16 +45,13 @@ impl LayerManager {
|
|||||||
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
|
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
|
||||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||||
self.try_get_from_key(key)
|
self.layers()
|
||||||
|
.get(key)
|
||||||
.with_context(|| format!("get layer from key: {key}"))
|
.with_context(|| format!("get layer from key: {key}"))
|
||||||
.expect("not found")
|
.expect("not found")
|
||||||
.clone()
|
.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn try_get_from_key(&self, key: &PersistentLayerKey) -> Option<&Layer> {
|
|
||||||
self.layers().get(key)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
||||||
self.get_from_key(&desc.key())
|
self.get_from_key(&desc.key())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,11 +5,7 @@ use camino::Utf8PathBuf;
|
|||||||
use tracing::{error, info, info_span};
|
use tracing::{error, info, info_span};
|
||||||
use utils::{fs_ext, id::TimelineId, lsn::Lsn};
|
use utils::{fs_ext, id::TimelineId, lsn::Lsn};
|
||||||
|
|
||||||
use crate::{
|
use crate::{context::RequestContext, import_datadir, tenant::Tenant};
|
||||||
context::RequestContext,
|
|
||||||
import_datadir,
|
|
||||||
tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
|
|
||||||
};
|
|
||||||
|
|
||||||
use super::Timeline;
|
use super::Timeline;
|
||||||
|
|
||||||
@@ -169,17 +165,13 @@ pub(crate) struct TimelineCreateGuard<'t> {
|
|||||||
owning_tenant: &'t Tenant,
|
owning_tenant: &'t Tenant,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
pub(crate) timeline_path: Utf8PathBuf,
|
pub(crate) timeline_path: Utf8PathBuf,
|
||||||
pub(crate) idempotency: CreateTimelineIdempotency,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Errors when acquiring exclusive access to a timeline ID for creation
|
/// Errors when acquiring exclusive access to a timeline ID for creation
|
||||||
#[derive(thiserror::Error, Debug)]
|
#[derive(thiserror::Error, Debug)]
|
||||||
pub(crate) enum TimelineExclusionError {
|
pub(crate) enum TimelineExclusionError {
|
||||||
#[error("Already exists")]
|
#[error("Already exists")]
|
||||||
AlreadyExists {
|
AlreadyExists(Arc<Timeline>),
|
||||||
existing: TimelineOrOffloaded,
|
|
||||||
arg: CreateTimelineIdempotency,
|
|
||||||
},
|
|
||||||
#[error("Already creating")]
|
#[error("Already creating")]
|
||||||
AlreadyCreating,
|
AlreadyCreating,
|
||||||
|
|
||||||
@@ -193,42 +185,27 @@ impl<'t> TimelineCreateGuard<'t> {
|
|||||||
owning_tenant: &'t Tenant,
|
owning_tenant: &'t Tenant,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
timeline_path: Utf8PathBuf,
|
timeline_path: Utf8PathBuf,
|
||||||
idempotency: CreateTimelineIdempotency,
|
|
||||||
allow_offloaded: bool,
|
|
||||||
) -> Result<Self, TimelineExclusionError> {
|
) -> Result<Self, TimelineExclusionError> {
|
||||||
// Lock order: this is the only place we take both locks. During drop() we only
|
// Lock order: this is the only place we take both locks. During drop() we only
|
||||||
// lock creating_timelines
|
// lock creating_timelines
|
||||||
let timelines = owning_tenant.timelines.lock().unwrap();
|
let timelines = owning_tenant.timelines.lock().unwrap();
|
||||||
let timelines_offloaded = owning_tenant.timelines_offloaded.lock().unwrap();
|
|
||||||
let mut creating_timelines: std::sync::MutexGuard<
|
let mut creating_timelines: std::sync::MutexGuard<
|
||||||
'_,
|
'_,
|
||||||
std::collections::HashSet<TimelineId>,
|
std::collections::HashSet<TimelineId>,
|
||||||
> = owning_tenant.timelines_creating.lock().unwrap();
|
> = owning_tenant.timelines_creating.lock().unwrap();
|
||||||
|
|
||||||
if let Some(existing) = timelines.get(&timeline_id) {
|
if let Some(existing) = timelines.get(&timeline_id) {
|
||||||
return Err(TimelineExclusionError::AlreadyExists {
|
Err(TimelineExclusionError::AlreadyExists(existing.clone()))
|
||||||
existing: TimelineOrOffloaded::Timeline(existing.clone()),
|
} else if creating_timelines.contains(&timeline_id) {
|
||||||
arg: idempotency,
|
Err(TimelineExclusionError::AlreadyCreating)
|
||||||
});
|
} else {
|
||||||
|
creating_timelines.insert(timeline_id);
|
||||||
|
Ok(Self {
|
||||||
|
owning_tenant,
|
||||||
|
timeline_id,
|
||||||
|
timeline_path,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
if !allow_offloaded {
|
|
||||||
if let Some(existing) = timelines_offloaded.get(&timeline_id) {
|
|
||||||
return Err(TimelineExclusionError::AlreadyExists {
|
|
||||||
existing: TimelineOrOffloaded::Offloaded(existing.clone()),
|
|
||||||
arg: idempotency,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if creating_timelines.contains(&timeline_id) {
|
|
||||||
return Err(TimelineExclusionError::AlreadyCreating);
|
|
||||||
}
|
|
||||||
creating_timelines.insert(timeline_id);
|
|
||||||
Ok(Self {
|
|
||||||
owning_tenant,
|
|
||||||
timeline_id,
|
|
||||||
timeline_path,
|
|
||||||
idempotency,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,24 +16,18 @@ use tokio_epoll_uring::{System, SystemHandle};
|
|||||||
|
|
||||||
use crate::virtual_file::on_fatal_io_error;
|
use crate::virtual_file::on_fatal_io_error;
|
||||||
|
|
||||||
use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE};
|
use crate::metrics::tokio_epoll_uring as metrics;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct ThreadLocalState(Arc<ThreadLocalStateInner>);
|
struct ThreadLocalState(Arc<ThreadLocalStateInner>);
|
||||||
|
|
||||||
struct ThreadLocalStateInner {
|
struct ThreadLocalStateInner {
|
||||||
cell: tokio::sync::OnceCell<SystemHandle<metrics::ThreadLocalMetrics>>,
|
cell: tokio::sync::OnceCell<SystemHandle>,
|
||||||
launch_attempts: AtomicU32,
|
launch_attempts: AtomicU32,
|
||||||
/// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
|
/// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
|
||||||
thread_local_state_id: u64,
|
thread_local_state_id: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for ThreadLocalStateInner {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
THREAD_LOCAL_METRICS_STORAGE.remove_system(self.thread_local_state_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ThreadLocalState {
|
impl ThreadLocalState {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self(Arc::new(ThreadLocalStateInner {
|
Self(Arc::new(ThreadLocalStateInner {
|
||||||
@@ -77,8 +71,7 @@ pub async fn thread_local_system() -> Handle {
|
|||||||
&fake_cancel,
|
&fake_cancel,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id);
|
let res = System::launch()
|
||||||
let res = System::launch_with_metrics(per_system_metrics)
|
|
||||||
// this might move us to another executor thread => loop outside the get_or_try_init, not inside it
|
// this might move us to another executor thread => loop outside the get_or_try_init, not inside it
|
||||||
.await;
|
.await;
|
||||||
match res {
|
match res {
|
||||||
@@ -93,7 +86,6 @@ pub async fn thread_local_system() -> Handle {
|
|||||||
emit_launch_failure_process_stats();
|
emit_launch_failure_process_stats();
|
||||||
});
|
});
|
||||||
metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
|
metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
|
||||||
metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id);
|
|
||||||
Err(())
|
Err(())
|
||||||
}
|
}
|
||||||
// abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
|
// abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
|
||||||
@@ -123,7 +115,7 @@ fn emit_launch_failure_process_stats() {
|
|||||||
// number of threads
|
// number of threads
|
||||||
// rss / system memory usage generally
|
// rss / system memory usage generally
|
||||||
|
|
||||||
let tokio_epoll_uring::metrics::GlobalMetrics {
|
let tokio_epoll_uring::metrics::Metrics {
|
||||||
systems_created,
|
systems_created,
|
||||||
systems_destroyed,
|
systems_destroyed,
|
||||||
} = tokio_epoll_uring::metrics::global();
|
} = tokio_epoll_uring::metrics::global();
|
||||||
@@ -190,7 +182,7 @@ fn emit_launch_failure_process_stats() {
|
|||||||
pub struct Handle(ThreadLocalState);
|
pub struct Handle(ThreadLocalState);
|
||||||
|
|
||||||
impl std::ops::Deref for Handle {
|
impl std::ops::Deref for Handle {
|
||||||
type Target = SystemHandle<metrics::ThreadLocalMetrics>;
|
type Target = SystemHandle;
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
fn deref(&self) -> &Self::Target {
|
||||||
self.0
|
self.0
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,6 @@ OBJS = \
|
|||||||
file_cache.o \
|
file_cache.o \
|
||||||
hll.o \
|
hll.o \
|
||||||
libpagestore.o \
|
libpagestore.o \
|
||||||
logical_replication_monitor.o \
|
|
||||||
neon.o \
|
neon.o \
|
||||||
neon_pgversioncompat.o \
|
neon_pgversioncompat.o \
|
||||||
neon_perf_counters.o \
|
neon_perf_counters.o \
|
||||||
|
|||||||
@@ -1,253 +0,0 @@
|
|||||||
#include <limits.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <dirent.h>
|
|
||||||
#include <signal.h>
|
|
||||||
|
|
||||||
#include "postgres.h"
|
|
||||||
|
|
||||||
#include "miscadmin.h"
|
|
||||||
#include "postmaster/bgworker.h"
|
|
||||||
#include "postmaster/interrupt.h"
|
|
||||||
#include "replication/slot.h"
|
|
||||||
#include "storage/fd.h"
|
|
||||||
#include "storage/procsignal.h"
|
|
||||||
#include "tcop/tcopprot.h"
|
|
||||||
#include "utils/guc.h"
|
|
||||||
#include "utils/wait_event.h"
|
|
||||||
|
|
||||||
#include "logical_replication_monitor.h"
|
|
||||||
|
|
||||||
#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
|
|
||||||
|
|
||||||
static int logical_replication_max_snap_files = 300;
|
|
||||||
|
|
||||||
PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
|
|
||||||
|
|
||||||
static int
|
|
||||||
LsnDescComparator(const void *a, const void *b)
|
|
||||||
{
|
|
||||||
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
|
|
||||||
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
|
|
||||||
|
|
||||||
if (lsn1 < lsn2)
|
|
||||||
return 1;
|
|
||||||
else if (lsn1 == lsn2)
|
|
||||||
return 0;
|
|
||||||
else
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Look at .snap files and calculate minimum allowed restart_lsn of slot so that
|
|
||||||
* next gc would leave not more than logical_replication_max_snap_files; all
|
|
||||||
* slots having lower restart_lsn should be dropped.
|
|
||||||
*/
|
|
||||||
static XLogRecPtr
|
|
||||||
get_num_snap_files_lsn_threshold(void)
|
|
||||||
{
|
|
||||||
DIR *dirdesc;
|
|
||||||
struct dirent *de;
|
|
||||||
char *snap_path = "pg_logical/snapshots/";
|
|
||||||
int lsns_allocated = 1024;
|
|
||||||
int lsns_num = 0;
|
|
||||||
XLogRecPtr *lsns;
|
|
||||||
XLogRecPtr cutoff;
|
|
||||||
|
|
||||||
if (logical_replication_max_snap_files < 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
|
|
||||||
|
|
||||||
/* find all .snap files and get their lsns */
|
|
||||||
dirdesc = AllocateDir(snap_path);
|
|
||||||
while ((de = ReadDir(dirdesc, snap_path)) != NULL)
|
|
||||||
{
|
|
||||||
XLogRecPtr lsn;
|
|
||||||
uint32 hi;
|
|
||||||
uint32 lo;
|
|
||||||
|
|
||||||
if (strcmp(de->d_name, ".") == 0 ||
|
|
||||||
strcmp(de->d_name, "..") == 0)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
|
|
||||||
{
|
|
||||||
ereport(LOG,
|
|
||||||
(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
lsn = ((uint64) hi) << 32 | lo;
|
|
||||||
elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
|
|
||||||
if (lsns_allocated == lsns_num)
|
|
||||||
{
|
|
||||||
lsns_allocated *= 2;
|
|
||||||
lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
|
|
||||||
}
|
|
||||||
lsns[lsns_num++] = lsn;
|
|
||||||
}
|
|
||||||
/* sort by lsn desc */
|
|
||||||
qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
|
|
||||||
/* and take cutoff at logical_replication_max_snap_files */
|
|
||||||
if (logical_replication_max_snap_files > lsns_num)
|
|
||||||
cutoff = 0;
|
|
||||||
/* have less files than cutoff */
|
|
||||||
else
|
|
||||||
{
|
|
||||||
cutoff = lsns[logical_replication_max_snap_files - 1];
|
|
||||||
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
|
|
||||||
LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
|
|
||||||
}
|
|
||||||
pfree(lsns);
|
|
||||||
FreeDir(dirdesc);
|
|
||||||
return cutoff;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
InitLogicalReplicationMonitor(void)
|
|
||||||
{
|
|
||||||
BackgroundWorker bgw;
|
|
||||||
|
|
||||||
DefineCustomIntVariable(
|
|
||||||
"neon.logical_replication_max_snap_files",
|
|
||||||
"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
|
|
||||||
NULL,
|
|
||||||
&logical_replication_max_snap_files,
|
|
||||||
300, -1, INT_MAX,
|
|
||||||
PGC_SIGHUP,
|
|
||||||
0,
|
|
||||||
NULL, NULL, NULL);
|
|
||||||
|
|
||||||
memset(&bgw, 0, sizeof(bgw));
|
|
||||||
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
|
||||||
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
|
||||||
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
|
|
||||||
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
|
|
||||||
snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
|
|
||||||
snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
|
|
||||||
bgw.bgw_restart_time = 5;
|
|
||||||
bgw.bgw_notify_pid = 0;
|
|
||||||
bgw.bgw_main_arg = (Datum) 0;
|
|
||||||
|
|
||||||
RegisterBackgroundWorker(&bgw);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Unused logical replication slots pins WAL and prevents deletion of snapshots.
|
|
||||||
* WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
|
|
||||||
* need too many .snap files.
|
|
||||||
*/
|
|
||||||
void
|
|
||||||
LogicalSlotsMonitorMain(Datum main_arg)
|
|
||||||
{
|
|
||||||
/* Establish signal handlers. */
|
|
||||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
|
||||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
|
||||||
pqsignal(SIGTERM, die);
|
|
||||||
|
|
||||||
BackgroundWorkerUnblockSignals();
|
|
||||||
|
|
||||||
for (;;)
|
|
||||||
{
|
|
||||||
XLogRecPtr cutoff_lsn;
|
|
||||||
|
|
||||||
/* In case of a SIGHUP, just reload the configuration. */
|
|
||||||
if (ConfigReloadPending)
|
|
||||||
{
|
|
||||||
ConfigReloadPending = false;
|
|
||||||
ProcessConfigFile(PGC_SIGHUP);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If there are too many .snap files, just drop all logical slots to
|
|
||||||
* prevent aux files bloat.
|
|
||||||
*/
|
|
||||||
cutoff_lsn = get_num_snap_files_lsn_threshold();
|
|
||||||
if (cutoff_lsn > 0)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < max_replication_slots; i++)
|
|
||||||
{
|
|
||||||
char slot_name[NAMEDATALEN];
|
|
||||||
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
|
|
||||||
XLogRecPtr restart_lsn;
|
|
||||||
|
|
||||||
/* find the name */
|
|
||||||
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
|
|
||||||
/* Consider only logical repliction slots */
|
|
||||||
if (!s->in_use || !SlotIsLogical(s))
|
|
||||||
{
|
|
||||||
LWLockRelease(ReplicationSlotControlLock);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* do we need to drop it? */
|
|
||||||
SpinLockAcquire(&s->mutex);
|
|
||||||
restart_lsn = s->data.restart_lsn;
|
|
||||||
SpinLockRelease(&s->mutex);
|
|
||||||
if (restart_lsn >= cutoff_lsn)
|
|
||||||
{
|
|
||||||
LWLockRelease(ReplicationSlotControlLock);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
|
|
||||||
elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
|
|
||||||
slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
|
|
||||||
LWLockRelease(ReplicationSlotControlLock);
|
|
||||||
|
|
||||||
/* now try to drop it, killing owner before if any */
|
|
||||||
for (;;)
|
|
||||||
{
|
|
||||||
pid_t active_pid;
|
|
||||||
|
|
||||||
SpinLockAcquire(&s->mutex);
|
|
||||||
active_pid = s->active_pid;
|
|
||||||
SpinLockRelease(&s->mutex);
|
|
||||||
|
|
||||||
if (active_pid == 0)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Slot is releasted, try to drop it. Though of course
|
|
||||||
* it could have been reacquired, so drop can ERROR
|
|
||||||
* out. Similarly it could have been dropped in the
|
|
||||||
* meanwhile.
|
|
||||||
*
|
|
||||||
* In principle we could remove pg_try/pg_catch, that
|
|
||||||
* would restart the whole bgworker.
|
|
||||||
*/
|
|
||||||
ConditionVariableCancelSleep();
|
|
||||||
PG_TRY();
|
|
||||||
{
|
|
||||||
ReplicationSlotDrop(slot_name, true);
|
|
||||||
elog(LOG, "ls_monitor: slot %s dropped", slot_name);
|
|
||||||
}
|
|
||||||
PG_CATCH();
|
|
||||||
{
|
|
||||||
/* log ERROR and reset elog stack */
|
|
||||||
EmitErrorReport();
|
|
||||||
FlushErrorState();
|
|
||||||
elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
|
|
||||||
}
|
|
||||||
PG_END_TRY();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* kill the owner and wait for release */
|
|
||||||
elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
|
|
||||||
(void) kill(active_pid, SIGTERM);
|
|
||||||
/* We shouldn't get stuck, but to be safe add timeout. */
|
|
||||||
ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(void) WaitLatch(MyLatch,
|
|
||||||
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
|
|
||||||
LS_MONITOR_CHECK_INTERVAL,
|
|
||||||
PG_WAIT_EXTENSION);
|
|
||||||
ResetLatch(MyLatch);
|
|
||||||
CHECK_FOR_INTERRUPTS();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
#ifndef __NEON_LOGICAL_REPLICATION_MONITOR_H__
|
|
||||||
#define __NEON_LOGICAL_REPLICATION_MONITOR_H__
|
|
||||||
|
|
||||||
void InitLogicalReplicationMonitor(void);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
245
pgxn/neon/neon.c
245
pgxn/neon/neon.c
@@ -14,22 +14,32 @@
|
|||||||
#include "miscadmin.h"
|
#include "miscadmin.h"
|
||||||
#include "access/subtrans.h"
|
#include "access/subtrans.h"
|
||||||
#include "access/twophase.h"
|
#include "access/twophase.h"
|
||||||
|
#include "access/xact.h"
|
||||||
#include "access/xlog.h"
|
#include "access/xlog.h"
|
||||||
|
#include "storage/buf_internals.h"
|
||||||
|
#include "storage/bufmgr.h"
|
||||||
|
#include "catalog/pg_type.h"
|
||||||
|
#include "postmaster/bgworker.h"
|
||||||
|
#include "postmaster/interrupt.h"
|
||||||
#include "replication/logical.h"
|
#include "replication/logical.h"
|
||||||
#include "replication/slot.h"
|
#include "replication/slot.h"
|
||||||
#include "replication/walsender.h"
|
#include "replication/walsender.h"
|
||||||
#include "storage/proc.h"
|
#include "storage/proc.h"
|
||||||
|
#include "storage/procsignal.h"
|
||||||
|
#include "tcop/tcopprot.h"
|
||||||
#include "funcapi.h"
|
#include "funcapi.h"
|
||||||
#include "access/htup_details.h"
|
#include "access/htup_details.h"
|
||||||
#include "utils/builtins.h"
|
#include "utils/builtins.h"
|
||||||
#include "utils/pg_lsn.h"
|
#include "utils/pg_lsn.h"
|
||||||
#include "utils/guc.h"
|
#include "utils/guc.h"
|
||||||
#include "utils/guc_tables.h"
|
#include "utils/guc_tables.h"
|
||||||
|
#include "utils/wait_event.h"
|
||||||
|
|
||||||
#include "extension_server.h"
|
#include "extension_server.h"
|
||||||
#include "neon.h"
|
#include "neon.h"
|
||||||
|
#include "walproposer.h"
|
||||||
|
#include "pagestore_client.h"
|
||||||
#include "control_plane_connector.h"
|
#include "control_plane_connector.h"
|
||||||
#include "logical_replication_monitor.h"
|
|
||||||
#include "walsender_hooks.h"
|
#include "walsender_hooks.h"
|
||||||
#if PG_MAJORVERSION_NUM >= 16
|
#if PG_MAJORVERSION_NUM >= 16
|
||||||
#include "storage/ipc.h"
|
#include "storage/ipc.h"
|
||||||
@@ -38,6 +48,7 @@
|
|||||||
PG_MODULE_MAGIC;
|
PG_MODULE_MAGIC;
|
||||||
void _PG_init(void);
|
void _PG_init(void);
|
||||||
|
|
||||||
|
static int logical_replication_max_snap_files = 300;
|
||||||
|
|
||||||
static int running_xacts_overflow_policy;
|
static int running_xacts_overflow_policy;
|
||||||
|
|
||||||
@@ -71,6 +82,237 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = {
|
|||||||
{NULL, 0, false}
|
{NULL, 0, false}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void
|
||||||
|
InitLogicalReplicationMonitor(void)
|
||||||
|
{
|
||||||
|
BackgroundWorker bgw;
|
||||||
|
|
||||||
|
DefineCustomIntVariable(
|
||||||
|
"neon.logical_replication_max_snap_files",
|
||||||
|
"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
|
||||||
|
NULL,
|
||||||
|
&logical_replication_max_snap_files,
|
||||||
|
300, -1, INT_MAX,
|
||||||
|
PGC_SIGHUP,
|
||||||
|
0,
|
||||||
|
NULL, NULL, NULL);
|
||||||
|
|
||||||
|
memset(&bgw, 0, sizeof(bgw));
|
||||||
|
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||||
|
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
||||||
|
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
|
||||||
|
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
|
||||||
|
snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
|
||||||
|
snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
|
||||||
|
bgw.bgw_restart_time = 5;
|
||||||
|
bgw.bgw_notify_pid = 0;
|
||||||
|
bgw.bgw_main_arg = (Datum) 0;
|
||||||
|
|
||||||
|
RegisterBackgroundWorker(&bgw);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
LsnDescComparator(const void *a, const void *b)
|
||||||
|
{
|
||||||
|
XLogRecPtr lsn1 = *((const XLogRecPtr *) a);
|
||||||
|
XLogRecPtr lsn2 = *((const XLogRecPtr *) b);
|
||||||
|
|
||||||
|
if (lsn1 < lsn2)
|
||||||
|
return 1;
|
||||||
|
else if (lsn1 == lsn2)
|
||||||
|
return 0;
|
||||||
|
else
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Look at .snap files and calculate minimum allowed restart_lsn of slot so that
|
||||||
|
* next gc would leave not more than logical_replication_max_snap_files; all
|
||||||
|
* slots having lower restart_lsn should be dropped.
|
||||||
|
*/
|
||||||
|
static XLogRecPtr
|
||||||
|
get_num_snap_files_lsn_threshold(void)
|
||||||
|
{
|
||||||
|
DIR *dirdesc;
|
||||||
|
struct dirent *de;
|
||||||
|
char *snap_path = "pg_logical/snapshots/";
|
||||||
|
int lsns_allocated = 1024;
|
||||||
|
int lsns_num = 0;
|
||||||
|
XLogRecPtr *lsns;
|
||||||
|
XLogRecPtr cutoff;
|
||||||
|
|
||||||
|
if (logical_replication_max_snap_files < 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
|
||||||
|
|
||||||
|
/* find all .snap files and get their lsns */
|
||||||
|
dirdesc = AllocateDir(snap_path);
|
||||||
|
while ((de = ReadDir(dirdesc, snap_path)) != NULL)
|
||||||
|
{
|
||||||
|
XLogRecPtr lsn;
|
||||||
|
uint32 hi;
|
||||||
|
uint32 lo;
|
||||||
|
|
||||||
|
if (strcmp(de->d_name, ".") == 0 ||
|
||||||
|
strcmp(de->d_name, "..") == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
|
||||||
|
{
|
||||||
|
ereport(LOG,
|
||||||
|
(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
lsn = ((uint64) hi) << 32 | lo;
|
||||||
|
elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
|
||||||
|
if (lsns_allocated == lsns_num)
|
||||||
|
{
|
||||||
|
lsns_allocated *= 2;
|
||||||
|
lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
|
||||||
|
}
|
||||||
|
lsns[lsns_num++] = lsn;
|
||||||
|
}
|
||||||
|
/* sort by lsn desc */
|
||||||
|
qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
|
||||||
|
/* and take cutoff at logical_replication_max_snap_files */
|
||||||
|
if (logical_replication_max_snap_files > lsns_num)
|
||||||
|
cutoff = 0;
|
||||||
|
/* have less files than cutoff */
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cutoff = lsns[logical_replication_max_snap_files - 1];
|
||||||
|
elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
|
||||||
|
LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
|
||||||
|
}
|
||||||
|
pfree(lsns);
|
||||||
|
FreeDir(dirdesc);
|
||||||
|
return cutoff;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Unused logical replication slots pins WAL and prevents deletion of snapshots.
|
||||||
|
* WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
|
||||||
|
* need too many .snap files.
|
||||||
|
*/
|
||||||
|
PGDLLEXPORT void
|
||||||
|
LogicalSlotsMonitorMain(Datum main_arg)
|
||||||
|
{
|
||||||
|
/* Establish signal handlers. */
|
||||||
|
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||||
|
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||||
|
pqsignal(SIGTERM, die);
|
||||||
|
|
||||||
|
BackgroundWorkerUnblockSignals();
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
XLogRecPtr cutoff_lsn;
|
||||||
|
|
||||||
|
/* In case of a SIGHUP, just reload the configuration. */
|
||||||
|
if (ConfigReloadPending)
|
||||||
|
{
|
||||||
|
ConfigReloadPending = false;
|
||||||
|
ProcessConfigFile(PGC_SIGHUP);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there are too many .snap files, just drop all logical slots to
|
||||||
|
* prevent aux files bloat.
|
||||||
|
*/
|
||||||
|
cutoff_lsn = get_num_snap_files_lsn_threshold();
|
||||||
|
if (cutoff_lsn > 0)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < max_replication_slots; i++)
|
||||||
|
{
|
||||||
|
char slot_name[NAMEDATALEN];
|
||||||
|
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
|
||||||
|
XLogRecPtr restart_lsn;
|
||||||
|
|
||||||
|
/* find the name */
|
||||||
|
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
|
||||||
|
/* Consider only logical repliction slots */
|
||||||
|
if (!s->in_use || !SlotIsLogical(s))
|
||||||
|
{
|
||||||
|
LWLockRelease(ReplicationSlotControlLock);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* do we need to drop it? */
|
||||||
|
SpinLockAcquire(&s->mutex);
|
||||||
|
restart_lsn = s->data.restart_lsn;
|
||||||
|
SpinLockRelease(&s->mutex);
|
||||||
|
if (restart_lsn >= cutoff_lsn)
|
||||||
|
{
|
||||||
|
LWLockRelease(ReplicationSlotControlLock);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
|
||||||
|
elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
|
||||||
|
slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
|
||||||
|
LWLockRelease(ReplicationSlotControlLock);
|
||||||
|
|
||||||
|
/* now try to drop it, killing owner before if any */
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
pid_t active_pid;
|
||||||
|
|
||||||
|
SpinLockAcquire(&s->mutex);
|
||||||
|
active_pid = s->active_pid;
|
||||||
|
SpinLockRelease(&s->mutex);
|
||||||
|
|
||||||
|
if (active_pid == 0)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Slot is releasted, try to drop it. Though of course
|
||||||
|
* it could have been reacquired, so drop can ERROR
|
||||||
|
* out. Similarly it could have been dropped in the
|
||||||
|
* meanwhile.
|
||||||
|
*
|
||||||
|
* In principle we could remove pg_try/pg_catch, that
|
||||||
|
* would restart the whole bgworker.
|
||||||
|
*/
|
||||||
|
ConditionVariableCancelSleep();
|
||||||
|
PG_TRY();
|
||||||
|
{
|
||||||
|
ReplicationSlotDrop(slot_name, true);
|
||||||
|
elog(LOG, "ls_monitor: slot %s dropped", slot_name);
|
||||||
|
}
|
||||||
|
PG_CATCH();
|
||||||
|
{
|
||||||
|
/* log ERROR and reset elog stack */
|
||||||
|
EmitErrorReport();
|
||||||
|
FlushErrorState();
|
||||||
|
elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
|
||||||
|
}
|
||||||
|
PG_END_TRY();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* kill the owner and wait for release */
|
||||||
|
elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
|
||||||
|
(void) kill(active_pid, SIGTERM);
|
||||||
|
/* We shouldn't get stuck, but to be safe add timeout. */
|
||||||
|
ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(void) WaitLatch(MyLatch,
|
||||||
|
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
|
||||||
|
LS_MONITOR_CHECK_INTERVAL,
|
||||||
|
PG_WAIT_EXTENSION);
|
||||||
|
ResetLatch(MyLatch);
|
||||||
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* XXX: These private to procarray.c, but we need them here.
|
* XXX: These private to procarray.c, but we need them here.
|
||||||
*/
|
*/
|
||||||
@@ -425,6 +667,7 @@ _PG_init(void)
|
|||||||
SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||||
|
|
||||||
InitLogicalReplicationMonitor();
|
InitLogicalReplicationMonitor();
|
||||||
|
|
||||||
InitControlPlaneConnector();
|
InitControlPlaneConnector();
|
||||||
|
|
||||||
pg_init_extension_server();
|
pg_init_extension_server();
|
||||||
|
|||||||
10
poetry.lock
generated
10
poetry.lock
generated
@@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aiohappyeyeballs"
|
name = "aiohappyeyeballs"
|
||||||
@@ -3118,13 +3118,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "werkzeug"
|
name = "werkzeug"
|
||||||
version = "3.0.6"
|
version = "3.0.3"
|
||||||
description = "The comprehensive WSGI web application library."
|
description = "The comprehensive WSGI web application library."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
|
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
|
||||||
{file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
|
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@@ -3406,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf"
|
content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tracing::{debug, info};
|
use tracing::{info, warn};
|
||||||
|
|
||||||
use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
|
use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
|
||||||
use crate::auth::{self, AuthFlow};
|
use crate::auth::{self, AuthFlow};
|
||||||
@@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext(
|
|||||||
secret: AuthSecret,
|
secret: AuthSecret,
|
||||||
config: &'static AuthenticationConfig,
|
config: &'static AuthenticationConfig,
|
||||||
) -> auth::Result<ComputeCredentials> {
|
) -> auth::Result<ComputeCredentials> {
|
||||||
debug!("cleartext auth flow override is enabled, proceeding");
|
warn!("cleartext auth flow override is enabled, proceeding");
|
||||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||||
|
|
||||||
// pause the timer while we communicate with the client
|
// pause the timer while we communicate with the client
|
||||||
@@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
|
|||||||
info: ComputeUserInfoNoEndpoint,
|
info: ComputeUserInfoNoEndpoint,
|
||||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||||
) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
|
) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
|
||||||
debug!("project not specified, resorting to the password hack auth flow");
|
warn!("project not specified, resorting to the password hack auth flow");
|
||||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||||
|
|
||||||
// pause the timer while we communicate with the client
|
// pause the timer while we communicate with the client
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ use std::time::{Duration, SystemTime};
|
|||||||
use arc_swap::ArcSwapOption;
|
use arc_swap::ArcSwapOption;
|
||||||
use dashmap::DashMap;
|
use dashmap::DashMap;
|
||||||
use jose_jwk::crypto::KeyInfo;
|
use jose_jwk::crypto::KeyInfo;
|
||||||
use reqwest::{redirect, Client};
|
|
||||||
use serde::de::Visitor;
|
use serde::de::Visitor;
|
||||||
use serde::{Deserialize, Deserializer};
|
use serde::{Deserialize, Deserializer};
|
||||||
use signature::Verifier;
|
use signature::Verifier;
|
||||||
@@ -25,7 +24,6 @@ const MIN_RENEW: Duration = Duration::from_secs(30);
|
|||||||
const AUTO_RENEW: Duration = Duration::from_secs(300);
|
const AUTO_RENEW: Duration = Duration::from_secs(300);
|
||||||
const MAX_RENEW: Duration = Duration::from_secs(3600);
|
const MAX_RENEW: Duration = Duration::from_secs(3600);
|
||||||
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
||||||
const JWKS_USER_AGENT: &str = "neon-proxy";
|
|
||||||
|
|
||||||
/// How to get the JWT auth rules
|
/// How to get the JWT auth rules
|
||||||
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||||
@@ -52,6 +50,7 @@ pub(crate) struct AuthRule {
|
|||||||
pub(crate) role_names: Vec<RoleNameInt>,
|
pub(crate) role_names: Vec<RoleNameInt>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
pub struct JwkCache {
|
pub struct JwkCache {
|
||||||
client: reqwest::Client,
|
client: reqwest::Client,
|
||||||
|
|
||||||
@@ -358,20 +357,6 @@ impl JwkCache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for JwkCache {
|
|
||||||
fn default() -> Self {
|
|
||||||
let client = Client::builder()
|
|
||||||
.user_agent(JWKS_USER_AGENT)
|
|
||||||
.redirect(redirect::Policy::none())
|
|
||||||
.build()
|
|
||||||
.expect("using &str and standard redirect::Policy");
|
|
||||||
JwkCache {
|
|
||||||
client,
|
|
||||||
map: DashMap::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
|
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
|
||||||
use ecdsa::Signature;
|
use ecdsa::Signature;
|
||||||
use signature::Verifier;
|
use signature::Verifier;
|
||||||
|
|||||||
@@ -21,10 +21,7 @@ use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserIn
|
|||||||
use crate::cache::Cached;
|
use crate::cache::Cached;
|
||||||
use crate::config::AuthenticationConfig;
|
use crate::config::AuthenticationConfig;
|
||||||
use crate::context::RequestMonitoring;
|
use crate::context::RequestMonitoring;
|
||||||
use crate::control_plane::errors::GetAuthInfoError;
|
use crate::control_plane::provider::{CachedNodeInfo, ControlPlaneBackend};
|
||||||
use crate::control_plane::provider::{
|
|
||||||
CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend,
|
|
||||||
};
|
|
||||||
use crate::control_plane::{self, Api, AuthSecret};
|
use crate::control_plane::{self, Api, AuthSecret};
|
||||||
use crate::intern::EndpointIdInt;
|
use crate::intern::EndpointIdInt;
|
||||||
use crate::metrics::Metrics;
|
use crate::metrics::Metrics;
|
||||||
@@ -35,38 +32,19 @@ use crate::stream::Stream;
|
|||||||
use crate::types::{EndpointCacheKey, EndpointId, RoleName};
|
use crate::types::{EndpointCacheKey, EndpointId, RoleName};
|
||||||
use crate::{scram, stream};
|
use crate::{scram, stream};
|
||||||
|
|
||||||
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
|
/// The [crate::serverless] module can authenticate either using control-plane
|
||||||
pub enum MaybeOwned<'a, T> {
|
/// to get authentication state, or by using JWKs stored in the filesystem.
|
||||||
Owned(T),
|
#[derive(Clone, Copy)]
|
||||||
Borrowed(&'a T),
|
pub enum ServerlessBackend<'a> {
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> std::ops::Deref for MaybeOwned<'_, T> {
|
|
||||||
type Target = T;
|
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
|
||||||
match self {
|
|
||||||
MaybeOwned::Owned(t) => t,
|
|
||||||
MaybeOwned::Borrowed(t) => t,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This type serves two purposes:
|
|
||||||
///
|
|
||||||
/// * When `T` is `()`, it's just a regular auth backend selector
|
|
||||||
/// which we use in [`crate::config::ProxyConfig`].
|
|
||||||
///
|
|
||||||
/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
|
|
||||||
/// this helps us provide the credentials only to those auth
|
|
||||||
/// backends which require them for the authentication process.
|
|
||||||
pub enum Backend<'a, T> {
|
|
||||||
/// Cloud API (V2).
|
/// Cloud API (V2).
|
||||||
ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T),
|
ControlPlane(&'a ControlPlaneBackend),
|
||||||
/// Local proxy uses configured auth credentials and does not wake compute
|
/// Local proxy uses configured auth credentials and does not wake compute
|
||||||
Local(MaybeOwned<'a, LocalBackend>),
|
Local(&'a LocalBackend),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) trait TestBackend: Send + Sync + 'static {
|
pub(crate) trait TestBackend: Send + Sync + 'static {
|
||||||
fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
|
fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
|
||||||
@@ -83,56 +61,20 @@ impl Clone for Box<dyn TestBackend> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for Backend<'_, ()> {
|
impl std::fmt::Display for ControlPlaneBackend {
|
||||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Self::ControlPlane(api, ()) => match &**api {
|
ControlPlaneBackend::Management(endpoint) => fmt
|
||||||
ControlPlaneBackend::Management(endpoint) => fmt
|
.debug_tuple("ControlPlane::Management")
|
||||||
.debug_tuple("ControlPlane::Management")
|
.field(&endpoint.url())
|
||||||
.field(&endpoint.url())
|
.finish(),
|
||||||
.finish(),
|
#[cfg(any(test, feature = "testing"))]
|
||||||
#[cfg(any(test, feature = "testing"))]
|
ControlPlaneBackend::PostgresMock(endpoint) => fmt
|
||||||
ControlPlaneBackend::PostgresMock(endpoint) => fmt
|
.debug_tuple("ControlPlane::PostgresMock")
|
||||||
.debug_tuple("ControlPlane::PostgresMock")
|
.field(&endpoint.url())
|
||||||
.field(&endpoint.url())
|
.finish(),
|
||||||
.finish(),
|
#[cfg(test)]
|
||||||
#[cfg(test)]
|
ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
|
||||||
ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
|
|
||||||
},
|
|
||||||
Self::Local(_) => fmt.debug_tuple("Local").finish(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Backend<'_, T> {
|
|
||||||
/// Very similar to [`std::option::Option::as_ref`].
|
|
||||||
/// This helps us pass structured config to async tasks.
|
|
||||||
pub(crate) fn as_ref(&self) -> Backend<'_, &T> {
|
|
||||||
match self {
|
|
||||||
Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x),
|
|
||||||
Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, T> Backend<'a, T> {
|
|
||||||
/// Very similar to [`std::option::Option::map`].
|
|
||||||
/// Maps [`Backend<T>`] to [`Backend<R>`] by applying
|
|
||||||
/// a function to a contained value.
|
|
||||||
pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> {
|
|
||||||
match self {
|
|
||||||
Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)),
|
|
||||||
Self::Local(l) => Backend::Local(l),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl<'a, T, E> Backend<'a, Result<T, E>> {
|
|
||||||
/// Very similar to [`std::option::Option::transpose`].
|
|
||||||
/// This is most useful for error handling.
|
|
||||||
pub(crate) fn transpose(self) -> Result<Backend<'a, T>, E> {
|
|
||||||
match self {
|
|
||||||
Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)),
|
|
||||||
Self::Local(l) => Ok(Backend::Local(l)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -399,96 +341,79 @@ async fn authenticate_with_secret(
|
|||||||
classic::authenticate(ctx, info, client, config, secret).await
|
classic::authenticate(ctx, info, client, config, secret).await
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
|
impl ControlPlaneBackend {
|
||||||
/// Get username from the credentials.
|
|
||||||
pub(crate) fn get_user(&self) -> &str {
|
|
||||||
match self {
|
|
||||||
Self::ControlPlane(_, user_info) => &user_info.user,
|
|
||||||
Self::Local(_) => "local",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
|
||||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||||
pub(crate) async fn authenticate(
|
pub(crate) async fn authenticate(
|
||||||
self,
|
&self,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
|
user_info: ComputeUserInfoMaybeEndpoint,
|
||||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||||
allow_cleartext: bool,
|
allow_cleartext: bool,
|
||||||
config: &'static AuthenticationConfig,
|
config: &'static AuthenticationConfig,
|
||||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||||
) -> auth::Result<Backend<'a, ComputeCredentials>> {
|
) -> auth::Result<ControlPlaneComputeBackend> {
|
||||||
let res = match self {
|
info!(
|
||||||
Self::ControlPlane(api, user_info) => {
|
user = &*user_info.user,
|
||||||
info!(
|
project = user_info.endpoint(),
|
||||||
user = &*user_info.user,
|
"performing authentication using the console"
|
||||||
project = user_info.endpoint(),
|
);
|
||||||
"performing authentication using the console"
|
|
||||||
);
|
|
||||||
|
|
||||||
let credentials = auth_quirks(
|
let credentials = auth_quirks(
|
||||||
ctx,
|
ctx,
|
||||||
&*api,
|
self,
|
||||||
user_info,
|
user_info,
|
||||||
client,
|
client,
|
||||||
allow_cleartext,
|
allow_cleartext,
|
||||||
config,
|
config,
|
||||||
endpoint_rate_limiter,
|
endpoint_rate_limiter,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
Backend::ControlPlane(api, credentials)
|
|
||||||
}
|
|
||||||
Self::Local(_) => {
|
|
||||||
return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
info!("user successfully authenticated");
|
info!("user successfully authenticated");
|
||||||
Ok(res)
|
Ok(ControlPlaneComputeBackend {
|
||||||
|
api: self,
|
||||||
|
creds: credentials,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn attach_to_credentials(
|
||||||
|
&self,
|
||||||
|
creds: ComputeCredentials,
|
||||||
|
) -> ControlPlaneComputeBackend {
|
||||||
|
ControlPlaneComputeBackend { api: self, creds }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Backend<'_, ComputeUserInfo> {
|
pub struct ControlPlaneComputeBackend<'a> {
|
||||||
pub(crate) async fn get_role_secret(
|
api: &'a ControlPlaneBackend,
|
||||||
&self,
|
creds: ComputeCredentials,
|
||||||
ctx: &RequestMonitoring,
|
|
||||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
|
||||||
match self {
|
|
||||||
Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await,
|
|
||||||
Self::Local(_) => Ok(Cached::new_uncached(None)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn get_allowed_ips_and_secret(
|
|
||||||
&self,
|
|
||||||
ctx: &RequestMonitoring,
|
|
||||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
|
||||||
match self {
|
|
||||||
Self::ControlPlane(api, user_info) => {
|
|
||||||
api.get_allowed_ips_and_secret(ctx, user_info).await
|
|
||||||
}
|
|
||||||
Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
|
impl ComputeConnectBackend for ControlPlaneComputeBackend<'static> {
|
||||||
async fn wake_compute(
|
async fn wake_compute(
|
||||||
&self,
|
&self,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
|
) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
|
||||||
match self {
|
self.api.wake_compute(ctx, &self.creds.info).await
|
||||||
Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
|
||||||
Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_keys(&self) -> &ComputeCredentialKeys {
|
fn get_keys(&self) -> &ComputeCredentialKeys {
|
||||||
match self {
|
&self.creds.keys
|
||||||
Self::ControlPlane(_, creds) => &creds.keys,
|
}
|
||||||
Self::Local(_) => &ComputeCredentialKeys::None,
|
}
|
||||||
}
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ComputeConnectBackend for LocalBackend {
|
||||||
|
async fn wake_compute(
|
||||||
|
&self,
|
||||||
|
_ctx: &RequestMonitoring,
|
||||||
|
) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
|
||||||
|
Ok(Cached::new_uncached(self.node_info.clone()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_keys(&self) -> &ComputeCredentialKeys {
|
||||||
|
&ComputeCredentialKeys::None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
//! Client authentication mechanisms.
|
//! Client authentication mechanisms.
|
||||||
|
|
||||||
pub mod backend;
|
pub mod backend;
|
||||||
pub use backend::Backend;
|
pub use backend::ServerlessBackend;
|
||||||
|
|
||||||
mod credentials;
|
mod credentials;
|
||||||
pub(crate) use credentials::{
|
pub(crate) use credentials::{
|
||||||
|
|||||||
@@ -203,7 +203,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
|
|
||||||
let task = serverless::task_main(
|
let task = serverless::task_main(
|
||||||
config,
|
config,
|
||||||
auth_backend,
|
auth::ServerlessBackend::Local(auth_backend),
|
||||||
http_listener,
|
http_listener,
|
||||||
shutdown.clone(),
|
shutdown.clone(),
|
||||||
Arc::new(CancellationHandlerMain::new(
|
Arc::new(CancellationHandlerMain::new(
|
||||||
@@ -295,12 +295,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// auth::Backend is created at proxy startup, and lives forever.
|
/// auth::Backend is created at proxy startup, and lives forever.
|
||||||
fn build_auth_backend(
|
fn build_auth_backend(args: &LocalProxyCliArgs) -> anyhow::Result<&'static LocalBackend> {
|
||||||
args: &LocalProxyCliArgs,
|
let auth_backend = LocalBackend::new(args.postgres, args.compute_ctl.clone());
|
||||||
) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
|
|
||||||
let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
|
|
||||||
LocalBackend::new(args.postgres, args.compute_ctl.clone()),
|
|
||||||
));
|
|
||||||
|
|
||||||
Ok(Box::leak(Box::new(auth_backend)))
|
Ok(Box::leak(Box::new(auth_backend)))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,13 +13,14 @@ use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
|
|||||||
use aws_config::Region;
|
use aws_config::Region;
|
||||||
use futures::future::Either;
|
use futures::future::Either;
|
||||||
use proxy::auth::backend::jwt::JwkCache;
|
use proxy::auth::backend::jwt::JwkCache;
|
||||||
use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
|
use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend};
|
||||||
use proxy::cancellation::{CancelMap, CancellationHandler};
|
use proxy::cancellation::{CancelMap, CancellationHandler};
|
||||||
use proxy::config::{
|
use proxy::config::{
|
||||||
self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig,
|
self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig,
|
||||||
ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
|
ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
|
||||||
};
|
};
|
||||||
use proxy::context::parquet::ParquetUploadArgs;
|
use proxy::context::parquet::ParquetUploadArgs;
|
||||||
|
use proxy::control_plane::provider::ControlPlaneBackend;
|
||||||
use proxy::http::health_server::AppMetrics;
|
use proxy::http::health_server::AppMetrics;
|
||||||
use proxy::metrics::Metrics;
|
use proxy::metrics::Metrics;
|
||||||
use proxy::rate_limiter::{
|
use proxy::rate_limiter::{
|
||||||
@@ -137,6 +138,9 @@ struct ProxyCliArgs {
|
|||||||
/// size of the threadpool for password hashing
|
/// size of the threadpool for password hashing
|
||||||
#[clap(long, default_value_t = 4)]
|
#[clap(long, default_value_t = 4)]
|
||||||
scram_thread_pool_size: u8,
|
scram_thread_pool_size: u8,
|
||||||
|
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
|
||||||
|
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||||
|
disable_dynamic_rate_limiter: bool,
|
||||||
/// Endpoint rate limiter max number of requests per second.
|
/// Endpoint rate limiter max number of requests per second.
|
||||||
///
|
///
|
||||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||||
@@ -464,7 +468,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
if let Some(serverless_listener) = serverless_listener {
|
if let Some(serverless_listener) = serverless_listener {
|
||||||
client_tasks.spawn(serverless::task_main(
|
client_tasks.spawn(serverless::task_main(
|
||||||
config,
|
config,
|
||||||
auth_backend,
|
auth::ServerlessBackend::ControlPlane(auth_backend),
|
||||||
serverless_listener,
|
serverless_listener,
|
||||||
cancellation_token.clone(),
|
cancellation_token.clone(),
|
||||||
cancellation_handler.clone(),
|
cancellation_handler.clone(),
|
||||||
@@ -512,40 +516,38 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
|
if let Either::Left(ControlPlaneBackend::Management(api)) = &auth_backend {
|
||||||
if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api {
|
match (redis_notifications_client, regional_redis_client.clone()) {
|
||||||
match (redis_notifications_client, regional_redis_client.clone()) {
|
(None, None) => {}
|
||||||
(None, None) => {}
|
(client1, client2) => {
|
||||||
(client1, client2) => {
|
let cache = api.caches.project_info.clone();
|
||||||
let cache = api.caches.project_info.clone();
|
if let Some(client) = client1 {
|
||||||
if let Some(client) = client1 {
|
maintenance_tasks.spawn(notifications::task_main(
|
||||||
maintenance_tasks.spawn(notifications::task_main(
|
client,
|
||||||
client,
|
cache.clone(),
|
||||||
cache.clone(),
|
cancel_map.clone(),
|
||||||
cancel_map.clone(),
|
args.region.clone(),
|
||||||
args.region.clone(),
|
));
|
||||||
));
|
|
||||||
}
|
|
||||||
if let Some(client) = client2 {
|
|
||||||
maintenance_tasks.spawn(notifications::task_main(
|
|
||||||
client,
|
|
||||||
cache.clone(),
|
|
||||||
cancel_map.clone(),
|
|
||||||
args.region.clone(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
|
||||||
}
|
}
|
||||||
|
if let Some(client) = client2 {
|
||||||
|
maintenance_tasks.spawn(notifications::task_main(
|
||||||
|
client,
|
||||||
|
cache.clone(),
|
||||||
|
cancel_map.clone(),
|
||||||
|
args.region.clone(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||||
}
|
}
|
||||||
if let Some(regional_redis_client) = regional_redis_client {
|
}
|
||||||
let cache = api.caches.endpoints_cache.clone();
|
if let Some(regional_redis_client) = regional_redis_client {
|
||||||
let con = regional_redis_client;
|
let cache = api.caches.endpoints_cache.clone();
|
||||||
let span = tracing::info_span!("endpoints_cache");
|
let con = regional_redis_client;
|
||||||
maintenance_tasks.spawn(
|
let span = tracing::info_span!("endpoints_cache");
|
||||||
async move { cache.do_read(con, cancellation_token.clone()).await }
|
maintenance_tasks.spawn(
|
||||||
.instrument(span),
|
async move { cache.do_read(con, cancellation_token.clone()).await }
|
||||||
);
|
.instrument(span),
|
||||||
}
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -612,6 +614,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
|||||||
and metric-collection-interval must be specified"
|
and metric-collection-interval must be specified"
|
||||||
),
|
),
|
||||||
};
|
};
|
||||||
|
if !args.disable_dynamic_rate_limiter {
|
||||||
|
bail!("dynamic rate limiter should be disabled");
|
||||||
|
}
|
||||||
|
|
||||||
let config::ConcurrencyLockOptions {
|
let config::ConcurrencyLockOptions {
|
||||||
shards,
|
shards,
|
||||||
@@ -688,7 +693,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
|||||||
/// auth::Backend is created at proxy startup, and lives forever.
|
/// auth::Backend is created at proxy startup, and lives forever.
|
||||||
fn build_auth_backend(
|
fn build_auth_backend(
|
||||||
args: &ProxyCliArgs,
|
args: &ProxyCliArgs,
|
||||||
) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
|
) -> anyhow::Result<Either<&'static ControlPlaneBackend, &'static ConsoleRedirectBackend>> {
|
||||||
match &args.auth_backend {
|
match &args.auth_backend {
|
||||||
AuthBackendType::Console => {
|
AuthBackendType::Console => {
|
||||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||||
@@ -738,8 +743,7 @@ fn build_auth_backend(
|
|||||||
locks,
|
locks,
|
||||||
wake_compute_endpoint_rate_limiter,
|
wake_compute_endpoint_rate_limiter,
|
||||||
);
|
);
|
||||||
let api = control_plane::provider::ControlPlaneBackend::Management(api);
|
let auth_backend = control_plane::provider::ControlPlaneBackend::Management(api);
|
||||||
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
|
|
||||||
|
|
||||||
let config = Box::leak(Box::new(auth_backend));
|
let config = Box::leak(Box::new(auth_backend));
|
||||||
|
|
||||||
@@ -750,9 +754,7 @@ fn build_auth_backend(
|
|||||||
AuthBackendType::Postgres => {
|
AuthBackendType::Postgres => {
|
||||||
let url = args.auth_endpoint.parse()?;
|
let url = args.auth_endpoint.parse()?;
|
||||||
let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
|
let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
|
||||||
let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
|
let auth_backend = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
|
||||||
|
|
||||||
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
|
|
||||||
|
|
||||||
let config = Box::leak(Box::new(auth_backend));
|
let config = Box::leak(Box::new(auth_backend));
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ pub(crate) trait ConnectMechanism {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub(crate) trait ComputeConnectBackend {
|
pub(crate) trait ComputeConnectBackend: Send + Sync + 'static {
|
||||||
async fn wake_compute(
|
async fn wake_compute(
|
||||||
&self,
|
&self,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
@@ -98,10 +98,10 @@ impl ConnectMechanism for TcpMechanism<'_> {
|
|||||||
|
|
||||||
/// Try to connect to the compute node, retrying if necessary.
|
/// Try to connect to the compute node, retrying if necessary.
|
||||||
#[tracing::instrument(skip_all)]
|
#[tracing::instrument(skip_all)]
|
||||||
pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
pub(crate) async fn connect_to_compute<M: ConnectMechanism>(
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
mechanism: &M,
|
mechanism: &M,
|
||||||
user_info: &B,
|
user_info: &dyn ComputeConnectBackend,
|
||||||
allow_self_signed_compute: bool,
|
allow_self_signed_compute: bool,
|
||||||
wake_compute_retry_config: RetryConfig,
|
wake_compute_retry_config: RetryConfig,
|
||||||
connect_to_compute_retry_config: RetryConfig,
|
connect_to_compute_retry_config: RetryConfig,
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ use self::passthrough::ProxyPassthrough;
|
|||||||
use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
|
use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
|
||||||
use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
|
use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
|
||||||
use crate::context::RequestMonitoring;
|
use crate::context::RequestMonitoring;
|
||||||
|
use crate::control_plane::provider::ControlPlaneBackend;
|
||||||
use crate::error::ReportableError;
|
use crate::error::ReportableError;
|
||||||
use crate::metrics::{Metrics, NumClientConnectionsGuard};
|
use crate::metrics::{Metrics, NumClientConnectionsGuard};
|
||||||
use crate::protocol2::read_proxy_protocol;
|
use crate::protocol2::read_proxy_protocol;
|
||||||
@@ -54,7 +55,7 @@ pub async fn run_until_cancelled<F: std::future::Future>(
|
|||||||
|
|
||||||
pub async fn task_main(
|
pub async fn task_main(
|
||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
auth_backend: &'static auth::Backend<'static, ()>,
|
auth_backend: &'static ControlPlaneBackend,
|
||||||
listener: tokio::net::TcpListener,
|
listener: tokio::net::TcpListener,
|
||||||
cancellation_token: CancellationToken,
|
cancellation_token: CancellationToken,
|
||||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||||
@@ -241,7 +242,7 @@ impl ReportableError for ClientRequestError {
|
|||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
auth_backend: &'static auth::Backend<'static, ()>,
|
auth_backend: &'static ControlPlaneBackend,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||||
stream: S,
|
stream: S,
|
||||||
@@ -282,20 +283,17 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
|||||||
let common_names = tls.map(|tls| &tls.common_names);
|
let common_names = tls.map(|tls| &tls.common_names);
|
||||||
|
|
||||||
// Extract credentials which we're going to use for auth.
|
// Extract credentials which we're going to use for auth.
|
||||||
let result = auth_backend
|
let result = auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names);
|
||||||
.as_ref()
|
|
||||||
.map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names))
|
|
||||||
.transpose();
|
|
||||||
|
|
||||||
let user_info = match result {
|
let user_info = match result {
|
||||||
Ok(user_info) => user_info,
|
Ok(user_info) => user_info,
|
||||||
Err(e) => stream.throw_error(e).await?,
|
Err(e) => stream.throw_error(e).await?,
|
||||||
};
|
};
|
||||||
|
|
||||||
let user = user_info.get_user().to_owned();
|
let user = user_info.user.clone();
|
||||||
let user_info = match user_info
|
let user_info = match auth_backend
|
||||||
.authenticate(
|
.authenticate(
|
||||||
ctx,
|
ctx,
|
||||||
|
user_info,
|
||||||
&mut stream,
|
&mut stream,
|
||||||
mode.allow_cleartext(),
|
mode.allow_cleartext(),
|
||||||
&config.authentication_config,
|
&config.authentication_config,
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use auth::backend::ControlPlaneComputeBackend;
|
||||||
use http::StatusCode;
|
use http::StatusCode;
|
||||||
use retry::{retry_after, ShouldRetryWakeCompute};
|
use retry::{retry_after, ShouldRetryWakeCompute};
|
||||||
use rstest::rstest;
|
use rstest::rstest;
|
||||||
@@ -19,7 +20,7 @@ use super::connect_compute::ConnectMechanism;
|
|||||||
use super::retry::CouldRetry;
|
use super::retry::CouldRetry;
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::auth::backend::{
|
use crate::auth::backend::{
|
||||||
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
|
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, TestBackend,
|
||||||
};
|
};
|
||||||
use crate::config::{CertResolver, RetryConfig};
|
use crate::config::{CertResolver, RetryConfig};
|
||||||
use crate::control_plane::messages::{ControlPlaneError, Details, MetricsAuxInfo, Status};
|
use crate::control_plane::messages::{ControlPlaneError, Details, MetricsAuxInfo, Status};
|
||||||
@@ -566,19 +567,21 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
|
|||||||
|
|
||||||
fn helper_create_connect_info(
|
fn helper_create_connect_info(
|
||||||
mechanism: &TestConnectMechanism,
|
mechanism: &TestConnectMechanism,
|
||||||
) -> auth::Backend<'static, ComputeCredentials> {
|
) -> ControlPlaneComputeBackend<'static> {
|
||||||
let user_info = auth::Backend::ControlPlane(
|
let api = Box::leak(Box::new(ControlPlaneBackend::Test(Box::new(
|
||||||
MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))),
|
mechanism.clone(),
|
||||||
ComputeCredentials {
|
))));
|
||||||
info: ComputeUserInfo {
|
|
||||||
endpoint: "endpoint".into(),
|
let creds = ComputeCredentials {
|
||||||
user: "user".into(),
|
info: ComputeUserInfo {
|
||||||
options: NeonOptions::parse_options_raw(""),
|
endpoint: "endpoint".into(),
|
||||||
},
|
user: "user".into(),
|
||||||
keys: ComputeCredentialKeys::Password("password".into()),
|
options: NeonOptions::parse_options_raw(""),
|
||||||
},
|
},
|
||||||
);
|
keys: ComputeCredentialKeys::Password("password".into()),
|
||||||
user_info
|
};
|
||||||
|
|
||||||
|
api.attach_to_credentials(creds)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
@@ -11,10 +11,10 @@ use crate::metrics::{
|
|||||||
};
|
};
|
||||||
use crate::proxy::retry::{retry_after, should_retry};
|
use crate::proxy::retry::{retry_after, should_retry};
|
||||||
|
|
||||||
pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
|
pub(crate) async fn wake_compute(
|
||||||
num_retries: &mut u32,
|
num_retries: &mut u32,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
api: &B,
|
api: &dyn ComputeConnectBackend,
|
||||||
config: RetryConfig,
|
config: RetryConfig,
|
||||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||||
let retry_type = RetryType::WakeCompute;
|
let retry_type = RetryType::WakeCompute;
|
||||||
|
|||||||
@@ -15,9 +15,9 @@ use super::conn_pool::poll_client;
|
|||||||
use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
|
use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
|
||||||
use super::http_conn_pool::{self, poll_http2_client, Send};
|
use super::http_conn_pool::{self, poll_http2_client, Send};
|
||||||
use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
|
use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
|
||||||
use crate::auth::backend::local::StaticAuthRules;
|
use crate::auth::backend::local::{LocalBackend, StaticAuthRules};
|
||||||
use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
|
use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
|
||||||
use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
|
use crate::auth::{check_peer_addr_is_in_list, AuthError, ServerlessBackend};
|
||||||
use crate::compute;
|
use crate::compute;
|
||||||
use crate::compute_ctl::{
|
use crate::compute_ctl::{
|
||||||
ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
|
ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
|
||||||
@@ -26,11 +26,11 @@ use crate::config::ProxyConfig;
|
|||||||
use crate::context::RequestMonitoring;
|
use crate::context::RequestMonitoring;
|
||||||
use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
|
use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
|
||||||
use crate::control_plane::locks::ApiLocks;
|
use crate::control_plane::locks::ApiLocks;
|
||||||
use crate::control_plane::provider::ApiLockError;
|
use crate::control_plane::provider::{ApiLockError, ControlPlaneBackend};
|
||||||
use crate::control_plane::CachedNodeInfo;
|
use crate::control_plane::{Api, CachedNodeInfo};
|
||||||
use crate::error::{ErrorKind, ReportableError, UserFacingError};
|
use crate::error::{ErrorKind, ReportableError, UserFacingError};
|
||||||
use crate::intern::EndpointIdInt;
|
use crate::intern::EndpointIdInt;
|
||||||
use crate::proxy::connect_compute::ConnectMechanism;
|
use crate::proxy::connect_compute::{ComputeConnectBackend, ConnectMechanism};
|
||||||
use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
|
use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
|
||||||
use crate::rate_limiter::EndpointRateLimiter;
|
use crate::rate_limiter::EndpointRateLimiter;
|
||||||
use crate::types::{EndpointId, Host};
|
use crate::types::{EndpointId, Host};
|
||||||
@@ -41,7 +41,6 @@ pub(crate) struct PoolingBackend {
|
|||||||
pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
|
pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
|
||||||
|
|
||||||
pub(crate) config: &'static ProxyConfig,
|
pub(crate) config: &'static ProxyConfig,
|
||||||
pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
|
|
||||||
pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -49,12 +48,13 @@ impl PoolingBackend {
|
|||||||
pub(crate) async fn authenticate_with_password(
|
pub(crate) async fn authenticate_with_password(
|
||||||
&self,
|
&self,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
|
auth_backend: &ControlPlaneBackend,
|
||||||
user_info: &ComputeUserInfo,
|
user_info: &ComputeUserInfo,
|
||||||
password: &[u8],
|
password: &[u8],
|
||||||
) -> Result<ComputeCredentials, AuthError> {
|
) -> Result<ComputeCredentials, AuthError> {
|
||||||
let user_info = user_info.clone();
|
let (allowed_ips, maybe_secret) = auth_backend
|
||||||
let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
|
.get_allowed_ips_and_secret(ctx, user_info)
|
||||||
let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
|
.await?;
|
||||||
if self.config.authentication_config.ip_allowlist_check_enabled
|
if self.config.authentication_config.ip_allowlist_check_enabled
|
||||||
&& !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
|
&& !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
|
||||||
{
|
{
|
||||||
@@ -68,7 +68,7 @@ impl PoolingBackend {
|
|||||||
}
|
}
|
||||||
let cached_secret = match maybe_secret {
|
let cached_secret = match maybe_secret {
|
||||||
Some(secret) => secret,
|
Some(secret) => secret,
|
||||||
None => backend.get_role_secret(ctx).await?,
|
None => auth_backend.get_role_secret(ctx, user_info).await?,
|
||||||
};
|
};
|
||||||
|
|
||||||
let secret = match cached_secret.value.clone() {
|
let secret = match cached_secret.value.clone() {
|
||||||
@@ -103,7 +103,7 @@ impl PoolingBackend {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
res.map(|key| ComputeCredentials {
|
res.map(|key| ComputeCredentials {
|
||||||
info: user_info,
|
info: user_info.clone(),
|
||||||
keys: key,
|
keys: key,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -111,11 +111,12 @@ impl PoolingBackend {
|
|||||||
pub(crate) async fn authenticate_with_jwt(
|
pub(crate) async fn authenticate_with_jwt(
|
||||||
&self,
|
&self,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
|
auth_backend: ServerlessBackend<'static>,
|
||||||
user_info: &ComputeUserInfo,
|
user_info: &ComputeUserInfo,
|
||||||
jwt: String,
|
jwt: String,
|
||||||
) -> Result<ComputeCredentials, AuthError> {
|
) -> Result<ComputeCredentials, AuthError> {
|
||||||
match &self.auth_backend {
|
match auth_backend {
|
||||||
crate::auth::Backend::ControlPlane(console, ()) => {
|
ServerlessBackend::ControlPlane(console) => {
|
||||||
self.config
|
self.config
|
||||||
.authentication_config
|
.authentication_config
|
||||||
.jwks_cache
|
.jwks_cache
|
||||||
@@ -123,7 +124,7 @@ impl PoolingBackend {
|
|||||||
ctx,
|
ctx,
|
||||||
user_info.endpoint.clone(),
|
user_info.endpoint.clone(),
|
||||||
&user_info.user,
|
&user_info.user,
|
||||||
&**console,
|
console,
|
||||||
&jwt,
|
&jwt,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -134,7 +135,7 @@ impl PoolingBackend {
|
|||||||
keys: crate::auth::backend::ComputeCredentialKeys::None,
|
keys: crate::auth::backend::ComputeCredentialKeys::None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
crate::auth::Backend::Local(_) => {
|
ServerlessBackend::Local(_) => {
|
||||||
let keys = self
|
let keys = self
|
||||||
.config
|
.config
|
||||||
.authentication_config
|
.authentication_config
|
||||||
@@ -164,6 +165,7 @@ impl PoolingBackend {
|
|||||||
pub(crate) async fn connect_to_compute(
|
pub(crate) async fn connect_to_compute(
|
||||||
&self,
|
&self,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
|
auth_backend: ServerlessBackend<'static>,
|
||||||
conn_info: ConnInfo,
|
conn_info: ConnInfo,
|
||||||
keys: ComputeCredentials,
|
keys: ComputeCredentials,
|
||||||
force_new: bool,
|
force_new: bool,
|
||||||
@@ -182,7 +184,14 @@ impl PoolingBackend {
|
|||||||
let conn_id = uuid::Uuid::new_v4();
|
let conn_id = uuid::Uuid::new_v4();
|
||||||
tracing::Span::current().record("conn_id", display(conn_id));
|
tracing::Span::current().record("conn_id", display(conn_id));
|
||||||
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
|
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
|
||||||
let backend = self.auth_backend.as_ref().map(|()| keys);
|
|
||||||
|
let api = match auth_backend {
|
||||||
|
ServerlessBackend::ControlPlane(cplane) => {
|
||||||
|
&cplane.attach_to_credentials(keys) as &dyn ComputeConnectBackend
|
||||||
|
}
|
||||||
|
ServerlessBackend::Local(local_proxy) => local_proxy as &dyn ComputeConnectBackend,
|
||||||
|
};
|
||||||
|
|
||||||
crate::proxy::connect_compute::connect_to_compute(
|
crate::proxy::connect_compute::connect_to_compute(
|
||||||
ctx,
|
ctx,
|
||||||
&TokioMechanism {
|
&TokioMechanism {
|
||||||
@@ -191,7 +200,7 @@ impl PoolingBackend {
|
|||||||
pool: self.pool.clone(),
|
pool: self.pool.clone(),
|
||||||
locks: &self.config.connect_compute_locks,
|
locks: &self.config.connect_compute_locks,
|
||||||
},
|
},
|
||||||
&backend,
|
api,
|
||||||
false, // do not allow self signed compute for http flow
|
false, // do not allow self signed compute for http flow
|
||||||
self.config.wake_compute_retry_config,
|
self.config.wake_compute_retry_config,
|
||||||
self.config.connect_to_compute_retry_config,
|
self.config.connect_to_compute_retry_config,
|
||||||
@@ -204,6 +213,7 @@ impl PoolingBackend {
|
|||||||
pub(crate) async fn connect_to_local_proxy(
|
pub(crate) async fn connect_to_local_proxy(
|
||||||
&self,
|
&self,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
|
auth_backend: &'static ControlPlaneBackend,
|
||||||
conn_info: ConnInfo,
|
conn_info: ConnInfo,
|
||||||
) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
|
) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
|
||||||
info!("pool: looking for an existing connection");
|
info!("pool: looking for an existing connection");
|
||||||
@@ -214,7 +224,8 @@ impl PoolingBackend {
|
|||||||
let conn_id = uuid::Uuid::new_v4();
|
let conn_id = uuid::Uuid::new_v4();
|
||||||
tracing::Span::current().record("conn_id", display(conn_id));
|
tracing::Span::current().record("conn_id", display(conn_id));
|
||||||
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
|
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
|
||||||
let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials {
|
|
||||||
|
let backend = auth_backend.attach_to_credentials(ComputeCredentials {
|
||||||
info: ComputeUserInfo {
|
info: ComputeUserInfo {
|
||||||
user: conn_info.user_info.user.clone(),
|
user: conn_info.user_info.user.clone(),
|
||||||
endpoint: EndpointId::from(format!("{}-local-proxy", conn_info.user_info.endpoint)),
|
endpoint: EndpointId::from(format!("{}-local-proxy", conn_info.user_info.endpoint)),
|
||||||
@@ -249,26 +260,20 @@ impl PoolingBackend {
|
|||||||
pub(crate) async fn connect_to_local_postgres(
|
pub(crate) async fn connect_to_local_postgres(
|
||||||
&self,
|
&self,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
|
auth_backend: &LocalBackend,
|
||||||
conn_info: ConnInfo,
|
conn_info: ConnInfo,
|
||||||
) -> Result<LocalClient<tokio_postgres::Client>, HttpConnError> {
|
) -> Result<LocalClient<tokio_postgres::Client>, HttpConnError> {
|
||||||
if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
|
if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
|
||||||
return Ok(client);
|
return Ok(client);
|
||||||
}
|
}
|
||||||
|
|
||||||
let local_backend = match &self.auth_backend {
|
|
||||||
auth::Backend::ControlPlane(_, ()) => {
|
|
||||||
unreachable!("only local_proxy can connect to local postgres")
|
|
||||||
}
|
|
||||||
auth::Backend::Local(local) => local,
|
|
||||||
};
|
|
||||||
|
|
||||||
if !self.local_pool.initialized(&conn_info) {
|
if !self.local_pool.initialized(&conn_info) {
|
||||||
// only install and grant usage one at a time.
|
// only install and grant usage one at a time.
|
||||||
let _permit = local_backend.initialize.acquire().await.unwrap();
|
let _permit = auth_backend.initialize.acquire().await.unwrap();
|
||||||
|
|
||||||
// check again for race
|
// check again for race
|
||||||
if !self.local_pool.initialized(&conn_info) {
|
if !self.local_pool.initialized(&conn_info) {
|
||||||
local_backend
|
auth_backend
|
||||||
.compute_ctl
|
.compute_ctl
|
||||||
.install_extension(&ExtensionInstallRequest {
|
.install_extension(&ExtensionInstallRequest {
|
||||||
extension: EXT_NAME,
|
extension: EXT_NAME,
|
||||||
@@ -277,7 +282,7 @@ impl PoolingBackend {
|
|||||||
})
|
})
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
local_backend
|
auth_backend
|
||||||
.compute_ctl
|
.compute_ctl
|
||||||
.grant_role(&SetRoleGrantsRequest {
|
.grant_role(&SetRoleGrantsRequest {
|
||||||
schema: EXT_SCHEMA,
|
schema: EXT_SCHEMA,
|
||||||
@@ -295,7 +300,7 @@ impl PoolingBackend {
|
|||||||
tracing::Span::current().record("conn_id", display(conn_id));
|
tracing::Span::current().record("conn_id", display(conn_id));
|
||||||
info!(%conn_id, "local_pool: opening a new connection '{conn_info}'");
|
info!(%conn_id, "local_pool: opening a new connection '{conn_info}'");
|
||||||
|
|
||||||
let mut node_info = local_backend.node_info.clone();
|
let mut node_info = auth_backend.node_info.clone();
|
||||||
|
|
||||||
let (key, jwk) = create_random_jwk();
|
let (key, jwk) = create_random_jwk();
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ use hyper_util::rt::TokioExecutor;
|
|||||||
use hyper_util::server::conn::auto::Builder;
|
use hyper_util::server::conn::auto::Builder;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use sql_over_http::{uuid_to_header_value, NEON_REQUEST_ID};
|
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tokio::net::{TcpListener, TcpStream};
|
use tokio::net::{TcpListener, TcpStream};
|
||||||
use tokio::time::timeout;
|
use tokio::time::timeout;
|
||||||
@@ -42,6 +41,7 @@ use tokio_util::task::TaskTracker;
|
|||||||
use tracing::{info, warn, Instrument};
|
use tracing::{info, warn, Instrument};
|
||||||
use utils::http::error::ApiError;
|
use utils::http::error::ApiError;
|
||||||
|
|
||||||
|
use crate::auth::ServerlessBackend;
|
||||||
use crate::cancellation::CancellationHandlerMain;
|
use crate::cancellation::CancellationHandlerMain;
|
||||||
use crate::config::ProxyConfig;
|
use crate::config::ProxyConfig;
|
||||||
use crate::context::RequestMonitoring;
|
use crate::context::RequestMonitoring;
|
||||||
@@ -56,7 +56,7 @@ pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
|
|||||||
|
|
||||||
pub async fn task_main(
|
pub async fn task_main(
|
||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
auth_backend: &'static crate::auth::Backend<'static, ()>,
|
auth_backend: ServerlessBackend<'static>,
|
||||||
ws_listener: TcpListener,
|
ws_listener: TcpListener,
|
||||||
cancellation_token: CancellationToken,
|
cancellation_token: CancellationToken,
|
||||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||||
@@ -112,7 +112,6 @@ pub async fn task_main(
|
|||||||
local_pool,
|
local_pool,
|
||||||
pool: Arc::clone(&conn_pool),
|
pool: Arc::clone(&conn_pool),
|
||||||
config,
|
config,
|
||||||
auth_backend,
|
|
||||||
endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
|
endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
|
||||||
});
|
});
|
||||||
let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
|
let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
|
||||||
@@ -185,6 +184,7 @@ pub async fn task_main(
|
|||||||
|
|
||||||
Box::pin(connection_handler(
|
Box::pin(connection_handler(
|
||||||
config,
|
config,
|
||||||
|
auth_backend,
|
||||||
backend,
|
backend,
|
||||||
connections2,
|
connections2,
|
||||||
cancellation_handler,
|
cancellation_handler,
|
||||||
@@ -290,6 +290,7 @@ async fn connection_startup(
|
|||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
async fn connection_handler(
|
async fn connection_handler(
|
||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
|
auth_backend: ServerlessBackend<'static>,
|
||||||
backend: Arc<PoolingBackend>,
|
backend: Arc<PoolingBackend>,
|
||||||
connections: TaskTracker,
|
connections: TaskTracker,
|
||||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||||
@@ -310,18 +311,7 @@ async fn connection_handler(
|
|||||||
hyper_util::rt::TokioIo::new(conn),
|
hyper_util::rt::TokioIo::new(conn),
|
||||||
hyper::service::service_fn(move |req: hyper::Request<Incoming>| {
|
hyper::service::service_fn(move |req: hyper::Request<Incoming>| {
|
||||||
// First HTTP request shares the same session ID
|
// First HTTP request shares the same session ID
|
||||||
let mut session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
|
let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
|
||||||
|
|
||||||
if matches!(backend.auth_backend, crate::auth::Backend::Local(_)) {
|
|
||||||
// take session_id from request, if given.
|
|
||||||
if let Some(id) = req
|
|
||||||
.headers()
|
|
||||||
.get(&NEON_REQUEST_ID)
|
|
||||||
.and_then(|id| uuid::Uuid::try_parse_ascii(id.as_bytes()).ok())
|
|
||||||
{
|
|
||||||
session_id = id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Cancel the current inflight HTTP request if the requets stream is closed.
|
// Cancel the current inflight HTTP request if the requets stream is closed.
|
||||||
// This is slightly different to `_cancel_connection` in that
|
// This is slightly different to `_cancel_connection` in that
|
||||||
@@ -335,6 +325,7 @@ async fn connection_handler(
|
|||||||
request_handler(
|
request_handler(
|
||||||
req,
|
req,
|
||||||
config,
|
config,
|
||||||
|
auth_backend,
|
||||||
backend.clone(),
|
backend.clone(),
|
||||||
connections.clone(),
|
connections.clone(),
|
||||||
cancellation_handler.clone(),
|
cancellation_handler.clone(),
|
||||||
@@ -347,15 +338,8 @@ async fn connection_handler(
|
|||||||
.map_ok_or_else(api_error_into_response, |r| r),
|
.map_ok_or_else(api_error_into_response, |r| r),
|
||||||
);
|
);
|
||||||
async move {
|
async move {
|
||||||
let mut res = handler.await;
|
let res = handler.await;
|
||||||
cancel_request.disarm();
|
cancel_request.disarm();
|
||||||
|
|
||||||
// add the session ID to the response
|
|
||||||
if let Ok(resp) = &mut res {
|
|
||||||
resp.headers_mut()
|
|
||||||
.append(&NEON_REQUEST_ID, uuid_to_header_value(session_id));
|
|
||||||
}
|
|
||||||
|
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
@@ -381,6 +365,7 @@ async fn connection_handler(
|
|||||||
async fn request_handler(
|
async fn request_handler(
|
||||||
mut request: hyper::Request<Incoming>,
|
mut request: hyper::Request<Incoming>,
|
||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
|
auth_backend: ServerlessBackend<'static>,
|
||||||
backend: Arc<PoolingBackend>,
|
backend: Arc<PoolingBackend>,
|
||||||
ws_connections: TaskTracker,
|
ws_connections: TaskTracker,
|
||||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||||
@@ -401,6 +386,10 @@ async fn request_handler(
|
|||||||
if config.http_config.accept_websockets
|
if config.http_config.accept_websockets
|
||||||
&& framed_websockets::upgrade::is_upgrade_request(&request)
|
&& framed_websockets::upgrade::is_upgrade_request(&request)
|
||||||
{
|
{
|
||||||
|
let ServerlessBackend::ControlPlane(auth_backend) = auth_backend else {
|
||||||
|
return json_response(StatusCode::BAD_REQUEST, "query is not supported");
|
||||||
|
};
|
||||||
|
|
||||||
let ctx = RequestMonitoring::new(
|
let ctx = RequestMonitoring::new(
|
||||||
session_id,
|
session_id,
|
||||||
peer_addr,
|
peer_addr,
|
||||||
@@ -418,7 +407,7 @@ async fn request_handler(
|
|||||||
async move {
|
async move {
|
||||||
if let Err(e) = websocket::serve_websocket(
|
if let Err(e) = websocket::serve_websocket(
|
||||||
config,
|
config,
|
||||||
backend.auth_backend,
|
auth_backend,
|
||||||
ctx,
|
ctx,
|
||||||
websocket,
|
websocket,
|
||||||
cancellation_handler,
|
cancellation_handler,
|
||||||
@@ -444,9 +433,16 @@ async fn request_handler(
|
|||||||
);
|
);
|
||||||
let span = ctx.span();
|
let span = ctx.span();
|
||||||
|
|
||||||
sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
|
sql_over_http::handle(
|
||||||
.instrument(span)
|
config,
|
||||||
.await
|
ctx,
|
||||||
|
request,
|
||||||
|
auth_backend,
|
||||||
|
backend,
|
||||||
|
http_cancellation_token,
|
||||||
|
)
|
||||||
|
.instrument(span)
|
||||||
|
.await
|
||||||
} else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS {
|
} else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS {
|
||||||
Response::builder()
|
Response::builder()
|
||||||
.header("Allow", "OPTIONS, POST")
|
.header("Allow", "OPTIONS, POST")
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ use typed_json::json;
|
|||||||
use url::Url;
|
use url::Url;
|
||||||
use urlencoding;
|
use urlencoding;
|
||||||
use utils::http::error::ApiError;
|
use utils::http::error::ApiError;
|
||||||
use uuid::Uuid;
|
|
||||||
|
|
||||||
use super::backend::{LocalProxyConnError, PoolingBackend};
|
use super::backend::{LocalProxyConnError, PoolingBackend};
|
||||||
use super::conn_pool::{AuthData, ConnInfoWithAuth};
|
use super::conn_pool::{AuthData, ConnInfoWithAuth};
|
||||||
@@ -31,10 +30,11 @@ use super::conn_pool_lib::{self, ConnInfo};
|
|||||||
use super::http_util::json_response;
|
use super::http_util::json_response;
|
||||||
use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
|
use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
|
||||||
use super::local_conn_pool;
|
use super::local_conn_pool;
|
||||||
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
|
use crate::auth::backend::{ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo};
|
||||||
use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
|
use crate::auth::{endpoint_sni, ComputeUserInfoParseError, ServerlessBackend};
|
||||||
use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
|
use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
|
||||||
use crate::context::RequestMonitoring;
|
use crate::context::RequestMonitoring;
|
||||||
|
use crate::control_plane::provider::ControlPlaneBackend;
|
||||||
use crate::error::{ErrorKind, ReportableError, UserFacingError};
|
use crate::error::{ErrorKind, ReportableError, UserFacingError};
|
||||||
use crate::metrics::{HttpDirection, Metrics};
|
use crate::metrics::{HttpDirection, Metrics};
|
||||||
use crate::proxy::{run_until_cancelled, NeonOptions};
|
use crate::proxy::{run_until_cancelled, NeonOptions};
|
||||||
@@ -64,8 +64,6 @@ enum Payload {
|
|||||||
Batch(BatchQueryData),
|
Batch(BatchQueryData),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id");
|
|
||||||
|
|
||||||
static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
|
static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
|
||||||
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
|
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
|
||||||
static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
|
static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
|
||||||
@@ -243,10 +241,11 @@ pub(crate) async fn handle(
|
|||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
ctx: RequestMonitoring,
|
ctx: RequestMonitoring,
|
||||||
request: Request<Incoming>,
|
request: Request<Incoming>,
|
||||||
|
auth_backend: ServerlessBackend<'static>,
|
||||||
backend: Arc<PoolingBackend>,
|
backend: Arc<PoolingBackend>,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
|
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
|
||||||
let result = handle_inner(cancel, config, &ctx, request, backend).await;
|
let result = handle_inner(cancel, config, &ctx, request, auth_backend, backend).await;
|
||||||
|
|
||||||
let mut response = match result {
|
let mut response = match result {
|
||||||
Ok(r) => {
|
Ok(r) => {
|
||||||
@@ -501,6 +500,7 @@ async fn handle_inner(
|
|||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
request: Request<Incoming>,
|
request: Request<Incoming>,
|
||||||
|
auth_backend: ServerlessBackend<'static>,
|
||||||
backend: Arc<PoolingBackend>,
|
backend: Arc<PoolingBackend>,
|
||||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
|
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
|
||||||
let _requeset_gauge = Metrics::get()
|
let _requeset_gauge = Metrics::get()
|
||||||
@@ -525,7 +525,11 @@ async fn handle_inner(
|
|||||||
|
|
||||||
match conn_info.auth {
|
match conn_info.auth {
|
||||||
AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => {
|
AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => {
|
||||||
handle_auth_broker_inner(ctx, request, conn_info.conn_info, jwt, backend).await
|
let ServerlessBackend::ControlPlane(cplane) = auth_backend else {
|
||||||
|
panic!("auth_broker must be configured with a control-plane auth backend.")
|
||||||
|
};
|
||||||
|
|
||||||
|
handle_auth_broker_inner(ctx, request, conn_info.conn_info, jwt, cplane, backend).await
|
||||||
}
|
}
|
||||||
auth => {
|
auth => {
|
||||||
handle_db_inner(
|
handle_db_inner(
|
||||||
@@ -535,6 +539,7 @@ async fn handle_inner(
|
|||||||
request,
|
request,
|
||||||
conn_info.conn_info,
|
conn_info.conn_info,
|
||||||
auth,
|
auth,
|
||||||
|
auth_backend,
|
||||||
backend,
|
backend,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -542,6 +547,7 @@ async fn handle_inner(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
async fn handle_db_inner(
|
async fn handle_db_inner(
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
@@ -549,6 +555,7 @@ async fn handle_db_inner(
|
|||||||
request: Request<Incoming>,
|
request: Request<Incoming>,
|
||||||
conn_info: ConnInfo,
|
conn_info: ConnInfo,
|
||||||
auth: AuthData,
|
auth: AuthData,
|
||||||
|
auth_backend: ServerlessBackend<'static>,
|
||||||
backend: Arc<PoolingBackend>,
|
backend: Arc<PoolingBackend>,
|
||||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
|
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
|
||||||
//
|
//
|
||||||
@@ -591,45 +598,58 @@ async fn handle_db_inner(
|
|||||||
.map_err(SqlOverHttpError::from),
|
.map_err(SqlOverHttpError::from),
|
||||||
);
|
);
|
||||||
|
|
||||||
let authenticate_and_connect = Box::pin(
|
let authenticate_and_connect = Box::pin(async {
|
||||||
async {
|
let creds = match auth {
|
||||||
let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_));
|
AuthData::Password(pw) => {
|
||||||
|
let ServerlessBackend::ControlPlane(cplane) = auth_backend else {
|
||||||
|
return Err(SqlOverHttpError::ConnInfo(
|
||||||
|
ConnInfoError::MissingCredentials(Credentials::BearerJwt),
|
||||||
|
));
|
||||||
|
};
|
||||||
|
|
||||||
let keys = match auth {
|
backend
|
||||||
AuthData::Password(pw) => {
|
.authenticate_with_password(ctx, cplane, &conn_info.user_info, &pw)
|
||||||
backend
|
.await
|
||||||
.authenticate_with_password(ctx, &conn_info.user_info, &pw)
|
.map_err(HttpConnError::from)?
|
||||||
.await?
|
}
|
||||||
}
|
AuthData::Jwt(jwt) => backend
|
||||||
AuthData::Jwt(jwt) => {
|
.authenticate_with_jwt(ctx, auth_backend, &conn_info.user_info, jwt)
|
||||||
backend
|
.await
|
||||||
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
|
.map_err(HttpConnError::from)?,
|
||||||
.await?
|
};
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let client = match keys.keys {
|
let client = match (creds.keys, auth_backend) {
|
||||||
ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => {
|
(ComputeCredentialKeys::JwtPayload(payload), ServerlessBackend::Local(local)) => {
|
||||||
let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?;
|
let mut client = backend
|
||||||
let (cli_inner, _dsc) = client.client_inner();
|
.connect_to_local_postgres(ctx, local, conn_info)
|
||||||
cli_inner.set_jwt_session(&payload).await?;
|
.await?;
|
||||||
Client::Local(client)
|
let (cli_inner, _dsc) = client.client_inner();
|
||||||
}
|
cli_inner.set_jwt_session(&payload).await?;
|
||||||
_ => {
|
Client::Local(client)
|
||||||
let client = backend
|
}
|
||||||
.connect_to_compute(ctx, conn_info, keys, !allow_pool)
|
(keys, auth_backend) => {
|
||||||
.await?;
|
let client = backend
|
||||||
Client::Remote(client)
|
.connect_to_compute(
|
||||||
}
|
ctx,
|
||||||
};
|
auth_backend,
|
||||||
|
conn_info,
|
||||||
|
ComputeCredentials {
|
||||||
|
keys,
|
||||||
|
info: creds.info,
|
||||||
|
},
|
||||||
|
!allow_pool,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(HttpConnError::from)?;
|
||||||
|
Client::Remote(client)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// not strictly necessary to mark success here,
|
// not strictly necessary to mark success here,
|
||||||
// but it's just insurance for if we forget it somewhere else
|
// but it's just insurance for if we forget it somewhere else
|
||||||
ctx.success();
|
ctx.success();
|
||||||
Ok::<_, HttpConnError>(client)
|
Ok::<_, SqlOverHttpError>(client)
|
||||||
}
|
});
|
||||||
.map_err(SqlOverHttpError::from),
|
|
||||||
);
|
|
||||||
|
|
||||||
let (payload, mut client) = match run_until_cancelled(
|
let (payload, mut client) = match run_until_cancelled(
|
||||||
// Run both operations in parallel
|
// Run both operations in parallel
|
||||||
@@ -709,25 +729,27 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[
|
|||||||
&TXN_DEFERRABLE,
|
&TXN_DEFERRABLE,
|
||||||
];
|
];
|
||||||
|
|
||||||
pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue {
|
|
||||||
let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH];
|
|
||||||
HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..]))
|
|
||||||
.expect("uuid hyphenated format should be all valid header characters")
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_auth_broker_inner(
|
async fn handle_auth_broker_inner(
|
||||||
ctx: &RequestMonitoring,
|
ctx: &RequestMonitoring,
|
||||||
request: Request<Incoming>,
|
request: Request<Incoming>,
|
||||||
conn_info: ConnInfo,
|
conn_info: ConnInfo,
|
||||||
jwt: String,
|
jwt: String,
|
||||||
|
auth_backend: &'static ControlPlaneBackend,
|
||||||
backend: Arc<PoolingBackend>,
|
backend: Arc<PoolingBackend>,
|
||||||
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
|
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
|
||||||
backend
|
backend
|
||||||
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
|
.authenticate_with_jwt(
|
||||||
|
ctx,
|
||||||
|
ServerlessBackend::ControlPlane(auth_backend),
|
||||||
|
&conn_info.user_info,
|
||||||
|
jwt,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.map_err(HttpConnError::from)?;
|
.map_err(HttpConnError::from)?;
|
||||||
|
|
||||||
let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?;
|
let mut client = backend
|
||||||
|
.connect_to_local_proxy(ctx, auth_backend, conn_info)
|
||||||
|
.await?;
|
||||||
|
|
||||||
let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql");
|
let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql");
|
||||||
|
|
||||||
@@ -741,7 +763,6 @@ async fn handle_auth_broker_inner(
|
|||||||
req = req.header(h, hv);
|
req = req.header(h, hv);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
req = req.header(&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id()));
|
|
||||||
|
|
||||||
let req = req
|
let req = req
|
||||||
.body(body)
|
.body(body)
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ use tracing::warn;
|
|||||||
use crate::cancellation::CancellationHandlerMain;
|
use crate::cancellation::CancellationHandlerMain;
|
||||||
use crate::config::ProxyConfig;
|
use crate::config::ProxyConfig;
|
||||||
use crate::context::RequestMonitoring;
|
use crate::context::RequestMonitoring;
|
||||||
|
use crate::control_plane::provider::ControlPlaneBackend;
|
||||||
use crate::error::{io_error, ReportableError};
|
use crate::error::{io_error, ReportableError};
|
||||||
use crate::metrics::Metrics;
|
use crate::metrics::Metrics;
|
||||||
use crate::proxy::{handle_client, ClientMode, ErrorSource};
|
use crate::proxy::{handle_client, ClientMode, ErrorSource};
|
||||||
@@ -125,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
|
|||||||
|
|
||||||
pub(crate) async fn serve_websocket(
|
pub(crate) async fn serve_websocket(
|
||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
auth_backend: &'static crate::auth::Backend<'static, ()>,
|
auth_backend: &'static ControlPlaneBackend,
|
||||||
ctx: RequestMonitoring,
|
ctx: RequestMonitoring,
|
||||||
websocket: OnUpgrade,
|
websocket: OnUpgrade,
|
||||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ backoff = "^2.2.1"
|
|||||||
pytest-lazy-fixture = "^0.6.3"
|
pytest-lazy-fixture = "^0.6.3"
|
||||||
prometheus-client = "^0.14.1"
|
prometheus-client = "^0.14.1"
|
||||||
pytest-timeout = "^2.1.0"
|
pytest-timeout = "^2.1.0"
|
||||||
Werkzeug = "^3.0.6"
|
Werkzeug = "^3.0.3"
|
||||||
pytest-order = "^1.1.0"
|
pytest-order = "^1.1.0"
|
||||||
allure-pytest = "^2.13.2"
|
allure-pytest = "^2.13.2"
|
||||||
pytest-asyncio = "^0.21.0"
|
pytest-asyncio = "^0.21.0"
|
||||||
|
|||||||
@@ -193,8 +193,6 @@ struct Args {
|
|||||||
/// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
|
/// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
|
||||||
/// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
|
/// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
|
||||||
/// if it weren't for `eviction_min_resident` preventing that.
|
/// if it weren't for `eviction_min_resident` preventing that.
|
||||||
///
|
|
||||||
/// Also defines interval for eviction retries.
|
|
||||||
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
|
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
|
||||||
eviction_min_resident: Duration,
|
eviction_min_resident: Duration,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,10 +14,12 @@ use std::path::Path;
|
|||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use crate::control_file_upgrade::downgrade_v9_to_v8;
|
use crate::control_file_upgrade::downgrade_v9_to_v8;
|
||||||
use crate::control_file_upgrade::upgrade_control_file;
|
|
||||||
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
|
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
|
||||||
use crate::state::{EvictionState, TimelinePersistentState};
|
use crate::state::{EvictionState, TimelinePersistentState};
|
||||||
use utils::bin_ser::LeSer;
|
use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
|
||||||
|
use utils::{bin_ser::LeSer, id::TenantTimelineId};
|
||||||
|
|
||||||
|
use crate::SafeKeeperConf;
|
||||||
|
|
||||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||||
pub const SK_FORMAT_VERSION: u32 = 9;
|
pub const SK_FORMAT_VERSION: u32 = 9;
|
||||||
@@ -52,12 +54,13 @@ pub struct FileStorage {
|
|||||||
|
|
||||||
impl FileStorage {
|
impl FileStorage {
|
||||||
/// Initialize storage by loading state from disk.
|
/// Initialize storage by loading state from disk.
|
||||||
pub fn restore_new(timeline_dir: &Utf8Path, no_sync: bool) -> Result<FileStorage> {
|
pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
|
||||||
let state = Self::load_control_file_from_dir(timeline_dir)?;
|
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||||
|
let state = Self::load_control_file_from_dir(&timeline_dir)?;
|
||||||
|
|
||||||
Ok(FileStorage {
|
Ok(FileStorage {
|
||||||
timeline_dir: timeline_dir.to_path_buf(),
|
timeline_dir,
|
||||||
no_sync,
|
no_sync: conf.no_sync,
|
||||||
state,
|
state,
|
||||||
last_persist_at: Instant::now(),
|
last_persist_at: Instant::now(),
|
||||||
})
|
})
|
||||||
@@ -68,16 +71,16 @@ impl FileStorage {
|
|||||||
/// Note: we normally call this in temp directory for atomic init, so
|
/// Note: we normally call this in temp directory for atomic init, so
|
||||||
/// interested in FileStorage as a result only in tests.
|
/// interested in FileStorage as a result only in tests.
|
||||||
pub async fn create_new(
|
pub async fn create_new(
|
||||||
timeline_dir: &Utf8Path,
|
dir: Utf8PathBuf,
|
||||||
|
conf: &SafeKeeperConf,
|
||||||
state: TimelinePersistentState,
|
state: TimelinePersistentState,
|
||||||
no_sync: bool,
|
|
||||||
) -> Result<FileStorage> {
|
) -> Result<FileStorage> {
|
||||||
// we don't support creating new timelines in offloaded state
|
// we don't support creating new timelines in offloaded state
|
||||||
assert!(matches!(state.eviction_state, EvictionState::Present));
|
assert!(matches!(state.eviction_state, EvictionState::Present));
|
||||||
|
|
||||||
let mut store = FileStorage {
|
let mut store = FileStorage {
|
||||||
timeline_dir: timeline_dir.to_path_buf(),
|
timeline_dir: dir,
|
||||||
no_sync,
|
no_sync: conf.no_sync,
|
||||||
state: state.clone(),
|
state: state.clone(),
|
||||||
last_persist_at: Instant::now(),
|
last_persist_at: Instant::now(),
|
||||||
};
|
};
|
||||||
@@ -236,46 +239,89 @@ mod test {
|
|||||||
use tokio::fs;
|
use tokio::fs;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
const NO_SYNC: bool = true;
|
fn stub_conf() -> SafeKeeperConf {
|
||||||
|
let workdir = camino_tempfile::tempdir().unwrap().into_path();
|
||||||
|
SafeKeeperConf {
|
||||||
|
workdir,
|
||||||
|
..SafeKeeperConf::dummy()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
async fn load_from_control_file(
|
||||||
async fn test_read_write_safekeeper_state() -> anyhow::Result<()> {
|
conf: &SafeKeeperConf,
|
||||||
let tempdir = camino_tempfile::tempdir()?;
|
ttid: &TenantTimelineId,
|
||||||
let mut state = TimelinePersistentState::empty();
|
) -> Result<(FileStorage, TimelinePersistentState)> {
|
||||||
let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
|
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||||
|
fs::create_dir_all(&timeline_dir)
|
||||||
|
.await
|
||||||
|
.expect("failed to create timeline dir");
|
||||||
|
Ok((
|
||||||
|
FileStorage::restore_new(ttid, conf)?,
|
||||||
|
FileStorage::load_control_file_from_dir(&timeline_dir)?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
// Make a change.
|
async fn create(
|
||||||
state.commit_lsn = Lsn(42);
|
conf: &SafeKeeperConf,
|
||||||
storage.persist(&state).await?;
|
ttid: &TenantTimelineId,
|
||||||
|
) -> Result<(FileStorage, TimelinePersistentState)> {
|
||||||
// Reload the state. It should match the previously persisted state.
|
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||||
let loaded_state = FileStorage::load_control_file_from_dir(tempdir.path())?;
|
fs::create_dir_all(&timeline_dir)
|
||||||
assert_eq!(loaded_state, state);
|
.await
|
||||||
Ok(())
|
.expect("failed to create timeline dir");
|
||||||
|
let state = TimelinePersistentState::empty();
|
||||||
|
let storage = FileStorage::create_new(timeline_dir, conf, state.clone()).await?;
|
||||||
|
Ok((storage, state))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_safekeeper_state_checksum_mismatch() -> anyhow::Result<()> {
|
async fn test_read_write_safekeeper_state() {
|
||||||
let tempdir = camino_tempfile::tempdir()?;
|
let conf = stub_conf();
|
||||||
let mut state = TimelinePersistentState::empty();
|
let ttid = TenantTimelineId::generate();
|
||||||
let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
|
{
|
||||||
|
let (mut storage, mut state) =
|
||||||
// Make a change.
|
create(&conf, &ttid).await.expect("failed to create state");
|
||||||
state.commit_lsn = Lsn(42);
|
// change something
|
||||||
storage.persist(&state).await?;
|
state.commit_lsn = Lsn(42);
|
||||||
|
storage
|
||||||
// Change the first byte to fail checksum validation.
|
.persist(&state)
|
||||||
let ctrl_path = tempdir.path().join(CONTROL_FILE_NAME);
|
.await
|
||||||
let mut data = fs::read(&ctrl_path).await?;
|
.expect("failed to persist state");
|
||||||
data[0] += 1;
|
}
|
||||||
fs::write(&ctrl_path, &data).await?;
|
|
||||||
|
let (_, state) = load_from_control_file(&conf, &ttid)
|
||||||
// Loading the file should fail checksum validation.
|
.await
|
||||||
if let Err(err) = FileStorage::load_control_file_from_dir(tempdir.path()) {
|
.expect("failed to read state");
|
||||||
assert!(err.to_string().contains("control file checksum mismatch"))
|
assert_eq!(state.commit_lsn, Lsn(42));
|
||||||
} else {
|
}
|
||||||
panic!("expected checksum error")
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_safekeeper_state_checksum_mismatch() {
|
||||||
|
let conf = stub_conf();
|
||||||
|
let ttid = TenantTimelineId::generate();
|
||||||
|
{
|
||||||
|
let (mut storage, mut state) =
|
||||||
|
create(&conf, &ttid).await.expect("failed to read state");
|
||||||
|
|
||||||
|
// change something
|
||||||
|
state.commit_lsn = Lsn(42);
|
||||||
|
storage
|
||||||
|
.persist(&state)
|
||||||
|
.await
|
||||||
|
.expect("failed to persist state");
|
||||||
|
}
|
||||||
|
let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
|
||||||
|
let mut data = fs::read(&control_path).await.unwrap();
|
||||||
|
data[0] += 1; // change the first byte of the file to fail checksum validation
|
||||||
|
fs::write(&control_path, &data)
|
||||||
|
.await
|
||||||
|
.expect("failed to write control file");
|
||||||
|
|
||||||
|
match load_from_control_file(&conf, &ttid).await {
|
||||||
|
Err(err) => assert!(err
|
||||||
|
.to_string()
|
||||||
|
.contains("safekeeper control file checksum mismatch")),
|
||||||
|
Ok(_) => panic!("expected error"),
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -154,7 +154,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
|||||||
new_state.peer_horizon_lsn = request.until_lsn;
|
new_state.peer_horizon_lsn = request.until_lsn;
|
||||||
new_state.backup_lsn = new_backup_lsn;
|
new_state.backup_lsn = new_backup_lsn;
|
||||||
|
|
||||||
FileStorage::create_new(&tli_dir_path, new_state.clone(), conf.no_sync).await?;
|
FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone()).await?;
|
||||||
|
|
||||||
// now we have a ready timeline in a temp directory
|
// now we have a ready timeline in a temp directory
|
||||||
validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
|
validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
|
||||||
|
|||||||
@@ -262,6 +262,14 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
|||||||
check_permission(&request, Some(ttid.tenant_id))?;
|
check_permission(&request, Some(ttid.tenant_id))?;
|
||||||
|
|
||||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||||
|
// Note: with evicted timelines it should work better then de-evict them and
|
||||||
|
// stream; probably start_snapshot would copy partial s3 file to dest path
|
||||||
|
// and stream control file, or return WalResidentTimeline if timeline is not
|
||||||
|
// evicted.
|
||||||
|
let tli = tli
|
||||||
|
.wal_residence_guard()
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
|
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
|
||||||
// so create the chan and write to it in another task.
|
// so create the chan and write to it in another task.
|
||||||
|
|||||||
@@ -113,7 +113,6 @@ impl SafeKeeperConf {
|
|||||||
|
|
||||||
impl SafeKeeperConf {
|
impl SafeKeeperConf {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[allow(unused)]
|
|
||||||
fn dummy() -> Self {
|
fn dummy() -> Self {
|
||||||
SafeKeeperConf {
|
SafeKeeperConf {
|
||||||
workdir: Utf8PathBuf::from("./"),
|
workdir: Utf8PathBuf::from("./"),
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::{
|
use std::{
|
||||||
cmp::min,
|
cmp::min,
|
||||||
io::{self, ErrorKind},
|
io::{self, ErrorKind},
|
||||||
sync::Arc,
|
|
||||||
};
|
};
|
||||||
use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
|
use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
|
||||||
use tokio_tar::{Archive, Builder, Header};
|
use tokio_tar::{Archive, Builder, Header};
|
||||||
@@ -26,8 +25,8 @@ use crate::{
|
|||||||
routes::TimelineStatus,
|
routes::TimelineStatus,
|
||||||
},
|
},
|
||||||
safekeeper::Term,
|
safekeeper::Term,
|
||||||
state::{EvictionState, TimelinePersistentState},
|
state::TimelinePersistentState,
|
||||||
timeline::{Timeline, WalResidentTimeline},
|
timeline::WalResidentTimeline,
|
||||||
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
|
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
|
||||||
wal_backup,
|
wal_backup,
|
||||||
wal_storage::open_wal_file,
|
wal_storage::open_wal_file,
|
||||||
@@ -44,33 +43,18 @@ use utils::{
|
|||||||
/// Stream tar archive of timeline to tx.
|
/// Stream tar archive of timeline to tx.
|
||||||
#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
|
#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
|
||||||
pub async fn stream_snapshot(
|
pub async fn stream_snapshot(
|
||||||
tli: Arc<Timeline>,
|
tli: WalResidentTimeline,
|
||||||
source: NodeId,
|
source: NodeId,
|
||||||
destination: NodeId,
|
destination: NodeId,
|
||||||
tx: mpsc::Sender<Result<Bytes>>,
|
tx: mpsc::Sender<Result<Bytes>>,
|
||||||
) {
|
) {
|
||||||
match tli.try_wal_residence_guard().await {
|
if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
|
||||||
Err(e) => {
|
// Error type/contents don't matter as they won't can't reach the client
|
||||||
tx.send(Err(anyhow!("Error checking residence: {:#}", e)))
|
// (hyper likely doesn't do anything with it), but http stream will be
|
||||||
.await
|
// prematurely terminated. It would be nice to try to send the error in
|
||||||
.ok();
|
// trailers though.
|
||||||
}
|
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
|
||||||
Ok(maybe_resident_tli) => {
|
error!("snapshot failed: {:#}", e);
|
||||||
if let Err(e) = match maybe_resident_tli {
|
|
||||||
Some(resident_tli) => {
|
|
||||||
stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
|
|
||||||
} {
|
|
||||||
// Error type/contents don't matter as they won't can't reach the client
|
|
||||||
// (hyper likely doesn't do anything with it), but http stream will be
|
|
||||||
// prematurely terminated. It would be nice to try to send the error in
|
|
||||||
// trailers though.
|
|
||||||
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
|
|
||||||
error!("snapshot failed: {:#}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -96,10 +80,12 @@ impl Drop for SnapshotContext {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel.
|
pub async fn stream_snapshot_guts(
|
||||||
fn prepare_tar_stream(
|
tli: WalResidentTimeline,
|
||||||
|
source: NodeId,
|
||||||
|
destination: NodeId,
|
||||||
tx: mpsc::Sender<Result<Bytes>>,
|
tx: mpsc::Sender<Result<Bytes>>,
|
||||||
) -> tokio_tar::Builder<impl AsyncWrite + Unpin + Send> {
|
) -> Result<()> {
|
||||||
// tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
|
// tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
|
||||||
// use SinkWriter as a Write impl. That is,
|
// use SinkWriter as a Write impl. That is,
|
||||||
// - create Sink from the tx. It returns PollSendError if chan is closed.
|
// - create Sink from the tx. It returns PollSendError if chan is closed.
|
||||||
@@ -114,38 +100,12 @@ fn prepare_tar_stream(
|
|||||||
// - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
|
// - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
|
||||||
// into CopyToBytes. This is a data copy.
|
// into CopyToBytes. This is a data copy.
|
||||||
let copy_to_bytes = CopyToBytes::new(oksink);
|
let copy_to_bytes = CopyToBytes::new(oksink);
|
||||||
let writer = SinkWriter::new(copy_to_bytes);
|
let mut writer = SinkWriter::new(copy_to_bytes);
|
||||||
let pinned_writer = Box::pin(writer);
|
let pinned_writer = std::pin::pin!(writer);
|
||||||
|
|
||||||
// Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
|
// Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
|
||||||
// which is also likely suboptimal.
|
// which is also likely suboptimal.
|
||||||
Builder::new_non_terminated(pinned_writer)
|
let mut ar = Builder::new_non_terminated(pinned_writer);
|
||||||
}
|
|
||||||
|
|
||||||
/// Implementation of snapshot for an offloaded timeline, only reads control file
|
|
||||||
pub(crate) async fn stream_snapshot_offloaded_guts(
|
|
||||||
tli: Arc<Timeline>,
|
|
||||||
source: NodeId,
|
|
||||||
destination: NodeId,
|
|
||||||
tx: mpsc::Sender<Result<Bytes>>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut ar = prepare_tar_stream(tx);
|
|
||||||
|
|
||||||
tli.snapshot_offloaded(&mut ar, source, destination).await?;
|
|
||||||
|
|
||||||
ar.finish().await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Implementation of snapshot for a timeline which is resident (includes some segment data)
|
|
||||||
pub async fn stream_snapshot_resident_guts(
|
|
||||||
tli: WalResidentTimeline,
|
|
||||||
source: NodeId,
|
|
||||||
destination: NodeId,
|
|
||||||
tx: mpsc::Sender<Result<Bytes>>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut ar = prepare_tar_stream(tx);
|
|
||||||
|
|
||||||
let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
|
let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
|
||||||
pausable_failpoint!("sk-snapshot-after-list-pausable");
|
pausable_failpoint!("sk-snapshot-after-list-pausable");
|
||||||
@@ -178,70 +138,6 @@ pub async fn stream_snapshot_resident_guts(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Timeline {
|
|
||||||
/// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and
|
|
||||||
/// pass a modified control file into the provided tar stream (nothing with data segments on disk, since
|
|
||||||
/// we are offloaded and there aren't any)
|
|
||||||
async fn snapshot_offloaded<W: AsyncWrite + Unpin + Send>(
|
|
||||||
self: &Arc<Timeline>,
|
|
||||||
ar: &mut tokio_tar::Builder<W>,
|
|
||||||
source: NodeId,
|
|
||||||
destination: NodeId,
|
|
||||||
) -> Result<()> {
|
|
||||||
// Take initial copy of control file, then release state lock
|
|
||||||
let mut control_file = {
|
|
||||||
let shared_state = self.write_shared_state().await;
|
|
||||||
|
|
||||||
let control_file = TimelinePersistentState::clone(shared_state.sk.state());
|
|
||||||
|
|
||||||
// Rare race: we got unevicted between entering function and reading control file.
|
|
||||||
// We error out and let API caller retry.
|
|
||||||
if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) {
|
|
||||||
bail!("Timeline was un-evicted during snapshot, please retry");
|
|
||||||
}
|
|
||||||
|
|
||||||
control_file
|
|
||||||
};
|
|
||||||
|
|
||||||
// Modify the partial segment of the in-memory copy for the control file to
|
|
||||||
// point to the destination safekeeper.
|
|
||||||
let replace = control_file
|
|
||||||
.partial_backup
|
|
||||||
.replace_uploaded_segment(source, destination)?;
|
|
||||||
|
|
||||||
let Some(replace) = replace else {
|
|
||||||
// In Manager:: ready_for_eviction, we do not permit eviction unless the timeline
|
|
||||||
// has a partial segment. It is unexpected that
|
|
||||||
anyhow::bail!("Timeline has no partial segment, cannot generate snapshot");
|
|
||||||
};
|
|
||||||
|
|
||||||
tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}");
|
|
||||||
|
|
||||||
// Optimistically try to copy the partial segment to the destination's path: this
|
|
||||||
// can fail if the timeline was un-evicted and modified in the background.
|
|
||||||
let remote_timeline_path = &self.remote_path;
|
|
||||||
wal_backup::copy_partial_segment(
|
|
||||||
&replace.previous.remote_path(remote_timeline_path),
|
|
||||||
&replace.current.remote_path(remote_timeline_path),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// Since the S3 copy succeeded with the path given in our control file snapshot, and
|
|
||||||
// we are sending that snapshot in our response, we are giving the caller a consistent
|
|
||||||
// snapshot even if our local Timeline was unevicted or otherwise modified in the meantime.
|
|
||||||
let buf = control_file
|
|
||||||
.write_to_buf()
|
|
||||||
.with_context(|| "failed to serialize control store")?;
|
|
||||||
let mut header = Header::new_gnu();
|
|
||||||
header.set_size(buf.len().try_into().expect("never breaches u64"));
|
|
||||||
ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
|
|
||||||
.await
|
|
||||||
.with_context(|| "failed to append to archive")?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WalResidentTimeline {
|
impl WalResidentTimeline {
|
||||||
/// Start streaming tar archive with timeline:
|
/// Start streaming tar archive with timeline:
|
||||||
/// 1) stream control file under lock;
|
/// 1) stream control file under lock;
|
||||||
|
|||||||
@@ -21,15 +21,18 @@ use postgres_backend::QueryError;
|
|||||||
use pq_proto::BeMessage;
|
use pq_proto::BeMessage;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::future;
|
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::io::AsyncRead;
|
use tokio::io::AsyncRead;
|
||||||
use tokio::io::AsyncWrite;
|
use tokio::io::AsyncWrite;
|
||||||
use tokio::sync::mpsc::{channel, Receiver, Sender};
|
use tokio::sync::mpsc::channel;
|
||||||
|
use tokio::sync::mpsc::error::TryRecvError;
|
||||||
|
use tokio::sync::mpsc::Receiver;
|
||||||
|
use tokio::sync::mpsc::Sender;
|
||||||
use tokio::task;
|
use tokio::task;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
use tokio::time::{Duration, MissedTickBehavior};
|
use tokio::time::Duration;
|
||||||
|
use tokio::time::Instant;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
@@ -441,9 +444,9 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The WAL flush interval. This ensures we periodically flush the WAL and send AppendResponses to
|
// Send keepalive messages to walproposer, to make sure it receives updates
|
||||||
/// walproposer, even when it's writing a steady stream of messages.
|
// even when it writes a steady stream of messages.
|
||||||
const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
|
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||||
|
|
||||||
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
||||||
/// replies to reply_tx.
|
/// replies to reply_tx.
|
||||||
@@ -491,76 +494,67 @@ impl WalAcceptor {
|
|||||||
async fn run(&mut self) -> anyhow::Result<()> {
|
async fn run(&mut self) -> anyhow::Result<()> {
|
||||||
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
|
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
|
||||||
|
|
||||||
// Periodically flush the WAL.
|
// After this timestamp we will stop processing AppendRequests and send a response
|
||||||
let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
|
// to the walproposer. walproposer sends at least one AppendRequest per second,
|
||||||
flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
// we will send keepalives by replying to these requests once per second.
|
||||||
flush_ticker.tick().await; // skip the initial, immediate tick
|
let mut next_keepalive = Instant::now();
|
||||||
|
|
||||||
// Tracks unflushed appends.
|
while let Some(mut next_msg) = self.msg_rx.recv().await {
|
||||||
let mut dirty = false;
|
// Update walreceiver state in shmem for reporting.
|
||||||
|
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
|
||||||
|
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
|
||||||
|
}
|
||||||
|
|
||||||
loop {
|
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
|
||||||
let reply = tokio::select! {
|
// Loop through AppendRequests while available to write as many WAL records as
|
||||||
// Process inbound message.
|
// possible without fsyncing.
|
||||||
msg = self.msg_rx.recv() => {
|
|
||||||
// If disconnected, break to flush WAL and return.
|
|
||||||
let Some(mut msg) = msg else {
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Update walreceiver state in shmem for reporting.
|
|
||||||
if let ProposerAcceptorMessage::Elected(_) = &msg {
|
|
||||||
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Don't flush the WAL on every append, only periodically via flush_ticker.
|
|
||||||
// This batches multiple appends per fsync. If the channel is empty after
|
|
||||||
// sending the reply, we'll schedule an immediate flush.
|
|
||||||
if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
|
|
||||||
msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
|
||||||
dirty = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.tli.process_msg(&msg).await?
|
|
||||||
}
|
|
||||||
|
|
||||||
// While receiving AppendRequests, flush the WAL periodically and respond with an
|
|
||||||
// AppendResponse to let walproposer know we're still alive.
|
|
||||||
_ = flush_ticker.tick(), if dirty => {
|
|
||||||
dirty = false;
|
|
||||||
self.tli
|
|
||||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
|
||||||
.await?
|
|
||||||
}
|
|
||||||
|
|
||||||
// If there are no pending messages, flush the WAL immediately.
|
|
||||||
//
|
//
|
||||||
// TODO: this should be done via flush_ticker.reset_immediately(), but that's always
|
// Make sure the WAL is flushed before returning, see:
|
||||||
// delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866.
|
// https://github.com/neondatabase/neon/issues/9259
|
||||||
_ = future::ready(()), if dirty && self.msg_rx.is_empty() => {
|
//
|
||||||
dirty = false;
|
// Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
|
||||||
flush_ticker.reset();
|
// Otherwise, we might end up in a situation where we read a message, but don't
|
||||||
self.tli
|
// process it.
|
||||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
|
||||||
.await?
|
let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||||
|
|
||||||
|
if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
|
||||||
|
if self.reply_tx.send(reply).await.is_err() {
|
||||||
|
break; // disconnected, flush WAL and return on next send/recv
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get out of this loop if keepalive time is reached
|
||||||
|
if Instant::now() >= next_keepalive {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// continue pulling AppendRequests if available
|
||||||
|
match self.msg_rx.try_recv() {
|
||||||
|
Ok(msg) => next_msg = msg,
|
||||||
|
Err(TryRecvError::Empty) => break,
|
||||||
|
// on disconnect, flush WAL and return on next send/recv
|
||||||
|
Err(TryRecvError::Disconnected) => break,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// flush all written WAL to the disk
|
||||||
|
self.tli
|
||||||
|
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||||
|
.await?
|
||||||
|
} else {
|
||||||
|
// process message other than AppendRequest
|
||||||
|
self.tli.process_msg(&next_msg).await?
|
||||||
};
|
};
|
||||||
|
|
||||||
// Send reply, if any.
|
if let Some(reply) = reply_msg {
|
||||||
if let Some(reply) = reply {
|
|
||||||
if self.reply_tx.send(reply).await.is_err() {
|
if self.reply_tx.send(reply).await.is_err() {
|
||||||
break; // disconnected, break to flush WAL and return
|
return Ok(()); // chan closed, streaming terminated
|
||||||
}
|
}
|
||||||
|
// reset keepalive time
|
||||||
|
next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
|
|
||||||
if dirty {
|
|
||||||
self.tli
|
|
||||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -143,8 +143,8 @@ impl TimelinePersistentState {
|
|||||||
TimelinePersistentState::new(
|
TimelinePersistentState::new(
|
||||||
&TenantTimelineId::empty(),
|
&TenantTimelineId::empty(),
|
||||||
ServerInfo {
|
ServerInfo {
|
||||||
pg_version: 170000, /* Postgres server version (major * 10000) */
|
pg_version: 17, /* Postgres server version */
|
||||||
system_id: 0, /* Postgres system identifier */
|
system_id: 0, /* Postgres system identifier */
|
||||||
wal_seg_size: 16 * 1024 * 1024,
|
wal_seg_size: 16 * 1024 * 1024,
|
||||||
},
|
},
|
||||||
vec![],
|
vec![],
|
||||||
|
|||||||
@@ -328,19 +328,15 @@ impl SharedState {
|
|||||||
/// Restore SharedState from control file. If file doesn't exist, bails out.
|
/// Restore SharedState from control file. If file doesn't exist, bails out.
|
||||||
fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
|
fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
|
||||||
let timeline_dir = get_timeline_dir(conf, ttid);
|
let timeline_dir = get_timeline_dir(conf, ttid);
|
||||||
let control_store = control_file::FileStorage::restore_new(&timeline_dir, conf.no_sync)?;
|
let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
|
||||||
if control_store.server.wal_seg_size == 0 {
|
if control_store.server.wal_seg_size == 0 {
|
||||||
bail!(TimelineError::UninitializedWalSegSize(*ttid));
|
bail!(TimelineError::UninitializedWalSegSize(*ttid));
|
||||||
}
|
}
|
||||||
|
|
||||||
let sk = match control_store.eviction_state {
|
let sk = match control_store.eviction_state {
|
||||||
EvictionState::Present => {
|
EvictionState::Present => {
|
||||||
let wal_store = wal_storage::PhysicalStorage::new(
|
let wal_store =
|
||||||
ttid,
|
wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
|
||||||
&timeline_dir,
|
|
||||||
&control_store,
|
|
||||||
conf.no_sync,
|
|
||||||
)?;
|
|
||||||
StateSK::Loaded(SafeKeeper::new(
|
StateSK::Loaded(SafeKeeper::new(
|
||||||
TimelineState::new(control_store),
|
TimelineState::new(control_store),
|
||||||
wal_store,
|
wal_store,
|
||||||
@@ -797,17 +793,14 @@ impl Timeline {
|
|||||||
state.sk.term_bump(to).await
|
state.sk.term_bump(to).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
|
/// Get the timeline guard for reading/writing WAL files.
|
||||||
async fn do_wal_residence_guard(
|
/// If WAL files are not present on disk (evicted), they will be automatically
|
||||||
self: &Arc<Self>,
|
/// downloaded from remote storage. This is done in the manager task, which is
|
||||||
block: bool,
|
/// responsible for issuing all guards.
|
||||||
) -> Result<Option<WalResidentTimeline>> {
|
///
|
||||||
let op_label = if block {
|
/// NB: don't use this function from timeline_manager, it will deadlock.
|
||||||
"wal_residence_guard"
|
/// NB: don't use this function while holding shared_state lock.
|
||||||
} else {
|
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
|
||||||
"try_wal_residence_guard"
|
|
||||||
};
|
|
||||||
|
|
||||||
if self.is_cancelled() {
|
if self.is_cancelled() {
|
||||||
bail!(TimelineError::Cancelled(self.ttid));
|
bail!(TimelineError::Cancelled(self.ttid));
|
||||||
}
|
}
|
||||||
@@ -819,13 +812,10 @@ impl Timeline {
|
|||||||
// Wait 30 seconds for the guard to be acquired. It can time out if someone is
|
// Wait 30 seconds for the guard to be acquired. It can time out if someone is
|
||||||
// holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
|
// holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
|
||||||
// is stuck.
|
// is stuck.
|
||||||
let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async {
|
let res = tokio::time::timeout_at(
|
||||||
if block {
|
started_at + Duration::from_secs(30),
|
||||||
self.manager_ctl.wal_residence_guard().await.map(Some)
|
self.manager_ctl.wal_residence_guard(),
|
||||||
} else {
|
)
|
||||||
self.manager_ctl.try_wal_residence_guard().await
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
let guard = match res {
|
let guard = match res {
|
||||||
@@ -833,14 +823,14 @@ impl Timeline {
|
|||||||
let finished_at = Instant::now();
|
let finished_at = Instant::now();
|
||||||
let elapsed = finished_at - started_at;
|
let elapsed = finished_at - started_at;
|
||||||
MISC_OPERATION_SECONDS
|
MISC_OPERATION_SECONDS
|
||||||
.with_label_values(&[op_label])
|
.with_label_values(&["wal_residence_guard"])
|
||||||
.observe(elapsed.as_secs_f64());
|
.observe(elapsed.as_secs_f64());
|
||||||
|
|
||||||
guard
|
guard
|
||||||
}
|
}
|
||||||
Ok(Err(e)) => {
|
Ok(Err(e)) => {
|
||||||
warn!(
|
warn!(
|
||||||
"error acquiring in {op_label}, statuses {:?} => {:?}",
|
"error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
|
||||||
status_before,
|
status_before,
|
||||||
self.mgr_status.get()
|
self.mgr_status.get()
|
||||||
);
|
);
|
||||||
@@ -848,7 +838,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
warn!(
|
warn!(
|
||||||
"timeout acquiring in {op_label} guard, statuses {:?} => {:?}",
|
"timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
|
||||||
status_before,
|
status_before,
|
||||||
self.mgr_status.get()
|
self.mgr_status.get()
|
||||||
);
|
);
|
||||||
@@ -856,28 +846,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g)))
|
Ok(WalResidentTimeline::new(self.clone(), guard))
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the timeline guard for reading/writing WAL files.
|
|
||||||
/// If WAL files are not present on disk (evicted), they will be automatically
|
|
||||||
/// downloaded from remote storage. This is done in the manager task, which is
|
|
||||||
/// responsible for issuing all guards.
|
|
||||||
///
|
|
||||||
/// NB: don't use this function from timeline_manager, it will deadlock.
|
|
||||||
/// NB: don't use this function while holding shared_state lock.
|
|
||||||
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
|
|
||||||
self.do_wal_residence_guard(true)
|
|
||||||
.await
|
|
||||||
.map(|m| m.expect("Always get Some in block=true mode"))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the timeline guard for reading/writing WAL files if the timeline is resident,
|
|
||||||
/// else return None
|
|
||||||
pub(crate) async fn try_wal_residence_guard(
|
|
||||||
self: &Arc<Self>,
|
|
||||||
) -> Result<Option<WalResidentTimeline>> {
|
|
||||||
self.do_wal_residence_guard(false).await
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
|
pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
|
||||||
@@ -1077,9 +1046,9 @@ impl ManagerTimeline {
|
|||||||
// trying to restore WAL storage
|
// trying to restore WAL storage
|
||||||
let wal_store = wal_storage::PhysicalStorage::new(
|
let wal_store = wal_storage::PhysicalStorage::new(
|
||||||
&self.ttid,
|
&self.ttid,
|
||||||
&self.timeline_dir,
|
self.timeline_dir.clone(),
|
||||||
|
&conf,
|
||||||
shared.sk.state(),
|
shared.sk.state(),
|
||||||
conf.no_sync,
|
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// updating control file
|
// updating control file
|
||||||
|
|||||||
@@ -56,9 +56,6 @@ impl Manager {
|
|||||||
// This also works for the first segment despite last_removed_segno
|
// This also works for the first segment despite last_removed_segno
|
||||||
// being 0 on init because this 0 triggers run of wal_removal_task
|
// being 0 on init because this 0 triggers run of wal_removal_task
|
||||||
// on success of which manager updates the horizon.
|
// on success of which manager updates the horizon.
|
||||||
//
|
|
||||||
// **Note** pull_timeline functionality assumes that evicted timelines always have
|
|
||||||
// a partial segment: if we ever change this condition, must also update that code.
|
|
||||||
&& self
|
&& self
|
||||||
.partial_backup_uploaded
|
.partial_backup_uploaded
|
||||||
.as_ref()
|
.as_ref()
|
||||||
@@ -69,15 +66,15 @@ impl Manager {
|
|||||||
ready
|
ready
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Evict the timeline to remote storage. Returns whether the eviction was successful.
|
/// Evict the timeline to remote storage.
|
||||||
#[instrument(name = "evict_timeline", skip_all)]
|
#[instrument(name = "evict_timeline", skip_all)]
|
||||||
pub(crate) async fn evict_timeline(&mut self) -> bool {
|
pub(crate) async fn evict_timeline(&mut self) {
|
||||||
assert!(!self.is_offloaded);
|
assert!(!self.is_offloaded);
|
||||||
let partial_backup_uploaded = match &self.partial_backup_uploaded {
|
let partial_backup_uploaded = match &self.partial_backup_uploaded {
|
||||||
Some(p) => p.clone(),
|
Some(p) => p.clone(),
|
||||||
None => {
|
None => {
|
||||||
warn!("no partial backup uploaded, skipping eviction");
|
warn!("no partial backup uploaded, skipping eviction");
|
||||||
return false;
|
return;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -94,12 +91,11 @@ impl Manager {
|
|||||||
|
|
||||||
if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
|
if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
|
||||||
warn!("failed to evict timeline: {:?}", e);
|
warn!("failed to evict timeline: {:?}", e);
|
||||||
return false;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("successfully evicted timeline");
|
info!("successfully evicted timeline");
|
||||||
NUM_EVICTED_TIMELINES.inc();
|
NUM_EVICTED_TIMELINES.inc();
|
||||||
true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Attempt to restore evicted timeline from remote storage; it must be
|
/// Attempt to restore evicted timeline from remote storage; it must be
|
||||||
|
|||||||
@@ -100,8 +100,6 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
|
|||||||
pub enum ManagerCtlMessage {
|
pub enum ManagerCtlMessage {
|
||||||
/// Request to get a guard for WalResidentTimeline, with WAL files available locally.
|
/// Request to get a guard for WalResidentTimeline, with WAL files available locally.
|
||||||
GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
|
GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
|
||||||
/// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None
|
|
||||||
TryGuardRequest(tokio::sync::oneshot::Sender<Option<ResidenceGuard>>),
|
|
||||||
/// Request to drop the guard.
|
/// Request to drop the guard.
|
||||||
GuardDrop(GuardId),
|
GuardDrop(GuardId),
|
||||||
/// Request to reset uploaded partial backup state.
|
/// Request to reset uploaded partial backup state.
|
||||||
@@ -112,7 +110,6 @@ impl std::fmt::Debug for ManagerCtlMessage {
|
|||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
|
ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
|
||||||
ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"),
|
|
||||||
ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
|
ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
|
||||||
ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
|
ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
|
||||||
}
|
}
|
||||||
@@ -155,19 +152,6 @@ impl ManagerCtl {
|
|||||||
.and_then(std::convert::identity)
|
.and_then(std::convert::identity)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Issue a new guard if the timeline is currently not offloaded, else return None
|
|
||||||
/// Sends a message to the manager and waits for the response.
|
|
||||||
/// Can be blocked indefinitely if the manager is stuck.
|
|
||||||
pub async fn try_wal_residence_guard(&self) -> anyhow::Result<Option<ResidenceGuard>> {
|
|
||||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
|
||||||
self.manager_tx
|
|
||||||
.send(ManagerCtlMessage::TryGuardRequest(tx))?;
|
|
||||||
|
|
||||||
// wait for the manager to respond with the guard
|
|
||||||
rx.await
|
|
||||||
.map_err(|e| anyhow::anyhow!("response read fail: {:?}", e))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Request timeline manager to reset uploaded partial segment state and
|
/// Request timeline manager to reset uploaded partial segment state and
|
||||||
/// wait for the result.
|
/// wait for the result.
|
||||||
pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
|
pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
|
||||||
@@ -313,12 +297,7 @@ pub async fn main_task(
|
|||||||
match mgr.global_rate_limiter.try_acquire_eviction() {
|
match mgr.global_rate_limiter.try_acquire_eviction() {
|
||||||
Some(_permit) => {
|
Some(_permit) => {
|
||||||
mgr.set_status(Status::EvictTimeline);
|
mgr.set_status(Status::EvictTimeline);
|
||||||
if !mgr.evict_timeline().await {
|
mgr.evict_timeline().await;
|
||||||
// eviction failed, try again later
|
|
||||||
mgr.evict_not_before =
|
|
||||||
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
|
|
||||||
update_next_event(&mut next_event, mgr.evict_not_before);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
// we can't evict timeline now, will try again later
|
// we can't evict timeline now, will try again later
|
||||||
@@ -690,17 +669,6 @@ impl Manager {
|
|||||||
warn!("failed to reply with a guard, receiver dropped");
|
warn!("failed to reply with a guard, receiver dropped");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(ManagerCtlMessage::TryGuardRequest(tx)) => {
|
|
||||||
let result = if self.is_offloaded {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(self.access_service.create_guard())
|
|
||||||
};
|
|
||||||
|
|
||||||
if tx.send(result).is_err() {
|
|
||||||
warn!("failed to reply with a guard, receiver dropped");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
|
Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
|
||||||
self.access_service.drop_guard(guard_id);
|
self.access_service.drop_guard(guard_id);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -244,7 +244,7 @@ impl GlobalTimelines {
|
|||||||
// immediately initialize first WAL segment as well.
|
// immediately initialize first WAL segment as well.
|
||||||
let state =
|
let state =
|
||||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||||
control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
|
control_file::FileStorage::create_new(tmp_dir_path.clone(), &conf, state).await?;
|
||||||
let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
||||||
Ok(timeline)
|
Ok(timeline)
|
||||||
}
|
}
|
||||||
@@ -596,7 +596,7 @@ pub async fn validate_temp_timeline(
|
|||||||
bail!("wal_seg_size is not set");
|
bail!("wal_seg_size is not set");
|
||||||
}
|
}
|
||||||
|
|
||||||
let wal_store = wal_storage::PhysicalStorage::new(&ttid, path, &control_store, conf.no_sync)?;
|
let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
|
||||||
|
|
||||||
let commit_lsn = control_store.commit_lsn;
|
let commit_lsn = control_store.commit_lsn;
|
||||||
let flush_lsn = wal_store.flush_lsn();
|
let flush_lsn = wal_store.flush_lsn();
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ use crate::metrics::{
|
|||||||
};
|
};
|
||||||
use crate::state::TimelinePersistentState;
|
use crate::state::TimelinePersistentState;
|
||||||
use crate::wal_backup::{read_object, remote_timeline_path};
|
use crate::wal_backup::{read_object, remote_timeline_path};
|
||||||
|
use crate::SafeKeeperConf;
|
||||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||||
use postgres_ffi::XLogFileName;
|
use postgres_ffi::XLogFileName;
|
||||||
use postgres_ffi::XLOG_BLCKSZ;
|
use postgres_ffi::XLOG_BLCKSZ;
|
||||||
@@ -86,9 +87,7 @@ pub trait Storage {
|
|||||||
pub struct PhysicalStorage {
|
pub struct PhysicalStorage {
|
||||||
metrics: WalStorageMetrics,
|
metrics: WalStorageMetrics,
|
||||||
timeline_dir: Utf8PathBuf,
|
timeline_dir: Utf8PathBuf,
|
||||||
|
conf: SafeKeeperConf,
|
||||||
/// Disables fsync if true.
|
|
||||||
no_sync: bool,
|
|
||||||
|
|
||||||
/// Size of WAL segment in bytes.
|
/// Size of WAL segment in bytes.
|
||||||
wal_seg_size: usize,
|
wal_seg_size: usize,
|
||||||
@@ -152,9 +151,9 @@ impl PhysicalStorage {
|
|||||||
/// the disk. Otherwise, all LSNs are set to zero.
|
/// the disk. Otherwise, all LSNs are set to zero.
|
||||||
pub fn new(
|
pub fn new(
|
||||||
ttid: &TenantTimelineId,
|
ttid: &TenantTimelineId,
|
||||||
timeline_dir: &Utf8Path,
|
timeline_dir: Utf8PathBuf,
|
||||||
|
conf: &SafeKeeperConf,
|
||||||
state: &TimelinePersistentState,
|
state: &TimelinePersistentState,
|
||||||
no_sync: bool,
|
|
||||||
) -> Result<PhysicalStorage> {
|
) -> Result<PhysicalStorage> {
|
||||||
let wal_seg_size = state.server.wal_seg_size as usize;
|
let wal_seg_size = state.server.wal_seg_size as usize;
|
||||||
|
|
||||||
@@ -199,8 +198,8 @@ impl PhysicalStorage {
|
|||||||
|
|
||||||
Ok(PhysicalStorage {
|
Ok(PhysicalStorage {
|
||||||
metrics: WalStorageMetrics::default(),
|
metrics: WalStorageMetrics::default(),
|
||||||
timeline_dir: timeline_dir.to_path_buf(),
|
timeline_dir,
|
||||||
no_sync,
|
conf: conf.clone(),
|
||||||
wal_seg_size,
|
wal_seg_size,
|
||||||
pg_version: state.server.pg_version,
|
pg_version: state.server.pg_version,
|
||||||
system_id: state.server.system_id,
|
system_id: state.server.system_id,
|
||||||
@@ -225,7 +224,7 @@ impl PhysicalStorage {
|
|||||||
|
|
||||||
/// Call fdatasync if config requires so.
|
/// Call fdatasync if config requires so.
|
||||||
async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
|
async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
|
||||||
if !self.no_sync {
|
if !self.conf.no_sync {
|
||||||
self.metrics
|
self.metrics
|
||||||
.observe_flush_seconds(time_io_closure(file.sync_data()).await?);
|
.observe_flush_seconds(time_io_closure(file.sync_data()).await?);
|
||||||
}
|
}
|
||||||
@@ -264,7 +263,9 @@ impl PhysicalStorage {
|
|||||||
|
|
||||||
// Note: this doesn't get into observe_flush_seconds metric. But
|
// Note: this doesn't get into observe_flush_seconds metric. But
|
||||||
// segment init should be separate metric, if any.
|
// segment init should be separate metric, if any.
|
||||||
if let Err(e) = durable_rename(&tmp_path, &wal_file_partial_path, !self.no_sync).await {
|
if let Err(e) =
|
||||||
|
durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
|
||||||
|
{
|
||||||
// Probably rename succeeded, but fsync of it failed. Remove
|
// Probably rename succeeded, but fsync of it failed. Remove
|
||||||
// the file then to avoid using it.
|
// the file then to avoid using it.
|
||||||
remove_file(wal_file_partial_path)
|
remove_file(wal_file_partial_path)
|
||||||
|
|||||||
@@ -968,28 +968,6 @@ async fn handle_tenant_shard_migrate(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_tenant_shard_cancel_reconcile(
|
|
||||||
service: Arc<Service>,
|
|
||||||
req: Request<Body>,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
check_permissions(&req, Scope::Admin)?;
|
|
||||||
|
|
||||||
let req = match maybe_forward(req).await {
|
|
||||||
ForwardOutcome::Forwarded(res) => {
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
ForwardOutcome::NotForwarded(req) => req,
|
|
||||||
};
|
|
||||||
|
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
service
|
|
||||||
.tenant_shard_cancel_reconcile(tenant_shard_id)
|
|
||||||
.await?,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
check_permissions(&req, Scope::Admin)?;
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
@@ -1798,16 +1776,6 @@ pub fn make_router(
|
|||||||
RequestName("control_v1_tenant_migrate"),
|
RequestName("control_v1_tenant_migrate"),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.put(
|
|
||||||
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|
|
||||||
|r| {
|
|
||||||
tenant_service_handler(
|
|
||||||
r,
|
|
||||||
handle_tenant_shard_cancel_reconcile,
|
|
||||||
RequestName("control_v1_tenant_cancel_reconcile"),
|
|
||||||
)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(
|
||||||
r,
|
r,
|
||||||
|
|||||||
@@ -450,9 +450,6 @@ impl Reconciler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This function does _not_ mutate any state, so it is cancellation safe.
|
|
||||||
///
|
|
||||||
/// This function does not respect [`Self::cancel`], callers should handle that.
|
|
||||||
async fn await_lsn(
|
async fn await_lsn(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
@@ -573,10 +570,8 @@ impl Reconciler {
|
|||||||
|
|
||||||
if let Some(baseline) = baseline_lsns {
|
if let Some(baseline) = baseline_lsns {
|
||||||
tracing::info!("🕑 Waiting for LSN to catch up...");
|
tracing::info!("🕑 Waiting for LSN to catch up...");
|
||||||
tokio::select! {
|
self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
|
||||||
r = self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) => {r?;}
|
.await?;
|
||||||
_ = self.cancel.cancelled() => {return Err(ReconcileError::Cancel)}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");
|
tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");
|
||||||
|
|||||||
@@ -3130,11 +3130,9 @@ impl Service {
|
|||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// Propagate the LSN that shard zero picked, if caller didn't provide one
|
// Propagate the LSN that shard zero picked, if caller didn't provide one
|
||||||
match &mut create_req.mode {
|
if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none()
|
||||||
models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => {
|
{
|
||||||
*ancestor_start_lsn = timeline_info.ancestor_lsn;
|
create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
|
||||||
},
|
|
||||||
_ => {}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create timeline on remaining shards with number >0
|
// Create timeline on remaining shards with number >0
|
||||||
@@ -4834,43 +4832,6 @@ impl Service {
|
|||||||
Ok(TenantShardMigrateResponse {})
|
Ok(TenantShardMigrateResponse {})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// 'cancel' in this context means cancel any ongoing reconcile
|
|
||||||
pub(crate) async fn tenant_shard_cancel_reconcile(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
) -> Result<(), ApiError> {
|
|
||||||
// Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete
|
|
||||||
let waiter = {
|
|
||||||
let locked = self.inner.write().unwrap();
|
|
||||||
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
|
|
||||||
return Err(ApiError::NotFound(
|
|
||||||
anyhow::anyhow!("Tenant shard not found").into(),
|
|
||||||
));
|
|
||||||
};
|
|
||||||
|
|
||||||
let waiter = shard.get_waiter();
|
|
||||||
match waiter {
|
|
||||||
None => {
|
|
||||||
tracing::info!("Shard does not have an ongoing Reconciler");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
Some(waiter) => {
|
|
||||||
tracing::info!("Cancelling Reconciler");
|
|
||||||
shard.cancel_reconciler();
|
|
||||||
waiter
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Cancellation should be prompt. If this fails we have still done our job of firing the
|
|
||||||
// cancellation token, but by returning an ApiError we will indicate to the caller that
|
|
||||||
// the Reconciler is misbehaving and not respecting the cancellation token
|
|
||||||
self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is for debug/support only: we simply drop all state for a tenant, without
|
/// This is for debug/support only: we simply drop all state for a tenant, without
|
||||||
/// detaching or deleting it on pageservers.
|
/// detaching or deleting it on pageservers.
|
||||||
pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
|
pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
|
||||||
|
|||||||
@@ -1317,12 +1317,6 @@ impl TenantShard {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn cancel_reconciler(&self) {
|
|
||||||
if let Some(handle) = self.reconciler.as_ref() {
|
|
||||||
handle.cancel.cancel()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
|
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
|
||||||
/// if it is not already running
|
/// if it is not already running
|
||||||
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
|
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
|
||||||
|
|||||||
@@ -150,7 +150,6 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
|
|||||||
counter("pageserver_tenant_throttling_count_accounted_finish_global"),
|
counter("pageserver_tenant_throttling_count_accounted_finish_global"),
|
||||||
counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
|
counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
|
||||||
counter("pageserver_tenant_throttling_count_global"),
|
counter("pageserver_tenant_throttling_count_global"),
|
||||||
*histogram("pageserver_tokio_epoll_uring_slots_submission_queue_depth"),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||||
|
|||||||
@@ -40,19 +40,11 @@ from _pytest.fixtures import FixtureRequest
|
|||||||
from psycopg2.extensions import connection as PgConnection
|
from psycopg2.extensions import connection as PgConnection
|
||||||
from psycopg2.extensions import cursor as PgCursor
|
from psycopg2.extensions import cursor as PgCursor
|
||||||
from psycopg2.extensions import make_dsn, parse_dsn
|
from psycopg2.extensions import make_dsn, parse_dsn
|
||||||
from pytest_httpserver import HTTPServer
|
|
||||||
from urllib3.util.retry import Retry
|
from urllib3.util.retry import Retry
|
||||||
|
|
||||||
from fixtures import overlayfs
|
from fixtures import overlayfs
|
||||||
from fixtures.auth_tokens import AuthKeys, TokenScope
|
from fixtures.auth_tokens import AuthKeys, TokenScope
|
||||||
from fixtures.common_types import (
|
from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
|
||||||
Lsn,
|
|
||||||
NodeId,
|
|
||||||
TenantId,
|
|
||||||
TenantShardId,
|
|
||||||
TimelineArchivalState,
|
|
||||||
TimelineId,
|
|
||||||
)
|
|
||||||
from fixtures.endpoint.http import EndpointHttpClient
|
from fixtures.endpoint.http import EndpointHttpClient
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||||
@@ -62,11 +54,7 @@ from fixtures.pageserver.allowed_errors import (
|
|||||||
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
|
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
|
||||||
)
|
)
|
||||||
from fixtures.pageserver.common_types import LayerName, parse_layer_file_name
|
from fixtures.pageserver.common_types import LayerName, parse_layer_file_name
|
||||||
from fixtures.pageserver.http import (
|
from fixtures.pageserver.http import PageserverHttpClient
|
||||||
HistoricLayerInfo,
|
|
||||||
PageserverHttpClient,
|
|
||||||
ScanDisposableKeysResponse,
|
|
||||||
)
|
|
||||||
from fixtures.pageserver.utils import (
|
from fixtures.pageserver.utils import (
|
||||||
wait_for_last_record_lsn,
|
wait_for_last_record_lsn,
|
||||||
)
|
)
|
||||||
@@ -2144,24 +2132,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
def timeline_archival_config(
|
|
||||||
self,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
state: TimelineArchivalState,
|
|
||||||
):
|
|
||||||
config = {"state": state.value}
|
|
||||||
log.info(
|
|
||||||
f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
|
|
||||||
)
|
|
||||||
res = self.request(
|
|
||||||
"PUT",
|
|
||||||
f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
|
|
||||||
json=config,
|
|
||||||
headers=self.headers(TokenScope.ADMIN),
|
|
||||||
)
|
|
||||||
return res.json()
|
|
||||||
|
|
||||||
def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]):
|
def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]):
|
||||||
if isinstance(config_strings, tuple):
|
if isinstance(config_strings, tuple):
|
||||||
pairs = [config_strings]
|
pairs = [config_strings]
|
||||||
@@ -2675,51 +2645,6 @@ class NeonPageserver(PgProtocol, LogUtils):
|
|||||||
layers = self.list_layers(tenant_id, timeline_id)
|
layers = self.list_layers(tenant_id, timeline_id)
|
||||||
return layer_name in [parse_layer_file_name(p.name) for p in layers]
|
return layer_name in [parse_layer_file_name(p.name) for p in layers]
|
||||||
|
|
||||||
def timeline_scan_no_disposable_keys(
|
|
||||||
self, tenant_shard_id: TenantShardId, timeline_id: TimelineId
|
|
||||||
) -> TimelineAssertNoDisposableKeysResult:
|
|
||||||
"""
|
|
||||||
Scan all keys in all layers of the tenant/timeline for disposable keys.
|
|
||||||
Disposable keys are keys that are present in a layer referenced by the shard
|
|
||||||
but are not going to be accessed by the shard.
|
|
||||||
For example, after shard split, the child shards will reference the parent's layer
|
|
||||||
files until new data is ingested and/or compaction rewrites the layers.
|
|
||||||
"""
|
|
||||||
|
|
||||||
ps_http = self.http_client()
|
|
||||||
tally = ScanDisposableKeysResponse(0, 0)
|
|
||||||
per_layer = []
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
||||||
futs = []
|
|
||||||
shard_layer_map = ps_http.layer_map_info(tenant_shard_id, timeline_id)
|
|
||||||
for layer in shard_layer_map.historic_layers:
|
|
||||||
|
|
||||||
def do_layer(
|
|
||||||
shard_ps_http: PageserverHttpClient,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
layer: HistoricLayerInfo,
|
|
||||||
) -> tuple[HistoricLayerInfo, ScanDisposableKeysResponse]:
|
|
||||||
return (
|
|
||||||
layer,
|
|
||||||
shard_ps_http.timeline_layer_scan_disposable_keys(
|
|
||||||
tenant_shard_id, timeline_id, layer.layer_file_name
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
futs.append(executor.submit(do_layer, ps_http, tenant_shard_id, timeline_id, layer))
|
|
||||||
for fut in futs:
|
|
||||||
layer, result = fut.result()
|
|
||||||
tally += result
|
|
||||||
per_layer.append((layer, result))
|
|
||||||
return TimelineAssertNoDisposableKeysResult(tally, per_layer)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TimelineAssertNoDisposableKeysResult:
|
|
||||||
tally: ScanDisposableKeysResponse
|
|
||||||
per_layer: list[tuple[HistoricLayerInfo, ScanDisposableKeysResponse]]
|
|
||||||
|
|
||||||
|
|
||||||
class PgBin:
|
class PgBin:
|
||||||
"""A helper class for executing postgres binaries"""
|
"""A helper class for executing postgres binaries"""
|
||||||
@@ -3099,6 +3024,10 @@ class NeonProxy(PgProtocol):
|
|||||||
class AuthBackend(abc.ABC):
|
class AuthBackend(abc.ABC):
|
||||||
"""All auth backends must inherit from this class"""
|
"""All auth backends must inherit from this class"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_conn_url(self) -> Optional[str]:
|
||||||
|
return None
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def extra_args(self) -> list[str]:
|
def extra_args(self) -> list[str]:
|
||||||
pass
|
pass
|
||||||
@@ -3112,7 +3041,7 @@ class NeonProxy(PgProtocol):
|
|||||||
*["--allow-self-signed-compute", "true"],
|
*["--allow-self-signed-compute", "true"],
|
||||||
]
|
]
|
||||||
|
|
||||||
class ControlPlane(AuthBackend):
|
class Console(AuthBackend):
|
||||||
def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None):
|
def __init__(self, endpoint: str, fixed_rate_limit: Optional[int] = None):
|
||||||
self.endpoint = endpoint
|
self.endpoint = endpoint
|
||||||
self.fixed_rate_limit = fixed_rate_limit
|
self.fixed_rate_limit = fixed_rate_limit
|
||||||
@@ -3136,6 +3065,21 @@ class NeonProxy(PgProtocol):
|
|||||||
]
|
]
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Postgres(AuthBackend):
|
||||||
|
pg_conn_url: str
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_conn_url(self) -> Optional[str]:
|
||||||
|
return self.pg_conn_url
|
||||||
|
|
||||||
|
def extra_args(self) -> list[str]:
|
||||||
|
return [
|
||||||
|
# Postgres auth backend params
|
||||||
|
*["--auth-backend", "postgres"],
|
||||||
|
*["--auth-endpoint", self.pg_conn_url],
|
||||||
|
]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
neon_binpath: Path,
|
neon_binpath: Path,
|
||||||
@@ -3150,7 +3094,7 @@ class NeonProxy(PgProtocol):
|
|||||||
):
|
):
|
||||||
host = "127.0.0.1"
|
host = "127.0.0.1"
|
||||||
domain = "proxy.localtest.me" # resolves to 127.0.0.1
|
domain = "proxy.localtest.me" # resolves to 127.0.0.1
|
||||||
super().__init__(host=domain, port=proxy_port)
|
super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
|
||||||
|
|
||||||
self.domain = domain
|
self.domain = domain
|
||||||
self.host = host
|
self.host = host
|
||||||
@@ -3404,39 +3348,20 @@ def static_proxy(
|
|||||||
port_distributor: PortDistributor,
|
port_distributor: PortDistributor,
|
||||||
neon_binpath: Path,
|
neon_binpath: Path,
|
||||||
test_output_dir: Path,
|
test_output_dir: Path,
|
||||||
httpserver: HTTPServer,
|
|
||||||
) -> Iterator[NeonProxy]:
|
) -> Iterator[NeonProxy]:
|
||||||
"""Neon proxy that routes directly to vanilla postgres and a mocked cplane HTTP API."""
|
"""Neon proxy that routes directly to vanilla postgres."""
|
||||||
|
|
||||||
port = vanilla_pg.default_options["port"]
|
port = vanilla_pg.default_options["port"]
|
||||||
host = vanilla_pg.default_options["host"]
|
host = vanilla_pg.default_options["host"]
|
||||||
dbname = vanilla_pg.default_options["dbname"]
|
dbname = vanilla_pg.default_options["dbname"]
|
||||||
|
auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"
|
||||||
|
|
||||||
|
# For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
|
||||||
vanilla_pg.start()
|
vanilla_pg.start()
|
||||||
vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
|
vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
|
||||||
[(rolpassword,)] = vanilla_pg.safe_psql(
|
vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS neon_control_plane")
|
||||||
"select rolpassword from pg_catalog.pg_authid where rolname = 'proxy'"
|
vanilla_pg.safe_psql(
|
||||||
)
|
"CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))"
|
||||||
|
|
||||||
# return local postgres addr on ProxyWakeCompute.
|
|
||||||
httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
|
|
||||||
{
|
|
||||||
"address": f"{host}:{port}",
|
|
||||||
"aux": {
|
|
||||||
"endpoint_id": "ep-foo-bar-1234",
|
|
||||||
"branch_id": "br-foo-bar",
|
|
||||||
"project_id": "foo-bar",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# return local postgres addr on ProxyWakeCompute.
|
|
||||||
httpserver.expect_request("/cplane/proxy_get_role_secret").respond_with_json(
|
|
||||||
{
|
|
||||||
"role_secret": rolpassword,
|
|
||||||
"allowed_ips": None,
|
|
||||||
"project_id": "foo-bar",
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
proxy_port = port_distributor.get_port()
|
proxy_port = port_distributor.get_port()
|
||||||
@@ -3451,12 +3376,8 @@ def static_proxy(
|
|||||||
http_port=http_port,
|
http_port=http_port,
|
||||||
mgmt_port=mgmt_port,
|
mgmt_port=mgmt_port,
|
||||||
external_http_port=external_http_port,
|
external_http_port=external_http_port,
|
||||||
auth_backend=NeonProxy.ControlPlane(httpserver.url_for("/cplane")),
|
auth_backend=NeonProxy.Postgres(auth_endpoint),
|
||||||
) as proxy:
|
) as proxy:
|
||||||
proxy.default_options["user"] = "proxy"
|
|
||||||
proxy.default_options["password"] = "password"
|
|
||||||
proxy.default_options["dbname"] = dbname
|
|
||||||
|
|
||||||
proxy.start()
|
proxy.start()
|
||||||
yield proxy
|
yield proxy
|
||||||
|
|
||||||
|
|||||||
@@ -129,26 +129,6 @@ class LayerMapInfo:
|
|||||||
return set(x.layer_file_name for x in self.historic_layers)
|
return set(x.layer_file_name for x in self.historic_layers)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ScanDisposableKeysResponse:
|
|
||||||
disposable_count: int
|
|
||||||
not_disposable_count: int
|
|
||||||
|
|
||||||
def __add__(self, b):
|
|
||||||
a = self
|
|
||||||
assert isinstance(a, ScanDisposableKeysResponse)
|
|
||||||
assert isinstance(b, ScanDisposableKeysResponse)
|
|
||||||
return ScanDisposableKeysResponse(
|
|
||||||
a.disposable_count + b.disposable_count, a.not_disposable_count + b.not_disposable_count
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_json(cls, d: dict[str, Any]) -> ScanDisposableKeysResponse:
|
|
||||||
disposable_count = d["disposable_count"]
|
|
||||||
not_disposable_count = d["not_disposable_count"]
|
|
||||||
return ScanDisposableKeysResponse(disposable_count, not_disposable_count)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TenantConfig:
|
class TenantConfig:
|
||||||
tenant_specific_overrides: dict[str, Any]
|
tenant_specific_overrides: dict[str, Any]
|
||||||
@@ -162,19 +142,6 @@ class TenantConfig:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TimelinesInfoAndOffloaded:
|
|
||||||
timelines: list[dict[str, Any]]
|
|
||||||
offloaded: list[dict[str, Any]]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_json(cls, d: dict[str, Any]) -> TimelinesInfoAndOffloaded:
|
|
||||||
return TimelinesInfoAndOffloaded(
|
|
||||||
timelines=d["timelines"],
|
|
||||||
offloaded=d["offloaded"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PageserverHttpClient(requests.Session, MetricsGetter):
|
class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -497,18 +464,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
|||||||
assert isinstance(res_json, list)
|
assert isinstance(res_json, list)
|
||||||
return res_json
|
return res_json
|
||||||
|
|
||||||
def timeline_and_offloaded_list(
|
|
||||||
self,
|
|
||||||
tenant_id: Union[TenantId, TenantShardId],
|
|
||||||
) -> TimelinesInfoAndOffloaded:
|
|
||||||
res = self.get(
|
|
||||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline_and_offloaded",
|
|
||||||
)
|
|
||||||
self.verbose_error(res)
|
|
||||||
res_json = res.json()
|
|
||||||
assert isinstance(res_json, dict)
|
|
||||||
return TimelinesInfoAndOffloaded.from_json(res_json)
|
|
||||||
|
|
||||||
def timeline_create(
|
def timeline_create(
|
||||||
self,
|
self,
|
||||||
pg_version: PgVersion,
|
pg_version: PgVersion,
|
||||||
@@ -521,13 +476,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
|||||||
) -> dict[Any, Any]:
|
) -> dict[Any, Any]:
|
||||||
body: dict[str, Any] = {
|
body: dict[str, Any] = {
|
||||||
"new_timeline_id": str(new_timeline_id),
|
"new_timeline_id": str(new_timeline_id),
|
||||||
|
"ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
|
||||||
|
"ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
|
||||||
|
"existing_initdb_timeline_id": str(existing_initdb_timeline_id)
|
||||||
|
if existing_initdb_timeline_id
|
||||||
|
else None,
|
||||||
}
|
}
|
||||||
if ancestor_timeline_id:
|
|
||||||
body["ancestor_timeline_id"] = str(ancestor_timeline_id)
|
|
||||||
if ancestor_start_lsn:
|
|
||||||
body["ancestor_start_lsn"] = str(ancestor_start_lsn)
|
|
||||||
if existing_initdb_timeline_id:
|
|
||||||
body["existing_initdb_timeline_id"] = str(existing_initdb_timeline_id)
|
|
||||||
if pg_version != PgVersion.NOT_SET:
|
if pg_version != PgVersion.NOT_SET:
|
||||||
body["pg_version"] = int(pg_version)
|
body["pg_version"] = int(pg_version)
|
||||||
|
|
||||||
@@ -925,16 +879,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
|||||||
self.verbose_error(res)
|
self.verbose_error(res)
|
||||||
return LayerMapInfo.from_json(res.json())
|
return LayerMapInfo.from_json(res.json())
|
||||||
|
|
||||||
def timeline_layer_scan_disposable_keys(
|
|
||||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
|
|
||||||
) -> ScanDisposableKeysResponse:
|
|
||||||
res = self.post(
|
|
||||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}/scan_disposable_keys",
|
|
||||||
)
|
|
||||||
self.verbose_error(res)
|
|
||||||
assert res.status_code == 200
|
|
||||||
return ScanDisposableKeysResponse.from_json(res.json())
|
|
||||||
|
|
||||||
def download_layer(
|
def download_layer(
|
||||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
|
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -3,13 +3,10 @@
|
|||||||
#
|
#
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, cast
|
from typing import TYPE_CHECKING, cast
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from fixtures.log_helper import log
|
|
||||||
from fixtures.neon_fixtures import (
|
from fixtures.neon_fixtures import (
|
||||||
Endpoint,
|
Endpoint,
|
||||||
NeonEnv,
|
NeonEnv,
|
||||||
@@ -327,97 +324,3 @@ def test_sql_regress(
|
|||||||
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
||||||
|
|
||||||
post_checks(env, test_output_dir, DBNAME, endpoint)
|
post_checks(env, test_output_dir, DBNAME, endpoint)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
|
|
||||||
def test_tx_abort_with_many_relations(
|
|
||||||
neon_env_builder: NeonEnvBuilder,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres
|
|
||||||
behavior when aborting a transaction with lots of relations.
|
|
||||||
|
|
||||||
Reproducer for https://github.com/neondatabase/neon/issues/9505
|
|
||||||
"""
|
|
||||||
|
|
||||||
env = neon_env_builder.init_start()
|
|
||||||
ep = env.endpoints.create_start(
|
|
||||||
"main",
|
|
||||||
tenant_id=env.initial_tenant,
|
|
||||||
config_lines=[
|
|
||||||
"shared_buffers=1000MB",
|
|
||||||
"max_locks_per_transaction=16384",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# How many relations: this number is tuned to be long enough to take tens of seconds
|
|
||||||
# if the rollback code path is buggy, tripping the test's timeout.
|
|
||||||
n = 4000
|
|
||||||
|
|
||||||
def create():
|
|
||||||
# Create many relations
|
|
||||||
log.info(f"Creating {n} relations...")
|
|
||||||
ep.safe_psql_many(
|
|
||||||
[
|
|
||||||
"BEGIN",
|
|
||||||
f"""DO $$
|
|
||||||
DECLARE
|
|
||||||
i INT;
|
|
||||||
table_name TEXT;
|
|
||||||
BEGIN
|
|
||||||
FOR i IN 1..{n} LOOP
|
|
||||||
table_name := 'table_' || i;
|
|
||||||
EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
|
|
||||||
END LOOP;
|
|
||||||
END $$;
|
|
||||||
""",
|
|
||||||
"COMMIT",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
def truncate():
|
|
||||||
# Truncate relations, then roll back the transaction containing the truncations
|
|
||||||
log.info(f"Truncating {n} relations...")
|
|
||||||
ep.safe_psql_many(
|
|
||||||
[
|
|
||||||
"BEGIN",
|
|
||||||
f"""DO $$
|
|
||||||
DECLARE
|
|
||||||
i INT;
|
|
||||||
table_name TEXT;
|
|
||||||
BEGIN
|
|
||||||
FOR i IN 1..{n} LOOP
|
|
||||||
table_name := 'table_' || i;
|
|
||||||
EXECUTE 'TRUNCATE ' || table_name ;
|
|
||||||
END LOOP;
|
|
||||||
END $$;
|
|
||||||
""",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
def rollback_and_wait():
|
|
||||||
log.info(f"Rolling back after truncating {n} relations...")
|
|
||||||
ep.safe_psql("ROLLBACK")
|
|
||||||
|
|
||||||
# Restart the endpoint: this ensures that we can read back what we just wrote, i.e. pageserver
|
|
||||||
# ingest has caught up.
|
|
||||||
ep.stop()
|
|
||||||
log.info(f"Starting endpoint after truncating {n} relations...")
|
|
||||||
ep.start()
|
|
||||||
log.info(f"Started endpoint after truncating {n} relations...")
|
|
||||||
|
|
||||||
# Actual create & truncate phases may be slow, these involves lots of WAL records. We do not
|
|
||||||
# apply a special timeout, they are expected to complete within general test timeout
|
|
||||||
create()
|
|
||||||
truncate()
|
|
||||||
|
|
||||||
# Run in a thread because the failure case is to take pathologically long time, and we don't want
|
|
||||||
# to block the test executor on that.
|
|
||||||
with ThreadPoolExecutor(max_workers=1) as exec:
|
|
||||||
try:
|
|
||||||
# Rollback phase should be fast: this is one WAL record that we should process efficiently
|
|
||||||
fut = exec.submit(rollback_and_wait)
|
|
||||||
fut.result(timeout=5)
|
|
||||||
except:
|
|
||||||
exec.shutdown(wait=False, cancel_futures=True)
|
|
||||||
raise
|
|
||||||
|
|||||||
@@ -6,27 +6,20 @@ from fixtures.neon_fixtures import (
|
|||||||
NeonProxy,
|
NeonProxy,
|
||||||
VanillaPostgres,
|
VanillaPostgres,
|
||||||
)
|
)
|
||||||
from pytest_httpserver import HTTPServer
|
|
||||||
|
|
||||||
TABLE_NAME = "neon_control_plane.endpoints"
|
TABLE_NAME = "neon_control_plane.endpoints"
|
||||||
|
|
||||||
|
|
||||||
def test_proxy_psql_not_allowed_ips(
|
# Proxy uses the same logic for psql and websockets.
|
||||||
static_proxy: NeonProxy,
|
@pytest.mark.asyncio
|
||||||
vanilla_pg: VanillaPostgres,
|
async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres):
|
||||||
httpserver: HTTPServer,
|
|
||||||
):
|
|
||||||
[(rolpassword,)] = vanilla_pg.safe_psql(
|
|
||||||
"select rolpassword from pg_catalog.pg_authid where rolname = 'proxy'"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Shouldn't be able to connect to this project
|
# Shouldn't be able to connect to this project
|
||||||
httpserver.expect_request("/cplane/proxy_get_role_secret").respond_with_json(
|
vanilla_pg.safe_psql(
|
||||||
{
|
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')"
|
||||||
"role_secret": rolpassword,
|
)
|
||||||
"allowed_ips": ["8.8.8.8"],
|
# Should be able to connect to this project
|
||||||
"project_id": "foo-bar",
|
vanilla_pg.safe_psql(
|
||||||
}
|
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')"
|
||||||
)
|
)
|
||||||
|
|
||||||
def check_cannot_connect(**kwargs):
|
def check_cannot_connect(**kwargs):
|
||||||
@@ -44,25 +37,6 @@ def test_proxy_psql_not_allowed_ips(
|
|||||||
# with SNI
|
# with SNI
|
||||||
check_cannot_connect(query="select 1", host="private-project.localtest.me")
|
check_cannot_connect(query="select 1", host="private-project.localtest.me")
|
||||||
|
|
||||||
|
|
||||||
def test_proxy_psql_allowed_ips(
|
|
||||||
static_proxy: NeonProxy,
|
|
||||||
vanilla_pg: VanillaPostgres,
|
|
||||||
httpserver: HTTPServer,
|
|
||||||
):
|
|
||||||
[(rolpassword,)] = vanilla_pg.safe_psql(
|
|
||||||
"select rolpassword from pg_catalog.pg_authid where rolname = 'proxy'"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Should be able to connect to this project
|
|
||||||
httpserver.expect_request("/cplane/proxy_get_role_secret").respond_with_json(
|
|
||||||
{
|
|
||||||
"role_secret": rolpassword,
|
|
||||||
"allowed_ips": ["::1", "127.0.0.1"],
|
|
||||||
"project_id": "foo-bar",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
|
# no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
|
||||||
out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project")
|
out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project")
|
||||||
assert out[0][0] == 1
|
assert out[0][0] == 1
|
||||||
@@ -76,61 +50,27 @@ def test_proxy_psql_allowed_ips(
|
|||||||
assert out[0][0] == 1
|
assert out[0][0] == 1
|
||||||
|
|
||||||
|
|
||||||
def test_proxy_http_not_allowed_ips(
|
@pytest.mark.asyncio
|
||||||
static_proxy: NeonProxy,
|
async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres):
|
||||||
vanilla_pg: VanillaPostgres,
|
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
|
||||||
httpserver: HTTPServer,
|
|
||||||
):
|
|
||||||
vanilla_pg.safe_psql("create user http_auth with password 'http' superuser")
|
|
||||||
|
|
||||||
[(rolpassword,)] = vanilla_pg.safe_psql(
|
# Shouldn't be able to connect to this project
|
||||||
"select rolpassword from pg_catalog.pg_authid where rolname = 'http_auth'"
|
vanilla_pg.safe_psql(
|
||||||
|
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')"
|
||||||
)
|
)
|
||||||
|
|
||||||
httpserver.expect_oneshot_request("/cplane/proxy_get_role_secret").respond_with_json(
|
def query(status: int, query: str, *args):
|
||||||
{
|
|
||||||
"role_secret": rolpassword,
|
|
||||||
"allowed_ips": ["8.8.8.8"],
|
|
||||||
"project_id": "foo-bar",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
with httpserver.wait() as waiting:
|
|
||||||
static_proxy.http_query(
|
static_proxy.http_query(
|
||||||
"select 1;",
|
query,
|
||||||
[],
|
args,
|
||||||
user="http_auth",
|
user="http_auth",
|
||||||
password="http",
|
password="http",
|
||||||
expected_code=400,
|
expected_code=status,
|
||||||
)
|
)
|
||||||
assert waiting.result
|
|
||||||
|
|
||||||
|
query(400, "select 1;") # ip address is not allowed
|
||||||
def test_proxy_http_allowed_ips(
|
# Should be able to connect to this project
|
||||||
static_proxy: NeonProxy,
|
vanilla_pg.safe_psql(
|
||||||
vanilla_pg: VanillaPostgres,
|
f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'"
|
||||||
httpserver: HTTPServer,
|
|
||||||
):
|
|
||||||
vanilla_pg.safe_psql("create user http_auth with password 'http' superuser")
|
|
||||||
|
|
||||||
[(rolpassword,)] = vanilla_pg.safe_psql(
|
|
||||||
"select rolpassword from pg_catalog.pg_authid where rolname = 'http_auth'"
|
|
||||||
)
|
)
|
||||||
|
query(200, "select 1;") # should work now
|
||||||
httpserver.expect_oneshot_request("/cplane/proxy_get_role_secret").respond_with_json(
|
|
||||||
{
|
|
||||||
"role_secret": rolpassword,
|
|
||||||
"allowed_ips": ["8.8.8.8", "127.0.0.1"],
|
|
||||||
"project_id": "foo-bar",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
with httpserver.wait() as waiting:
|
|
||||||
static_proxy.http_query(
|
|
||||||
"select 1;",
|
|
||||||
[],
|
|
||||||
user="http_auth",
|
|
||||||
password="http",
|
|
||||||
expected_code=200,
|
|
||||||
)
|
|
||||||
assert waiting.result
|
|
||||||
|
|||||||
@@ -169,24 +169,23 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
|||||||
)
|
)
|
||||||
return last_flush_lsn
|
return last_flush_lsn
|
||||||
|
|
||||||
def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str):
|
def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
|
||||||
"""
|
"""
|
||||||
Trigger GC manually on all pageservers. Then run an `SELECT` query.
|
Trigger GC manually on all pageservers. Then run an `SELECT` query.
|
||||||
"""
|
"""
|
||||||
for shard, ps in tenant_get_shards(env, env.initial_tenant):
|
for shard, ps in tenant_get_shards(env, env.initial_tenant):
|
||||||
client = ps.http_client()
|
client = ps.http_client()
|
||||||
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
|
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
|
||||||
# Note: cannot assert on `layers_removed` here because it could be layers
|
|
||||||
# not guarded by the lease. Rely on successful execution of the query instead.
|
|
||||||
log.info(f"{gc_result=}")
|
log.info(f"{gc_result=}")
|
||||||
|
|
||||||
|
assert (
|
||||||
|
gc_result["layers_removed"] == 0
|
||||||
|
), "No layers should be removed, old layers are guarded by leases."
|
||||||
|
|
||||||
with ep_static.cursor() as cur:
|
with ep_static.cursor() as cur:
|
||||||
# Following query should succeed if pages are properly guarded by leases.
|
|
||||||
cur.execute("SELECT count(*) FROM t0")
|
cur.execute("SELECT count(*) FROM t0")
|
||||||
assert cur.fetchone() == (ROW_COUNT,)
|
assert cur.fetchone() == (ROW_COUNT,)
|
||||||
|
|
||||||
log.info(f"`SELECT` query succeed after GC, {ctx=}")
|
|
||||||
|
|
||||||
# Insert some records on main branch
|
# Insert some records on main branch
|
||||||
with env.endpoints.create_start("main") as ep_main:
|
with env.endpoints.create_start("main") as ep_main:
|
||||||
with ep_main.cursor() as cur:
|
with ep_main.cursor() as cur:
|
||||||
@@ -211,9 +210,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
|||||||
# Wait for static compute to renew lease at least once.
|
# Wait for static compute to renew lease at least once.
|
||||||
time.sleep(LSN_LEASE_LENGTH / 2)
|
time.sleep(LSN_LEASE_LENGTH / 2)
|
||||||
|
|
||||||
generate_updates_on_main(env, ep_main, 3, end=100)
|
generate_updates_on_main(env, ep_main, i, end=100)
|
||||||
|
|
||||||
trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart")
|
trigger_gc_and_select(env, ep_static)
|
||||||
|
|
||||||
# Trigger Pageserver restarts
|
# Trigger Pageserver restarts
|
||||||
for ps in env.pageservers:
|
for ps in env.pageservers:
|
||||||
@@ -222,7 +221,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
|||||||
time.sleep(LSN_LEASE_LENGTH / 2)
|
time.sleep(LSN_LEASE_LENGTH / 2)
|
||||||
ps.start()
|
ps.start()
|
||||||
|
|
||||||
trigger_gc_and_select(env, ep_static, ctx="After pageservers restart")
|
trigger_gc_and_select(env, ep_static)
|
||||||
|
|
||||||
# Reconfigure pageservers
|
# Reconfigure pageservers
|
||||||
env.pageservers[0].stop()
|
env.pageservers[0].stop()
|
||||||
@@ -231,7 +230,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
|
|||||||
)
|
)
|
||||||
env.storage_controller.reconcile_until_idle()
|
env.storage_controller.reconcile_until_idle()
|
||||||
|
|
||||||
trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline")
|
trigger_gc_and_select(env, ep_static)
|
||||||
|
|
||||||
# Do some update so we can increment latest_gc_cutoff
|
# Do some update so we can increment latest_gc_cutoff
|
||||||
generate_updates_on_main(env, ep_main, i, end=100)
|
generate_updates_on_main(env, ep_main, i, end=100)
|
||||||
|
|||||||
@@ -3,11 +3,11 @@ from __future__ import annotations
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId
|
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
|
||||||
from fixtures.compute_reconfigure import ComputeReconfigure
|
from fixtures.compute_reconfigure import ComputeReconfigure
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from fixtures.neon_fixtures import (
|
from fixtures.neon_fixtures import (
|
||||||
@@ -188,9 +188,7 @@ def test_sharding_split_unsharded(
|
|||||||
"compact-shard-ancestors-persistent",
|
"compact-shard-ancestors-persistent",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_sharding_split_compaction(
|
def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
|
||||||
neon_env_builder: NeonEnvBuilder, failpoint: Optional[str], build_type: str
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Test that after a split, we clean up parent layer data in the child shards via compaction.
|
Test that after a split, we clean up parent layer data in the child shards via compaction.
|
||||||
"""
|
"""
|
||||||
@@ -324,19 +322,9 @@ def test_sharding_split_compaction(
|
|||||||
# Physical size should shrink because layers are smaller
|
# Physical size should shrink because layers are smaller
|
||||||
assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
|
assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
|
||||||
|
|
||||||
# Validate filtering compaction actually happened
|
# Validate size statistics
|
||||||
for shard in shards:
|
for shard in shards:
|
||||||
ps = env.get_tenant_pageserver(shard)
|
ps = env.get_tenant_pageserver(shard)
|
||||||
|
|
||||||
log.info("scan all layer files for disposable keys, there shouldn't be any")
|
|
||||||
result = ps.timeline_scan_no_disposable_keys(shard, timeline_id)
|
|
||||||
tally = result.tally
|
|
||||||
raw_page_count = tally.not_disposable_count + tally.disposable_count
|
|
||||||
assert tally.not_disposable_count > (
|
|
||||||
raw_page_count // 2
|
|
||||||
), "compaction doesn't rewrite layers that are >=50pct local"
|
|
||||||
|
|
||||||
log.info("check sizes")
|
|
||||||
timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
|
timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
|
||||||
reported_size = timeline_info["current_physical_size"]
|
reported_size = timeline_info["current_physical_size"]
|
||||||
layer_paths = ps.list_layers(shard, timeline_id)
|
layer_paths = ps.list_layers(shard, timeline_id)
|
||||||
@@ -365,145 +353,6 @@ def test_sharding_split_compaction(
|
|||||||
workload.validate()
|
workload.validate()
|
||||||
|
|
||||||
|
|
||||||
def test_sharding_split_offloading(neon_env_builder: NeonEnvBuilder):
|
|
||||||
"""
|
|
||||||
Test that during a split, we don't miss archived and offloaded timelines.
|
|
||||||
"""
|
|
||||||
|
|
||||||
TENANT_CONF = {
|
|
||||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
|
||||||
"checkpoint_distance": 128 * 1024,
|
|
||||||
"compaction_threshold": 1,
|
|
||||||
"compaction_target_size": 128 * 1024,
|
|
||||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
|
||||||
"pitr_interval": "3600s",
|
|
||||||
# disable background compaction, GC and offloading. We invoke it manually when we want it to happen.
|
|
||||||
"gc_period": "0s",
|
|
||||||
"compaction_period": "0s",
|
|
||||||
# Disable automatic creation of image layers, as we will create them explicitly when we want them
|
|
||||||
"image_creation_threshold": 9999,
|
|
||||||
"image_layer_creation_check_threshold": 0,
|
|
||||||
"lsn_lease_length": "0s",
|
|
||||||
}
|
|
||||||
|
|
||||||
neon_env_builder.storage_controller_config = {
|
|
||||||
# Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
|
|
||||||
"max_offline": "30s",
|
|
||||||
"max_warming_up": "300s",
|
|
||||||
}
|
|
||||||
|
|
||||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
|
||||||
tenant_id = env.initial_tenant
|
|
||||||
timeline_id_main = env.initial_timeline
|
|
||||||
|
|
||||||
# Check that we created with an unsharded TenantShardId: this is the default,
|
|
||||||
# but check it in case we change the default in future
|
|
||||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
|
|
||||||
|
|
||||||
workload_main = Workload(env, tenant_id, timeline_id_main, branch_name="main")
|
|
||||||
workload_main.init()
|
|
||||||
workload_main.write_rows(256)
|
|
||||||
workload_main.validate()
|
|
||||||
workload_main.stop()
|
|
||||||
|
|
||||||
# Create two timelines, archive one, offload the other
|
|
||||||
timeline_id_archived = env.create_branch("archived_not_offloaded")
|
|
||||||
timeline_id_offloaded = env.create_branch("archived_offloaded")
|
|
||||||
|
|
||||||
def timeline_id_set_for(list: list[dict[str, Any]]) -> set[TimelineId]:
|
|
||||||
return set(
|
|
||||||
map(
|
|
||||||
lambda t: TimelineId(t["timeline_id"]),
|
|
||||||
list,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
expected_offloaded_set = {timeline_id_offloaded}
|
|
||||||
expected_timeline_set = {timeline_id_main, timeline_id_archived}
|
|
||||||
|
|
||||||
with env.get_tenant_pageserver(tenant_id).http_client() as http_client:
|
|
||||||
http_client.timeline_archival_config(
|
|
||||||
tenant_id, timeline_id_archived, TimelineArchivalState.ARCHIVED
|
|
||||||
)
|
|
||||||
http_client.timeline_archival_config(
|
|
||||||
tenant_id, timeline_id_offloaded, TimelineArchivalState.ARCHIVED
|
|
||||||
)
|
|
||||||
http_client.timeline_offload(tenant_id, timeline_id_offloaded)
|
|
||||||
list = http_client.timeline_and_offloaded_list(tenant_id)
|
|
||||||
assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
|
|
||||||
assert timeline_id_set_for(list.timelines) == expected_timeline_set
|
|
||||||
|
|
||||||
# Do a full image layer generation before splitting
|
|
||||||
http_client.timeline_checkpoint(
|
|
||||||
tenant_id, timeline_id_main, force_image_layer_creation=True, wait_until_uploaded=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Split one shard into two
|
|
||||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
|
|
||||||
|
|
||||||
# Let all shards move into their stable locations, so that during subsequent steps we
|
|
||||||
# don't have reconciles in progress (simpler to reason about what messages we expect in logs)
|
|
||||||
env.storage_controller.reconcile_until_idle()
|
|
||||||
|
|
||||||
# Check we got the shard IDs we expected
|
|
||||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
|
|
||||||
assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
|
|
||||||
|
|
||||||
workload_main.validate()
|
|
||||||
workload_main.stop()
|
|
||||||
|
|
||||||
env.storage_controller.consistency_check()
|
|
||||||
|
|
||||||
# Ensure each shard has the same list of timelines and offloaded timelines
|
|
||||||
for shard in shards:
|
|
||||||
ps = env.get_tenant_pageserver(shard)
|
|
||||||
|
|
||||||
list = ps.http_client().timeline_and_offloaded_list(shard)
|
|
||||||
assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
|
|
||||||
assert timeline_id_set_for(list.timelines) == expected_timeline_set
|
|
||||||
|
|
||||||
ps.http_client().timeline_compact(shard, timeline_id_main)
|
|
||||||
|
|
||||||
# Check that we can still read all the data
|
|
||||||
workload_main.validate()
|
|
||||||
|
|
||||||
# Force a restart, which requires the state to be persisted.
|
|
||||||
env.pageserver.stop()
|
|
||||||
env.pageserver.start()
|
|
||||||
|
|
||||||
# Ensure each shard has the same list of timelines and offloaded timelines
|
|
||||||
for shard in shards:
|
|
||||||
ps = env.get_tenant_pageserver(shard)
|
|
||||||
|
|
||||||
list = ps.http_client().timeline_and_offloaded_list(shard)
|
|
||||||
assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
|
|
||||||
assert timeline_id_set_for(list.timelines) == expected_timeline_set
|
|
||||||
|
|
||||||
ps.http_client().timeline_compact(shard, timeline_id_main)
|
|
||||||
|
|
||||||
# Compaction shouldn't make anything unreadable
|
|
||||||
workload_main.validate()
|
|
||||||
|
|
||||||
# Do sharded unarchival
|
|
||||||
env.storage_controller.timeline_archival_config(
|
|
||||||
tenant_id, timeline_id_offloaded, TimelineArchivalState.UNARCHIVED
|
|
||||||
)
|
|
||||||
env.storage_controller.timeline_archival_config(
|
|
||||||
tenant_id, timeline_id_archived, TimelineArchivalState.UNARCHIVED
|
|
||||||
)
|
|
||||||
|
|
||||||
for shard in shards:
|
|
||||||
ps = env.get_tenant_pageserver(shard)
|
|
||||||
|
|
||||||
list = ps.http_client().timeline_and_offloaded_list(shard)
|
|
||||||
assert timeline_id_set_for(list.offloaded) == set()
|
|
||||||
assert timeline_id_set_for(list.timelines) == {
|
|
||||||
timeline_id_main,
|
|
||||||
timeline_id_archived,
|
|
||||||
timeline_id_offloaded,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def test_sharding_split_smoke(
|
def test_sharding_split_smoke(
|
||||||
neon_env_builder: NeonEnvBuilder,
|
neon_env_builder: NeonEnvBuilder,
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from fixtures.log_helper import log
|
|||||||
from fixtures.neon_fixtures import (
|
from fixtures.neon_fixtures import (
|
||||||
NeonEnv,
|
NeonEnv,
|
||||||
NeonEnvBuilder,
|
NeonEnvBuilder,
|
||||||
NeonPageserver,
|
|
||||||
PageserverAvailability,
|
PageserverAvailability,
|
||||||
PageserverSchedulingPolicy,
|
PageserverSchedulingPolicy,
|
||||||
PgBin,
|
PgBin,
|
||||||
@@ -299,20 +298,17 @@ def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
|
|||||||
env.storage_controller.consistency_check()
|
env.storage_controller.consistency_check()
|
||||||
|
|
||||||
|
|
||||||
def prepare_onboarding_env(
|
@pytest.mark.parametrize("warm_up", [True, False])
|
||||||
neon_env_builder: NeonEnvBuilder,
|
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
|
||||||
) -> tuple[NeonEnv, NeonPageserver, TenantId, int]:
|
|
||||||
"""
|
"""
|
||||||
For tests that do onboarding of a tenant to the storage controller, a small dance to
|
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
|
||||||
set up one pageserver that won't be managed by the storage controller and create
|
which provides the /location_config API. This is similar to creating a tenant,
|
||||||
a tenant there.
|
but imports the generation number.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# One pageserver to simulate legacy environment, two to be managed by storage controller
|
# One pageserver to simulate legacy environment, two to be managed by storage controller
|
||||||
neon_env_builder.num_pageservers = 3
|
neon_env_builder.num_pageservers = 3
|
||||||
|
|
||||||
# Enable tests to use methods that require real S3 API
|
|
||||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
||||||
|
|
||||||
# Start services by hand so that we can skip registration on one of the pageservers
|
# Start services by hand so that we can skip registration on one of the pageservers
|
||||||
env = neon_env_builder.init_configs()
|
env = neon_env_builder.init_configs()
|
||||||
env.broker.start()
|
env.broker.start()
|
||||||
@@ -333,6 +329,7 @@ def prepare_onboarding_env(
|
|||||||
# will be attached after onboarding
|
# will be attached after onboarding
|
||||||
env.pageservers[1].start()
|
env.pageservers[1].start()
|
||||||
env.pageservers[2].start()
|
env.pageservers[2].start()
|
||||||
|
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||||
|
|
||||||
for sk in env.safekeepers:
|
for sk in env.safekeepers:
|
||||||
sk.start()
|
sk.start()
|
||||||
@@ -342,23 +339,6 @@ def prepare_onboarding_env(
|
|||||||
generation = 123
|
generation = 123
|
||||||
origin_ps.tenant_create(tenant_id, generation=generation)
|
origin_ps.tenant_create(tenant_id, generation=generation)
|
||||||
|
|
||||||
origin_ps.http_client().timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
|
|
||||||
|
|
||||||
return (env, origin_ps, tenant_id, generation)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("warm_up", [True, False])
|
|
||||||
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
|
|
||||||
"""
|
|
||||||
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
|
|
||||||
which provides the /location_config API. This is similar to creating a tenant,
|
|
||||||
but imports the generation number.
|
|
||||||
"""
|
|
||||||
|
|
||||||
env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
|
|
||||||
|
|
||||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
|
||||||
|
|
||||||
# As if doing a live migration, first configure origin into stale mode
|
# As if doing a live migration, first configure origin into stale mode
|
||||||
r = origin_ps.http_client().tenant_location_conf(
|
r = origin_ps.http_client().tenant_location_conf(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
@@ -495,70 +475,6 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
|||||||
env.storage_controller.consistency_check()
|
env.storage_controller.consistency_check()
|
||||||
|
|
||||||
|
|
||||||
@run_only_on_default_postgres("this test doesn't start an endpoint")
|
|
||||||
def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
|
|
||||||
"""
|
|
||||||
Sometimes, the control plane wants to delete a tenant that wasn't attached to any pageserver,
|
|
||||||
and also wasn't ever registered with the storage controller.
|
|
||||||
|
|
||||||
It may do this by calling /location_conf in mode Detached and then calling the delete API
|
|
||||||
as normal.
|
|
||||||
"""
|
|
||||||
|
|
||||||
env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
|
|
||||||
|
|
||||||
remote_prefix = "/".join(
|
|
||||||
(
|
|
||||||
"tenants",
|
|
||||||
str(tenant_id),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Detach it from its original pageserver.
|
|
||||||
origin_ps.http_client().tenant_location_conf(
|
|
||||||
tenant_id,
|
|
||||||
{
|
|
||||||
"mode": "Detached",
|
|
||||||
"secondary_conf": None,
|
|
||||||
"tenant_conf": {},
|
|
||||||
"generation": None,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Since we will later assert that remote data is gone, as a control also check it was ever there
|
|
||||||
assert_prefix_not_empty(
|
|
||||||
neon_env_builder.pageserver_remote_storage,
|
|
||||||
prefix=remote_prefix,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Register with storage controller in Detached state
|
|
||||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
|
||||||
generation += 1
|
|
||||||
r = virtual_ps_http.tenant_location_conf(
|
|
||||||
tenant_id,
|
|
||||||
{
|
|
||||||
"mode": "Detached",
|
|
||||||
"secondary_conf": None,
|
|
||||||
"tenant_conf": {},
|
|
||||||
"generation": generation,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
assert len(r["shards"]) == 0 # location_conf tells us there are no attached shards
|
|
||||||
|
|
||||||
# Onboarding in Detached state shouldn't have attached it to any pageserver
|
|
||||||
for ps in env.pageservers:
|
|
||||||
assert ps.http_client().tenant_list() == []
|
|
||||||
|
|
||||||
# Delete it via the storage controller
|
|
||||||
virtual_ps_http.tenant_delete(tenant_id)
|
|
||||||
|
|
||||||
# Check that we really deleted it
|
|
||||||
assert_prefix_empty(
|
|
||||||
neon_env_builder.pageserver_remote_storage,
|
|
||||||
prefix=remote_prefix,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_storage_controller_compute_hook(
|
def test_storage_controller_compute_hook(
|
||||||
httpserver: HTTPServer,
|
httpserver: HTTPServer,
|
||||||
neon_env_builder: NeonEnvBuilder,
|
neon_env_builder: NeonEnvBuilder,
|
||||||
@@ -956,14 +872,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
|
|||||||
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
|
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
|
||||||
assert all(v["may_schedule"] for v in response.json()["nodes"].values())
|
assert all(v["may_schedule"] for v in response.json()["nodes"].values())
|
||||||
|
|
||||||
# Reconciler cancel API should be a no-op when nothing is in flight
|
|
||||||
env.storage_controller.request(
|
|
||||||
"PUT",
|
|
||||||
f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0102/cancel_reconcile",
|
|
||||||
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Node unclean drop API
|
|
||||||
response = env.storage_controller.request(
|
response = env.storage_controller.request(
|
||||||
"POST",
|
"POST",
|
||||||
f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
|
f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
|
||||||
@@ -971,7 +879,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
|
|||||||
)
|
)
|
||||||
assert len(env.storage_controller.node_list()) == 1
|
assert len(env.storage_controller.node_list()) == 1
|
||||||
|
|
||||||
# Tenant unclean drop API
|
|
||||||
response = env.storage_controller.request(
|
response = env.storage_controller.request(
|
||||||
"POST",
|
"POST",
|
||||||
f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
|
f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
|
||||||
@@ -985,6 +892,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
|
|||||||
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
||||||
)
|
)
|
||||||
assert len(response.json()) == 1
|
assert len(response.json()) == 1
|
||||||
|
|
||||||
# Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
|
# Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
|
||||||
# meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
|
# meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
|
||||||
env.storage_controller.consistency_check()
|
env.storage_controller.consistency_check()
|
||||||
@@ -1752,11 +1660,6 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
|
|||||||
storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
|
storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
|
||||||
assert "Stop" in storcon_cli(["tenants"])[3]
|
assert "Stop" in storcon_cli(["tenants"])[3]
|
||||||
|
|
||||||
# Cancel ongoing reconcile on a tenant
|
|
||||||
storcon_cli(
|
|
||||||
["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Change a tenant's placement
|
# Change a tenant's placement
|
||||||
storcon_cli(
|
storcon_cli(
|
||||||
["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
|
["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
|
||||||
|
|||||||
@@ -435,9 +435,7 @@ def test_emergency_relocate_with_branches_slow_replay(
|
|||||||
|
|
||||||
# This fail point will pause the WAL ingestion on the main branch, after the
|
# This fail point will pause the WAL ingestion on the main branch, after the
|
||||||
# the first insert
|
# the first insert
|
||||||
pageserver_http.configure_failpoints(
|
pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
|
||||||
[("pageserver-wal-ingest-logical-message-sleep", "return(5000)")]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Attach and wait a few seconds to give it time to load the tenants, attach to the
|
# Attach and wait a few seconds to give it time to load the tenants, attach to the
|
||||||
# safekeepers, and to stream and ingest the WAL up to the pause-point.
|
# safekeepers, and to stream and ingest the WAL up to the pause-point.
|
||||||
@@ -455,13 +453,11 @@ def test_emergency_relocate_with_branches_slow_replay(
|
|||||||
assert cur.fetchall() == [("before pause",), ("after pause",)]
|
assert cur.fetchall() == [("before pause",), ("after pause",)]
|
||||||
|
|
||||||
# Sanity check that the failpoint was reached
|
# Sanity check that the failpoint was reached
|
||||||
env.pageserver.assert_log_contains(
|
env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
|
||||||
'failpoint "pageserver-wal-ingest-logical-message-sleep": sleep done'
|
|
||||||
)
|
|
||||||
assert time.time() - before_attach_time > 5
|
assert time.time() - before_attach_time > 5
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
pageserver_http.configure_failpoints(("pageserver-wal-ingest-logical-message-sleep", "off"))
|
pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
|
||||||
|
|
||||||
|
|
||||||
# Simulate hard crash of pageserver and re-attach a tenant with a branch
|
# Simulate hard crash of pageserver and re-attach a tenant with a branch
|
||||||
@@ -585,9 +581,7 @@ def test_emergency_relocate_with_branches_createdb(
|
|||||||
# bug reproduced easily even without this, as there is always some delay between
|
# bug reproduced easily even without this, as there is always some delay between
|
||||||
# loading the timeline and establishing the connection to the safekeeper to stream and
|
# loading the timeline and establishing the connection to the safekeeper to stream and
|
||||||
# ingest the WAL, but let's make this less dependent on accidental timing.
|
# ingest the WAL, but let's make this less dependent on accidental timing.
|
||||||
pageserver_http.configure_failpoints(
|
pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
|
||||||
[("pageserver-wal-ingest-logical-message-sleep", "return(5000)")]
|
|
||||||
)
|
|
||||||
before_attach_time = time.time()
|
before_attach_time = time.time()
|
||||||
env.pageserver.tenant_attach(tenant_id)
|
env.pageserver.tenant_attach(tenant_id)
|
||||||
|
|
||||||
@@ -596,10 +590,8 @@ def test_emergency_relocate_with_branches_createdb(
|
|||||||
assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
|
assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
|
||||||
|
|
||||||
# Sanity check that the failpoint was reached
|
# Sanity check that the failpoint was reached
|
||||||
env.pageserver.assert_log_contains(
|
env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
|
||||||
'failpoint "pageserver-wal-ingest-logical-message-sleep": sleep done'
|
|
||||||
)
|
|
||||||
assert time.time() - before_attach_time > 5
|
assert time.time() - before_attach_time > 5
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
pageserver_http.configure_failpoints(("pageserver-wal-ingest-logical-message-sleep", "off"))
|
pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
|
||||||
|
|||||||
@@ -1998,109 +1998,6 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
|
|||||||
pt_handle.join()
|
pt_handle.join()
|
||||||
|
|
||||||
|
|
||||||
def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
|
|
||||||
"""
|
|
||||||
Verify that when pull_timeline is used on an evicted timeline, it does not result in
|
|
||||||
promoting any segments to local disk on the source, and the timeline is correctly instantiated
|
|
||||||
in evicted state on the destination. This behavior is important to avoid ballooning disk
|
|
||||||
usage when doing mass migration of timelines.
|
|
||||||
"""
|
|
||||||
neon_env_builder.num_safekeepers = 4
|
|
||||||
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
|
|
||||||
|
|
||||||
# Configure safekeepers with ultra-fast eviction policy
|
|
||||||
neon_env_builder.safekeeper_extra_opts = [
|
|
||||||
"--enable-offload",
|
|
||||||
"--partial-backup-timeout",
|
|
||||||
"50ms",
|
|
||||||
"--control-file-save-interval",
|
|
||||||
"1s",
|
|
||||||
# Safekeepers usually wait a while before evicting something: for this test we want them to
|
|
||||||
# evict things as soon as they are inactive.
|
|
||||||
"--eviction-min-resident=100ms",
|
|
||||||
"--delete-offloaded-wal",
|
|
||||||
]
|
|
||||||
|
|
||||||
initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
|
|
||||||
env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
|
|
||||||
tenant_id = env.initial_tenant
|
|
||||||
timeline_id = env.initial_timeline
|
|
||||||
|
|
||||||
(src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[-1])
|
|
||||||
log.info(f"Will pull_timeline on destination {dst_sk.id} from source {src_sk.id}")
|
|
||||||
|
|
||||||
ep = env.endpoints.create("main")
|
|
||||||
ep.active_safekeepers = [s.id for s in env.safekeepers if s.id != dst_sk.id]
|
|
||||||
log.info(f"Compute writing initially to safekeepers: {ep.active_safekeepers}")
|
|
||||||
ep.active_safekeepers = [1, 2, 3] # Exclude dst_sk from set written by compute initially
|
|
||||||
ep.start()
|
|
||||||
ep.safe_psql("CREATE TABLE t(i int)")
|
|
||||||
ep.safe_psql("INSERT INTO t VALUES (0)")
|
|
||||||
ep.stop()
|
|
||||||
|
|
||||||
wait_lsn_force_checkpoint_at_sk(src_sk, tenant_id, timeline_id, env.pageserver)
|
|
||||||
|
|
||||||
src_http = src_sk.http_client()
|
|
||||||
dst_http = dst_sk.http_client()
|
|
||||||
|
|
||||||
def evicted_on_source():
|
|
||||||
# Wait for timeline to go into evicted state
|
|
||||||
assert src_http.get_eviction_state(timeline_id) != "Present"
|
|
||||||
assert (
|
|
||||||
src_http.get_metric_value(
|
|
||||||
"safekeeper_eviction_events_completed_total", {"kind": "evict"}
|
|
||||||
)
|
|
||||||
or 0 > 0
|
|
||||||
)
|
|
||||||
assert src_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
|
|
||||||
# Check that on source no segment files are present
|
|
||||||
assert src_sk.list_segments(tenant_id, timeline_id) == []
|
|
||||||
|
|
||||||
wait_until(60, 1, evicted_on_source)
|
|
||||||
|
|
||||||
# Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
|
|
||||||
# destination should import the control file only & go into evicted mode immediately
|
|
||||||
dst_sk.pull_timeline([src_sk], tenant_id, timeline_id)
|
|
||||||
|
|
||||||
# Check that on source and destination no segment files are present
|
|
||||||
assert src_sk.list_segments(tenant_id, timeline_id) == []
|
|
||||||
assert dst_sk.list_segments(tenant_id, timeline_id) == []
|
|
||||||
|
|
||||||
# Check that the timeline on the destination is in the expected evicted state.
|
|
||||||
evicted_on_source() # It should still be evicted on the source
|
|
||||||
|
|
||||||
def evicted_on_destination():
|
|
||||||
assert dst_http.get_eviction_state(timeline_id) != "Present"
|
|
||||||
assert dst_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
|
|
||||||
|
|
||||||
# This should be fast, it is a wait_until because eviction state is updated
|
|
||||||
# in the background wrt pull_timeline.
|
|
||||||
wait_until(10, 0.1, evicted_on_destination)
|
|
||||||
|
|
||||||
# Delete the timeline on the source, to prove that deletion works on an
|
|
||||||
# evicted timeline _and_ that the final compute test is really not using
|
|
||||||
# the original location
|
|
||||||
src_sk.http_client().timeline_delete(tenant_id, timeline_id, only_local=True)
|
|
||||||
|
|
||||||
# Check that using the timeline correctly un-evicts it on the new location
|
|
||||||
ep.active_safekeepers = [2, 3, 4]
|
|
||||||
ep.start()
|
|
||||||
ep.safe_psql("INSERT INTO t VALUES (0)")
|
|
||||||
ep.stop()
|
|
||||||
|
|
||||||
def unevicted_on_dest():
|
|
||||||
assert (
|
|
||||||
dst_http.get_metric_value(
|
|
||||||
"safekeeper_eviction_events_completed_total", {"kind": "restore"}
|
|
||||||
)
|
|
||||||
or 0 > 0
|
|
||||||
)
|
|
||||||
n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
|
|
||||||
assert n_evicted == 0
|
|
||||||
|
|
||||||
wait_until(10, 1, unevicted_on_dest)
|
|
||||||
|
|
||||||
|
|
||||||
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
|
# In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
|
||||||
# when compute is active, but there are no writes to the timeline. In that case
|
# when compute is active, but there are no writes to the timeline. In that case
|
||||||
# pageserver should maintain a single connection to safekeeper and don't attempt
|
# pageserver should maintain a single connection to safekeeper and don't attempt
|
||||||
|
|||||||
@@ -1,12 +1,11 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import time
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from fixtures.common_types import Lsn, TenantId
|
from fixtures.common_types import Lsn, TenantId
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
||||||
from fixtures.utils import wait_until
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -20,10 +19,6 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
|
|||||||
env = neon_env_builder.init_start()
|
env = neon_env_builder.init_start()
|
||||||
env.pageserver.http_client()
|
env.pageserver.http_client()
|
||||||
|
|
||||||
# In this test we force 'Timed out while waiting for WAL record error' while
|
|
||||||
# fetching basebackup and don't want any retries.
|
|
||||||
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
|
|
||||||
|
|
||||||
tenant_id, timeline_id = env.create_tenant()
|
tenant_id, timeline_id = env.create_tenant()
|
||||||
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
|
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
|
||||||
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
|
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
|
||||||
@@ -54,14 +49,11 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
|
|||||||
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
|
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
|
||||||
# Trigger WAL wait timeout faster
|
# Trigger WAL wait timeout faster
|
||||||
def customize_pageserver_toml(ps_cfg: dict[str, Any]):
|
def customize_pageserver_toml(ps_cfg: dict[str, Any]):
|
||||||
ps_cfg["wait_lsn_timeout"] = "2s"
|
ps_cfg["wait_lsn_timeout"] = "1s"
|
||||||
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
||||||
tenant_config["walreceiver_connect_timeout"] = "2s"
|
tenant_config["walreceiver_connect_timeout"] = "2s"
|
||||||
tenant_config["lagging_wal_timeout"] = "2s"
|
tenant_config["lagging_wal_timeout"] = "2s"
|
||||||
|
|
||||||
# In this test we force 'Timed out while waiting for WAL record error' while
|
|
||||||
# fetching basebackup and don't want any retries.
|
|
||||||
os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
|
|
||||||
neon_env_builder.pageserver_config_override = customize_pageserver_toml
|
neon_env_builder.pageserver_config_override = customize_pageserver_toml
|
||||||
|
|
||||||
# Have notable SK ids to ensure we check logs for their presence, not some other random numbers
|
# Have notable SK ids to ensure we check logs for their presence, not some other random numbers
|
||||||
@@ -72,6 +64,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
|
|||||||
|
|
||||||
tenant_id, timeline_id = env.create_tenant()
|
tenant_id, timeline_id = env.create_tenant()
|
||||||
|
|
||||||
|
elements_to_insert = 1_000_000
|
||||||
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
|
expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
|
||||||
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
|
env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
|
||||||
# we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
|
# we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
|
||||||
@@ -81,50 +74,45 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
|
|||||||
".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
|
".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
|
||||||
)
|
)
|
||||||
|
|
||||||
insert_test_elements(env, tenant_id, start=0, count=1)
|
insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)
|
||||||
|
|
||||||
def all_sks_in_wareceiver_state():
|
try:
|
||||||
try:
|
trigger_wait_lsn_timeout(env, tenant_id)
|
||||||
trigger_wait_lsn_timeout(env, tenant_id)
|
except Exception as e:
|
||||||
except Exception as e:
|
exception_string = str(e)
|
||||||
exception_string = str(e)
|
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
|
||||||
|
|
||||||
|
for safekeeper in env.safekeepers:
|
||||||
assert (
|
assert (
|
||||||
expected_timeout_error in exception_string
|
str(safekeeper.id) in exception_string
|
||||||
), "Should time out during waiting for WAL"
|
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
|
||||||
|
|
||||||
for safekeeper in env.safekeepers:
|
|
||||||
assert (
|
|
||||||
str(safekeeper.id) in exception_string
|
|
||||||
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
|
|
||||||
|
|
||||||
wait_until(60, 0.5, all_sks_in_wareceiver_state)
|
|
||||||
|
|
||||||
stopped_safekeeper = env.safekeepers[-1]
|
stopped_safekeeper = env.safekeepers[-1]
|
||||||
stopped_safekeeper_id = stopped_safekeeper.id
|
stopped_safekeeper_id = stopped_safekeeper.id
|
||||||
log.info(f"Stopping safekeeper {stopped_safekeeper.id}")
|
log.info(f"Stopping safekeeper {stopped_safekeeper.id}")
|
||||||
stopped_safekeeper.stop()
|
stopped_safekeeper.stop()
|
||||||
|
# sleep until stopped safekeeper is removed from candidates
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
def all_but_stopped_sks_in_wareceiver_state():
|
# Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats.
|
||||||
try:
|
insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert)
|
||||||
trigger_wait_lsn_timeout(env, tenant_id)
|
|
||||||
except Exception as e:
|
|
||||||
# Strip out the part before stdout, as it contains full command with the list of all safekeepers
|
|
||||||
exception_string = str(e).split("stdout", 1)[-1]
|
|
||||||
assert (
|
|
||||||
expected_timeout_error in exception_string
|
|
||||||
), "Should time out during waiting for WAL"
|
|
||||||
|
|
||||||
for safekeeper in env.safekeepers:
|
try:
|
||||||
if safekeeper.id == stopped_safekeeper_id:
|
trigger_wait_lsn_timeout(env, tenant_id)
|
||||||
assert (
|
except Exception as e:
|
||||||
str(safekeeper.id) not in exception_string
|
# Strip out the part before stdout, as it contains full command with the list of all safekeepers
|
||||||
), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
|
exception_string = str(e).split("stdout", 1)[-1]
|
||||||
else:
|
assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
|
||||||
assert (
|
|
||||||
str(safekeeper.id) in exception_string
|
|
||||||
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
|
|
||||||
|
|
||||||
wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state)
|
for safekeeper in env.safekeepers:
|
||||||
|
if safekeeper.id == stopped_safekeeper_id:
|
||||||
|
assert (
|
||||||
|
str(safekeeper.id) not in exception_string
|
||||||
|
), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
str(safekeeper.id) in exception_string
|
||||||
|
), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
|
||||||
|
|
||||||
|
|
||||||
def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):
|
def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):
|
||||||
|
|||||||
Reference in New Issue
Block a user