mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 10:22:56 +00:00
Merge pull request #9573 from neondatabase/releases/2024-10-29-compute-only-2
Compute release 2024-10-29
This commit is contained in:
6
.github/workflows/benchmarking.yml
vendored
6
.github/workflows/benchmarking.yml
vendored
@@ -671,6 +671,10 @@ jobs:
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 12h, default timeout is 6h
|
||||
# we have regression in clickbench causing it to run 2-3x longer
|
||||
timeout-minutes: 720
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -716,7 +720,7 @@ jobs:
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
|
||||
extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
|
||||
6
.github/workflows/build_and_test.yml
vendored
6
.github/workflows/build_and_test.yml
vendored
@@ -839,6 +839,7 @@ jobs:
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-size=2G \
|
||||
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
|
||||
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||
@@ -1116,7 +1117,10 @@ jobs:
|
||||
|
||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployProxyLink=true \
|
||||
-f deployPrivatelinkProxy=true \
|
||||
-f deployProxyScram=true \
|
||||
-f deployProxyAuthBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
else
|
||||
|
||||
20
Cargo.lock
generated
20
Cargo.lock
generated
@@ -3749,6 +3749,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"wal_decoder",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -4186,6 +4187,7 @@ dependencies = [
|
||||
"regex",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
"utils",
|
||||
]
|
||||
|
||||
@@ -6272,7 +6274,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6788,7 +6790,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
@@ -6954,6 +6956,20 @@ dependencies = [
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wal_decoder"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"serde",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.3.3"
|
||||
|
||||
@@ -33,6 +33,7 @@ members = [
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
"libs/walproposer",
|
||||
"libs/wal_decoder",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -238,6 +239,7 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
@@ -666,7 +666,7 @@ RUN apt-get update && \
|
||||
#
|
||||
# Use new version only for v17
|
||||
# because Release_2024_09_1 has some backward incompatible changes
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
@@ -860,18 +860,98 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
|
||||
esac && \
|
||||
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
case "${PG_VERSION}" in \
|
||||
'v17') \
|
||||
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
|
||||
esac && \
|
||||
cargo install --locked --version 0.11.3 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions pgrx12"
|
||||
#
|
||||
# pgrx started to support Postgres 17 since version 12,
|
||||
# but some older extension aren't compatible with it.
|
||||
# This layer should be used as a base for new pgrx extensions,
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build-pgrx12
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y curl libclang-dev && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.12.6 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "pg-onnx-build" and "pgrag-pg-build"
|
||||
# Compile "pgrag" extensions
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-onnx-build
|
||||
|
||||
# cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
|
||||
# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
|
||||
RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
|
||||
python3 -m venv venv && \
|
||||
. venv/bin/activate && \
|
||||
python3 -m pip install cmake==3.30.5 && \
|
||||
wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
|
||||
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
|
||||
./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root
|
||||
|
||||
|
||||
FROM pg-onnx-build AS pgrag-pg-build
|
||||
|
||||
RUN apt-get install -y protobuf-compiler && \
|
||||
wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \
|
||||
echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
|
||||
\
|
||||
cd exts/rag && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
|
||||
\
|
||||
cd ../rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
|
||||
\
|
||||
cd ../rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
|
||||
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-jsonschema-pg-build"
|
||||
@@ -1041,6 +1121,31 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_mooncake"
|
||||
# compile pg_mooncake extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM rust-extensions-build AS pg-mooncake-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PG_MOONCAKE_VERSION=882175dbba07ba2e6e59b1088d61bf325b910b9e
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
'v14') \
|
||||
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
|
||||
esac && \
|
||||
git clone --depth 1 --branch neon https://github.com/kelvich/pg_mooncake.git pg_mooncake-src && \
|
||||
cd pg_mooncake-src && \
|
||||
git checkout "${PG_MOONCAKE_VERSION}" && \
|
||||
git submodule update --init --depth 1 --recursive && \
|
||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -1059,6 +1164,7 @@ COPY --from=h3-pg-build /h3/usr /
|
||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -1084,6 +1190,7 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -1247,6 +1354,7 @@ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.patch /ext-src/
|
||||
COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
|
||||
#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
|
||||
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
|
||||
|
||||
@@ -18,7 +18,7 @@ commands:
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -18,7 +18,7 @@ commands:
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -399,11 +399,6 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
|
||||
lsn_lease_length_for_ts: settings
|
||||
.remove("lsn_lease_length_for_ts")
|
||||
@@ -499,11 +494,6 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
|
||||
lsn_lease_length_for_ts: settings
|
||||
.remove("lsn_lease_length_for_ts")
|
||||
|
||||
@@ -111,6 +111,11 @@ enum Command {
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
},
|
||||
/// Cancel any ongoing reconciliation for this shard
|
||||
TenantShardCancelReconcile {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
},
|
||||
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
||||
TenantConfig {
|
||||
@@ -535,6 +540,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardCancelReconcile { tenant_shard_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantConfig { tenant_id, config } => {
|
||||
let tenant_conf = serde_json::from_str(&config)?;
|
||||
|
||||
|
||||
@@ -250,12 +250,6 @@ pub struct TenantConfigToml {
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: crate::models::AuxFilePolicy,
|
||||
|
||||
/// The length for an explicit LSN lease request.
|
||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -475,7 +469,6 @@ impl Default for TenantConfigToml {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
}
|
||||
|
||||
@@ -5,9 +5,11 @@ pub mod controller_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod record;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
/// Public API types
|
||||
pub mod upcall_api;
|
||||
pub mod value;
|
||||
|
||||
pub mod config;
|
||||
|
||||
@@ -10,7 +10,6 @@ use std::{
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
sync::atomic::AtomicUsize,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -309,7 +308,6 @@ pub struct TenantConfig {
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
pub lsn_lease_length: Option<String>,
|
||||
pub lsn_lease_length_for_ts: Option<String>,
|
||||
}
|
||||
@@ -350,68 +348,6 @@ pub enum AuxFilePolicy {
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
|
||||
matches!(
|
||||
(from, to),
|
||||
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
|
||||
)
|
||||
}
|
||||
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V2
|
||||
}
|
||||
}
|
||||
|
||||
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
|
||||
pub struct AtomicAuxFilePolicy(AtomicUsize);
|
||||
|
||||
impl AtomicAuxFilePolicy {
|
||||
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
|
||||
Self(AtomicUsize::new(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Option<AuxFilePolicy> {
|
||||
match self.0.load(std::sync::atomic::Ordering::Acquire) {
|
||||
0 => None,
|
||||
other => Some(AuxFilePolicy::from_usize(other)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store(&self, policy: Option<AuxFilePolicy>) {
|
||||
self.0.store(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
std::sync::atomic::Ordering::Release,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn to_usize(self) -> usize {
|
||||
match self {
|
||||
Self::V1 => 1,
|
||||
Self::CrossValidation => 2,
|
||||
Self::V2 => 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_from_usize(this: usize) -> Option<Self> {
|
||||
match this {
|
||||
1 => Some(Self::V1),
|
||||
2 => Some(Self::CrossValidation),
|
||||
3 => Some(Self::V2),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_usize(this: usize) -> Self {
|
||||
Self::try_from_usize(this).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum EvictionPolicy {
|
||||
@@ -1633,71 +1569,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_file_migration_path() {
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Self-migration is not a valid migration path, and the caller should handle it by itself.
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations not allowed
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations allowed
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_parse() {
|
||||
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(
|
||||
AuxFilePolicy::from_str("cross-validation").unwrap(),
|
||||
AuxFilePolicy::CrossValidation
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_image_compression_algorithm_parsing() {
|
||||
use ImageCompressionAlgorithm::*;
|
||||
|
||||
113
libs/pageserver_api/src/record.rs
Normal file
113
libs/pageserver_api/src/record.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
//! This module defines the WAL record format used within the pageserver.
|
||||
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
|
||||
/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
|
||||
/// around a PostgreSQL WAL record, or a custom neon-specific "record".
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum NeonWalRecord {
|
||||
/// Native PostgreSQL WAL record
|
||||
Postgres { will_init: bool, rec: Bytes },
|
||||
|
||||
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
|
||||
ClearVisibilityMapFlags {
|
||||
new_heap_blkno: Option<u32>,
|
||||
old_heap_blkno: Option<u32>,
|
||||
flags: u8,
|
||||
},
|
||||
/// Mark transaction IDs as committed on a CLOG page
|
||||
ClogSetCommitted {
|
||||
xids: Vec<TransactionId>,
|
||||
timestamp: TimestampTz,
|
||||
},
|
||||
/// Mark transaction IDs as aborted on a CLOG page
|
||||
ClogSetAborted { xids: Vec<TransactionId> },
|
||||
/// Extend multixact offsets SLRU
|
||||
MultixactOffsetCreate {
|
||||
mid: MultiXactId,
|
||||
moff: MultiXactOffset,
|
||||
},
|
||||
/// Extend multixact members SLRU.
|
||||
MultixactMembersCreate {
|
||||
moff: MultiXactOffset,
|
||||
members: Vec<MultiXactMember>,
|
||||
},
|
||||
/// Update the map of AUX files, either writing or dropping an entry
|
||||
AuxFile {
|
||||
file_path: String,
|
||||
content: Option<Bytes>,
|
||||
},
|
||||
|
||||
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
/// Append a string to the image.
|
||||
append: String,
|
||||
/// Clear the image before appending.
|
||||
clear: bool,
|
||||
/// Treat this record as an init record. `clear` should be set to true if this field is set
|
||||
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
|
||||
/// its references in `timeline.rs`.
|
||||
will_init: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl NeonWalRecord {
|
||||
/// Does replaying this WAL record initialize the page from scratch, or does
|
||||
/// it need to be applied over the previous image of the page?
|
||||
pub fn will_init(&self) -> bool {
|
||||
// If you change this function, you'll also need to change ValueBytes::will_init
|
||||
match self {
|
||||
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
|
||||
#[cfg(feature = "testing")]
|
||||
NeonWalRecord::Test { will_init, .. } => *will_init,
|
||||
// None of the special neon record types currently initialize the page
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_append(s: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_clear() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_init() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a human-readable string to describe a WAL record
|
||||
///
|
||||
/// For debugging purposes
|
||||
pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
|
||||
match rec {
|
||||
NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
|
||||
"will_init: {}, {}",
|
||||
will_init,
|
||||
describe_postgres_wal_record(rec)?
|
||||
)),
|
||||
_ => Ok(format!("{:?}", rec)),
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,16 @@
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
//! This module defines the value type used by the storage engine.
|
||||
//!
|
||||
//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]),
|
||||
//! or a "delta" of how to get from previous version of the value to the new one
|
||||
//! ([`Value::WalRecord`]])
|
||||
//!
|
||||
//! Note that the [`Value`] type is used for the permananent storage format, so any
|
||||
//! changes to it must be backwards compatible.
|
||||
|
||||
use crate::record::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum Value {
|
||||
/// An Image value contains a full copy of the value
|
||||
@@ -20,10 +23,12 @@ pub enum Value {
|
||||
}
|
||||
|
||||
impl Value {
|
||||
#[inline(always)]
|
||||
pub fn is_image(&self) -> bool {
|
||||
matches!(self, Value::Image(_))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn will_init(&self) -> bool {
|
||||
match self {
|
||||
Value::Image(_) => true,
|
||||
@@ -33,17 +38,18 @@ impl Value {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum InvalidInput {
|
||||
pub enum InvalidInput {
|
||||
TooShortValue,
|
||||
TooShortPostgresRecord,
|
||||
}
|
||||
|
||||
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
|
||||
/// use this type for querying if a slice looks some particular way.
|
||||
pub(crate) struct ValueBytes;
|
||||
pub struct ValueBytes;
|
||||
|
||||
impl ValueBytes {
|
||||
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
#[inline(always)]
|
||||
pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
if raw.len() < 12 {
|
||||
return Err(InvalidInput::TooShortValue);
|
||||
}
|
||||
@@ -79,6 +85,7 @@ impl ValueBytes {
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
macro_rules! roundtrip {
|
||||
@@ -229,56 +236,3 @@ mod test {
|
||||
assert!(!ValueBytes::will_init(&expected).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
|
||||
/// The layers which were garbage collected.
|
||||
///
|
||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||
/// dropped in tests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip)]
|
||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let mut other = other;
|
||||
self.doomed_layers.append(&mut other.doomed_layers);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,7 @@ memoffset.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
utils.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
|
||||
@@ -217,6 +217,7 @@ macro_rules! enum_pgversion {
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod walrecord;
|
||||
|
||||
// Export some widely used datatypes that are unlikely to change across Postgres versions
|
||||
pub use v14::bindings::RepOriginId;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
18
libs/wal_decoder/Cargo.toml
Normal file
18
libs/wal_decoder/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "wal_decoder"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
serde.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
1
libs/wal_decoder/src/decoder.rs
Normal file
1
libs/wal_decoder/src/decoder.rs
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
2
libs/wal_decoder/src/lib.rs
Normal file
2
libs/wal_decoder/src/lib.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
pub mod decoder;
|
||||
pub mod models;
|
||||
167
libs/wal_decoder/src/models.rs
Normal file
167
libs/wal_decoder/src/models.rs
Normal file
@@ -0,0 +1,167 @@
|
||||
//! This module houses types which represent decoded PG WAL records
|
||||
//! ready for the pageserver to interpret. They are derived from the original
|
||||
//! WAL records, so that each struct corresponds closely to one WAL record of
|
||||
//! a specific kind. They contain the same information as the original WAL records,
|
||||
//! just decoded into structs and fields for easier access.
|
||||
//!
|
||||
//! The ingestion code uses these structs to help with parsing the WAL records,
|
||||
//! and it splits them into a stream of modifications to the key-value pairs that
|
||||
//! are ultimately stored in delta layers. See also the split-out counterparts in
|
||||
//! [`postgres_ffi::walrecord`].
|
||||
//!
|
||||
//! The pipeline which processes WAL records is not super obvious, so let's follow
|
||||
//! the flow of an example XACT_COMMIT Postgres record:
|
||||
//!
|
||||
//! (Postgres XACT_COMMIT record)
|
||||
//! |
|
||||
//! |--> pageserver::walingest::WalIngest::decode_xact_record
|
||||
//! |
|
||||
//! |--> ([`XactRecord::Commit`])
|
||||
//! |
|
||||
//! |--> pageserver::walingest::WalIngest::ingest_xact_record
|
||||
//! |
|
||||
//! |--> (NeonWalRecord::ClogSetCommitted)
|
||||
//! |
|
||||
//! |--> write to KV store within the pageserver
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::walrecord::{
|
||||
XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
|
||||
XlSmgrTruncate, XlXactParsedRecord,
|
||||
};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
pub vm_rel: RelTag,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
pub src_db_id: Oid,
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
CommitPrepared(XactCommon),
|
||||
AbortPrepared(XactCommon),
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
// Fields below are only used for logging
|
||||
pub xl_xid: TransactionId,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
pub struct PutLogicalMessage {
|
||||
pub path: String,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
}
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing" ]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -83,6 +83,7 @@ enum-map.workspace = true
|
||||
enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
|
||||
@@ -8,13 +8,12 @@ use pageserver::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use criterion::measurement::WallTime;
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
|
||||
@@ -60,7 +60,8 @@ use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
future::Future,
|
||||
|
||||
@@ -51,7 +51,7 @@
|
||||
//!
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver_api::key::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -14,12 +14,12 @@ use std::ops::Range;
|
||||
use std::{fs, str};
|
||||
|
||||
use pageserver::page_cache::{self, PAGE_SZ};
|
||||
use pageserver::repository::{Key, KEY_SIZE};
|
||||
use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
|
||||
use pageserver::tenant::storage_layer::range_overlaps;
|
||||
use pageserver::virtual_file::{self, VirtualFile};
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
|
||||
@@ -14,13 +14,13 @@ use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::virtual_file::api::IoMode;
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver::{
|
||||
repository::{Key, KEY_SIZE},
|
||||
tenant::{
|
||||
block_io::FileBlockReader, disk_btree::VisitDirection,
|
||||
storage_layer::delta_layer::DELTA_KEY_SIZE,
|
||||
},
|
||||
virtual_file::VirtualFile,
|
||||
};
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use std::fs;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::models::{TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -66,10 +66,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
mgmt_api_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
config: TenantConfig {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -696,7 +696,7 @@ impl DeletionQueue {
|
||||
mod test {
|
||||
use camino::Utf8Path;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use std::{io::ErrorKind, time::Duration};
|
||||
use tracing::info;
|
||||
|
||||
@@ -705,7 +705,6 @@ mod test {
|
||||
|
||||
use crate::{
|
||||
controller_upcall_client::RetryForeverError,
|
||||
repository::Key,
|
||||
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
|
||||
};
|
||||
|
||||
|
||||
@@ -2232,13 +2232,13 @@ async fn getpage_at_lsn_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
struct Key(crate::repository::Key);
|
||||
struct Key(pageserver_api::key::Key);
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
crate::repository::Key::from_hex(s).map(Key)
|
||||
pageserver_api::key::Key::from_hex(s).map(Key)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,12 +19,11 @@ use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::decode_wal_record;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
|
||||
use postgres_ffi::ControlFileData;
|
||||
use postgres_ffi::DBState_DB_SHUTDOWNED;
|
||||
use postgres_ffi::Oid;
|
||||
|
||||
@@ -24,7 +24,6 @@ pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod repository;
|
||||
pub mod span;
|
||||
pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
@@ -32,7 +31,6 @@ pub mod tenant;
|
||||
pub mod utilization;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -7,14 +7,14 @@
|
||||
//! Clarify that)
|
||||
//!
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -22,7 +22,9 @@ use pageserver_api::key::{
|
||||
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
|
||||
|
||||
@@ -92,11 +92,11 @@ use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationMode;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
pub use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
@@ -160,6 +160,7 @@ pub(crate) mod timeline;
|
||||
pub mod size;
|
||||
|
||||
mod gc_block;
|
||||
mod gc_result;
|
||||
pub(crate) mod throttle;
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -301,6 +302,13 @@ pub struct Tenant {
|
||||
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
|
||||
timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
|
||||
|
||||
/// Serialize writes of the tenant manifest to remote storage. If there are concurrent operations
|
||||
/// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
|
||||
/// each other (this could be optimized to coalesce writes if necessary).
|
||||
///
|
||||
/// The contents of the Mutex are the last manifest we successfully uploaded
|
||||
tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
|
||||
|
||||
// This mutex prevents creation of new timelines during GC.
|
||||
// Adding yet another mutex (in addition to `timelines`) is needed because holding
|
||||
// `timelines` mutex during all GC iteration
|
||||
@@ -467,10 +475,10 @@ impl WalRedoManager {
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: crate::repository::Key,
|
||||
key: pageserver_api::key::Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
|
||||
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
@@ -625,19 +633,10 @@ impl TimelineOrOffloaded {
|
||||
TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
|
||||
}
|
||||
}
|
||||
fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
|
||||
fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
|
||||
match self {
|
||||
TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
|
||||
TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
|
||||
Some(remote_client) => remote_client,
|
||||
None => {
|
||||
let remote_client = tenant.build_timeline_client(
|
||||
offloaded.timeline_id,
|
||||
tenant.remote_storage.clone(),
|
||||
);
|
||||
Arc::new(remote_client)
|
||||
}
|
||||
},
|
||||
TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
|
||||
TimelineOrOffloaded::Offloaded(offloaded) => offloaded.remote_client.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -749,6 +748,24 @@ pub enum TimelineArchivalError {
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum TenantManifestError {
|
||||
#[error("Remote storage error: {0}")]
|
||||
RemoteStorage(anyhow::Error),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl From<TenantManifestError> for TimelineArchivalError {
|
||||
fn from(e: TenantManifestError) -> Self {
|
||||
match e {
|
||||
TenantManifestError::RemoteStorage(e) => Self::Other(e),
|
||||
TenantManifestError::Cancelled => Self::Cancelled,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TimelineArchivalError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
@@ -1534,18 +1551,7 @@ impl Tenant {
|
||||
offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
|
||||
}
|
||||
if !offloaded_timeline_ids.is_empty() {
|
||||
let manifest = self.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(TimelineArchivalError::Other)?;
|
||||
self.store_tenant_manifest().await?;
|
||||
}
|
||||
|
||||
// The local filesystem contents are a cache of what's in the remote IndexPart;
|
||||
@@ -1830,6 +1836,18 @@ impl Tenant {
|
||||
ctx: RequestContext,
|
||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||
info!("unoffloading timeline");
|
||||
|
||||
// We activate the timeline below manually, so this must be called on an active timeline.
|
||||
// We expect callers of this function to ensure this.
|
||||
match self.current_state() {
|
||||
TenantState::Activating { .. }
|
||||
| TenantState::Attaching
|
||||
| TenantState::Broken { .. } => {
|
||||
panic!("Timeline expected to be active")
|
||||
}
|
||||
TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled),
|
||||
TenantState::Active => {}
|
||||
}
|
||||
let cancel = self.cancel.clone();
|
||||
|
||||
// Protect against concurrent attempts to use this TimelineId
|
||||
@@ -1914,18 +1932,7 @@ impl Tenant {
|
||||
};
|
||||
|
||||
// Upload new list of offloaded timelines to S3
|
||||
let manifest = self.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(TimelineArchivalError::Other)?;
|
||||
self.store_tenant_manifest().await?;
|
||||
|
||||
// Activate the timeline (if it makes sense)
|
||||
if !(timeline.is_broken() || timeline.is_stopping()) {
|
||||
@@ -3122,7 +3129,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_manifest = self.tenant_manifest();
|
||||
let tenant_manifest = self.build_tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
for child_shard in child_shards {
|
||||
@@ -3317,7 +3324,8 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_manifest(&self) -> TenantManifest {
|
||||
/// Generate an up-to-date TenantManifest based on the state of this Tenant.
|
||||
fn build_tenant_manifest(&self) -> TenantManifest {
|
||||
let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
|
||||
|
||||
let mut timeline_manifests = timelines_offloaded
|
||||
@@ -3525,6 +3533,7 @@ impl Tenant {
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
timelines_creating: Mutex::new(HashSet::new()),
|
||||
timelines_offloaded: Mutex::new(HashMap::new()),
|
||||
tenant_manifest_upload: Default::default(),
|
||||
gc_cs: tokio::sync::Mutex::new(()),
|
||||
walredo_mgr,
|
||||
remote_storage,
|
||||
@@ -4704,6 +4713,49 @@ impl Tenant {
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Serialize and write the latest TenantManifest to remote storage.
|
||||
pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
|
||||
// Only one manifest write may be done at at time, and the contents of the manifest
|
||||
// must be loaded while holding this lock. This makes it safe to call this function
|
||||
// from anywhere without worrying about colliding updates.
|
||||
let mut guard = tokio::select! {
|
||||
g = self.tenant_manifest_upload.lock() => {
|
||||
g
|
||||
},
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(TenantManifestError::Cancelled);
|
||||
}
|
||||
};
|
||||
|
||||
let manifest = self.build_tenant_manifest();
|
||||
if Some(&manifest) == (*guard).as_ref() {
|
||||
// Optimisation: skip uploads that don't change anything.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if self.cancel.is_cancelled() {
|
||||
TenantManifestError::Cancelled
|
||||
} else {
|
||||
TenantManifestError::RemoteStorage(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
// Store the successfully uploaded manifest, so that future callers can avoid
|
||||
// re-uploading the same thing.
|
||||
*guard = Some(manifest);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
@@ -4806,7 +4858,8 @@ pub(crate) mod harness {
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::l0_flush::L0FlushConfig;
|
||||
use crate::walredo::apply_neon;
|
||||
use crate::{repository::Key, walrecord::NeonWalRecord};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
use super::*;
|
||||
use hex_literal::hex;
|
||||
@@ -4853,7 +4906,6 @@ pub(crate) mod harness {
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
|
||||
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
|
||||
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
|
||||
}
|
||||
@@ -5076,25 +5128,30 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{thread_rng, Rng};
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{DeltaLayerTestDesc, GcInfo};
|
||||
use timeline::DeltaLayerTestDesc;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::GcInfo;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
|
||||
@@ -7659,6 +7716,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_neon_test_record() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_neon_test_record").await?;
|
||||
@@ -7850,6 +7908,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
|
||||
@@ -8046,6 +8105,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_generate_key_retention() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_generate_key_retention").await?;
|
||||
@@ -8393,6 +8453,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
|
||||
let harness =
|
||||
@@ -8633,6 +8694,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
|
||||
{
|
||||
@@ -8841,6 +8903,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
@@ -9042,6 +9105,7 @@ mod tests {
|
||||
//
|
||||
// When querying the key range [A, B) we need to read at different LSN ranges
|
||||
// for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
@@ -341,10 +340,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
@@ -410,9 +405,6 @@ impl TenantConfOpt {
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
switch_aux_file_policy: self
|
||||
.switch_aux_file_policy
|
||||
.unwrap_or(global_conf.switch_aux_file_policy),
|
||||
lsn_lease_length: self
|
||||
.lsn_lease_length
|
||||
.unwrap_or(global_conf.lsn_lease_length),
|
||||
@@ -470,7 +462,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
switch_aux_file_policy: value.switch_aux_file_policy,
|
||||
lsn_lease_length: value.lsn_lease_length.map(humantime),
|
||||
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
|
||||
}
|
||||
|
||||
57
pageserver/src/tenant/gc_result.rs
Normal file
57
pageserver/src/tenant/gc_result.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use anyhow::Result;
|
||||
use serde::Serialize;
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
|
||||
/// The layers which were garbage collected.
|
||||
///
|
||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||
/// dropped in tests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip)]
|
||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let mut other = other;
|
||||
self.doomed_layers.append(&mut other.doomed_layers);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -48,9 +48,9 @@ mod layer_coverage;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
|
||||
@@ -2811,7 +2811,7 @@ where
|
||||
}
|
||||
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
|
||||
@@ -249,7 +249,7 @@ pub(crate) use download::{
|
||||
list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
pub(crate) use index::LayerFileMetadata;
|
||||
pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
|
||||
pub(crate) use upload::upload_initdb_dir;
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a download fails, we log it at info-level, and retry.
|
||||
|
||||
@@ -403,59 +403,79 @@ async fn do_download_index_part(
|
||||
Ok((index_part, index_generation, index_part_mtime))
|
||||
}
|
||||
|
||||
/// index_part.json objects are suffixed with a generation number, so we cannot
|
||||
/// directly GET the latest index part without doing some probing.
|
||||
/// Metadata objects are "generationed", meaning that they include a generation suffix. This
|
||||
/// function downloads the object with the highest generation <= `my_generation`.
|
||||
///
|
||||
/// In this function we probe for the most recent index in a generation <= our current generation.
|
||||
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
/// Data objects (layer files) also include a generation in their path, but there is no equivalent
|
||||
/// search process, because their reference from an index includes the generation.
|
||||
///
|
||||
/// An expensive object listing operation is only done if necessary: the typical fast path is to issue two
|
||||
/// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding
|
||||
/// generation (normal case when migrating/restarting). Only if both of these return 404 do we fall back
|
||||
/// to listing objects.
|
||||
///
|
||||
/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
|
||||
/// * `what`: for logging, what object are we downloading
|
||||
/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
|
||||
/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
|
||||
/// `cancel`` has fired. This function does not do its own retries of GET operations, and relies
|
||||
/// on the function passed in to do so.
|
||||
/// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
|
||||
pub(crate) async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_shard_id: &'a TenantShardId,
|
||||
timeline_id: &'a TimelineId,
|
||||
my_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
what: &str,
|
||||
prefix: RemotePath,
|
||||
do_download: DF,
|
||||
parse_path: PF,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> Result<(T, Generation, SystemTime), DownloadError>
|
||||
where
|
||||
DF: Fn(
|
||||
&'a GenericRemoteStorage,
|
||||
&'a TenantShardId,
|
||||
&'a TimelineId,
|
||||
Generation,
|
||||
&'a CancellationToken,
|
||||
) -> DFF,
|
||||
DFF: Future<Output = Result<(T, Generation, SystemTime), DownloadError>>,
|
||||
PF: Fn(RemotePath) -> Option<Generation>,
|
||||
T: 'static,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
if my_generation.is_none() {
|
||||
// Operating without generations: just fetch the generation-less path
|
||||
return do_download_index_part(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
my_generation,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
}
|
||||
|
||||
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
||||
// index in our generation.
|
||||
// Stale case: If we were intentionally attached in a stale generation, the remote object may already
|
||||
// exist in our generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res =
|
||||
do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!(
|
||||
"Found index_part from current generation (this is a stale attachment)"
|
||||
);
|
||||
return Ok(index_part);
|
||||
Ok(decoded) => {
|
||||
tracing::debug!("Found {what} from current generation (this is a stale attachment)");
|
||||
return Ok(decoded);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
// Typical case: the previous generation of this tenant was running healthily, and had uploaded
|
||||
// and index part. We may safely start from this index without doing a listing, because:
|
||||
// Typical case: the previous generation of this tenant was running healthily, and had uploaded the object
|
||||
// we are seeking in that generation. We may safely start from this index without doing a listing, because:
|
||||
// - We checked for current generation case above
|
||||
// - generations > my_generation are to be ignored
|
||||
// - any other indices that exist would have an older generation than `previous_gen`, and
|
||||
// we want to find the most recent index from a previous generation.
|
||||
// - any other objects that exist would have an older generation than `previous_gen`, and
|
||||
// we want to find the most recent object from a previous generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download_index_part(
|
||||
let res = do_download(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
@@ -464,14 +484,12 @@ pub(crate) async fn download_index_part(
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!("Found index_part from previous generation");
|
||||
return Ok(index_part);
|
||||
Ok(decoded) => {
|
||||
tracing::debug!("Found {what} from previous generation");
|
||||
return Ok(decoded);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
tracing::debug!(
|
||||
"No index_part found from previous generation, falling back to listing"
|
||||
);
|
||||
tracing::debug!("No {what} found from previous generation, falling back to listing");
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
@@ -481,12 +499,10 @@ pub(crate) async fn download_index_part(
|
||||
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
|
||||
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
|
||||
// to constructing a full index path with no generation, because the generation is a suffix.
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
|
||||
let indices = download_retry(
|
||||
let paths = download_retry(
|
||||
|| async {
|
||||
storage
|
||||
.list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
|
||||
.list(Some(&prefix), ListingMode::NoDelimiter, None, cancel)
|
||||
.await
|
||||
},
|
||||
"list index_part files",
|
||||
@@ -497,22 +513,22 @@ pub(crate) async fn download_index_part(
|
||||
|
||||
// General case logic for which index to use: the latest index whose generation
|
||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
let max_previous_generation = indices
|
||||
let max_previous_generation = paths
|
||||
.into_iter()
|
||||
.filter_map(|o| parse_remote_index_path(o.key))
|
||||
.filter_map(|o| parse_path(o.key))
|
||||
.filter(|g| g <= &my_generation)
|
||||
.max();
|
||||
|
||||
match max_previous_generation {
|
||||
Some(g) => {
|
||||
tracing::debug!("Found index_part in generation {g:?}");
|
||||
do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
|
||||
tracing::debug!("Found {what} in generation {g:?}");
|
||||
do_download(storage, tenant_shard_id, timeline_id, g, cancel).await
|
||||
}
|
||||
None => {
|
||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||
// attached pageservers did. Try to load from a no-generation path.
|
||||
tracing::debug!("No index_part.json* found");
|
||||
do_download_index_part(
|
||||
tracing::debug!("No {what}* found");
|
||||
do_download(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
@@ -524,6 +540,33 @@ pub(crate) async fn download_index_part(
|
||||
}
|
||||
}
|
||||
|
||||
/// index_part.json objects are suffixed with a generation number, so we cannot
|
||||
/// directly GET the latest index part without doing some probing.
|
||||
///
|
||||
/// In this function we probe for the most recent index in a generation <= our current generation.
|
||||
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
pub(crate) async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
download_generation_object(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
my_generation,
|
||||
"index_part",
|
||||
index_prefix,
|
||||
do_download_index_part,
|
||||
parse_remote_index_path,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn download_initdb_tar_zst(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
|
||||
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
/// Tenant-shard scoped manifest
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct TenantManifest {
|
||||
/// Debugging aid describing the version of this manifest.
|
||||
/// Can also be used for distinguishing breaking changes later on.
|
||||
@@ -23,7 +23,7 @@ pub struct TenantManifest {
|
||||
/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
|
||||
/// but the two datastructures serve different needs, this is for a persistent disk format
|
||||
/// that must be backwards compatible, while the other is only for informative purposes.
|
||||
#[derive(Clone, Serialize, Deserialize, Copy)]
|
||||
#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
|
||||
pub struct OffloadedTimelineManifest {
|
||||
pub timeline_id: TimelineId,
|
||||
/// Whether the timeline has a parent it has been branched off from or not
|
||||
|
||||
@@ -187,6 +187,8 @@ pub(super) async fn gather_inputs(
|
||||
// but it is unlikely to cause any issues. In the worst case,
|
||||
// the calculation will error out.
|
||||
timelines.retain(|t| t.is_active());
|
||||
// Also filter out archived timelines.
|
||||
timelines.retain(|t| t.is_archived() != Some(true));
|
||||
|
||||
// Build a map of branch points.
|
||||
let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
|
||||
|
||||
@@ -11,11 +11,11 @@ mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
|
||||
@@ -5,7 +5,8 @@ use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::layer::S3_UPLOAD_LIMIT;
|
||||
use super::{
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
@@ -46,7 +45,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -54,9 +53,11 @@ use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
@@ -1293,7 +1294,7 @@ impl DeltaLayerInner {
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = crate::repository::ValueBytes::will_init(&data)
|
||||
let will_init = pageserver_api::value::ValueBytes::will_init(&data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
@@ -1356,7 +1357,7 @@ impl DeltaLayerInner {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
@@ -1610,7 +1611,6 @@ pub(crate) mod test {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
@@ -1622,6 +1622,7 @@ pub(crate) mod test {
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
@@ -1974,8 +1975,8 @@ pub(crate) mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn copy_delta_prefix_smoke() {
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
|
||||
.await
|
||||
@@ -2198,6 +2199,7 @@ pub(crate) mod test {
|
||||
(k1, l1).cmp(&(k2, l2))
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub(crate) fn sort_delta_value(
|
||||
(k1, l1, v1): &(Key, Lsn, Value),
|
||||
(k2, l2, v2): &(Key, Lsn, Value),
|
||||
|
||||
@@ -7,7 +7,7 @@ use pageserver_api::{
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::repository::Value;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::merge_iterator::MergeIterator;
|
||||
|
||||
@@ -121,8 +121,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
|
||||
@@ -28,7 +28,6 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
@@ -51,8 +50,10 @@ use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
@@ -1125,6 +1126,7 @@ mod test {
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
value::Value,
|
||||
};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
@@ -1134,7 +1136,6 @@ mod test {
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
repository::Value,
|
||||
tenant::{
|
||||
config::TenantConf,
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -16,9 +15,11 @@ use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
|
||||
@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
/// Also checks that the same does not happen on a non-evicted layer (regression test).
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn eviction_cancellation_on_drop() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
let mut writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
crate::repository::Key::from_i128(5),
|
||||
pageserver_api::key::Key::from_i128(5),
|
||||
Lsn(0x20),
|
||||
&Value::Image(Bytes::from_static(b"this does not matter either")),
|
||||
&ctx,
|
||||
|
||||
@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use std::ops::Range;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::repository::Key;
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
use super::{DeltaLayerName, ImageLayerName, LayerName};
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//!
|
||||
//! Helper functions for dealing with filenames of the image and delta layer files.
|
||||
//!
|
||||
use crate::repository::Key;
|
||||
use pageserver_api::key::Key;
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
||||
@@ -7,7 +7,8 @@ use anyhow::bail;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{context::RequestContext, repository::Value};
|
||||
use crate::context::RequestContext;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::{
|
||||
delta_layer::{DeltaLayerInner, DeltaLayerIterator},
|
||||
@@ -291,12 +292,16 @@ mod tests {
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
|
||||
},
|
||||
walrecord::NeonWalRecord,
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
async fn assert_merge_iter_equal(
|
||||
merge_iter: &mut MergeIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
@@ -319,8 +324,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn merge_in_between() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_merge_in_between")
|
||||
.await
|
||||
@@ -384,8 +389,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_merge")
|
||||
.await
|
||||
@@ -458,10 +463,11 @@ mod tests {
|
||||
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn delta_image_mixed_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
|
||||
.await
|
||||
@@ -586,5 +592,6 @@ mod tests {
|
||||
is_send(merge_iter);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn is_send(_: impl Send) {}
|
||||
}
|
||||
|
||||
@@ -125,11 +125,12 @@ use utils::{
|
||||
simple_rcu::{Rcu, RcuReadGuard},
|
||||
};
|
||||
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
@@ -5822,17 +5823,15 @@ fn is_send() {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
repository::Value,
|
||||
tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
},
|
||||
use crate::tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -49,9 +49,10 @@ use pageserver_api::config::tenant_conf_defaults::{
|
||||
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
|
||||
};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -2148,7 +2149,7 @@ struct ResidentDeltaLayer(ResidentLayer);
|
||||
struct ResidentImageLayer(ResidentLayer);
|
||||
|
||||
impl CompactionJobExecutor for TimelineAdaptor {
|
||||
type Key = crate::repository::Key;
|
||||
type Key = pageserver_api::key::Key;
|
||||
|
||||
type Layer = OwnArc<PersistentLayerDesc>;
|
||||
type DeltaLayer = ResidentDeltaLayer;
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, instrument, Instrument};
|
||||
use tracing::{error, info, info_span, instrument, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
|
||||
use crate::{
|
||||
@@ -14,10 +14,9 @@ use crate::{
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{
|
||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
|
||||
TimelineOrOffloaded,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -176,32 +175,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn upload_new_tenant_manifest(
|
||||
tenant: &Tenant,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) -> anyhow::Result<()> {
|
||||
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
|
||||
// between the deletion of the index-part.json and reaching of this code.
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
let manifest = tenant.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
remote_timeline_client::upload_tenant_manifest(
|
||||
&tenant.remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&tenant.cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
@@ -258,7 +231,32 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
let remote_client = timeline.remote_client_maybe_construct(tenant);
|
||||
let remote_client = match timeline.maybe_remote_client() {
|
||||
Some(remote_client) => remote_client,
|
||||
None => {
|
||||
let remote_client = tenant
|
||||
.build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
|
||||
let result = remote_client
|
||||
.download_index_file(&tenant.cancel)
|
||||
.instrument(info_span!("download_index_file"))
|
||||
.await
|
||||
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
|
||||
let index_part = match result {
|
||||
MaybeDeletedIndexPart::Deleted(p) => {
|
||||
tracing::info!("Timeline already set as deleted in remote index");
|
||||
p
|
||||
}
|
||||
MaybeDeletedIndexPart::IndexPart(p) => p,
|
||||
};
|
||||
let remote_client = Arc::new(remote_client);
|
||||
|
||||
remote_client
|
||||
.init_upload_queue(&index_part)
|
||||
.map_err(DeleteTimelineError::Other)?;
|
||||
remote_client.shutdown().await;
|
||||
remote_client
|
||||
}
|
||||
};
|
||||
set_deleted_in_remote_index(&remote_client).await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
||||
@@ -455,7 +453,15 @@ impl DeleteTimelineFlow {
|
||||
|
||||
remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
|
||||
|
||||
upload_new_tenant_manifest(tenant, &guard).await?;
|
||||
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
|
||||
// between the deletion of the index-part.json and reaching of this code.
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
tenant
|
||||
.store_tenant_manifest()
|
||||
.await
|
||||
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::sync::Arc;
|
||||
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
|
||||
use super::Timeline;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
|
||||
use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
|
||||
|
||||
pub(crate) async fn offload_timeline(
|
||||
tenant: &Tenant,
|
||||
@@ -63,17 +63,10 @@ pub(crate) async fn offload_timeline(
|
||||
// at the next restart attach it again.
|
||||
// For that to happen, we'd need to make the manifest reflect our *intended* state,
|
||||
// not our actual state of offloaded timelines.
|
||||
let manifest = tenant.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
remote_timeline_client::upload_tenant_manifest(
|
||||
&tenant.remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&tenant.cancel,
|
||||
)
|
||||
.await?;
|
||||
tenant
|
||||
.store_tenant_manifest()
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -31,11 +31,11 @@ use crate::{
|
||||
task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
walingest::WalIngest,
|
||||
walrecord::{decode_wal_record, DecodedWALRecord},
|
||||
};
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
|
||||
use utils::{id::NodeId, lsn::Lsn};
|
||||
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
|
||||
|
||||
|
||||
@@ -29,8 +29,10 @@ use std::time::Instant;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::walrecord::*;
|
||||
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
|
||||
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
||||
use wal_decoder::models::*;
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
@@ -44,9 +46,9 @@ use crate::pgdatadir_mapping::{DatadirModification, Version};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord::*;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
@@ -108,143 +110,6 @@ struct WarnIngestLag {
|
||||
timestamp_invalid_msg_ratelimit: RateLimit,
|
||||
}
|
||||
|
||||
// These structs are an intermediary representation of the PostgreSQL WAL records.
|
||||
// The ones prefixed with `Xl` are lower level, while the ones that are not have
|
||||
// all the required context to be acted upon by the pageserver.
|
||||
|
||||
enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
struct ClearVmBits {
|
||||
new_heap_blkno: Option<u32>,
|
||||
old_heap_blkno: Option<u32>,
|
||||
vm_rel: RelTag,
|
||||
flags: u8,
|
||||
}
|
||||
|
||||
enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
struct SmgrCreate {
|
||||
rel: RelTag,
|
||||
}
|
||||
|
||||
enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
struct DbaseCreate {
|
||||
db_id: u32,
|
||||
tablespace_id: u32,
|
||||
src_db_id: u32,
|
||||
src_tablespace_id: u32,
|
||||
}
|
||||
|
||||
struct DbaseDrop {
|
||||
db_id: u32,
|
||||
tablespace_ids: Vec<u32>,
|
||||
}
|
||||
|
||||
enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
struct ClogZeroPage {
|
||||
segno: u32,
|
||||
rpageno: u32,
|
||||
}
|
||||
|
||||
struct ClogTruncate {
|
||||
pageno: u32,
|
||||
oldest_xid: u32,
|
||||
oldest_xid_db: u32,
|
||||
}
|
||||
|
||||
enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
CommitPrepared(XactCommon),
|
||||
AbortPrepared(XactCommon),
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
struct XactCommon {
|
||||
parsed: XlXactParsedRecord,
|
||||
origin_id: u16,
|
||||
// Fields below are only used for logging
|
||||
xl_xid: u32,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
struct XactPrepare {
|
||||
xl_xid: u32,
|
||||
data: Bytes,
|
||||
}
|
||||
|
||||
enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
struct MultiXactZeroPage {
|
||||
slru_kind: SlruKind,
|
||||
segno: u32,
|
||||
rpageno: u32,
|
||||
}
|
||||
|
||||
enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
struct RelmapUpdate {
|
||||
update: XlRelmapUpdate,
|
||||
buf: Bytes,
|
||||
}
|
||||
|
||||
enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
struct RawXlogRecord {
|
||||
info: u8,
|
||||
lsn: Lsn,
|
||||
buf: Bytes,
|
||||
}
|
||||
|
||||
enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
struct PutLogicalMessage {
|
||||
path: String,
|
||||
buf: Bytes,
|
||||
}
|
||||
|
||||
enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
struct StandbyRunningXacts {
|
||||
oldest_running_xid: u32,
|
||||
}
|
||||
|
||||
enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
}
|
||||
|
||||
impl WalIngest {
|
||||
pub async fn new(
|
||||
timeline: &Timeline,
|
||||
@@ -284,7 +149,6 @@ impl WalIngest {
|
||||
/// relations/pages that the record affects.
|
||||
///
|
||||
/// This function returns `true` if the record was ingested, and `false` if it was filtered out
|
||||
///
|
||||
pub async fn ingest_record(
|
||||
&mut self,
|
||||
decoded: DecodedWALRecord,
|
||||
@@ -2218,7 +2082,7 @@ impl WalIngest {
|
||||
) -> anyhow::Result<Option<LogicalMessageRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
||||
let xlrec = crate::walrecord::XlLogicalMessage::decode(buf);
|
||||
let xlrec = XlLogicalMessage::decode(buf);
|
||||
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -2246,7 +2110,7 @@ impl WalIngest {
|
||||
) -> anyhow::Result<Option<StandbyRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_RUNNING_XACTS {
|
||||
let xlrec = crate::walrecord::XlRunningXacts::decode(buf);
|
||||
let xlrec = XlRunningXacts::decode(buf);
|
||||
return Ok(Some(StandbyRecord::RunningXacts(StandbyRunningXacts {
|
||||
oldest_running_xid: xlrec.oldest_running_xid,
|
||||
})));
|
||||
@@ -2276,10 +2140,10 @@ impl WalIngest {
|
||||
) -> anyhow::Result<Option<ReploriginRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_REPLORIGIN_SET {
|
||||
let xlrec = crate::walrecord::XlReploriginSet::decode(buf);
|
||||
let xlrec = XlReploriginSet::decode(buf);
|
||||
return Ok(Some(ReploriginRecord::Set(xlrec)));
|
||||
} else if info == pg_constants::XLOG_REPLORIGIN_DROP {
|
||||
let xlrec = crate::walrecord::XlReploriginDrop::decode(buf);
|
||||
let xlrec = XlReploriginDrop::decode(buf);
|
||||
return Ok(Some(ReploriginRecord::Drop(xlrec)));
|
||||
}
|
||||
|
||||
@@ -3146,6 +3010,7 @@ mod tests {
|
||||
async fn test_ingest_real_wal() {
|
||||
use crate::tenant::harness::*;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::walrecord::decode_wal_record;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
|
||||
// Define test data path and constants.
|
||||
|
||||
@@ -29,11 +29,11 @@ use crate::metrics::{
|
||||
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
|
||||
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
|
||||
};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
@@ -548,9 +548,10 @@ impl PostgresRedoManager {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::PostgresRedoManager;
|
||||
use crate::repository::Key;
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
use crate::config::PageServerConf;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::str::FromStr;
|
||||
use tracing::Instrument;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
@@ -238,7 +238,7 @@ pub(crate) fn apply_in_neon(
|
||||
// No-op: this record will never be created in aux v2.
|
||||
warn!("AuxFile record should not be created in aux v2");
|
||||
}
|
||||
#[cfg(test)]
|
||||
#[cfg(feature = "testing")]
|
||||
NeonWalRecord::Test {
|
||||
append,
|
||||
clear,
|
||||
|
||||
@@ -8,10 +8,10 @@ use crate::{
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
page_cache::PAGE_SZ,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
#[cfg(feature = "testing")]
|
||||
|
||||
@@ -16,6 +16,7 @@ OBJS = \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
unstable_extensions.o \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
control_plane_connector.o \
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
@@ -508,6 +509,8 @@ NeonXactCallback(XactEvent event, void *arg)
|
||||
static bool
|
||||
RoleIsNeonSuperuser(const char *role_name)
|
||||
{
|
||||
Assert(role_name);
|
||||
|
||||
return strcmp(role_name, "neon_superuser") == 0;
|
||||
}
|
||||
|
||||
@@ -670,7 +673,7 @@ HandleCreateRole(CreateRoleStmt *stmt)
|
||||
static void
|
||||
HandleAlterRole(AlterRoleStmt *stmt)
|
||||
{
|
||||
const char *role_name = stmt->role->rolename;
|
||||
char *role_name;
|
||||
DefElem *dpass;
|
||||
ListCell *option;
|
||||
bool found = false;
|
||||
@@ -678,6 +681,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
|
||||
InitRoleTableIfNeeded();
|
||||
|
||||
role_name = get_rolespec_name(stmt->role);
|
||||
if (RoleIsNeonSuperuser(role_name) && !superuser())
|
||||
elog(ERROR, "can't ALTER neon_superuser");
|
||||
|
||||
@@ -689,9 +693,13 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
if (strcmp(defel->defname, "password") == 0)
|
||||
dpass = defel;
|
||||
}
|
||||
|
||||
/* We only care about updates to the password */
|
||||
if (!dpass)
|
||||
{
|
||||
pfree(role_name);
|
||||
return;
|
||||
}
|
||||
|
||||
entry = hash_search(CurrentDdlTable->role_table,
|
||||
role_name,
|
||||
@@ -704,6 +712,8 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
else
|
||||
entry->password = NULL;
|
||||
entry->type = Op_Set;
|
||||
|
||||
pfree(role_name);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "neon.h"
|
||||
#include "control_plane_connector.h"
|
||||
#include "logical_replication_monitor.h"
|
||||
#include "unstable_extensions.h"
|
||||
#include "walsender_hooks.h"
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
#include "storage/ipc.h"
|
||||
@@ -424,6 +425,7 @@ _PG_init(void)
|
||||
LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
|
||||
|
||||
InitUnstableExtensionsSupport();
|
||||
InitLogicalReplicationMonitor();
|
||||
InitControlPlaneConnector();
|
||||
|
||||
|
||||
@@ -42,3 +42,4 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
|
||||
MemoryContextSwitchTo(old_context);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
129
pgxn/neon/unstable_extensions.c
Normal file
129
pgxn/neon/unstable_extensions.c
Normal file
@@ -0,0 +1,129 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "nodes/plannodes.h"
|
||||
#include "nodes/parsenodes.h"
|
||||
#include "tcop/utility.h"
|
||||
#include "utils/errcodes.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
#include "unstable_extensions.h"
|
||||
|
||||
static bool allow_unstable_extensions = false;
|
||||
static char *unstable_extensions = NULL;
|
||||
|
||||
static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
|
||||
|
||||
static bool
|
||||
list_contains(char const* comma_separated_list, char const* val)
|
||||
{
|
||||
char const* occ = comma_separated_list;
|
||||
size_t val_len = strlen(val);
|
||||
|
||||
if (val_len == 0)
|
||||
return false;
|
||||
|
||||
while ((occ = strstr(occ, val)) != NULL)
|
||||
{
|
||||
if ((occ == comma_separated_list || occ[-1] == ',')
|
||||
&& (occ[val_len] == '\0' || occ[val_len] == ','))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
occ += val_len;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
CheckUnstableExtension(
|
||||
PlannedStmt *pstmt,
|
||||
const char *queryString,
|
||||
bool readOnlyTree,
|
||||
ProcessUtilityContext context,
|
||||
ParamListInfo params,
|
||||
QueryEnvironment *queryEnv,
|
||||
DestReceiver *dest,
|
||||
QueryCompletion *qc)
|
||||
{
|
||||
Node *parseTree = pstmt->utilityStmt;
|
||||
|
||||
if (allow_unstable_extensions || unstable_extensions == NULL)
|
||||
goto process;
|
||||
|
||||
switch (nodeTag(parseTree))
|
||||
{
|
||||
case T_CreateExtensionStmt:
|
||||
{
|
||||
CreateExtensionStmt *stmt = castNode(CreateExtensionStmt, parseTree);
|
||||
if (list_contains(unstable_extensions, stmt->extname))
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("%s extension is in beta and may be unstable or introduce backward-incompatible changes.\nWe recommend testing it in a separate, dedicated Neon project.", stmt->extname),
|
||||
errhint("to proceed with installation, run SET neon.allow_unstable_extensions='true'")));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
goto process;
|
||||
}
|
||||
|
||||
process:
|
||||
if (PreviousProcessUtilityHook)
|
||||
{
|
||||
PreviousProcessUtilityHook(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
}
|
||||
else
|
||||
{
|
||||
standard_ProcessUtility(
|
||||
pstmt,
|
||||
queryString,
|
||||
readOnlyTree,
|
||||
context,
|
||||
params,
|
||||
queryEnv,
|
||||
dest,
|
||||
qc);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
InitUnstableExtensionsSupport(void)
|
||||
{
|
||||
DefineCustomBoolVariable(
|
||||
"neon.allow_unstable_extensions",
|
||||
"Allow unstable extensions to be installed and used",
|
||||
NULL,
|
||||
&allow_unstable_extensions,
|
||||
false,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable(
|
||||
"neon.unstable_extensions",
|
||||
"List of unstable extensions",
|
||||
NULL,
|
||||
&unstable_extensions,
|
||||
NULL,
|
||||
PGC_SUSET,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
PreviousProcessUtilityHook = ProcessUtility_hook;
|
||||
ProcessUtility_hook = CheckUnstableExtension;
|
||||
}
|
||||
6
pgxn/neon/unstable_extensions.h
Normal file
6
pgxn/neon/unstable_extensions.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef __NEON_UNSTABLE_EXTENSIONS_H__
|
||||
#define __NEON_UNSTABLE_EXTENSIONS_H__
|
||||
|
||||
void InitUnstableExtensionsSupport(void);
|
||||
|
||||
#endif
|
||||
53
poetry.lock
generated
53
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@@ -1521,6 +1521,21 @@ files = [
|
||||
[package.dependencies]
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "jwcrypto"
|
||||
version = "1.5.6"
|
||||
description = "Implementation of JOSE Web standards"
|
||||
optional = false
|
||||
python-versions = ">= 3.8"
|
||||
files = [
|
||||
{file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"},
|
||||
{file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cryptography = ">=3.4"
|
||||
typing-extensions = ">=4.5.0"
|
||||
|
||||
[[package]]
|
||||
name = "kafka-python"
|
||||
version = "2.0.2"
|
||||
@@ -2111,7 +2126,6 @@ files = [
|
||||
{file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
|
||||
{file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
|
||||
@@ -2120,8 +2134,6 @@ files = [
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
|
||||
{file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
|
||||
{file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
|
||||
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
|
||||
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
|
||||
@@ -2603,7 +2615,6 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@@ -2912,6 +2923,20 @@ files = [
|
||||
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-jwcrypto"
|
||||
version = "1.5.0.20240925"
|
||||
description = "Typing stubs for jwcrypto"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"},
|
||||
{file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cryptography = "*"
|
||||
|
||||
[[package]]
|
||||
name = "types-psutil"
|
||||
version = "5.9.5.12"
|
||||
@@ -3118,13 +3143,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.0.3"
|
||||
version = "3.0.6"
|
||||
description = "The comprehensive WSGI web application library."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
|
||||
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
|
||||
{file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
|
||||
{file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -3159,16 +3184,6 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -3406,4 +3421,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
|
||||
content-hash = "ad5c9ee7723359af22bbd7fa41538dcf78913c02e947a13a8f9a87eb3a59039e"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
|
||||
use crate::auth::{self, AuthFlow};
|
||||
@@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext(
|
||||
secret: AuthSecret,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> auth::Result<ComputeCredentials> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
debug!("cleartext auth flow override is enabled, proceeding");
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
@@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
debug!("project not specified, resorting to the password hack auth flow");
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
@@ -45,6 +46,7 @@ pub(crate) enum FetchAuthRulesError {
|
||||
RoleJwksNotConfigured,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct AuthRule {
|
||||
pub(crate) id: String,
|
||||
pub(crate) jwks_url: url::Url,
|
||||
@@ -277,7 +279,7 @@ impl JwkCacheEntryLock {
|
||||
|
||||
// get the key from the JWKs if possible. If not, wait for the keys to update.
|
||||
let (jwk, expected_audience) = loop {
|
||||
match guard.find_jwk_and_audience(kid, role_name) {
|
||||
match guard.find_jwk_and_audience(&kid, role_name) {
|
||||
Some(jwk) => break jwk,
|
||||
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
@@ -312,7 +314,9 @@ impl JwkCacheEntryLock {
|
||||
|
||||
if let Some(aud) = expected_audience {
|
||||
if payload.audience.0.iter().all(|s| s != aud) {
|
||||
return Err(JwtError::InvalidJwtTokenAudience);
|
||||
return Err(JwtError::InvalidClaims(
|
||||
JwtClaimsError::InvalidJwtTokenAudience,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,13 +324,15 @@ impl JwkCacheEntryLock {
|
||||
|
||||
if let Some(exp) = payload.expiration {
|
||||
if now >= exp + CLOCK_SKEW_LEEWAY {
|
||||
return Err(JwtError::JwtTokenHasExpired);
|
||||
return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(nbf) = payload.not_before {
|
||||
if nbf >= now + CLOCK_SKEW_LEEWAY {
|
||||
return Err(JwtError::JwtTokenNotYetReadyToUse);
|
||||
return Err(JwtError::InvalidClaims(
|
||||
JwtClaimsError::JwtTokenNotYetReadyToUse,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -420,8 +426,8 @@ struct JwtHeader<'a> {
|
||||
#[serde(rename = "alg")]
|
||||
algorithm: jose_jwa::Algorithm,
|
||||
/// key id, must be provided for our usecase
|
||||
#[serde(rename = "kid")]
|
||||
key_id: Option<&'a str>,
|
||||
#[serde(rename = "kid", borrow)]
|
||||
key_id: Option<Cow<'a, str>>,
|
||||
}
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
|
||||
@@ -440,17 +446,17 @@ struct JwtPayload<'a> {
|
||||
|
||||
// the following entries are only extracted for the sake of debug logging.
|
||||
/// Issuer of the JWT
|
||||
#[serde(rename = "iss")]
|
||||
issuer: Option<&'a str>,
|
||||
#[serde(rename = "iss", borrow)]
|
||||
issuer: Option<Cow<'a, str>>,
|
||||
/// Subject of the JWT (the user)
|
||||
#[serde(rename = "sub")]
|
||||
subject: Option<&'a str>,
|
||||
#[serde(rename = "sub", borrow)]
|
||||
subject: Option<Cow<'a, str>>,
|
||||
/// Unique token identifier
|
||||
#[serde(rename = "jti")]
|
||||
jwt_id: Option<&'a str>,
|
||||
#[serde(rename = "jti", borrow)]
|
||||
jwt_id: Option<Cow<'a, str>>,
|
||||
/// Unique session identifier
|
||||
#[serde(rename = "sid")]
|
||||
session_id: Option<&'a str>,
|
||||
#[serde(rename = "sid", borrow)]
|
||||
session_id: Option<Cow<'a, str>>,
|
||||
}
|
||||
|
||||
/// `OneOrMany` supports parsing either a single item or an array of items.
|
||||
@@ -585,14 +591,8 @@ pub(crate) enum JwtError {
|
||||
#[error("Provided authentication token is not a valid JWT encoding")]
|
||||
JwtEncoding(#[from] JwtEncodingError),
|
||||
|
||||
#[error("invalid JWT token audience")]
|
||||
InvalidJwtTokenAudience,
|
||||
|
||||
#[error("JWT token has expired")]
|
||||
JwtTokenHasExpired,
|
||||
|
||||
#[error("JWT token is not yet ready to use")]
|
||||
JwtTokenNotYetReadyToUse,
|
||||
#[error(transparent)]
|
||||
InvalidClaims(#[from] JwtClaimsError),
|
||||
|
||||
#[error("invalid P256 key")]
|
||||
InvalidP256Key(jose_jwk::crypto::Error),
|
||||
@@ -644,6 +644,19 @@ pub enum JwtEncodingError {
|
||||
InvalidCompactForm,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, PartialEq)]
|
||||
#[non_exhaustive]
|
||||
pub enum JwtClaimsError {
|
||||
#[error("invalid JWT token audience")]
|
||||
InvalidJwtTokenAudience,
|
||||
|
||||
#[error("JWT token has expired")]
|
||||
JwtTokenHasExpired,
|
||||
|
||||
#[error("JWT token is not yet ready to use")]
|
||||
JwtTokenNotYetReadyToUse,
|
||||
}
|
||||
|
||||
#[allow(dead_code, reason = "Debug use only")]
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum KeyType {
|
||||
@@ -680,6 +693,8 @@ mod tests {
|
||||
use hyper_util::rt::TokioIo;
|
||||
use rand::rngs::OsRng;
|
||||
use rsa::pkcs8::DecodePrivateKey;
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use signature::Signer;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
@@ -693,6 +708,7 @@ mod tests {
|
||||
key: jose_jwk::Key::Ec(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
@@ -706,24 +722,47 @@ mod tests {
|
||||
key: jose_jwk::Key::Rsa(pk),
|
||||
prm: jose_jwk::Parameters {
|
||||
kid: Some(kid),
|
||||
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
|
||||
..Default::default()
|
||||
},
|
||||
};
|
||||
(sk, jwk)
|
||||
}
|
||||
|
||||
fn now() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs()
|
||||
}
|
||||
|
||||
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
|
||||
let now = now();
|
||||
let body = typed_json::json! {{
|
||||
"exp": now + 3600,
|
||||
"nbf": now,
|
||||
"aud": ["audience1", "neon", "audience2"],
|
||||
"sub": "user1",
|
||||
"sid": "session1",
|
||||
"jti": "token1",
|
||||
"iss": "neon-testing",
|
||||
}};
|
||||
build_custom_jwt_payload(kid, body, sig)
|
||||
}
|
||||
|
||||
fn build_custom_jwt_payload(
|
||||
kid: String,
|
||||
body: impl Serialize,
|
||||
sig: jose_jwa::Signing,
|
||||
) -> String {
|
||||
let header = JwtHeader {
|
||||
algorithm: jose_jwa::Algorithm::Signing(sig),
|
||||
key_id: Some(&kid),
|
||||
key_id: Some(Cow::Owned(kid)),
|
||||
};
|
||||
let body = typed_json::json! {{
|
||||
"exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
|
||||
}};
|
||||
|
||||
let header =
|
||||
base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
|
||||
let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
|
||||
let body = base64::encode_config(serde_json::to_string(&body).unwrap(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{header}.{body}")
|
||||
}
|
||||
@@ -738,6 +777,16 @@ mod tests {
|
||||
format!("{payload}.{sig}")
|
||||
}
|
||||
|
||||
fn new_custom_ec_jwt(kid: String, key: &p256::SecretKey, body: impl Serialize) -> String {
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
|
||||
let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256);
|
||||
let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
|
||||
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
|
||||
|
||||
format!("{payload}.{sig}")
|
||||
}
|
||||
|
||||
fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
|
||||
use rsa::pkcs1v15::SigningKey;
|
||||
use rsa::signature::SignatureEncoding;
|
||||
@@ -809,37 +858,34 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
-----END PRIVATE KEY-----
|
||||
";
|
||||
|
||||
#[tokio::test]
|
||||
async fn renew() {
|
||||
let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into());
|
||||
let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into());
|
||||
let (ec1, jwk3) = new_ec_jwk("3".into());
|
||||
let (ec2, jwk4) = new_ec_jwk("4".into());
|
||||
#[derive(Clone)]
|
||||
struct Fetch(Vec<AuthRule>);
|
||||
|
||||
let foo_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk1, jwk3],
|
||||
};
|
||||
let bar_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk2, jwk4],
|
||||
};
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
|
||||
Ok(self.0.clone())
|
||||
}
|
||||
}
|
||||
|
||||
async fn jwks_server(
|
||||
router: impl for<'a> Fn(&'a str) -> Option<Vec<u8>> + Send + Sync + 'static,
|
||||
) -> SocketAddr {
|
||||
let router = Arc::new(router);
|
||||
let service = service_fn(move |req| {
|
||||
let foo_jwks = foo_jwks.clone();
|
||||
let bar_jwks = bar_jwks.clone();
|
||||
let router = Arc::clone(&router);
|
||||
async move {
|
||||
let jwks = match req.uri().path() {
|
||||
"/foo" => &foo_jwks,
|
||||
"/bar" => &bar_jwks,
|
||||
_ => {
|
||||
return Response::builder()
|
||||
.status(404)
|
||||
.body(Full::new(Bytes::new()));
|
||||
}
|
||||
};
|
||||
let body = serde_json::to_vec(jwks).unwrap();
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.body(Full::new(Bytes::from(body)))
|
||||
match router(req.uri().path()) {
|
||||
Some(body) => Response::builder()
|
||||
.status(200)
|
||||
.body(Full::new(Bytes::from(body))),
|
||||
None => Response::builder()
|
||||
.status(404)
|
||||
.body(Full::new(Bytes::new())),
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -854,84 +900,61 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
}
|
||||
});
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
addr
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Fetch(SocketAddr, Vec<RoleNameInt>);
|
||||
#[tokio::test]
|
||||
async fn check_jwt_happy_path() {
|
||||
let (rs1, jwk1) = new_rsa_jwk(RS1, "rs1".into());
|
||||
let (rs2, jwk2) = new_rsa_jwk(RS2, "rs2".into());
|
||||
let (ec1, jwk3) = new_ec_jwk("ec1".into());
|
||||
let (ec2, jwk4) = new_ec_jwk("ec2".into());
|
||||
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
|
||||
Ok(vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: self.1.clone(),
|
||||
},
|
||||
AuthRule {
|
||||
id: "bar".to_owned(),
|
||||
jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: self.1.clone(),
|
||||
},
|
||||
])
|
||||
}
|
||||
}
|
||||
let foo_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk1, jwk3],
|
||||
};
|
||||
let bar_jwks = jose_jwk::JwkSet {
|
||||
keys: vec![jwk2, jwk4],
|
||||
};
|
||||
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/foo" => Some(serde_json::to_vec(&foo_jwks).unwrap()),
|
||||
"/bar" => Some(serde_json::to_vec(&bar_jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let role_name1 = RoleName::from("anonymous");
|
||||
let role_name2 = RoleName::from("authenticated");
|
||||
|
||||
let fetch = Fetch(
|
||||
addr,
|
||||
vec![
|
||||
RoleNameInt::from(&role_name1),
|
||||
RoleNameInt::from(&role_name2),
|
||||
],
|
||||
);
|
||||
let roles = vec![
|
||||
RoleNameInt::from(&role_name1),
|
||||
RoleNameInt::from(&role_name2),
|
||||
];
|
||||
let rules = vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{jwks_addr}/foo").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: roles.clone(),
|
||||
},
|
||||
AuthRule {
|
||||
id: "bar".to_owned(),
|
||||
jwks_url: format!("http://{jwks_addr}/bar").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: roles.clone(),
|
||||
},
|
||||
];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let endpoint = EndpointId::from("ep");
|
||||
|
||||
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
|
||||
|
||||
let jwt1 = new_rsa_jwt("1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("3".into(), &ec1);
|
||||
let jwt4 = new_ec_jwt("4".into(), &ec2);
|
||||
|
||||
// had the wrong kid, therefore will have the wrong ecdsa signature
|
||||
let bad_jwt = new_ec_jwt("3".into(), &ec2);
|
||||
// this role_name is not accepted
|
||||
let bad_role_name = RoleName::from("cloud_admin");
|
||||
|
||||
let err = jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&bad_jwt,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
&role_name1,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("signature error"));
|
||||
|
||||
let err = jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&jwt1,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
&bad_role_name,
|
||||
&fetch,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(err.to_string().contains("jwk not found"));
|
||||
let jwt1 = new_rsa_jwt("rs1".into(), rs1);
|
||||
let jwt2 = new_rsa_jwt("rs2".into(), rs2);
|
||||
let jwt3 = new_ec_jwt("ec1".into(), &ec1);
|
||||
let jwt4 = new_ec_jwt("ec2".into(), &ec2);
|
||||
|
||||
let tokens = [jwt1, jwt2, jwt3, jwt4];
|
||||
let role_names = [role_name1, role_name2];
|
||||
@@ -940,15 +963,250 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
token,
|
||||
&client,
|
||||
endpoint.clone(),
|
||||
role,
|
||||
&fetch,
|
||||
token,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS Cognito escapes the `/` in the URL.
|
||||
#[tokio::test]
|
||||
async fn check_jwt_regression_cognito_issuer() {
|
||||
let (key, jwk) = new_ec_jwk("key".into());
|
||||
|
||||
let now = now();
|
||||
let token = new_custom_ec_jwt(
|
||||
"key".into(),
|
||||
&key,
|
||||
typed_json::json! {{
|
||||
"sub": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
|
||||
// cognito uses `\/`. I cannot replicated that easily here as serde_json will refuse
|
||||
// to write that escape character. instead I will make a bogus URL using `\` instead.
|
||||
"iss": "https:\\\\cognito-idp.us-west-2.amazonaws.com\\us-west-2_abcdefgh",
|
||||
"client_id": "abcdefghijklmnopqrstuvwxyz",
|
||||
"origin_jti": "6759d132-3fe7-446e-9e90-2fe7e8017893",
|
||||
"event_id": "ec9c36ab-b01d-46a0-94e4-87fde6767065",
|
||||
"token_use": "access",
|
||||
"scope": "aws.cognito.signin.user.admin",
|
||||
"auth_time":now,
|
||||
"exp":now + 60,
|
||||
"iat":now,
|
||||
"jti": "b241614b-0b93-4bdc-96db-0a3c7061d9c0",
|
||||
"username": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
|
||||
}},
|
||||
);
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
|
||||
|
||||
let jwks_addr = jwks_server(move |_path| Some(serde_json::to_vec(&jwks).unwrap())).await;
|
||||
|
||||
let role_name = RoleName::from("anonymous");
|
||||
let rules = vec![AuthRule {
|
||||
id: "aws-cognito".to_owned(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: vec![RoleNameInt::from(&role_name)],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let endpoint = EndpointId::from("ep");
|
||||
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
endpoint.clone(),
|
||||
&role_name,
|
||||
&fetch,
|
||||
&token,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_jwt_invalid_signature() {
|
||||
let (_, jwk) = new_ec_jwk("1".into());
|
||||
let (key, _) = new_ec_jwk("1".into());
|
||||
|
||||
// has a matching kid, but signed by the wrong key
|
||||
let bad_jwt = new_ec_jwt("1".into(), &key);
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let role = RoleName::from("authenticated");
|
||||
|
||||
let rules = vec![AuthRule {
|
||||
id: String::new(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: vec![RoleNameInt::from(&role)],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let ep = EndpointId::from("ep");
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let err = jwk_cache
|
||||
.check_jwt(&ctx, ep, &role, &fetch, &bad_jwt)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
matches!(err, JwtError::Signature(_)),
|
||||
"expected \"signature error\", got {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_jwt_unknown_role() {
|
||||
let (key, jwk) = new_rsa_jwk(RS1, "1".into());
|
||||
let jwt = new_rsa_jwt("1".into(), key);
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let role = RoleName::from("authenticated");
|
||||
let rules = vec![AuthRule {
|
||||
id: String::new(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: None,
|
||||
role_names: vec![RoleNameInt::from(&role)],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let ep = EndpointId::from("ep");
|
||||
|
||||
// this role_name is not accepted
|
||||
let bad_role_name = RoleName::from("cloud_admin");
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
let err = jwk_cache
|
||||
.check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert!(
|
||||
matches!(err, JwtError::JwkNotFound),
|
||||
"expected \"jwk not found\", got {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_jwt_invalid_claims() {
|
||||
let (key, jwk) = new_ec_jwk("1".into());
|
||||
|
||||
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
|
||||
let jwks_addr = jwks_server(move |path| match path {
|
||||
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
|
||||
let now = SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
|
||||
struct Test {
|
||||
body: serde_json::Value,
|
||||
error: JwtClaimsError,
|
||||
}
|
||||
|
||||
let table = vec![
|
||||
Test {
|
||||
body: json! {{
|
||||
"nbf": now + 60,
|
||||
"aud": "neon",
|
||||
}},
|
||||
error: JwtClaimsError::JwtTokenNotYetReadyToUse,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"exp": now - 60,
|
||||
"aud": ["neon"],
|
||||
}},
|
||||
error: JwtClaimsError::JwtTokenHasExpired,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"aud": [],
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"aud": "foo",
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"aud": ["foo"],
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
Test {
|
||||
body: json! {{
|
||||
"aud": ["foo", "bar"],
|
||||
}},
|
||||
error: JwtClaimsError::InvalidJwtTokenAudience,
|
||||
},
|
||||
];
|
||||
|
||||
let role = RoleName::from("authenticated");
|
||||
|
||||
let rules = vec![AuthRule {
|
||||
id: String::new(),
|
||||
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
|
||||
audience: Some("neon".to_string()),
|
||||
role_names: vec![RoleNameInt::from(&role)],
|
||||
}];
|
||||
|
||||
let fetch = Fetch(rules);
|
||||
let jwk_cache = JwkCache::default();
|
||||
|
||||
let ep = EndpointId::from("ep");
|
||||
|
||||
let ctx = RequestMonitoring::test();
|
||||
for test in table {
|
||||
let jwt = new_custom_ec_jwt("1".into(), &key, test.body);
|
||||
|
||||
match jwk_cache
|
||||
.check_jwt(&ctx, ep.clone(), &role, &fetch, &jwt)
|
||||
.await
|
||||
{
|
||||
Err(JwtError::InvalidClaims(error)) if error == test.error => {}
|
||||
Err(err) => {
|
||||
panic!("expected {:?}, got {err:?}", test.error)
|
||||
}
|
||||
Ok(_payload) => {
|
||||
panic!("expected {:?}, got ok", test.error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,9 +137,6 @@ struct ProxyCliArgs {
|
||||
/// size of the threadpool for password hashing
|
||||
#[clap(long, default_value_t = 4)]
|
||||
scram_thread_pool_size: u8,
|
||||
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_dynamic_rate_limiter: bool,
|
||||
/// Endpoint rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||
@@ -615,9 +612,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
and metric-collection-interval must be specified"
|
||||
),
|
||||
};
|
||||
if !args.disable_dynamic_rate_limiter {
|
||||
bail!("dynamic rate limiter should be disabled");
|
||||
}
|
||||
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
|
||||
5
proxy/src/serverless/error.rs
Normal file
5
proxy/src/serverless/error.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
use http::StatusCode;
|
||||
|
||||
pub trait HttpCodeError {
|
||||
fn get_http_status_code(&self) -> StatusCode;
|
||||
}
|
||||
@@ -6,6 +6,7 @@ mod backend;
|
||||
pub mod cancel_set;
|
||||
mod conn_pool;
|
||||
mod conn_pool_lib;
|
||||
mod error;
|
||||
mod http_conn_pool;
|
||||
mod http_util;
|
||||
mod json;
|
||||
|
||||
@@ -28,6 +28,7 @@ use uuid::Uuid;
|
||||
use super::backend::{LocalProxyConnError, PoolingBackend};
|
||||
use super::conn_pool::{AuthData, ConnInfoWithAuth};
|
||||
use super::conn_pool_lib::{self, ConnInfo};
|
||||
use super::error::HttpCodeError;
|
||||
use super::http_util::json_response;
|
||||
use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
|
||||
use super::local_conn_pool;
|
||||
@@ -238,7 +239,6 @@ fn get_conn_info(
|
||||
Ok(ConnInfoWithAuth { conn_info, auth })
|
||||
}
|
||||
|
||||
// TODO: return different http error codes
|
||||
pub(crate) async fn handle(
|
||||
config: &'static ProxyConfig,
|
||||
ctx: RequestMonitoring,
|
||||
@@ -319,9 +319,8 @@ pub(crate) async fn handle(
|
||||
"forwarding error to user"
|
||||
);
|
||||
|
||||
// TODO: this shouldn't always be bad request.
|
||||
json_response(
|
||||
StatusCode::BAD_REQUEST,
|
||||
e.get_http_status_code(),
|
||||
json!({
|
||||
"message": message,
|
||||
"code": code,
|
||||
@@ -405,6 +404,25 @@ impl UserFacingError for SqlOverHttpError {
|
||||
}
|
||||
}
|
||||
|
||||
impl HttpCodeError for SqlOverHttpError {
|
||||
fn get_http_status_code(&self) -> StatusCode {
|
||||
match self {
|
||||
SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() {
|
||||
ErrorKind::User => StatusCode::BAD_REQUEST,
|
||||
_ => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
},
|
||||
SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE,
|
||||
SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE,
|
||||
SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST,
|
||||
SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum ReadPayloadError {
|
||||
#[error("could not read the HTTP request body: {0}")]
|
||||
|
||||
@@ -23,7 +23,7 @@ backoff = "^2.2.1"
|
||||
pytest-lazy-fixture = "^0.6.3"
|
||||
prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "^3.0.3"
|
||||
Werkzeug = "^3.0.6"
|
||||
pytest-order = "^1.1.0"
|
||||
allure-pytest = "^2.13.2"
|
||||
pytest-asyncio = "^0.21.0"
|
||||
@@ -42,6 +42,9 @@ pytest-repeat = "^0.9.3"
|
||||
websockets = "^12.0"
|
||||
clickhouse-connect = "^0.7.16"
|
||||
kafka-python = "^2.0.2"
|
||||
jwcrypto = "^1.5.6"
|
||||
h2 = "^4.1.0"
|
||||
types-jwcrypto = "^1.5.0.20240925"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
mypy = "==1.3.0"
|
||||
|
||||
@@ -262,14 +262,6 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
// Note: with evicted timelines it should work better then de-evict them and
|
||||
// stream; probably start_snapshot would copy partial s3 file to dest path
|
||||
// and stream control file, or return WalResidentTimeline if timeline is not
|
||||
// evicted.
|
||||
let tli = tli
|
||||
.wal_residence_guard()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
|
||||
// so create the chan and write to it in another task.
|
||||
|
||||
@@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
cmp::min,
|
||||
io::{self, ErrorKind},
|
||||
sync::Arc,
|
||||
};
|
||||
use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
|
||||
use tokio_tar::{Archive, Builder, Header};
|
||||
@@ -25,8 +26,8 @@ use crate::{
|
||||
routes::TimelineStatus,
|
||||
},
|
||||
safekeeper::Term,
|
||||
state::TimelinePersistentState,
|
||||
timeline::WalResidentTimeline,
|
||||
state::{EvictionState, TimelinePersistentState},
|
||||
timeline::{Timeline, WalResidentTimeline},
|
||||
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
|
||||
wal_backup,
|
||||
wal_storage::open_wal_file,
|
||||
@@ -43,18 +44,33 @@ use utils::{
|
||||
/// Stream tar archive of timeline to tx.
|
||||
#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn stream_snapshot(
|
||||
tli: WalResidentTimeline,
|
||||
tli: Arc<Timeline>,
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
tx: mpsc::Sender<Result<Bytes>>,
|
||||
) {
|
||||
if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
|
||||
// Error type/contents don't matter as they won't can't reach the client
|
||||
// (hyper likely doesn't do anything with it), but http stream will be
|
||||
// prematurely terminated. It would be nice to try to send the error in
|
||||
// trailers though.
|
||||
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
|
||||
error!("snapshot failed: {:#}", e);
|
||||
match tli.try_wal_residence_guard().await {
|
||||
Err(e) => {
|
||||
tx.send(Err(anyhow!("Error checking residence: {:#}", e)))
|
||||
.await
|
||||
.ok();
|
||||
}
|
||||
Ok(maybe_resident_tli) => {
|
||||
if let Err(e) = match maybe_resident_tli {
|
||||
Some(resident_tli) => {
|
||||
stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
|
||||
.await
|
||||
}
|
||||
None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
|
||||
} {
|
||||
// Error type/contents don't matter as they won't can't reach the client
|
||||
// (hyper likely doesn't do anything with it), but http stream will be
|
||||
// prematurely terminated. It would be nice to try to send the error in
|
||||
// trailers though.
|
||||
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
|
||||
error!("snapshot failed: {:#}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,12 +96,10 @@ impl Drop for SnapshotContext {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn stream_snapshot_guts(
|
||||
tli: WalResidentTimeline,
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel.
|
||||
fn prepare_tar_stream(
|
||||
tx: mpsc::Sender<Result<Bytes>>,
|
||||
) -> Result<()> {
|
||||
) -> tokio_tar::Builder<impl AsyncWrite + Unpin + Send> {
|
||||
// tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
|
||||
// use SinkWriter as a Write impl. That is,
|
||||
// - create Sink from the tx. It returns PollSendError if chan is closed.
|
||||
@@ -100,12 +114,38 @@ pub async fn stream_snapshot_guts(
|
||||
// - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
|
||||
// into CopyToBytes. This is a data copy.
|
||||
let copy_to_bytes = CopyToBytes::new(oksink);
|
||||
let mut writer = SinkWriter::new(copy_to_bytes);
|
||||
let pinned_writer = std::pin::pin!(writer);
|
||||
let writer = SinkWriter::new(copy_to_bytes);
|
||||
let pinned_writer = Box::pin(writer);
|
||||
|
||||
// Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
|
||||
// which is also likely suboptimal.
|
||||
let mut ar = Builder::new_non_terminated(pinned_writer);
|
||||
Builder::new_non_terminated(pinned_writer)
|
||||
}
|
||||
|
||||
/// Implementation of snapshot for an offloaded timeline, only reads control file
|
||||
pub(crate) async fn stream_snapshot_offloaded_guts(
|
||||
tli: Arc<Timeline>,
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
tx: mpsc::Sender<Result<Bytes>>,
|
||||
) -> Result<()> {
|
||||
let mut ar = prepare_tar_stream(tx);
|
||||
|
||||
tli.snapshot_offloaded(&mut ar, source, destination).await?;
|
||||
|
||||
ar.finish().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Implementation of snapshot for a timeline which is resident (includes some segment data)
|
||||
pub async fn stream_snapshot_resident_guts(
|
||||
tli: WalResidentTimeline,
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
tx: mpsc::Sender<Result<Bytes>>,
|
||||
) -> Result<()> {
|
||||
let mut ar = prepare_tar_stream(tx);
|
||||
|
||||
let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
|
||||
pausable_failpoint!("sk-snapshot-after-list-pausable");
|
||||
@@ -138,6 +178,70 @@ pub async fn stream_snapshot_guts(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and
|
||||
/// pass a modified control file into the provided tar stream (nothing with data segments on disk, since
|
||||
/// we are offloaded and there aren't any)
|
||||
async fn snapshot_offloaded<W: AsyncWrite + Unpin + Send>(
|
||||
self: &Arc<Timeline>,
|
||||
ar: &mut tokio_tar::Builder<W>,
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
) -> Result<()> {
|
||||
// Take initial copy of control file, then release state lock
|
||||
let mut control_file = {
|
||||
let shared_state = self.write_shared_state().await;
|
||||
|
||||
let control_file = TimelinePersistentState::clone(shared_state.sk.state());
|
||||
|
||||
// Rare race: we got unevicted between entering function and reading control file.
|
||||
// We error out and let API caller retry.
|
||||
if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) {
|
||||
bail!("Timeline was un-evicted during snapshot, please retry");
|
||||
}
|
||||
|
||||
control_file
|
||||
};
|
||||
|
||||
// Modify the partial segment of the in-memory copy for the control file to
|
||||
// point to the destination safekeeper.
|
||||
let replace = control_file
|
||||
.partial_backup
|
||||
.replace_uploaded_segment(source, destination)?;
|
||||
|
||||
let Some(replace) = replace else {
|
||||
// In Manager:: ready_for_eviction, we do not permit eviction unless the timeline
|
||||
// has a partial segment. It is unexpected that
|
||||
anyhow::bail!("Timeline has no partial segment, cannot generate snapshot");
|
||||
};
|
||||
|
||||
tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}");
|
||||
|
||||
// Optimistically try to copy the partial segment to the destination's path: this
|
||||
// can fail if the timeline was un-evicted and modified in the background.
|
||||
let remote_timeline_path = &self.remote_path;
|
||||
wal_backup::copy_partial_segment(
|
||||
&replace.previous.remote_path(remote_timeline_path),
|
||||
&replace.current.remote_path(remote_timeline_path),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Since the S3 copy succeeded with the path given in our control file snapshot, and
|
||||
// we are sending that snapshot in our response, we are giving the caller a consistent
|
||||
// snapshot even if our local Timeline was unevicted or otherwise modified in the meantime.
|
||||
let buf = control_file
|
||||
.write_to_buf()
|
||||
.with_context(|| "failed to serialize control store")?;
|
||||
let mut header = Header::new_gnu();
|
||||
header.set_size(buf.len().try_into().expect("never breaches u64"));
|
||||
ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
|
||||
.await
|
||||
.with_context(|| "failed to append to archive")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl WalResidentTimeline {
|
||||
/// Start streaming tar archive with timeline:
|
||||
/// 1) stream control file under lock;
|
||||
|
||||
@@ -21,18 +21,15 @@ use postgres_backend::QueryError;
|
||||
use pq_proto::BeMessage;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::future;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tokio::sync::mpsc::channel;
|
||||
use tokio::sync::mpsc::error::TryRecvError;
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::sync::mpsc::{channel, Receiver, Sender};
|
||||
use tokio::task;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::Duration;
|
||||
use tokio::time::Instant;
|
||||
use tokio::time::{Duration, MissedTickBehavior};
|
||||
use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -444,9 +441,9 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
}
|
||||
|
||||
// Send keepalive messages to walproposer, to make sure it receives updates
|
||||
// even when it writes a steady stream of messages.
|
||||
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||
/// The WAL flush interval. This ensures we periodically flush the WAL and send AppendResponses to
|
||||
/// walproposer, even when it's writing a steady stream of messages.
|
||||
const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
||||
/// replies to reply_tx.
|
||||
@@ -494,67 +491,76 @@ impl WalAcceptor {
|
||||
async fn run(&mut self) -> anyhow::Result<()> {
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
|
||||
|
||||
// After this timestamp we will stop processing AppendRequests and send a response
|
||||
// to the walproposer. walproposer sends at least one AppendRequest per second,
|
||||
// we will send keepalives by replying to these requests once per second.
|
||||
let mut next_keepalive = Instant::now();
|
||||
// Periodically flush the WAL.
|
||||
let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
|
||||
flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
||||
flush_ticker.tick().await; // skip the initial, immediate tick
|
||||
|
||||
while let Some(mut next_msg) = self.msg_rx.recv().await {
|
||||
// Update walreceiver state in shmem for reporting.
|
||||
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
|
||||
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
|
||||
}
|
||||
// Tracks unflushed appends.
|
||||
let mut dirty = false;
|
||||
|
||||
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
|
||||
// Loop through AppendRequests while available to write as many WAL records as
|
||||
// possible without fsyncing.
|
||||
//
|
||||
// Make sure the WAL is flushed before returning, see:
|
||||
// https://github.com/neondatabase/neon/issues/9259
|
||||
//
|
||||
// Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
|
||||
// Otherwise, we might end up in a situation where we read a message, but don't
|
||||
// process it.
|
||||
while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
|
||||
let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||
|
||||
if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
|
||||
if self.reply_tx.send(reply).await.is_err() {
|
||||
break; // disconnected, flush WAL and return on next send/recv
|
||||
}
|
||||
}
|
||||
|
||||
// get out of this loop if keepalive time is reached
|
||||
if Instant::now() >= next_keepalive {
|
||||
loop {
|
||||
let reply = tokio::select! {
|
||||
// Process inbound message.
|
||||
msg = self.msg_rx.recv() => {
|
||||
// If disconnected, break to flush WAL and return.
|
||||
let Some(mut msg) = msg else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Update walreceiver state in shmem for reporting.
|
||||
if let ProposerAcceptorMessage::Elected(_) = &msg {
|
||||
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
|
||||
}
|
||||
|
||||
// continue pulling AppendRequests if available
|
||||
match self.msg_rx.try_recv() {
|
||||
Ok(msg) => next_msg = msg,
|
||||
Err(TryRecvError::Empty) => break,
|
||||
// on disconnect, flush WAL and return on next send/recv
|
||||
Err(TryRecvError::Disconnected) => break,
|
||||
};
|
||||
// Don't flush the WAL on every append, only periodically via flush_ticker.
|
||||
// This batches multiple appends per fsync. If the channel is empty after
|
||||
// sending the reply, we'll schedule an immediate flush.
|
||||
if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
|
||||
msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||
dirty = true;
|
||||
}
|
||||
|
||||
self.tli.process_msg(&msg).await?
|
||||
}
|
||||
|
||||
// flush all written WAL to the disk
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?
|
||||
} else {
|
||||
// process message other than AppendRequest
|
||||
self.tli.process_msg(&next_msg).await?
|
||||
// While receiving AppendRequests, flush the WAL periodically and respond with an
|
||||
// AppendResponse to let walproposer know we're still alive.
|
||||
_ = flush_ticker.tick(), if dirty => {
|
||||
dirty = false;
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?
|
||||
}
|
||||
|
||||
// If there are no pending messages, flush the WAL immediately.
|
||||
//
|
||||
// TODO: this should be done via flush_ticker.reset_immediately(), but that's always
|
||||
// delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866.
|
||||
_ = future::ready(()), if dirty && self.msg_rx.is_empty() => {
|
||||
dirty = false;
|
||||
flush_ticker.reset();
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(reply) = reply_msg {
|
||||
// Send reply, if any.
|
||||
if let Some(reply) = reply {
|
||||
if self.reply_tx.send(reply).await.is_err() {
|
||||
return Ok(()); // chan closed, streaming terminated
|
||||
break; // disconnected, break to flush WAL and return
|
||||
}
|
||||
// reset keepalive time
|
||||
next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
|
||||
}
|
||||
}
|
||||
|
||||
// Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
|
||||
if dirty {
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -797,14 +797,17 @@ impl Timeline {
|
||||
state.sk.term_bump(to).await
|
||||
}
|
||||
|
||||
/// Get the timeline guard for reading/writing WAL files.
|
||||
/// If WAL files are not present on disk (evicted), they will be automatically
|
||||
/// downloaded from remote storage. This is done in the manager task, which is
|
||||
/// responsible for issuing all guards.
|
||||
///
|
||||
/// NB: don't use this function from timeline_manager, it will deadlock.
|
||||
/// NB: don't use this function while holding shared_state lock.
|
||||
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
|
||||
/// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
|
||||
async fn do_wal_residence_guard(
|
||||
self: &Arc<Self>,
|
||||
block: bool,
|
||||
) -> Result<Option<WalResidentTimeline>> {
|
||||
let op_label = if block {
|
||||
"wal_residence_guard"
|
||||
} else {
|
||||
"try_wal_residence_guard"
|
||||
};
|
||||
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
@@ -816,10 +819,13 @@ impl Timeline {
|
||||
// Wait 30 seconds for the guard to be acquired. It can time out if someone is
|
||||
// holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
|
||||
// is stuck.
|
||||
let res = tokio::time::timeout_at(
|
||||
started_at + Duration::from_secs(30),
|
||||
self.manager_ctl.wal_residence_guard(),
|
||||
)
|
||||
let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async {
|
||||
if block {
|
||||
self.manager_ctl.wal_residence_guard().await.map(Some)
|
||||
} else {
|
||||
self.manager_ctl.try_wal_residence_guard().await
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
let guard = match res {
|
||||
@@ -827,14 +833,14 @@ impl Timeline {
|
||||
let finished_at = Instant::now();
|
||||
let elapsed = finished_at - started_at;
|
||||
MISC_OPERATION_SECONDS
|
||||
.with_label_values(&["wal_residence_guard"])
|
||||
.with_label_values(&[op_label])
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
guard
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!(
|
||||
"error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
|
||||
"error acquiring in {op_label}, statuses {:?} => {:?}",
|
||||
status_before,
|
||||
self.mgr_status.get()
|
||||
);
|
||||
@@ -842,7 +848,7 @@ impl Timeline {
|
||||
}
|
||||
Err(_) => {
|
||||
warn!(
|
||||
"timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
|
||||
"timeout acquiring in {op_label} guard, statuses {:?} => {:?}",
|
||||
status_before,
|
||||
self.mgr_status.get()
|
||||
);
|
||||
@@ -850,7 +856,28 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
Ok(WalResidentTimeline::new(self.clone(), guard))
|
||||
Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g)))
|
||||
}
|
||||
|
||||
/// Get the timeline guard for reading/writing WAL files.
|
||||
/// If WAL files are not present on disk (evicted), they will be automatically
|
||||
/// downloaded from remote storage. This is done in the manager task, which is
|
||||
/// responsible for issuing all guards.
|
||||
///
|
||||
/// NB: don't use this function from timeline_manager, it will deadlock.
|
||||
/// NB: don't use this function while holding shared_state lock.
|
||||
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
|
||||
self.do_wal_residence_guard(true)
|
||||
.await
|
||||
.map(|m| m.expect("Always get Some in block=true mode"))
|
||||
}
|
||||
|
||||
/// Get the timeline guard for reading/writing WAL files if the timeline is resident,
|
||||
/// else return None
|
||||
pub(crate) async fn try_wal_residence_guard(
|
||||
self: &Arc<Self>,
|
||||
) -> Result<Option<WalResidentTimeline>> {
|
||||
self.do_wal_residence_guard(false).await
|
||||
}
|
||||
|
||||
pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
|
||||
|
||||
@@ -56,6 +56,9 @@ impl Manager {
|
||||
// This also works for the first segment despite last_removed_segno
|
||||
// being 0 on init because this 0 triggers run of wal_removal_task
|
||||
// on success of which manager updates the horizon.
|
||||
//
|
||||
// **Note** pull_timeline functionality assumes that evicted timelines always have
|
||||
// a partial segment: if we ever change this condition, must also update that code.
|
||||
&& self
|
||||
.partial_backup_uploaded
|
||||
.as_ref()
|
||||
|
||||
@@ -100,6 +100,8 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
|
||||
pub enum ManagerCtlMessage {
|
||||
/// Request to get a guard for WalResidentTimeline, with WAL files available locally.
|
||||
GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
|
||||
/// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None
|
||||
TryGuardRequest(tokio::sync::oneshot::Sender<Option<ResidenceGuard>>),
|
||||
/// Request to drop the guard.
|
||||
GuardDrop(GuardId),
|
||||
/// Request to reset uploaded partial backup state.
|
||||
@@ -110,6 +112,7 @@ impl std::fmt::Debug for ManagerCtlMessage {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
|
||||
ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"),
|
||||
ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
|
||||
ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
|
||||
}
|
||||
@@ -152,6 +155,19 @@ impl ManagerCtl {
|
||||
.and_then(std::convert::identity)
|
||||
}
|
||||
|
||||
/// Issue a new guard if the timeline is currently not offloaded, else return None
|
||||
/// Sends a message to the manager and waits for the response.
|
||||
/// Can be blocked indefinitely if the manager is stuck.
|
||||
pub async fn try_wal_residence_guard(&self) -> anyhow::Result<Option<ResidenceGuard>> {
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
self.manager_tx
|
||||
.send(ManagerCtlMessage::TryGuardRequest(tx))?;
|
||||
|
||||
// wait for the manager to respond with the guard
|
||||
rx.await
|
||||
.map_err(|e| anyhow::anyhow!("response read fail: {:?}", e))
|
||||
}
|
||||
|
||||
/// Request timeline manager to reset uploaded partial segment state and
|
||||
/// wait for the result.
|
||||
pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
|
||||
@@ -674,6 +690,17 @@ impl Manager {
|
||||
warn!("failed to reply with a guard, receiver dropped");
|
||||
}
|
||||
}
|
||||
Some(ManagerCtlMessage::TryGuardRequest(tx)) => {
|
||||
let result = if self.is_offloaded {
|
||||
None
|
||||
} else {
|
||||
Some(self.access_service.create_guard())
|
||||
};
|
||||
|
||||
if tx.send(result).is_err() {
|
||||
warn!("failed to reply with a guard, receiver dropped");
|
||||
}
|
||||
}
|
||||
Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
|
||||
self.access_service.drop_guard(guard_id);
|
||||
}
|
||||
|
||||
@@ -968,6 +968,28 @@ async fn handle_tenant_shard_migrate(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_cancel_reconcile(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_shard_cancel_reconcile(tenant_shard_id)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -1776,6 +1798,16 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_migrate"),
|
||||
)
|
||||
})
|
||||
.put(
|
||||
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_shard_cancel_reconcile,
|
||||
RequestName("control_v1_tenant_cancel_reconcile"),
|
||||
)
|
||||
},
|
||||
)
|
||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
|
||||
@@ -4834,6 +4834,43 @@ impl Service {
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
/// 'cancel' in this context means cancel any ongoing reconcile
|
||||
pub(crate) async fn tenant_shard_cancel_reconcile(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), ApiError> {
|
||||
// Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete
|
||||
let waiter = {
|
||||
let locked = self.inner.write().unwrap();
|
||||
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant shard not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
let waiter = shard.get_waiter();
|
||||
match waiter {
|
||||
None => {
|
||||
tracing::info!("Shard does not have an ongoing Reconciler");
|
||||
return Ok(());
|
||||
}
|
||||
Some(waiter) => {
|
||||
tracing::info!("Cancelling Reconciler");
|
||||
shard.cancel_reconciler();
|
||||
waiter
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Cancellation should be prompt. If this fails we have still done our job of firing the
|
||||
// cancellation token, but by returning an ApiError we will indicate to the caller that
|
||||
// the Reconciler is misbehaving and not respecting the cancellation token
|
||||
self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is for debug/support only: we simply drop all state for a tenant, without
|
||||
/// detaching or deleting it on pageservers.
|
||||
pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
|
||||
@@ -4921,16 +4958,7 @@ impl Service {
|
||||
stripe_size,
|
||||
},
|
||||
placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
|
||||
|
||||
// There is no way to know what the tenant's config was: revert to defaults
|
||||
//
|
||||
// TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
|
||||
//
|
||||
// we write to both v1+v2 storage, so that the test case can use either storage format for testing
|
||||
config: TenantConfig {
|
||||
switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
|
||||
..TenantConfig::default()
|
||||
},
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -1317,6 +1317,12 @@ impl TenantShard {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn cancel_reconciler(&self) {
|
||||
if let Some(handle) = self.reconciler.as_ref() {
|
||||
handle.cancel.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
|
||||
/// if it is not already running
|
||||
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
pytest_plugins = (
|
||||
"fixtures.pg_version",
|
||||
"fixtures.parametrize",
|
||||
"fixtures.h2server",
|
||||
"fixtures.httpserver",
|
||||
"fixtures.compute_reconfigure",
|
||||
"fixtures.storage_controller_proxy",
|
||||
|
||||
198
test_runner/fixtures/h2server.py
Normal file
198
test_runner/fixtures/h2server.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html
|
||||
|
||||
auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import collections
|
||||
import io
|
||||
import json
|
||||
from collections.abc import AsyncIterable
|
||||
|
||||
import pytest_asyncio
|
||||
from h2.config import H2Configuration
|
||||
from h2.connection import H2Connection
|
||||
from h2.errors import ErrorCodes
|
||||
from h2.events import (
|
||||
ConnectionTerminated,
|
||||
DataReceived,
|
||||
RemoteSettingsChanged,
|
||||
RequestReceived,
|
||||
StreamEnded,
|
||||
StreamReset,
|
||||
WindowUpdated,
|
||||
)
|
||||
from h2.exceptions import ProtocolError, StreamClosedError
|
||||
from h2.settings import SettingCodes
|
||||
|
||||
RequestData = collections.namedtuple("RequestData", ["headers", "data"])
|
||||
|
||||
|
||||
class H2Server:
|
||||
def __init__(self, host, port) -> None:
|
||||
self.host = host
|
||||
self.port = port
|
||||
|
||||
|
||||
class H2Protocol(asyncio.Protocol):
|
||||
def __init__(self):
|
||||
config = H2Configuration(client_side=False, header_encoding="utf-8")
|
||||
self.conn = H2Connection(config=config)
|
||||
self.transport = None
|
||||
self.stream_data = {}
|
||||
self.flow_control_futures = {}
|
||||
|
||||
def connection_made(self, transport: asyncio.Transport): # type: ignore[override]
|
||||
self.transport = transport
|
||||
self.conn.initiate_connection()
|
||||
self.transport.write(self.conn.data_to_send())
|
||||
|
||||
def connection_lost(self, _exc):
|
||||
for future in self.flow_control_futures.values():
|
||||
future.cancel()
|
||||
self.flow_control_futures = {}
|
||||
|
||||
def data_received(self, data: bytes):
|
||||
assert self.transport is not None
|
||||
try:
|
||||
events = self.conn.receive_data(data)
|
||||
except ProtocolError:
|
||||
self.transport.write(self.conn.data_to_send())
|
||||
self.transport.close()
|
||||
else:
|
||||
self.transport.write(self.conn.data_to_send())
|
||||
for event in events:
|
||||
if isinstance(event, RequestReceived):
|
||||
self.request_received(event.headers, event.stream_id)
|
||||
elif isinstance(event, DataReceived):
|
||||
self.receive_data(event.data, event.stream_id)
|
||||
elif isinstance(event, StreamEnded):
|
||||
self.stream_complete(event.stream_id)
|
||||
elif isinstance(event, ConnectionTerminated):
|
||||
self.transport.close()
|
||||
elif isinstance(event, StreamReset):
|
||||
self.stream_reset(event.stream_id)
|
||||
elif isinstance(event, WindowUpdated):
|
||||
self.window_updated(event.stream_id, event.delta)
|
||||
elif isinstance(event, RemoteSettingsChanged):
|
||||
if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings:
|
||||
self.window_updated(None, 0)
|
||||
|
||||
self.transport.write(self.conn.data_to_send())
|
||||
|
||||
def request_received(self, headers: list[tuple[str, str]], stream_id: int):
|
||||
headers_map = collections.OrderedDict(headers)
|
||||
|
||||
# Store off the request data.
|
||||
request_data = RequestData(headers_map, io.BytesIO())
|
||||
self.stream_data[stream_id] = request_data
|
||||
|
||||
def stream_complete(self, stream_id: int):
|
||||
"""
|
||||
When a stream is complete, we can send our response.
|
||||
"""
|
||||
try:
|
||||
request_data = self.stream_data[stream_id]
|
||||
except KeyError:
|
||||
# Just return, we probably 405'd this already
|
||||
return
|
||||
|
||||
headers = request_data.headers
|
||||
body = request_data.data.getvalue().decode("utf-8")
|
||||
|
||||
data = json.dumps({"headers": headers, "body": body}, indent=4).encode("utf8")
|
||||
|
||||
response_headers = (
|
||||
(":status", "200"),
|
||||
("content-type", "application/json"),
|
||||
("content-length", str(len(data))),
|
||||
)
|
||||
self.conn.send_headers(stream_id, response_headers)
|
||||
asyncio.ensure_future(self.send_data(data, stream_id))
|
||||
|
||||
def receive_data(self, data: bytes, stream_id: int):
|
||||
"""
|
||||
We've received some data on a stream. If that stream is one we're
|
||||
expecting data on, save it off. Otherwise, reset the stream.
|
||||
"""
|
||||
try:
|
||||
stream_data = self.stream_data[stream_id]
|
||||
except KeyError:
|
||||
self.conn.reset_stream(stream_id, error_code=ErrorCodes.PROTOCOL_ERROR)
|
||||
else:
|
||||
stream_data.data.write(data)
|
||||
|
||||
def stream_reset(self, stream_id):
|
||||
"""
|
||||
A stream reset was sent. Stop sending data.
|
||||
"""
|
||||
if stream_id in self.flow_control_futures:
|
||||
future = self.flow_control_futures.pop(stream_id)
|
||||
future.cancel()
|
||||
|
||||
async def send_data(self, data, stream_id):
|
||||
"""
|
||||
Send data according to the flow control rules.
|
||||
"""
|
||||
while data:
|
||||
while self.conn.local_flow_control_window(stream_id) < 1:
|
||||
try:
|
||||
await self.wait_for_flow_control(stream_id)
|
||||
except asyncio.CancelledError:
|
||||
return
|
||||
|
||||
chunk_size = min(
|
||||
self.conn.local_flow_control_window(stream_id),
|
||||
len(data),
|
||||
self.conn.max_outbound_frame_size,
|
||||
)
|
||||
|
||||
try:
|
||||
self.conn.send_data(
|
||||
stream_id, data[:chunk_size], end_stream=(chunk_size == len(data))
|
||||
)
|
||||
except (StreamClosedError, ProtocolError):
|
||||
# The stream got closed and we didn't get told. We're done
|
||||
# here.
|
||||
break
|
||||
|
||||
assert self.transport is not None
|
||||
self.transport.write(self.conn.data_to_send())
|
||||
data = data[chunk_size:]
|
||||
|
||||
async def wait_for_flow_control(self, stream_id):
|
||||
"""
|
||||
Waits for a Future that fires when the flow control window is opened.
|
||||
"""
|
||||
f: asyncio.Future[None] = asyncio.Future()
|
||||
self.flow_control_futures[stream_id] = f
|
||||
await f
|
||||
|
||||
def window_updated(self, stream_id, delta):
|
||||
"""
|
||||
A window update frame was received. Unblock some number of flow control
|
||||
Futures.
|
||||
"""
|
||||
if stream_id and stream_id in self.flow_control_futures:
|
||||
f = self.flow_control_futures.pop(stream_id)
|
||||
f.set_result(delta)
|
||||
elif not stream_id:
|
||||
for f in self.flow_control_futures.values():
|
||||
f.set_result(delta)
|
||||
|
||||
self.flow_control_futures = {}
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="function")
|
||||
async def http2_echoserver() -> AsyncIterable[H2Server]:
|
||||
loop = asyncio.get_event_loop()
|
||||
serve = await loop.create_server(H2Protocol, "127.0.0.1", 0)
|
||||
(host, port) = serve.sockets[0].getsockname()
|
||||
|
||||
asyncio.create_task(serve.wait_closed())
|
||||
|
||||
server = H2Server(host, port)
|
||||
yield server
|
||||
|
||||
serve.close()
|
||||
@@ -16,7 +16,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.common_types import IndexPartDump
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import AuxFileStore
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import (
|
||||
@@ -201,7 +200,6 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
placement_policy: Optional[str] = None,
|
||||
set_default: bool = False,
|
||||
aux_file_policy: Optional[AuxFileStore] = None,
|
||||
):
|
||||
"""
|
||||
Creates a new tenant, returns its id and its initial timeline's id.
|
||||
@@ -223,13 +221,6 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
)
|
||||
)
|
||||
|
||||
if aux_file_policy is AuxFileStore.V2:
|
||||
args.extend(["-c", "switch_aux_file_policy:v2"])
|
||||
elif aux_file_policy is AuxFileStore.V1:
|
||||
args.extend(["-c", "switch_aux_file_policy:v1"])
|
||||
elif aux_file_policy is AuxFileStore.CrossValidation:
|
||||
args.extend(["-c", "switch_aux_file_policy:cross-validation"])
|
||||
|
||||
if set_default:
|
||||
args.append("--set-default")
|
||||
|
||||
|
||||
@@ -35,11 +35,13 @@ import toml
|
||||
from _pytest.config import Config
|
||||
from _pytest.config.argparsing import Parser
|
||||
from _pytest.fixtures import FixtureRequest
|
||||
from jwcrypto import jwk
|
||||
|
||||
# Type-related stuff
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
from psycopg2.extensions import cursor as PgCursor
|
||||
from psycopg2.extensions import make_dsn, parse_dsn
|
||||
from pytest_httpserver import HTTPServer
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures import overlayfs
|
||||
@@ -53,6 +55,7 @@ from fixtures.common_types import (
|
||||
TimelineId,
|
||||
)
|
||||
from fixtures.endpoint.http import EndpointHttpClient
|
||||
from fixtures.h2server import H2Server
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.neon_cli import NeonLocalCli, Pagectl
|
||||
@@ -94,7 +97,6 @@ from fixtures.utils import (
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.utils import AuxFileStore as AuxFileStore # reexport
|
||||
|
||||
from .neon_api import NeonAPI, NeonApiEndpoint
|
||||
|
||||
@@ -353,7 +355,6 @@ class NeonEnvBuilder:
|
||||
initial_tenant: Optional[TenantId] = None,
|
||||
initial_timeline: Optional[TimelineId] = None,
|
||||
pageserver_virtual_file_io_engine: Optional[str] = None,
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
|
||||
pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None,
|
||||
safekeeper_extra_opts: Optional[list[str]] = None,
|
||||
storage_controller_port_override: Optional[int] = None,
|
||||
@@ -405,8 +406,6 @@ class NeonEnvBuilder:
|
||||
f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
|
||||
)
|
||||
|
||||
self.pageserver_aux_file_policy = pageserver_aux_file_policy
|
||||
|
||||
self.safekeeper_extra_opts = safekeeper_extra_opts
|
||||
|
||||
self.storage_controller_port_override = storage_controller_port_override
|
||||
@@ -467,7 +466,6 @@ class NeonEnvBuilder:
|
||||
timeline_id=env.initial_timeline,
|
||||
shard_count=initial_tenant_shard_count,
|
||||
shard_stripe_size=initial_tenant_shard_stripe_size,
|
||||
aux_file_policy=self.pageserver_aux_file_policy,
|
||||
)
|
||||
assert env.initial_tenant == initial_tenant
|
||||
assert env.initial_timeline == initial_timeline
|
||||
@@ -1027,7 +1025,6 @@ class NeonEnv:
|
||||
self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
|
||||
|
||||
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
|
||||
self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
|
||||
self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
|
||||
|
||||
# Create the neon_local's `NeonLocalInitConf`
|
||||
@@ -1323,7 +1320,6 @@ class NeonEnv:
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
placement_policy: Optional[str] = None,
|
||||
set_default: bool = False,
|
||||
aux_file_policy: Optional[AuxFileStore] = None,
|
||||
) -> tuple[TenantId, TimelineId]:
|
||||
"""
|
||||
Creates a new tenant, returns its id and its initial timeline's id.
|
||||
@@ -1340,7 +1336,6 @@ class NeonEnv:
|
||||
shard_stripe_size=shard_stripe_size,
|
||||
placement_policy=placement_policy,
|
||||
set_default=set_default,
|
||||
aux_file_policy=aux_file_policy,
|
||||
)
|
||||
|
||||
return tenant_id, timeline_id
|
||||
@@ -1398,7 +1393,6 @@ def neon_simple_env(
|
||||
compatibility_pg_distrib_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore],
|
||||
pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
|
||||
pageserver_virtual_file_io_mode: Optional[str],
|
||||
) -> Iterator[NeonEnv]:
|
||||
@@ -1431,7 +1425,6 @@ def neon_simple_env(
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
|
||||
pageserver_aux_file_policy=pageserver_aux_file_policy,
|
||||
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
|
||||
pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
|
||||
combination=combination,
|
||||
@@ -1458,7 +1451,6 @@ def neon_env_builder(
|
||||
top_output_dir: Path,
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
|
||||
pageserver_aux_file_policy: Optional[AuxFileStore],
|
||||
record_property: Callable[[str, object], None],
|
||||
pageserver_virtual_file_io_mode: Optional[str],
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
@@ -1501,7 +1493,6 @@ def neon_env_builder(
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
test_overlay_dir=test_overlay_dir,
|
||||
pageserver_aux_file_policy=pageserver_aux_file_policy,
|
||||
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
|
||||
pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
|
||||
) as builder:
|
||||
@@ -3092,6 +3083,31 @@ class PSQL:
|
||||
)
|
||||
|
||||
|
||||
def generate_proxy_tls_certs(common_name: str, key_path: Path, crt_path: Path):
|
||||
if not key_path.exists():
|
||||
r = subprocess.run(
|
||||
[
|
||||
"openssl",
|
||||
"req",
|
||||
"-new",
|
||||
"-x509",
|
||||
"-days",
|
||||
"365",
|
||||
"-nodes",
|
||||
"-text",
|
||||
"-out",
|
||||
str(crt_path),
|
||||
"-keyout",
|
||||
str(key_path),
|
||||
"-subj",
|
||||
f"/CN={common_name}",
|
||||
"-addext",
|
||||
f"subjectAltName = DNS:{common_name}",
|
||||
]
|
||||
)
|
||||
assert r.returncode == 0
|
||||
|
||||
|
||||
class NeonProxy(PgProtocol):
|
||||
link_auth_uri: str = "http://dummy-uri"
|
||||
|
||||
@@ -3190,29 +3206,7 @@ class NeonProxy(PgProtocol):
|
||||
# generate key of it doesn't exist
|
||||
crt_path = self.test_output_dir / "proxy.crt"
|
||||
key_path = self.test_output_dir / "proxy.key"
|
||||
|
||||
if not key_path.exists():
|
||||
r = subprocess.run(
|
||||
[
|
||||
"openssl",
|
||||
"req",
|
||||
"-new",
|
||||
"-x509",
|
||||
"-days",
|
||||
"365",
|
||||
"-nodes",
|
||||
"-text",
|
||||
"-out",
|
||||
str(crt_path),
|
||||
"-keyout",
|
||||
str(key_path),
|
||||
"-subj",
|
||||
"/CN=*.localtest.me",
|
||||
"-addext",
|
||||
"subjectAltName = DNS:*.localtest.me",
|
||||
]
|
||||
)
|
||||
assert r.returncode == 0
|
||||
generate_proxy_tls_certs("*.localtest.me", key_path, crt_path)
|
||||
|
||||
args = [
|
||||
str(self.neon_binpath / "proxy"),
|
||||
@@ -3392,6 +3386,125 @@ class NeonProxy(PgProtocol):
|
||||
assert out == "ok"
|
||||
|
||||
|
||||
class NeonAuthBroker:
|
||||
class ControlPlane:
|
||||
def __init__(self, endpoint: str):
|
||||
self.endpoint = endpoint
|
||||
|
||||
def extra_args(self) -> list[str]:
|
||||
args = [
|
||||
*["--auth-backend", "console"],
|
||||
*["--auth-endpoint", self.endpoint],
|
||||
]
|
||||
return args
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
http_port: int,
|
||||
mgmt_port: int,
|
||||
external_http_port: int,
|
||||
auth_backend: NeonAuthBroker.ControlPlane,
|
||||
):
|
||||
self.domain = "apiauth.localtest.me" # resolves to 127.0.0.1
|
||||
self.host = "127.0.0.1"
|
||||
self.http_port = http_port
|
||||
self.external_http_port = external_http_port
|
||||
self.neon_binpath = neon_binpath
|
||||
self.test_output_dir = test_output_dir
|
||||
self.mgmt_port = mgmt_port
|
||||
self.auth_backend = auth_backend
|
||||
self.http_timeout_seconds = 15
|
||||
self._popen: Optional[subprocess.Popen[bytes]] = None
|
||||
|
||||
def start(self) -> NeonAuthBroker:
|
||||
assert self._popen is None
|
||||
|
||||
# generate key of it doesn't exist
|
||||
crt_path = self.test_output_dir / "proxy.crt"
|
||||
key_path = self.test_output_dir / "proxy.key"
|
||||
generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path)
|
||||
|
||||
args = [
|
||||
str(self.neon_binpath / "proxy"),
|
||||
*["--http", f"{self.host}:{self.http_port}"],
|
||||
*["--mgmt", f"{self.host}:{self.mgmt_port}"],
|
||||
*["--wss", f"{self.host}:{self.external_http_port}"],
|
||||
*["-c", str(crt_path)],
|
||||
*["-k", str(key_path)],
|
||||
*["--sql-over-http-pool-opt-in", "false"],
|
||||
*["--is-auth-broker", "true"],
|
||||
*self.auth_backend.extra_args(),
|
||||
]
|
||||
|
||||
logfile = open(self.test_output_dir / "proxy.log", "w")
|
||||
self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
|
||||
self._wait_until_ready()
|
||||
return self
|
||||
|
||||
# Sends SIGTERM to the proxy if it has been started
|
||||
def terminate(self):
|
||||
if self._popen:
|
||||
self._popen.terminate()
|
||||
|
||||
# Waits for proxy to exit if it has been opened with a default timeout of
|
||||
# two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time.
|
||||
def wait_for_exit(self, timeout=2):
|
||||
if self._popen:
|
||||
self._popen.wait(timeout=timeout)
|
||||
|
||||
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
|
||||
def _wait_until_ready(self):
|
||||
assert (
|
||||
self._popen and self._popen.poll() is None
|
||||
), "Proxy exited unexpectedly. Check test log."
|
||||
requests.get(f"http://{self.host}:{self.http_port}/v1/status")
|
||||
|
||||
async def query(self, query, args, **kwargs):
|
||||
user = kwargs["user"]
|
||||
token = kwargs["token"]
|
||||
expected_code = kwargs.get("expected_code")
|
||||
|
||||
log.info(f"Executing http query: {query}")
|
||||
|
||||
connstr = f"postgresql://{user}@{self.domain}/postgres"
|
||||
async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client:
|
||||
response = await client.post(
|
||||
f"https://{self.domain}:{self.external_http_port}/sql",
|
||||
json={"query": query, "params": args},
|
||||
headers={
|
||||
"Neon-Connection-String": connstr,
|
||||
"Authorization": f"Bearer {token}",
|
||||
},
|
||||
)
|
||||
|
||||
if expected_code is not None:
|
||||
assert response.status_code == expected_code, f"response: {response.json()}"
|
||||
return response.json()
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
|
||||
return request_result.text
|
||||
|
||||
def __enter__(self) -> NeonAuthBroker:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
_exc_type: Optional[type[BaseException]],
|
||||
_exc_value: Optional[BaseException],
|
||||
_traceback: Optional[TracebackType],
|
||||
):
|
||||
if self._popen is not None:
|
||||
self._popen.terminate()
|
||||
try:
|
||||
self._popen.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warning("failed to gracefully terminate proxy; killing")
|
||||
self._popen.kill()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def link_proxy(
|
||||
port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
|
||||
@@ -3456,6 +3569,74 @@ def static_proxy(
|
||||
yield proxy
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def neon_authorize_jwk() -> jwk.JWK:
|
||||
kid = str(uuid.uuid4())
|
||||
key = jwk.JWK.generate(kty="RSA", size=2048, alg="RS256", use="sig", kid=kid)
|
||||
assert isinstance(key, jwk.JWK)
|
||||
return key
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def static_auth_broker(
|
||||
port_distributor: PortDistributor,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
httpserver: HTTPServer,
|
||||
neon_authorize_jwk: jwk.JWK,
|
||||
http2_echoserver: H2Server,
|
||||
) -> Iterable[NeonAuthBroker]:
|
||||
"""Neon Auth Broker that routes to a mocked local_proxy and a mocked cplane HTTP API."""
|
||||
|
||||
local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}"
|
||||
|
||||
# return local_proxy addr on ProxyWakeCompute.
|
||||
httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
|
||||
{
|
||||
"address": local_proxy_addr,
|
||||
"aux": {
|
||||
"endpoint_id": "ep-foo-bar-1234",
|
||||
"branch_id": "br-foo-bar",
|
||||
"project_id": "foo-bar",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# return jwks mock addr on GetEndpointJwks
|
||||
httpserver.expect_request(re.compile("^/cplane/endpoints/.+/jwks$")).respond_with_json(
|
||||
{
|
||||
"jwks": [
|
||||
{
|
||||
"id": "foo",
|
||||
"jwks_url": httpserver.url_for("/authorize/jwks.json"),
|
||||
"provider_name": "test",
|
||||
"jwt_audience": None,
|
||||
"role_names": ["anonymous", "authenticated"],
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
# return static fixture jwks.
|
||||
jwk = neon_authorize_jwk.export_public(as_dict=True)
|
||||
httpserver.expect_request("/authorize/jwks.json").respond_with_json({"keys": [jwk]})
|
||||
|
||||
mgmt_port = port_distributor.get_port()
|
||||
http_port = port_distributor.get_port()
|
||||
external_http_port = port_distributor.get_port()
|
||||
|
||||
with NeonAuthBroker(
|
||||
neon_binpath=neon_binpath,
|
||||
test_output_dir=test_output_dir,
|
||||
http_port=http_port,
|
||||
mgmt_port=mgmt_port,
|
||||
external_http_port=external_http_port,
|
||||
auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")),
|
||||
) as proxy:
|
||||
proxy.start()
|
||||
yield proxy
|
||||
|
||||
|
||||
class Endpoint(PgProtocol, LogUtils):
|
||||
"""An object representing a Postgres compute endpoint managed by the control plane."""
|
||||
|
||||
|
||||
@@ -10,12 +10,6 @@ from _pytest.python import Metafunc
|
||||
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any, Optional
|
||||
|
||||
from fixtures.utils import AuxFileStore
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any, Optional
|
||||
|
||||
@@ -50,11 +44,6 @@ def pageserver_virtual_file_io_mode() -> Optional[str]:
|
||||
return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
|
||||
return None
|
||||
|
||||
|
||||
def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]:
|
||||
toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
|
||||
if toml_table is None:
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -515,21 +514,6 @@ def assert_no_errors(log_file: Path, service: str, allowed_errors: list[str]):
|
||||
assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add"
|
||||
|
||||
|
||||
@enum.unique
|
||||
class AuxFileStore(str, enum.Enum):
|
||||
V1 = "v1"
|
||||
V2 = "v2"
|
||||
CrossValidation = "cross-validation"
|
||||
|
||||
@override
|
||||
def __repr__(self) -> str:
|
||||
return f"'aux-{self.value}'"
|
||||
|
||||
@override
|
||||
def __str__(self) -> str:
|
||||
return f"'aux-{self.value}'"
|
||||
|
||||
|
||||
def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str]):
|
||||
"""
|
||||
This is essentially:
|
||||
|
||||
@@ -9,7 +9,7 @@ import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
|
||||
from fixtures.neon_fixtures import logical_replication_sync
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.benchmark_fixture import NeonBenchmarker
|
||||
@@ -17,7 +17,6 @@ if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
|
||||
@pytest.mark.timeout(1000)
|
||||
def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -172,7 +172,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
},
|
||||
"walreceiver_connect_timeout": "13m",
|
||||
"image_layer_creation_check_threshold": 1,
|
||||
"switch_aux_file_policy": "cross-validation",
|
||||
"lsn_lease_length": "1m",
|
||||
"lsn_lease_length_for_ts": "5s",
|
||||
}
|
||||
|
||||
37
test_runner/regress/test_auth_broker.py
Normal file
37
test_runner/regress/test_auth_broker.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonAuthBroker
|
||||
from jwcrypto import jwk, jwt
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_auth_broker_happy(
|
||||
static_auth_broker: NeonAuthBroker,
|
||||
neon_authorize_jwk: jwk.JWK,
|
||||
):
|
||||
"""
|
||||
Signs a JWT and uses it to authorize a query to local_proxy.
|
||||
"""
|
||||
|
||||
token = jwt.JWT(
|
||||
header={"kid": neon_authorize_jwk.key_id, "alg": "RS256"}, claims={"sub": "user1"}
|
||||
)
|
||||
token.make_signed_token(neon_authorize_jwk)
|
||||
res = await static_auth_broker.query("foo", ["arg1"], user="anonymous", token=token.serialize())
|
||||
|
||||
# local proxy mock just echos back the request
|
||||
# check that we forward the correct data
|
||||
|
||||
assert (
|
||||
res["headers"]["authorization"] == f"Bearer {token.serialize()}"
|
||||
), "JWT should be forwarded"
|
||||
|
||||
assert (
|
||||
"anonymous" in res["headers"]["neon-connection-string"]
|
||||
), "conn string should be forwarded"
|
||||
|
||||
assert json.loads(res["body"]) == {
|
||||
"query": "foo",
|
||||
"params": ["arg1"],
|
||||
}, "Query body should be forwarded"
|
||||
@@ -7,6 +7,7 @@ import psycopg2
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, VanillaPostgres
|
||||
from psycopg2.errors import UndefinedObject
|
||||
from pytest_httpserver import HTTPServer
|
||||
from werkzeug.wrappers.request import Request
|
||||
from werkzeug.wrappers.response import Response
|
||||
@@ -335,3 +336,34 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
|
||||
if not result:
|
||||
raise AssertionError("Could not count databases")
|
||||
assert result[0] == 0, "Database 'failure' still exists after restart"
|
||||
|
||||
|
||||
def test_ddl_forwarding_role_specs(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Postgres has a concept of role specs:
|
||||
|
||||
ROLESPEC_CSTRING: ALTER ROLE xyz
|
||||
ROLESPEC_CURRENT_USER: ALTER ROLE current_user
|
||||
ROLESPEC_CURRENT_ROLE: ALTER ROLE current_role
|
||||
ROLESPEC_SESSION_USER: ALTER ROLE session_user
|
||||
ROLESPEC_PUBLIC: ALTER ROLE public
|
||||
|
||||
The extension is required to serialize these special role spec into
|
||||
usernames for the purpose of DDL forwarding.
|
||||
"""
|
||||
env = neon_simple_env
|
||||
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
# ROLESPEC_CSTRING
|
||||
cur.execute("ALTER ROLE cloud_admin WITH PASSWORD 'york'")
|
||||
# ROLESPEC_CURRENT_USER
|
||||
cur.execute("ALTER ROLE current_user WITH PASSWORD 'pork'")
|
||||
# ROLESPEC_CURRENT_ROLE
|
||||
cur.execute("ALTER ROLE current_role WITH PASSWORD 'cork'")
|
||||
# ROLESPEC_SESSION_USER
|
||||
cur.execute("ALTER ROLE session_user WITH PASSWORD 'bork'")
|
||||
# ROLESPEC_PUBLIC
|
||||
with pytest.raises(UndefinedObject):
|
||||
cur.execute("ALTER ROLE public WITH PASSWORD 'dork'")
|
||||
|
||||
@@ -5,11 +5,9 @@ from functools import partial
|
||||
from random import choice
|
||||
from string import ascii_lowercase
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
AuxFileStore,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgProtocol,
|
||||
@@ -23,17 +21,6 @@ def random_string(n: int):
|
||||
return "".join([choice(ascii_lowercase) for _ in range(n)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation]
|
||||
)
|
||||
def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
|
||||
env = neon_simple_env
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_config = client.tenant_config(env.initial_tenant).effective_config
|
||||
assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
|
||||
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -173,7 +160,6 @@ COMMIT;
|
||||
|
||||
|
||||
# Test that neon.logical_replication_max_snap_files works
|
||||
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
|
||||
def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
def slot_removed(ep):
|
||||
assert (
|
||||
@@ -350,7 +336,6 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
|
||||
#
|
||||
# Most pages start with a contrecord, so we don't do anything special
|
||||
# to ensure that.
|
||||
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
|
||||
def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -395,7 +380,6 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
# logical replication bug as such, but without logical replication,
|
||||
# records passed ot the WAL redo process are never large enough to hit
|
||||
# the bug.
|
||||
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
|
||||
def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -467,7 +451,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
|
||||
ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
|
||||
def test_replication_shutdown(neon_simple_env: NeonEnv):
|
||||
# Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -561,7 +561,7 @@ def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
|
||||
|
||||
# query generates a million rows - should hit the 10MB reponse limit quickly
|
||||
response = query(
|
||||
400,
|
||||
507,
|
||||
"select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
|
||||
)
|
||||
assert "response is too large (max is 10485760 bytes)" in response["message"]
|
||||
|
||||
@@ -18,6 +18,7 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
NeonPageserver,
|
||||
PageserverAvailability,
|
||||
PageserverSchedulingPolicy,
|
||||
PgBin,
|
||||
@@ -298,17 +299,20 @@ def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("warm_up", [True, False])
|
||||
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
|
||||
def prepare_onboarding_env(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
) -> tuple[NeonEnv, NeonPageserver, TenantId, int]:
|
||||
"""
|
||||
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
|
||||
which provides the /location_config API. This is similar to creating a tenant,
|
||||
but imports the generation number.
|
||||
For tests that do onboarding of a tenant to the storage controller, a small dance to
|
||||
set up one pageserver that won't be managed by the storage controller and create
|
||||
a tenant there.
|
||||
"""
|
||||
|
||||
# One pageserver to simulate legacy environment, two to be managed by storage controller
|
||||
neon_env_builder.num_pageservers = 3
|
||||
|
||||
# Enable tests to use methods that require real S3 API
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
|
||||
# Start services by hand so that we can skip registration on one of the pageservers
|
||||
env = neon_env_builder.init_configs()
|
||||
env.broker.start()
|
||||
@@ -329,7 +333,6 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
# will be attached after onboarding
|
||||
env.pageservers[1].start()
|
||||
env.pageservers[2].start()
|
||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
@@ -339,6 +342,23 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
generation = 123
|
||||
origin_ps.tenant_create(tenant_id, generation=generation)
|
||||
|
||||
origin_ps.http_client().timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
|
||||
|
||||
return (env, origin_ps, tenant_id, generation)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("warm_up", [True, False])
|
||||
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
|
||||
"""
|
||||
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
|
||||
which provides the /location_config API. This is similar to creating a tenant,
|
||||
but imports the generation number.
|
||||
"""
|
||||
|
||||
env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
|
||||
|
||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||
|
||||
# As if doing a live migration, first configure origin into stale mode
|
||||
r = origin_ps.http_client().tenant_location_conf(
|
||||
tenant_id,
|
||||
@@ -475,6 +495,70 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
@run_only_on_default_postgres("this test doesn't start an endpoint")
|
||||
def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Sometimes, the control plane wants to delete a tenant that wasn't attached to any pageserver,
|
||||
and also wasn't ever registered with the storage controller.
|
||||
|
||||
It may do this by calling /location_conf in mode Detached and then calling the delete API
|
||||
as normal.
|
||||
"""
|
||||
|
||||
env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
|
||||
|
||||
remote_prefix = "/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
)
|
||||
|
||||
# Detach it from its original pageserver.
|
||||
origin_ps.http_client().tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Detached",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": None,
|
||||
},
|
||||
)
|
||||
|
||||
# Since we will later assert that remote data is gone, as a control also check it was ever there
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix=remote_prefix,
|
||||
)
|
||||
|
||||
# Register with storage controller in Detached state
|
||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||
generation += 1
|
||||
r = virtual_ps_http.tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Detached",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": generation,
|
||||
},
|
||||
)
|
||||
assert len(r["shards"]) == 0 # location_conf tells us there are no attached shards
|
||||
|
||||
# Onboarding in Detached state shouldn't have attached it to any pageserver
|
||||
for ps in env.pageservers:
|
||||
assert ps.http_client().tenant_list() == []
|
||||
|
||||
# Delete it via the storage controller
|
||||
virtual_ps_http.tenant_delete(tenant_id)
|
||||
|
||||
# Check that we really deleted it
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix=remote_prefix,
|
||||
)
|
||||
|
||||
|
||||
def test_storage_controller_compute_hook(
|
||||
httpserver: HTTPServer,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
@@ -872,6 +956,14 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
|
||||
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
|
||||
assert all(v["may_schedule"] for v in response.json()["nodes"].values())
|
||||
|
||||
# Reconciler cancel API should be a no-op when nothing is in flight
|
||||
env.storage_controller.request(
|
||||
"PUT",
|
||||
f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0102/cancel_reconcile",
|
||||
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
# Node unclean drop API
|
||||
response = env.storage_controller.request(
|
||||
"POST",
|
||||
f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
|
||||
@@ -879,6 +971,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
assert len(env.storage_controller.node_list()) == 1
|
||||
|
||||
# Tenant unclean drop API
|
||||
response = env.storage_controller.request(
|
||||
"POST",
|
||||
f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
|
||||
@@ -892,7 +985,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
|
||||
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
||||
)
|
||||
assert len(response.json()) == 1
|
||||
|
||||
# Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
|
||||
# meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
|
||||
env.storage_controller.consistency_check()
|
||||
@@ -1660,6 +1752,11 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
|
||||
storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
|
||||
assert "Stop" in storcon_cli(["tenants"])[3]
|
||||
|
||||
# Cancel ongoing reconcile on a tenant
|
||||
storcon_cli(
|
||||
["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
|
||||
)
|
||||
|
||||
# Change a tenant's placement
|
||||
storcon_cli(
|
||||
["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
|
||||
|
||||
@@ -137,14 +137,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
|
||||
}
|
||||
)
|
||||
|
||||
# Create two branches and archive them
|
||||
parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
|
||||
leaf_timeline_id = env.create_branch(
|
||||
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
|
||||
# Create three branches that depend on each other, starting with two
|
||||
grandparent_timeline_id = env.create_branch(
|
||||
"test_ancestor_branch_archive_grandparent", tenant_id
|
||||
)
|
||||
parent_timeline_id = env.create_branch(
|
||||
"test_ancestor_branch_archive_parent", tenant_id, "test_ancestor_branch_archive_grandparent"
|
||||
)
|
||||
|
||||
# write some stuff to the parent
|
||||
with env.endpoints.create_start(
|
||||
"test_ancestor_branch_archive_branch1", tenant_id=tenant_id
|
||||
"test_ancestor_branch_archive_parent", tenant_id=tenant_id
|
||||
) as endpoint:
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
@@ -154,6 +157,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
|
||||
)
|
||||
sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
|
||||
|
||||
# create the third branch
|
||||
leaf_timeline_id = env.create_branch(
|
||||
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
|
||||
)
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
@@ -171,6 +179,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
|
||||
state=TimelineArchivalState.ARCHIVED,
|
||||
)
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
grandparent_timeline_id,
|
||||
state=TimelineArchivalState.ARCHIVED,
|
||||
)
|
||||
|
||||
def timeline_offloaded_logged(timeline_id: TimelineId) -> bool:
|
||||
return (
|
||||
env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
|
||||
@@ -201,30 +215,34 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
parent_timeline_id,
|
||||
grandparent_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
parent_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
leaf_detail = ps_http.timeline_detail(
|
||||
parent_detail = ps_http.timeline_detail(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
parent_timeline_id,
|
||||
)
|
||||
assert leaf_detail["is_archived"] is False
|
||||
assert parent_detail["is_archived"] is False
|
||||
|
||||
with env.endpoints.create_start(
|
||||
"test_ancestor_branch_archive_branch1", tenant_id=tenant_id
|
||||
"test_ancestor_branch_archive_parent", tenant_id=tenant_id
|
||||
) as endpoint:
|
||||
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
|
||||
assert sum == sum_again
|
||||
|
||||
# Test that deletion of offloaded timelines works
|
||||
ps_http.timeline_delete(tenant_id, leaf_timeline_id)
|
||||
|
||||
assert not timeline_offloaded_logged(initial_timeline_id)
|
||||
|
||||
|
||||
def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("delete_timeline", [False, True])
|
||||
def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timeline: bool):
|
||||
"""
|
||||
Test for persistence of timeline offload state
|
||||
"""
|
||||
@@ -306,27 +324,35 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
|
||||
assert timeline_offloaded_api(child_timeline_id)
|
||||
assert not timeline_offloaded_api(root_timeline_id)
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
child_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
child_detail = ps_http.timeline_detail(
|
||||
tenant_id,
|
||||
child_timeline_id,
|
||||
)
|
||||
assert child_detail["is_archived"] is False
|
||||
if delete_timeline:
|
||||
ps_http.timeline_delete(tenant_id, child_timeline_id)
|
||||
with pytest.raises(PageserverApiException, match="not found"):
|
||||
ps_http.timeline_detail(
|
||||
tenant_id,
|
||||
child_timeline_id,
|
||||
)
|
||||
else:
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
child_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
child_detail = ps_http.timeline_detail(
|
||||
tenant_id,
|
||||
child_timeline_id,
|
||||
)
|
||||
assert child_detail["is_archived"] is False
|
||||
|
||||
with env.endpoints.create_start(
|
||||
"test_archived_branch_persisted", tenant_id=tenant_id
|
||||
) as endpoint:
|
||||
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
|
||||
assert sum == sum_again
|
||||
with env.endpoints.create_start(
|
||||
"test_archived_branch_persisted", tenant_id=tenant_id
|
||||
) as endpoint:
|
||||
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
|
||||
assert sum == sum_again
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
|
||||
)
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
|
||||
)
|
||||
|
||||
assert not timeline_offloaded_api(root_timeline_id)
|
||||
|
||||
|
||||
50
test_runner/regress/test_unstable_extensions.py
Normal file
50
test_runner/regress/test_unstable_extensions.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
import pytest
|
||||
from psycopg2.errors import InsufficientPrivilege
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_unstable_extensions_installation(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test that the unstable extension support within the neon extension can
|
||||
block extension installation.
|
||||
"""
|
||||
env = neon_simple_env
|
||||
|
||||
neon_unstable_extensions = "pg_prewarm,amcheck"
|
||||
|
||||
endpoint = env.endpoints.create(
|
||||
"main",
|
||||
config_lines=[
|
||||
"neon.allow_unstable_extensions=false",
|
||||
f"neon.unstable_extensions='{neon_unstable_extensions}'",
|
||||
],
|
||||
)
|
||||
endpoint.respec(skip_pg_catalog_updates=False)
|
||||
endpoint.start()
|
||||
|
||||
with endpoint.cursor() as cursor:
|
||||
cursor.execute("SELECT current_setting('neon.unstable_extensions')")
|
||||
result = cursor.fetchone()
|
||||
assert result is not None
|
||||
setting = cast("str", result[0])
|
||||
assert setting == neon_unstable_extensions
|
||||
|
||||
with pytest.raises(InsufficientPrivilege):
|
||||
cursor.execute("CREATE EXTENSION pg_prewarm")
|
||||
|
||||
with pytest.raises(InsufficientPrivilege):
|
||||
cursor.execute("CREATE EXTENSION amcheck")
|
||||
|
||||
# Make sure that we can install a "stable" extension
|
||||
cursor.execute("CREATE EXTENSION pageinspect")
|
||||
|
||||
cursor.execute("BEGIN")
|
||||
cursor.execute("SET neon.allow_unstable_extensions TO true")
|
||||
cursor.execute("CREATE EXTENSION pg_prewarm")
|
||||
cursor.execute("COMMIT")
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user