Merge pull request #9573 from neondatabase/releases/2024-10-29-compute-only-2

Compute release 2024-10-29
This commit is contained in:
Anastasia Lubennikova
2024-10-29 18:53:03 +00:00
committed by GitHub
113 changed files with 3692 additions and 1511 deletions

View File

@@ -671,6 +671,10 @@ jobs:
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
# Increase timeout to 12h, default timeout is 6h
# we have regression in clickbench causing it to run 2-3x longer
timeout-minutes: 720
steps:
- uses: actions/checkout@v4
@@ -716,7 +720,7 @@ jobs:
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

View File

@@ -839,6 +839,7 @@ jobs:
- name: Build vm image
run: |
./vm-builder \
-size=2G \
-spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
-src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
-dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
@@ -1116,7 +1117,10 @@ jobs:
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
-f deployPgSniRouter=true \
-f deployProxy=true \
-f deployProxyLink=true \
-f deployPrivatelinkProxy=true \
-f deployProxyScram=true \
-f deployProxyAuthBroker=true \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}}
else

20
Cargo.lock generated
View File

@@ -3749,6 +3749,7 @@ dependencies = [
"tracing",
"url",
"utils",
"wal_decoder",
"walkdir",
"workspace_hack",
]
@@ -4186,6 +4187,7 @@ dependencies = [
"regex",
"serde",
"thiserror",
"tracing",
"utils",
]
@@ -6272,7 +6274,7 @@ dependencies = [
[[package]]
name = "tokio-epoll-uring"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
dependencies = [
"futures",
"nix 0.26.4",
@@ -6788,7 +6790,7 @@ dependencies = [
[[package]]
name = "uring-common"
version = "0.1.0"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
dependencies = [
"bytes",
"io-uring",
@@ -6954,6 +6956,20 @@ dependencies = [
"utils",
]
[[package]]
name = "wal_decoder"
version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"pageserver_api",
"postgres_ffi",
"serde",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "walkdir"
version = "2.3.3"

View File

@@ -33,6 +33,7 @@ members = [
"libs/postgres_ffi/wal_craft",
"libs/vm_monitor",
"libs/walproposer",
"libs/wal_decoder",
]
[workspace.package]
@@ -238,6 +239,7 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
walproposer = { version = "0.1", path = "./libs/walproposer/" }
wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
## Common library dependency
workspace_hack = { version = "0.1", path = "./workspace_hack/" }

View File

@@ -666,7 +666,7 @@ RUN apt-get update && \
#
# Use new version only for v17
# because Release_2024_09_1 has some backward incompatible changes
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN case "${PG_VERSION}" in \
"v17") \
@@ -860,18 +860,98 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
USER nonroot
WORKDIR /home/nonroot
RUN case "${PG_VERSION}" in "v17") \
echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
esac && \
curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
case "${PG_VERSION}" in \
'v17') \
echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
esac && \
cargo install --locked --version 0.11.3 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
#########################################################################################
#
# Layer "rust extensions pgrx12"
#
# pgrx started to support Postgres 17 since version 12,
# but some older extension aren't compatible with it.
# This layer should be used as a base for new pgrx extensions,
# and eventually get merged with `rust-extensions-build`
#
#########################################################################################
FROM build-deps AS rust-extensions-build-pgrx12
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
apt-get install --no-install-recommends -y curl libclang-dev && \
useradd -ms /bin/bash nonroot -b /home
ENV HOME=/home/nonroot
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
USER nonroot
WORKDIR /home/nonroot
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
cargo install --locked --version 0.12.6 cargo-pgrx && \
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
#########################################################################################
#
# Layers "pg-onnx-build" and "pgrag-pg-build"
# Compile "pgrag" extensions
#
#########################################################################################
FROM rust-extensions-build-pgrx12 AS pg-onnx-build
# cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
python3 -m venv venv && \
. venv/bin/activate && \
python3 -m pip install cmake==3.30.5 && \
wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root
FROM pg-onnx-build AS pgrag-pg-build
RUN apt-get install -y protobuf-compiler && \
wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \
echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
\
cd exts/rag && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
\
cd ../rag_bge_small_en_v15 && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
cargo pgrx install --release --features remote_onnx && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
\
cd ../rag_jina_reranker_v1_tiny_en && \
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
cargo pgrx install --release --features remote_onnx && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
#########################################################################################
#
# Layer "pg-jsonschema-pg-build"
@@ -1041,6 +1121,31 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
#########################################################################################
#
# Layer "pg_mooncake"
# compile pg_mooncake extension
#
#########################################################################################
FROM rust-extensions-build AS pg-mooncake-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PG_MOONCAKE_VERSION=882175dbba07ba2e6e59b1088d61bf325b910b9e
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \
'v14') \
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
esac && \
git clone --depth 1 --branch neon https://github.com/kelvich/pg_mooncake.git pg_mooncake-src && \
cd pg_mooncake-src && \
git checkout "${PG_MOONCAKE_VERSION}" && \
git submodule update --init --depth 1 --recursive && \
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -1059,6 +1164,7 @@ COPY --from=h3-pg-build /h3/usr /
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1084,6 +1190,7 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -1247,6 +1354,7 @@ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
COPY --from=vector-pg-build /pgvector.patch /ext-src/
COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src

View File

@@ -18,7 +18,7 @@ commands:
- name: pgbouncer
user: postgres
sysvInitAction: respawn
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
- name: local_proxy
user: postgres
sysvInitAction: respawn

View File

@@ -18,7 +18,7 @@ commands:
- name: pgbouncer
user: postgres
sysvInitAction: respawn
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
- name: local_proxy
user: postgres
sysvInitAction: respawn

View File

@@ -17,7 +17,7 @@ use std::time::Duration;
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
@@ -399,11 +399,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_aux_file_policy: settings
.remove("switch_aux_file_policy")
.map(|x| x.parse::<AuxFilePolicy>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")
@@ -499,11 +494,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_aux_file_policy: settings
.remove("switch_aux_file_policy")
.map(|x| x.parse::<AuxFilePolicy>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")

View File

@@ -111,6 +111,11 @@ enum Command {
#[arg(long)]
node: NodeId,
},
/// Cancel any ongoing reconciliation for this shard
TenantShardCancelReconcile {
#[arg(long)]
tenant_shard_id: TenantShardId,
},
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
/// that is passed through to pageservers, and does not affect storage controller behavior.
TenantConfig {
@@ -535,6 +540,15 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TenantShardCancelReconcile { tenant_shard_id } => {
storcon_client
.dispatch::<(), ()>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
None,
)
.await?;
}
Command::TenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?;

View File

@@ -250,12 +250,6 @@ pub struct TenantConfigToml {
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
/// file is written.
pub switch_aux_file_policy: crate::models::AuxFilePolicy,
/// The length for an explicit LSN lease request.
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
#[serde(with = "humantime_serde")]
@@ -475,7 +469,6 @@ impl Default for TenantConfigToml {
lazy_slru_download: false,
timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
}

View File

@@ -5,9 +5,11 @@ pub mod controller_api;
pub mod key;
pub mod keyspace;
pub mod models;
pub mod record;
pub mod reltag;
pub mod shard;
/// Public API types
pub mod upcall_api;
pub mod value;
pub mod config;

View File

@@ -10,7 +10,6 @@ use std::{
io::{BufRead, Read},
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
str::FromStr,
sync::atomic::AtomicUsize,
time::{Duration, SystemTime},
};
@@ -309,7 +308,6 @@ pub struct TenantConfig {
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub switch_aux_file_policy: Option<AuxFilePolicy>,
pub lsn_lease_length: Option<String>,
pub lsn_lease_length_for_ts: Option<String>,
}
@@ -350,68 +348,6 @@ pub enum AuxFilePolicy {
CrossValidation,
}
impl AuxFilePolicy {
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
matches!(
(from, to),
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
)
}
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
pub fn default_tenant_config() -> Self {
Self::V2
}
}
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
pub struct AtomicAuxFilePolicy(AtomicUsize);
impl AtomicAuxFilePolicy {
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
Self(AtomicUsize::new(
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
))
}
pub fn load(&self) -> Option<AuxFilePolicy> {
match self.0.load(std::sync::atomic::Ordering::Acquire) {
0 => None,
other => Some(AuxFilePolicy::from_usize(other)),
}
}
pub fn store(&self, policy: Option<AuxFilePolicy>) {
self.0.store(
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
std::sync::atomic::Ordering::Release,
);
}
}
impl AuxFilePolicy {
pub fn to_usize(self) -> usize {
match self {
Self::V1 => 1,
Self::CrossValidation => 2,
Self::V2 => 3,
}
}
pub fn try_from_usize(this: usize) -> Option<Self> {
match this {
1 => Some(Self::V1),
2 => Some(Self::CrossValidation),
3 => Some(Self::V2),
_ => None,
}
}
pub fn from_usize(this: usize) -> Self {
Self::try_from_usize(this).unwrap()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind")]
pub enum EvictionPolicy {
@@ -1633,71 +1569,6 @@ mod tests {
}
}
#[test]
fn test_aux_file_migration_path() {
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::V1
));
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::V2
));
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::CrossValidation
));
// Self-migration is not a valid migration path, and the caller should handle it by itself.
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::V2
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::CrossValidation
));
// Migrations not allowed
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::V2
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::CrossValidation
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::CrossValidation
));
// Migrations allowed
assert!(AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::V2
));
}
#[test]
fn test_aux_parse() {
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
assert_eq!(
AuxFilePolicy::from_str("cross-validation").unwrap(),
AuxFilePolicy::CrossValidation
);
}
#[test]
fn test_image_compression_algorithm_parsing() {
use ImageCompressionAlgorithm::*;

View File

@@ -0,0 +1,113 @@
//! This module defines the WAL record format used within the pageserver.
use bytes::Bytes;
use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember};
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
use serde::{Deserialize, Serialize};
use utils::bin_ser::DeserializeError;
/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
/// around a PostgreSQL WAL record, or a custom neon-specific "record".
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum NeonWalRecord {
/// Native PostgreSQL WAL record
Postgres { will_init: bool, rec: Bytes },
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
ClearVisibilityMapFlags {
new_heap_blkno: Option<u32>,
old_heap_blkno: Option<u32>,
flags: u8,
},
/// Mark transaction IDs as committed on a CLOG page
ClogSetCommitted {
xids: Vec<TransactionId>,
timestamp: TimestampTz,
},
/// Mark transaction IDs as aborted on a CLOG page
ClogSetAborted { xids: Vec<TransactionId> },
/// Extend multixact offsets SLRU
MultixactOffsetCreate {
mid: MultiXactId,
moff: MultiXactOffset,
},
/// Extend multixact members SLRU.
MultixactMembersCreate {
moff: MultiXactOffset,
members: Vec<MultiXactMember>,
},
/// Update the map of AUX files, either writing or dropping an entry
AuxFile {
file_path: String,
content: Option<Bytes>,
},
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
#[cfg(feature = "testing")]
Test {
/// Append a string to the image.
append: String,
/// Clear the image before appending.
clear: bool,
/// Treat this record as an init record. `clear` should be set to true if this field is set
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
/// its references in `timeline.rs`.
will_init: bool,
},
}
impl NeonWalRecord {
/// Does replaying this WAL record initialize the page from scratch, or does
/// it need to be applied over the previous image of the page?
pub fn will_init(&self) -> bool {
// If you change this function, you'll also need to change ValueBytes::will_init
match self {
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
#[cfg(feature = "testing")]
NeonWalRecord::Test { will_init, .. } => *will_init,
// None of the special neon record types currently initialize the page
_ => false,
}
}
#[cfg(feature = "testing")]
pub fn wal_append(s: impl AsRef<str>) -> Self {
Self::Test {
append: s.as_ref().to_string(),
clear: false,
will_init: false,
}
}
#[cfg(feature = "testing")]
pub fn wal_clear() -> Self {
Self::Test {
append: "".to_string(),
clear: true,
will_init: false,
}
}
#[cfg(feature = "testing")]
pub fn wal_init() -> Self {
Self::Test {
append: "".to_string(),
clear: true,
will_init: true,
}
}
}
/// Build a human-readable string to describe a WAL record
///
/// For debugging purposes
pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
match rec {
NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
"will_init: {}, {}",
will_init,
describe_postgres_wal_record(rec)?
)),
_ => Ok(format!("{:?}", rec)),
}
}

View File

@@ -1,13 +1,16 @@
use crate::walrecord::NeonWalRecord;
use anyhow::Result;
//! This module defines the value type used by the storage engine.
//!
//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]),
//! or a "delta" of how to get from previous version of the value to the new one
//! ([`Value::WalRecord`]])
//!
//! Note that the [`Value`] type is used for the permananent storage format, so any
//! changes to it must be backwards compatible.
use crate::record::NeonWalRecord;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::ops::AddAssign;
use std::time::Duration;
pub use pageserver_api::key::{Key, KEY_SIZE};
/// A 'value' stored for a one Key.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum Value {
/// An Image value contains a full copy of the value
@@ -20,10 +23,12 @@ pub enum Value {
}
impl Value {
#[inline(always)]
pub fn is_image(&self) -> bool {
matches!(self, Value::Image(_))
}
#[inline(always)]
pub fn will_init(&self) -> bool {
match self {
Value::Image(_) => true,
@@ -33,17 +38,18 @@ impl Value {
}
#[derive(Debug, PartialEq)]
pub(crate) enum InvalidInput {
pub enum InvalidInput {
TooShortValue,
TooShortPostgresRecord,
}
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
/// use this type for querying if a slice looks some particular way.
pub(crate) struct ValueBytes;
pub struct ValueBytes;
impl ValueBytes {
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
#[inline(always)]
pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
if raw.len() < 12 {
return Err(InvalidInput::TooShortValue);
}
@@ -79,6 +85,7 @@ impl ValueBytes {
mod test {
use super::*;
use bytes::Bytes;
use utils::bin_ser::BeSer;
macro_rules! roundtrip {
@@ -229,56 +236,3 @@ mod test {
assert!(!ValueBytes::will_init(&expected).unwrap());
}
}
///
/// Result of performing GC
///
#[derive(Default, Serialize, Debug)]
pub struct GcResult {
pub layers_total: u64,
pub layers_needed_by_cutoff: u64,
pub layers_needed_by_pitr: u64,
pub layers_needed_by_branches: u64,
pub layers_needed_by_leases: u64,
pub layers_not_updated: u64,
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
#[serde(serialize_with = "serialize_duration_as_millis")]
pub elapsed: Duration,
/// The layers which were garbage collected.
///
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
/// dropped in tests.
#[cfg(feature = "testing")]
#[serde(skip)]
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
}
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
d.as_millis().serialize(serializer)
}
impl AddAssign for GcResult {
fn add_assign(&mut self, other: Self) {
self.layers_total += other.layers_total;
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
self.layers_needed_by_branches += other.layers_needed_by_branches;
self.layers_needed_by_leases += other.layers_needed_by_leases;
self.layers_not_updated += other.layers_not_updated;
self.layers_removed += other.layers_removed;
self.elapsed += other.elapsed;
#[cfg(feature = "testing")]
{
let mut other = other;
self.doomed_layers.append(&mut other.doomed_layers);
}
}
}

View File

@@ -15,6 +15,7 @@ memoffset.workspace = true
thiserror.workspace = true
serde.workspace = true
utils.workspace = true
tracing.workspace = true
[dev-dependencies]
env_logger.workspace = true

View File

@@ -217,6 +217,7 @@ macro_rules! enum_pgversion {
pub mod pg_constants;
pub mod relfile_utils;
pub mod walrecord;
// Export some widely used datatypes that are unlikely to change across Postgres versions
pub use v14::bindings::RepOriginId;

View File

@@ -0,0 +1,18 @@
[package]
name = "wal_decoder"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
testing = []
[dependencies]
anyhow.workspace = true
bytes.workspace = true
pageserver_api.workspace = true
postgres_ffi.workspace = true
serde.workspace = true
tracing.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,2 @@
pub mod decoder;
pub mod models;

View File

@@ -0,0 +1,167 @@
//! This module houses types which represent decoded PG WAL records
//! ready for the pageserver to interpret. They are derived from the original
//! WAL records, so that each struct corresponds closely to one WAL record of
//! a specific kind. They contain the same information as the original WAL records,
//! just decoded into structs and fields for easier access.
//!
//! The ingestion code uses these structs to help with parsing the WAL records,
//! and it splits them into a stream of modifications to the key-value pairs that
//! are ultimately stored in delta layers. See also the split-out counterparts in
//! [`postgres_ffi::walrecord`].
//!
//! The pipeline which processes WAL records is not super obvious, so let's follow
//! the flow of an example XACT_COMMIT Postgres record:
//!
//! (Postgres XACT_COMMIT record)
//! |
//! |--> pageserver::walingest::WalIngest::decode_xact_record
//! |
//! |--> ([`XactRecord::Commit`])
//! |
//! |--> pageserver::walingest::WalIngest::ingest_xact_record
//! |
//! |--> (NeonWalRecord::ClogSetCommitted)
//! |
//! |--> write to KV store within the pageserver
use bytes::Bytes;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::walrecord::{
XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
XlSmgrTruncate, XlXactParsedRecord,
};
use postgres_ffi::{Oid, TransactionId};
use utils::lsn::Lsn;
pub enum HeapamRecord {
ClearVmBits(ClearVmBits),
}
pub struct ClearVmBits {
pub new_heap_blkno: Option<u32>,
pub old_heap_blkno: Option<u32>,
pub vm_rel: RelTag,
pub flags: u8,
}
pub enum NeonrmgrRecord {
ClearVmBits(ClearVmBits),
}
pub enum SmgrRecord {
Create(SmgrCreate),
Truncate(XlSmgrTruncate),
}
pub struct SmgrCreate {
pub rel: RelTag,
}
pub enum DbaseRecord {
Create(DbaseCreate),
Drop(DbaseDrop),
}
pub struct DbaseCreate {
pub db_id: Oid,
pub tablespace_id: Oid,
pub src_db_id: Oid,
pub src_tablespace_id: Oid,
}
pub struct DbaseDrop {
pub db_id: Oid,
pub tablespace_ids: Vec<Oid>,
}
pub enum ClogRecord {
ZeroPage(ClogZeroPage),
Truncate(ClogTruncate),
}
pub struct ClogZeroPage {
pub segno: u32,
pub rpageno: u32,
}
pub struct ClogTruncate {
pub pageno: u32,
pub oldest_xid: TransactionId,
pub oldest_xid_db: Oid,
}
pub enum XactRecord {
Commit(XactCommon),
Abort(XactCommon),
CommitPrepared(XactCommon),
AbortPrepared(XactCommon),
Prepare(XactPrepare),
}
pub struct XactCommon {
pub parsed: XlXactParsedRecord,
pub origin_id: u16,
// Fields below are only used for logging
pub xl_xid: TransactionId,
pub lsn: Lsn,
}
pub struct XactPrepare {
pub xl_xid: TransactionId,
pub data: Bytes,
}
pub enum MultiXactRecord {
ZeroPage(MultiXactZeroPage),
Create(XlMultiXactCreate),
Truncate(XlMultiXactTruncate),
}
pub struct MultiXactZeroPage {
pub slru_kind: SlruKind,
pub segno: u32,
pub rpageno: u32,
}
pub enum RelmapRecord {
Update(RelmapUpdate),
}
pub struct RelmapUpdate {
pub update: XlRelmapUpdate,
pub buf: Bytes,
}
pub enum XlogRecord {
Raw(RawXlogRecord),
}
pub struct RawXlogRecord {
pub info: u8,
pub lsn: Lsn,
pub buf: Bytes,
}
pub enum LogicalMessageRecord {
Put(PutLogicalMessage),
#[cfg(feature = "testing")]
Failpoint,
}
pub struct PutLogicalMessage {
pub path: String,
pub buf: Bytes,
}
pub enum StandbyRecord {
RunningXacts(StandbyRunningXacts),
}
pub struct StandbyRunningXacts {
pub oldest_running_xid: TransactionId,
}
pub enum ReploriginRecord {
Set(XlReploriginSet),
Drop(XlReploriginDrop),
}

View File

@@ -8,7 +8,7 @@ license.workspace = true
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints", "pageserver_api/testing" ]
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
[dependencies]
anyhow.workspace = true
@@ -83,6 +83,7 @@ enum-map.workspace = true
enumset = { workspace = true, features = ["serde"]}
strum.workspace = true
strum_macros.workspace = true
wal_decoder.workspace = true
[target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true

View File

@@ -8,13 +8,12 @@ use pageserver::{
context::{DownloadBehavior, RequestContext},
l0_flush::{L0FlushConfig, L0FlushGlobalState},
page_cache,
repository::Value,
task_mgr::TaskKind,
tenant::storage_layer::inmemory_layer::SerializedBatch,
tenant::storage_layer::InMemoryLayer,
virtual_file,
};
use pageserver_api::{key::Key, shard::TenantShardId};
use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},

View File

@@ -1,9 +1,9 @@
use criterion::measurement::WallTime;
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::storage_layer::PersistentLayerDesc;
use pageserver_api::key::Key;
use pageserver_api::shard::TenantShardId;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min};

View File

@@ -60,7 +60,8 @@ use anyhow::Context;
use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion};
use once_cell::sync::Lazy;
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::{key::Key, shard::TenantShardId};
use std::{
future::Future,

View File

@@ -51,7 +51,7 @@
//!
use anyhow::{Context, Result};
use pageserver::repository::Key;
use pageserver_api::key::Key;
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::path::PathBuf;

View File

@@ -14,12 +14,12 @@ use std::ops::Range;
use std::{fs, str};
use pageserver::page_cache::{self, PAGE_SZ};
use pageserver::repository::{Key, KEY_SIZE};
use pageserver::tenant::block_io::FileBlockReader;
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
use pageserver::tenant::storage_layer::range_overlaps;
use pageserver::virtual_file::{self, VirtualFile};
use pageserver_api::key::{Key, KEY_SIZE};
use utils::{bin_ser::BeSer, lsn::Lsn};

View File

@@ -14,13 +14,13 @@ use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use pageserver::virtual_file::api::IoMode;
use pageserver::{page_cache, virtual_file};
use pageserver::{
repository::{Key, KEY_SIZE},
tenant::{
block_io::FileBlockReader, disk_btree::VisitDirection,
storage_layer::delta_layer::DELTA_KEY_SIZE,
},
virtual_file::VirtualFile,
};
use pageserver_api::key::{Key, KEY_SIZE};
use std::fs;
use utils::bin_ser::BeSer;
use utils::id::{TenantId, TimelineId};

View File

@@ -1,4 +1,4 @@
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
use pageserver_api::models::{TenantConfig, TenantConfigRequest};
use pageserver_api::shard::TenantShardId;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
@@ -66,10 +66,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
mgmt_api_client
.tenant_config(&TenantConfigRequest {
tenant_id: timeline.tenant_id,
config: TenantConfig {
switch_aux_file_policy: Some(AuxFilePolicy::V2),
..Default::default()
},
config: TenantConfig::default(),
})
.await?;

View File

@@ -696,7 +696,7 @@ impl DeletionQueue {
mod test {
use camino::Utf8Path;
use hex_literal::hex;
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
use std::{io::ErrorKind, time::Duration};
use tracing::info;
@@ -705,7 +705,6 @@ mod test {
use crate::{
controller_upcall_client::RetryForeverError,
repository::Key,
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
};

View File

@@ -2232,13 +2232,13 @@ async fn getpage_at_lsn_handler(
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
struct Key(crate::repository::Key);
struct Key(pageserver_api::key::Key);
impl std::str::FromStr for Key {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
crate::repository::Key::from_hex(s).map(Key)
pageserver_api::key::Key::from_hex(s).map(Key)
}
}

View File

@@ -19,12 +19,11 @@ use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walrecord::decode_wal_record;
use crate::walrecord::DecodedWALRecord;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::*;
use postgres_ffi::waldecoder::WalStreamDecoder;
use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
use postgres_ffi::ControlFileData;
use postgres_ffi::DBState_DB_SHUTDOWNED;
use postgres_ffi::Oid;

View File

@@ -24,7 +24,6 @@ pub mod metrics;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
pub mod repository;
pub mod span;
pub(crate) mod statvfs;
pub mod task_mgr;
@@ -32,7 +31,6 @@ pub mod tenant;
pub mod utilization;
pub mod virtual_file;
pub mod walingest;
pub mod walrecord;
pub mod walredo;
use camino::Utf8Path;

View File

@@ -7,14 +7,14 @@
//! Clarify that)
//!
use super::tenant::{PageReconstructError, Timeline};
use crate::aux_file;
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::walrecord::NeonWalRecord;
use crate::{aux_file, repository::*};
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use pageserver_api::key::Key;
use pageserver_api::key::{
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -22,7 +22,9 @@ use pageserver_api::key::{
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::value::Value;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};

View File

@@ -92,11 +92,11 @@ use crate::metrics::{
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
};
use crate::repository::GcResult;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::config::LocationMode;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::gc_result::GcResult;
pub use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
@@ -160,6 +160,7 @@ pub(crate) mod timeline;
pub mod size;
mod gc_block;
mod gc_result;
pub(crate) mod throttle;
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -301,6 +302,13 @@ pub struct Tenant {
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
/// Serialize writes of the tenant manifest to remote storage. If there are concurrent operations
/// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
/// each other (this could be optimized to coalesce writes if necessary).
///
/// The contents of the Mutex are the last manifest we successfully uploaded
tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
// This mutex prevents creation of new timelines during GC.
// Adding yet another mutex (in addition to `timelines`) is needed because holding
// `timelines` mutex during all GC iteration
@@ -467,10 +475,10 @@ impl WalRedoManager {
/// This method is cancellation-safe.
pub async fn request_redo(
&self,
key: crate::repository::Key,
key: pageserver_api::key::Key,
lsn: Lsn,
base_img: Option<(Lsn, bytes::Bytes)>,
records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
pg_version: u32,
) -> Result<bytes::Bytes, walredo::Error> {
match self {
@@ -625,19 +633,10 @@ impl TimelineOrOffloaded {
TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
}
}
fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
match self {
TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
Some(remote_client) => remote_client,
None => {
let remote_client = tenant.build_timeline_client(
offloaded.timeline_id,
tenant.remote_storage.clone(),
);
Arc::new(remote_client)
}
},
TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
TimelineOrOffloaded::Offloaded(offloaded) => offloaded.remote_client.clone(),
}
}
}
@@ -749,6 +748,24 @@ pub enum TimelineArchivalError {
Other(anyhow::Error),
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum TenantManifestError {
#[error("Remote storage error: {0}")]
RemoteStorage(anyhow::Error),
#[error("Cancelled")]
Cancelled,
}
impl From<TenantManifestError> for TimelineArchivalError {
fn from(e: TenantManifestError) -> Self {
match e {
TenantManifestError::RemoteStorage(e) => Self::Other(e),
TenantManifestError::Cancelled => Self::Cancelled,
}
}
}
impl Debug for TimelineArchivalError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
@@ -1534,18 +1551,7 @@ impl Tenant {
offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
}
if !offloaded_timeline_ids.is_empty() {
let manifest = self.tenant_manifest();
// TODO: generation support
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
upload_tenant_manifest(
&self.remote_storage,
&self.tenant_shard_id,
generation,
&manifest,
&self.cancel,
)
.await
.map_err(TimelineArchivalError::Other)?;
self.store_tenant_manifest().await?;
}
// The local filesystem contents are a cache of what's in the remote IndexPart;
@@ -1830,6 +1836,18 @@ impl Tenant {
ctx: RequestContext,
) -> Result<Arc<Timeline>, TimelineArchivalError> {
info!("unoffloading timeline");
// We activate the timeline below manually, so this must be called on an active timeline.
// We expect callers of this function to ensure this.
match self.current_state() {
TenantState::Activating { .. }
| TenantState::Attaching
| TenantState::Broken { .. } => {
panic!("Timeline expected to be active")
}
TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled),
TenantState::Active => {}
}
let cancel = self.cancel.clone();
// Protect against concurrent attempts to use this TimelineId
@@ -1914,18 +1932,7 @@ impl Tenant {
};
// Upload new list of offloaded timelines to S3
let manifest = self.tenant_manifest();
// TODO: generation support
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
upload_tenant_manifest(
&self.remote_storage,
&self.tenant_shard_id,
generation,
&manifest,
&cancel,
)
.await
.map_err(TimelineArchivalError::Other)?;
self.store_tenant_manifest().await?;
// Activate the timeline (if it makes sense)
if !(timeline.is_broken() || timeline.is_stopping()) {
@@ -3122,7 +3129,7 @@ impl Tenant {
}
}
let tenant_manifest = self.tenant_manifest();
let tenant_manifest = self.build_tenant_manifest();
// TODO: generation support
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
for child_shard in child_shards {
@@ -3317,7 +3324,8 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
}
pub(crate) fn tenant_manifest(&self) -> TenantManifest {
/// Generate an up-to-date TenantManifest based on the state of this Tenant.
fn build_tenant_manifest(&self) -> TenantManifest {
let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
let mut timeline_manifests = timelines_offloaded
@@ -3525,6 +3533,7 @@ impl Tenant {
timelines: Mutex::new(HashMap::new()),
timelines_creating: Mutex::new(HashSet::new()),
timelines_offloaded: Mutex::new(HashMap::new()),
tenant_manifest_upload: Default::default(),
gc_cs: tokio::sync::Mutex::new(()),
walredo_mgr,
remote_storage,
@@ -4704,6 +4713,49 @@ impl Tenant {
.max()
.unwrap_or(0)
}
/// Serialize and write the latest TenantManifest to remote storage.
pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
// Only one manifest write may be done at at time, and the contents of the manifest
// must be loaded while holding this lock. This makes it safe to call this function
// from anywhere without worrying about colliding updates.
let mut guard = tokio::select! {
g = self.tenant_manifest_upload.lock() => {
g
},
_ = self.cancel.cancelled() => {
return Err(TenantManifestError::Cancelled);
}
};
let manifest = self.build_tenant_manifest();
if Some(&manifest) == (*guard).as_ref() {
// Optimisation: skip uploads that don't change anything.
return Ok(());
}
upload_tenant_manifest(
&self.remote_storage,
&self.tenant_shard_id,
self.generation,
&manifest,
&self.cancel,
)
.await
.map_err(|e| {
if self.cancel.is_cancelled() {
TenantManifestError::Cancelled
} else {
TenantManifestError::RemoteStorage(e)
}
})?;
// Store the successfully uploaded manifest, so that future callers can avoid
// re-uploading the same thing.
*guard = Some(manifest);
Ok(())
}
}
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
@@ -4806,7 +4858,8 @@ pub(crate) mod harness {
use crate::deletion_queue::mock::MockDeletionQueue;
use crate::l0_flush::L0FlushConfig;
use crate::walredo::apply_neon;
use crate::{repository::Key, walrecord::NeonWalRecord};
use pageserver_api::key::Key;
use pageserver_api::record::NeonWalRecord;
use super::*;
use hex_literal::hex;
@@ -4853,7 +4906,6 @@ pub(crate) mod harness {
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
}
@@ -5076,25 +5128,30 @@ mod tests {
use super::*;
use crate::keyspace::KeySpaceAccum;
use crate::repository::{Key, Value};
use crate::tenant::harness::*;
use crate::tenant::timeline::CompactFlags;
use crate::walrecord::NeonWalRecord;
use crate::DEFAULT_PG_VERSION;
use bytes::{Bytes, BytesMut};
use hex_literal::hex;
use itertools::Itertools;
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
use pageserver_api::value::Value;
use rand::{thread_rng, Rng};
use storage_layer::PersistentLayerKey;
use tests::storage_layer::ValuesReconstructState;
use tests::timeline::{GetVectoredError, ShutdownMode};
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
use timeline::{DeltaLayerTestDesc, GcInfo};
use timeline::DeltaLayerTestDesc;
use utils::id::TenantId;
#[cfg(feature = "testing")]
use pageserver_api::record::NeonWalRecord;
#[cfg(feature = "testing")]
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
#[cfg(feature = "testing")]
use timeline::GcInfo;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -7659,6 +7716,7 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_neon_test_record() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_neon_test_record").await?;
@@ -7850,6 +7908,7 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
@@ -8046,6 +8105,7 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_generate_key_retention() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_generate_key_retention").await?;
@@ -8393,6 +8453,7 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
let harness =
@@ -8633,6 +8694,7 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
{
@@ -8841,6 +8903,7 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
@@ -9042,6 +9105,7 @@ mod tests {
//
// When querying the key range [A, B) we need to read at different LSN ranges
// for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;

View File

@@ -9,7 +9,6 @@
//! may lead to a data loss.
//!
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::CompactionAlgorithmSettings;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, ThrottleConfig};
@@ -341,10 +340,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
pub image_layer_creation_check_threshold: Option<u8>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub switch_aux_file_policy: Option<AuxFilePolicy>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
@@ -410,9 +405,6 @@ impl TenantConfOpt {
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
switch_aux_file_policy: self
.switch_aux_file_policy
.unwrap_or(global_conf.switch_aux_file_policy),
lsn_lease_length: self
.lsn_lease_length
.unwrap_or(global_conf.lsn_lease_length),
@@ -470,7 +462,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
switch_aux_file_policy: value.switch_aux_file_policy,
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
}

View File

@@ -0,0 +1,57 @@
use anyhow::Result;
use serde::Serialize;
use std::ops::AddAssign;
use std::time::Duration;
///
/// Result of performing GC
///
#[derive(Default, Serialize, Debug)]
pub struct GcResult {
pub layers_total: u64,
pub layers_needed_by_cutoff: u64,
pub layers_needed_by_pitr: u64,
pub layers_needed_by_branches: u64,
pub layers_needed_by_leases: u64,
pub layers_not_updated: u64,
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
#[serde(serialize_with = "serialize_duration_as_millis")]
pub elapsed: Duration,
/// The layers which were garbage collected.
///
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
/// dropped in tests.
#[cfg(feature = "testing")]
#[serde(skip)]
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
}
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
d.as_millis().serialize(serializer)
}
impl AddAssign for GcResult {
fn add_assign(&mut self, other: Self) {
self.layers_total += other.layers_total;
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
self.layers_needed_by_branches += other.layers_needed_by_branches;
self.layers_needed_by_leases += other.layers_needed_by_leases;
self.layers_not_updated += other.layers_not_updated;
self.layers_removed += other.layers_removed;
self.elapsed += other.elapsed;
#[cfg(feature = "testing")]
{
let mut other = other;
self.doomed_layers.append(&mut other.doomed_layers);
}
}
}

View File

@@ -48,9 +48,9 @@ mod layer_coverage;
use crate::context::RequestContext;
use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use anyhow::Result;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
use std::collections::{HashMap, VecDeque};

View File

@@ -2811,7 +2811,7 @@ where
}
use {
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
utils::http::error::ApiError,
};

View File

@@ -249,7 +249,7 @@ pub(crate) use download::{
list_remote_tenant_shards, list_remote_timelines,
};
pub(crate) use index::LayerFileMetadata;
pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
pub(crate) use upload::upload_initdb_dir;
// Occasional network issues and such can cause remote operations to fail, and
// that's expected. If a download fails, we log it at info-level, and retry.

View File

@@ -403,59 +403,79 @@ async fn do_download_index_part(
Ok((index_part, index_generation, index_part_mtime))
}
/// index_part.json objects are suffixed with a generation number, so we cannot
/// directly GET the latest index part without doing some probing.
/// Metadata objects are "generationed", meaning that they include a generation suffix. This
/// function downloads the object with the highest generation <= `my_generation`.
///
/// In this function we probe for the most recent index in a generation <= our current generation.
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
/// Data objects (layer files) also include a generation in their path, but there is no equivalent
/// search process, because their reference from an index includes the generation.
///
/// An expensive object listing operation is only done if necessary: the typical fast path is to issue two
/// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding
/// generation (normal case when migrating/restarting). Only if both of these return 404 do we fall back
/// to listing objects.
///
/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
/// * `what`: for logging, what object are we downloading
/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
/// `cancel`` has fired. This function does not do its own retries of GET operations, and relies
/// on the function passed in to do so.
/// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object.
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
pub(crate) async fn download_index_part(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
storage: &'a GenericRemoteStorage,
tenant_shard_id: &'a TenantShardId,
timeline_id: &'a TimelineId,
my_generation: Generation,
cancel: &CancellationToken,
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
what: &str,
prefix: RemotePath,
do_download: DF,
parse_path: PF,
cancel: &'a CancellationToken,
) -> Result<(T, Generation, SystemTime), DownloadError>
where
DF: Fn(
&'a GenericRemoteStorage,
&'a TenantShardId,
&'a TimelineId,
Generation,
&'a CancellationToken,
) -> DFF,
DFF: Future<Output = Result<(T, Generation, SystemTime), DownloadError>>,
PF: Fn(RemotePath) -> Option<Generation>,
T: 'static,
{
debug_assert_current_span_has_tenant_and_timeline_id();
if my_generation.is_none() {
// Operating without generations: just fetch the generation-less path
return do_download_index_part(
storage,
tenant_shard_id,
timeline_id,
my_generation,
cancel,
)
.await;
return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
}
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
// index in our generation.
// Stale case: If we were intentionally attached in a stale generation, the remote object may already
// exist in our generation.
//
// This is an optimization to avoid doing the listing for the general case below.
let res =
do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
match res {
Ok(index_part) => {
tracing::debug!(
"Found index_part from current generation (this is a stale attachment)"
);
return Ok(index_part);
Ok(decoded) => {
tracing::debug!("Found {what} from current generation (this is a stale attachment)");
return Ok(decoded);
}
Err(DownloadError::NotFound) => {}
Err(e) => return Err(e),
};
// Typical case: the previous generation of this tenant was running healthily, and had uploaded
// and index part. We may safely start from this index without doing a listing, because:
// Typical case: the previous generation of this tenant was running healthily, and had uploaded the object
// we are seeking in that generation. We may safely start from this index without doing a listing, because:
// - We checked for current generation case above
// - generations > my_generation are to be ignored
// - any other indices that exist would have an older generation than `previous_gen`, and
// we want to find the most recent index from a previous generation.
// - any other objects that exist would have an older generation than `previous_gen`, and
// we want to find the most recent object from a previous generation.
//
// This is an optimization to avoid doing the listing for the general case below.
let res = do_download_index_part(
let res = do_download(
storage,
tenant_shard_id,
timeline_id,
@@ -464,14 +484,12 @@ pub(crate) async fn download_index_part(
)
.await;
match res {
Ok(index_part) => {
tracing::debug!("Found index_part from previous generation");
return Ok(index_part);
Ok(decoded) => {
tracing::debug!("Found {what} from previous generation");
return Ok(decoded);
}
Err(DownloadError::NotFound) => {
tracing::debug!(
"No index_part found from previous generation, falling back to listing"
);
tracing::debug!("No {what} found from previous generation, falling back to listing");
}
Err(e) => {
return Err(e);
@@ -481,12 +499,10 @@ pub(crate) async fn download_index_part(
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
// to constructing a full index path with no generation, because the generation is a suffix.
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
let indices = download_retry(
let paths = download_retry(
|| async {
storage
.list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
.list(Some(&prefix), ListingMode::NoDelimiter, None, cancel)
.await
},
"list index_part files",
@@ -497,22 +513,22 @@ pub(crate) async fn download_index_part(
// General case logic for which index to use: the latest index whose generation
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
let max_previous_generation = indices
let max_previous_generation = paths
.into_iter()
.filter_map(|o| parse_remote_index_path(o.key))
.filter_map(|o| parse_path(o.key))
.filter(|g| g <= &my_generation)
.max();
match max_previous_generation {
Some(g) => {
tracing::debug!("Found index_part in generation {g:?}");
do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
tracing::debug!("Found {what} in generation {g:?}");
do_download(storage, tenant_shard_id, timeline_id, g, cancel).await
}
None => {
// Migration from legacy pre-generation state: we have a generation but no prior
// attached pageservers did. Try to load from a no-generation path.
tracing::debug!("No index_part.json* found");
do_download_index_part(
tracing::debug!("No {what}* found");
do_download(
storage,
tenant_shard_id,
timeline_id,
@@ -524,6 +540,33 @@ pub(crate) async fn download_index_part(
}
}
/// index_part.json objects are suffixed with a generation number, so we cannot
/// directly GET the latest index part without doing some probing.
///
/// In this function we probe for the most recent index in a generation <= our current generation.
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
pub(crate) async fn download_index_part(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
my_generation: Generation,
cancel: &CancellationToken,
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
download_generation_object(
storage,
tenant_shard_id,
timeline_id,
my_generation,
"index_part",
index_prefix,
do_download_index_part,
parse_remote_index_path,
cancel,
)
.await
}
pub(crate) async fn download_initdb_tar_zst(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,

View File

@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use utils::{id::TimelineId, lsn::Lsn};
/// Tenant-shard scoped manifest
#[derive(Clone, Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct TenantManifest {
/// Debugging aid describing the version of this manifest.
/// Can also be used for distinguishing breaking changes later on.
@@ -23,7 +23,7 @@ pub struct TenantManifest {
/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
/// but the two datastructures serve different needs, this is for a persistent disk format
/// that must be backwards compatible, while the other is only for informative purposes.
#[derive(Clone, Serialize, Deserialize, Copy)]
#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
pub struct OffloadedTimelineManifest {
pub timeline_id: TimelineId,
/// Whether the timeline has a parent it has been branched off from or not

View File

@@ -187,6 +187,8 @@ pub(super) async fn gather_inputs(
// but it is unlikely to cause any issues. In the worst case,
// the calculation will error out.
timelines.retain(|t| t.is_active());
// Also filter out archived timelines.
timelines.retain(|t| t.is_archived() != Some(true));
// Build a map of branch points.
let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();

View File

@@ -11,11 +11,11 @@ mod layer_name;
pub mod merge_iterator;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};

View File

@@ -5,7 +5,8 @@ use pageserver_api::key::{Key, KEY_SIZE};
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
use crate::tenant::storage_layer::Layer;
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
use pageserver_api::value::Value;
use super::layer::S3_UPLOAD_LIMIT;
use super::{

View File

@@ -30,7 +30,6 @@
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::page_cache::{self, FileId, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{
@@ -46,7 +45,7 @@ use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
use crate::virtual_file::IoBufferMut;
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::TEMP_FILE_SUFFIX;
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{anyhow, bail, ensure, Context, Result};
use camino::{Utf8Path, Utf8PathBuf};
@@ -54,9 +53,11 @@ use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::DBDIR_KEY;
use pageserver_api::key::{Key, KEY_SIZE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
use pageserver_api::value::Value;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
@@ -1293,7 +1294,7 @@ impl DeltaLayerInner {
// is it an image or will_init walrecord?
// FIXME: this could be handled by threading the BlobRef to the
// VectoredReadBuilder
let will_init = crate::repository::ValueBytes::will_init(&data)
let will_init = pageserver_api::value::ValueBytes::will_init(&data)
.inspect_err(|_e| {
#[cfg(feature = "testing")]
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
@@ -1356,7 +1357,7 @@ impl DeltaLayerInner {
format!(" img {} bytes", img.len())
}
Value::WalRecord(rec) => {
let wal_desc = walrecord::describe_wal_record(&rec)?;
let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
format!(
" rec {} bytes will_init: {} {}",
buf.len(),
@@ -1610,7 +1611,6 @@ pub(crate) mod test {
use rand::RngCore;
use super::*;
use crate::repository::Value;
use crate::tenant::harness::TIMELINE_ID;
use crate::tenant::storage_layer::{Layer, ResidentLayer};
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
@@ -1622,6 +1622,7 @@ pub(crate) mod test {
DEFAULT_PG_VERSION,
};
use bytes::Bytes;
use pageserver_api::value::Value;
/// Construct an index for a fictional delta layer and and then
/// traverse in order to plan vectored reads for a query. Finally,
@@ -1974,8 +1975,8 @@ pub(crate) mod test {
#[tokio::test]
async fn copy_delta_prefix_smoke() {
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use pageserver_api::record::NeonWalRecord;
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
.await
@@ -2198,6 +2199,7 @@ pub(crate) mod test {
(k1, l1).cmp(&(k2, l2))
}
#[cfg(feature = "testing")]
pub(crate) fn sort_delta_value(
(k1, l1, v1): &(Key, Lsn, Value),
(k2, l2, v2): &(Key, Lsn, Value),

View File

@@ -7,7 +7,7 @@ use pageserver_api::{
};
use utils::lsn::Lsn;
use crate::repository::Value;
use pageserver_api::value::Value;
use super::merge_iterator::MergeIterator;
@@ -121,8 +121,8 @@ mod tests {
#[tokio::test]
async fn filter_keyspace_iterator() {
use crate::repository::Value;
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
.await

View File

@@ -28,7 +28,6 @@
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::page_cache::{self, FileId, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, FileBlockReader};
use crate::tenant::disk_btree::{
@@ -51,8 +50,10 @@ use hex;
use itertools::Itertools;
use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::DBDIR_KEY;
use pageserver_api::key::{Key, KEY_SIZE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use pageserver_api::value::Value;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
@@ -1125,6 +1126,7 @@ mod test {
use pageserver_api::{
key::Key,
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
value::Value,
};
use utils::{
generation::Generation,
@@ -1134,7 +1136,6 @@ mod test {
use crate::{
context::RequestContext,
repository::Value,
tenant::{
config::TenantConf,
harness::{TenantHarness, TIMELINE_ID},

View File

@@ -7,7 +7,6 @@
use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::repository::{Key, Value};
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::PageReconstructError;
@@ -16,9 +15,11 @@ use crate::{l0_flush, page_cache};
use anyhow::{anyhow, Context, Result};
use camino::Utf8PathBuf;
use pageserver_api::key::CompactKey;
use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use pageserver_api::value::Value;
use std::collections::{BTreeMap, HashMap};
use std::sync::{Arc, OnceLock};
use std::time::Instant;

View File

@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
/// Also checks that the same does not happen on a non-evicted layer (regression test).
#[tokio::test(start_paused = true)]
async fn eviction_cancellation_on_drop() {
use crate::repository::Value;
use bytes::Bytes;
use pageserver_api::value::Value;
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
let mut writer = timeline.writer().await;
writer
.put(
crate::repository::Key::from_i128(5),
pageserver_api::key::Key::from_i128(5),
Lsn(0x20),
&Value::Image(Bytes::from_static(b"this does not matter either")),
&ctx,

View File

@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
use std::ops::Range;
use utils::{id::TimelineId, lsn::Lsn};
use crate::repository::Key;
use pageserver_api::key::Key;
use super::{DeltaLayerName, ImageLayerName, LayerName};

View File

@@ -1,7 +1,7 @@
//!
//! Helper functions for dealing with filenames of the image and delta layer files.
//!
use crate::repository::Key;
use pageserver_api::key::Key;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::fmt;

View File

@@ -7,7 +7,8 @@ use anyhow::bail;
use pageserver_api::key::Key;
use utils::lsn::Lsn;
use crate::{context::RequestContext, repository::Value};
use crate::context::RequestContext;
use pageserver_api::value::Value;
use super::{
delta_layer::{DeltaLayerInner, DeltaLayerIterator},
@@ -291,12 +292,16 @@ mod tests {
use crate::{
tenant::{
harness::{TenantHarness, TIMELINE_ID},
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
},
walrecord::NeonWalRecord,
DEFAULT_PG_VERSION,
};
#[cfg(feature = "testing")]
use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
#[cfg(feature = "testing")]
use pageserver_api::record::NeonWalRecord;
async fn assert_merge_iter_equal(
merge_iter: &mut MergeIterator<'_>,
expect: &[(Key, Lsn, Value)],
@@ -319,8 +324,8 @@ mod tests {
#[tokio::test]
async fn merge_in_between() {
use crate::repository::Value;
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("merge_iterator_merge_in_between")
.await
@@ -384,8 +389,8 @@ mod tests {
#[tokio::test]
async fn delta_merge() {
use crate::repository::Value;
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("merge_iterator_delta_merge")
.await
@@ -458,10 +463,11 @@ mod tests {
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn delta_image_mixed_merge() {
use crate::repository::Value;
use bytes::Bytes;
use pageserver_api::value::Value;
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
.await
@@ -586,5 +592,6 @@ mod tests {
is_send(merge_iter);
}
#[cfg(feature = "testing")]
fn is_send(_: impl Send) {}
}

View File

@@ -125,11 +125,12 @@ use utils::{
simple_rcu::{Rcu, RcuReadGuard},
};
use crate::repository::GcResult;
use crate::repository::{Key, Value};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::gc_result::GcResult;
use crate::ZERO_PAGE;
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use self::delete::DeleteTimelineFlow;
pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -5822,17 +5823,15 @@ fn is_send() {
#[cfg(test)]
mod tests {
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use utils::{id::TimelineId, lsn::Lsn};
use crate::{
repository::Value,
tenant::{
harness::{test_img, TenantHarness},
layer_map::LayerMap,
storage_layer::{Layer, LayerName},
timeline::{DeltaLayerTestDesc, EvictionError},
Timeline,
},
use crate::tenant::{
harness::{test_img, TenantHarness},
layer_map::LayerMap,
storage_layer::{Layer, LayerName},
timeline::{DeltaLayerTestDesc, EvictionError},
Timeline,
};
#[tokio::test]

View File

@@ -49,9 +49,10 @@ use pageserver_api::config::tenant_conf_defaults::{
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
};
use crate::keyspace::KeySpace;
use crate::repository::{Key, Value};
use crate::walrecord::NeonWalRecord;
use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
use utils::lsn::Lsn;
@@ -2148,7 +2149,7 @@ struct ResidentDeltaLayer(ResidentLayer);
struct ResidentImageLayer(ResidentLayer);
impl CompactionJobExecutor for TimelineAdaptor {
type Key = crate::repository::Key;
type Key = pageserver_api::key::Key;
type Layer = OwnArc<PersistentLayerDesc>;
type DeltaLayer = ResidentDeltaLayer;

View File

@@ -6,7 +6,7 @@ use std::{
use anyhow::Context;
use pageserver_api::{models::TimelineState, shard::TenantShardId};
use tokio::sync::OwnedMutexGuard;
use tracing::{error, info, instrument, Instrument};
use tracing::{error, info, info_span, instrument, Instrument};
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
use crate::{
@@ -14,10 +14,9 @@ use crate::{
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
remote_timeline_client::{
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
},
CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
TimelineOrOffloaded,
},
};
@@ -176,32 +175,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
Ok(())
}
/// It is important that this gets called when DeletionGuard is being held.
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
async fn upload_new_tenant_manifest(
tenant: &Tenant,
_: &DeletionGuard, // using it as a witness
) -> anyhow::Result<()> {
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
// between the deletion of the index-part.json and reaching of this code.
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
// However, we handle this case in tenant loading code so the next time we attach, the issue is
// resolved.
let manifest = tenant.tenant_manifest();
// TODO: generation support
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
remote_timeline_client::upload_tenant_manifest(
&tenant.remote_storage,
&tenant.tenant_shard_id,
generation,
&manifest,
&tenant.cancel,
)
.await?;
Ok(())
}
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
/// and deletes its data from both disk and s3.
/// The sequence of steps:
@@ -258,7 +231,32 @@ impl DeleteTimelineFlow {
))?
});
let remote_client = timeline.remote_client_maybe_construct(tenant);
let remote_client = match timeline.maybe_remote_client() {
Some(remote_client) => remote_client,
None => {
let remote_client = tenant
.build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
let result = remote_client
.download_index_file(&tenant.cancel)
.instrument(info_span!("download_index_file"))
.await
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
let index_part = match result {
MaybeDeletedIndexPart::Deleted(p) => {
tracing::info!("Timeline already set as deleted in remote index");
p
}
MaybeDeletedIndexPart::IndexPart(p) => p,
};
let remote_client = Arc::new(remote_client);
remote_client
.init_upload_queue(&index_part)
.map_err(DeleteTimelineError::Other)?;
remote_client.shutdown().await;
remote_client
}
};
set_deleted_in_remote_index(&remote_client).await?;
fail::fail_point!("timeline-delete-before-schedule", |_| {
@@ -455,7 +453,15 @@ impl DeleteTimelineFlow {
remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
upload_new_tenant_manifest(tenant, &guard).await?;
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
// between the deletion of the index-part.json and reaching of this code.
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
// However, we handle this case in tenant loading code so the next time we attach, the issue is
// resolved.
tenant
.store_tenant_manifest()
.await
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
*guard = Self::Finished;

View File

@@ -3,7 +3,7 @@ use std::sync::Arc;
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
use super::Timeline;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
pub(crate) async fn offload_timeline(
tenant: &Tenant,
@@ -63,17 +63,10 @@ pub(crate) async fn offload_timeline(
// at the next restart attach it again.
// For that to happen, we'd need to make the manifest reflect our *intended* state,
// not our actual state of offloaded timelines.
let manifest = tenant.tenant_manifest();
// TODO: generation support
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
remote_timeline_client::upload_tenant_manifest(
&tenant.remote_storage,
&tenant.tenant_shard_id,
generation,
&manifest,
&tenant.cancel,
)
.await?;
tenant
.store_tenant_manifest()
.await
.map_err(|e| anyhow::anyhow!(e))?;
Ok(())
}

View File

@@ -31,11 +31,11 @@ use crate::{
task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
walingest::WalIngest,
walrecord::{decode_wal_record, DecodedWALRecord},
};
use postgres_backend::is_expected_io_error;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::waldecoder::WalStreamDecoder;
use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
use utils::{id::NodeId, lsn::Lsn};
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};

View File

@@ -29,8 +29,10 @@ use std::time::Instant;
use std::time::SystemTime;
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::walrecord::*;
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
use wal_decoder::models::*;
use anyhow::{bail, Context, Result};
use bytes::{Buf, Bytes, BytesMut};
@@ -44,9 +46,9 @@ use crate::pgdatadir_mapping::{DatadirModification, Version};
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::walrecord::*;
use crate::ZERO_PAGE;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -108,143 +110,6 @@ struct WarnIngestLag {
timestamp_invalid_msg_ratelimit: RateLimit,
}
// These structs are an intermediary representation of the PostgreSQL WAL records.
// The ones prefixed with `Xl` are lower level, while the ones that are not have
// all the required context to be acted upon by the pageserver.
enum HeapamRecord {
ClearVmBits(ClearVmBits),
}
struct ClearVmBits {
new_heap_blkno: Option<u32>,
old_heap_blkno: Option<u32>,
vm_rel: RelTag,
flags: u8,
}
enum NeonrmgrRecord {
ClearVmBits(ClearVmBits),
}
enum SmgrRecord {
Create(SmgrCreate),
Truncate(XlSmgrTruncate),
}
struct SmgrCreate {
rel: RelTag,
}
enum DbaseRecord {
Create(DbaseCreate),
Drop(DbaseDrop),
}
struct DbaseCreate {
db_id: u32,
tablespace_id: u32,
src_db_id: u32,
src_tablespace_id: u32,
}
struct DbaseDrop {
db_id: u32,
tablespace_ids: Vec<u32>,
}
enum ClogRecord {
ZeroPage(ClogZeroPage),
Truncate(ClogTruncate),
}
struct ClogZeroPage {
segno: u32,
rpageno: u32,
}
struct ClogTruncate {
pageno: u32,
oldest_xid: u32,
oldest_xid_db: u32,
}
enum XactRecord {
Commit(XactCommon),
Abort(XactCommon),
CommitPrepared(XactCommon),
AbortPrepared(XactCommon),
Prepare(XactPrepare),
}
struct XactCommon {
parsed: XlXactParsedRecord,
origin_id: u16,
// Fields below are only used for logging
xl_xid: u32,
lsn: Lsn,
}
struct XactPrepare {
xl_xid: u32,
data: Bytes,
}
enum MultiXactRecord {
ZeroPage(MultiXactZeroPage),
Create(XlMultiXactCreate),
Truncate(XlMultiXactTruncate),
}
struct MultiXactZeroPage {
slru_kind: SlruKind,
segno: u32,
rpageno: u32,
}
enum RelmapRecord {
Update(RelmapUpdate),
}
struct RelmapUpdate {
update: XlRelmapUpdate,
buf: Bytes,
}
enum XlogRecord {
Raw(RawXlogRecord),
}
struct RawXlogRecord {
info: u8,
lsn: Lsn,
buf: Bytes,
}
enum LogicalMessageRecord {
Put(PutLogicalMessage),
#[cfg(feature = "testing")]
Failpoint,
}
struct PutLogicalMessage {
path: String,
buf: Bytes,
}
enum StandbyRecord {
RunningXacts(StandbyRunningXacts),
}
struct StandbyRunningXacts {
oldest_running_xid: u32,
}
enum ReploriginRecord {
Set(XlReploriginSet),
Drop(XlReploriginDrop),
}
impl WalIngest {
pub async fn new(
timeline: &Timeline,
@@ -284,7 +149,6 @@ impl WalIngest {
/// relations/pages that the record affects.
///
/// This function returns `true` if the record was ingested, and `false` if it was filtered out
///
pub async fn ingest_record(
&mut self,
decoded: DecodedWALRecord,
@@ -2218,7 +2082,7 @@ impl WalIngest {
) -> anyhow::Result<Option<LogicalMessageRecord>> {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
let xlrec = crate::walrecord::XlLogicalMessage::decode(buf);
let xlrec = XlLogicalMessage::decode(buf);
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
#[cfg(feature = "testing")]
@@ -2246,7 +2110,7 @@ impl WalIngest {
) -> anyhow::Result<Option<StandbyRecord>> {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_RUNNING_XACTS {
let xlrec = crate::walrecord::XlRunningXacts::decode(buf);
let xlrec = XlRunningXacts::decode(buf);
return Ok(Some(StandbyRecord::RunningXacts(StandbyRunningXacts {
oldest_running_xid: xlrec.oldest_running_xid,
})));
@@ -2276,10 +2140,10 @@ impl WalIngest {
) -> anyhow::Result<Option<ReploriginRecord>> {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_REPLORIGIN_SET {
let xlrec = crate::walrecord::XlReploriginSet::decode(buf);
let xlrec = XlReploriginSet::decode(buf);
return Ok(Some(ReploriginRecord::Set(xlrec)));
} else if info == pg_constants::XLOG_REPLORIGIN_DROP {
let xlrec = crate::walrecord::XlReploriginDrop::decode(buf);
let xlrec = XlReploriginDrop::decode(buf);
return Ok(Some(ReploriginRecord::Drop(xlrec)));
}
@@ -3146,6 +3010,7 @@ mod tests {
async fn test_ingest_real_wal() {
use crate::tenant::harness::*;
use postgres_ffi::waldecoder::WalStreamDecoder;
use postgres_ffi::walrecord::decode_wal_record;
use postgres_ffi::WAL_SEGMENT_SIZE;
// Define test data path and constants.

View File

@@ -29,11 +29,11 @@ use crate::metrics::{
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
};
use crate::repository::Key;
use crate::walrecord::NeonWalRecord;
use anyhow::Context;
use bytes::{Bytes, BytesMut};
use pageserver_api::key::Key;
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::shard::TenantShardId;
use std::future::Future;
use std::sync::Arc;
@@ -548,9 +548,10 @@ impl PostgresRedoManager {
#[cfg(test)]
mod tests {
use super::PostgresRedoManager;
use crate::repository::Key;
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
use crate::config::PageServerConf;
use bytes::Bytes;
use pageserver_api::key::Key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::shard::TenantShardId;
use std::str::FromStr;
use tracing::Instrument;

View File

@@ -1,8 +1,8 @@
use crate::walrecord::NeonWalRecord;
use anyhow::Context;
use byteorder::{ByteOrder, LittleEndian};
use bytes::BytesMut;
use pageserver_api::key::Key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::SlruKind;
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -238,7 +238,7 @@ pub(crate) fn apply_in_neon(
// No-op: this record will never be created in aux v2.
warn!("AuxFile record should not be created in aux v2");
}
#[cfg(test)]
#[cfg(feature = "testing")]
NeonWalRecord::Test {
append,
clear,

View File

@@ -8,10 +8,10 @@ use crate::{
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
page_cache::PAGE_SZ,
span::debug_assert_current_span_has_tenant_id,
walrecord::NeonWalRecord,
};
use anyhow::Context;
use bytes::Bytes;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use postgres_ffi::BLCKSZ;
#[cfg(feature = "testing")]

View File

@@ -16,6 +16,7 @@ OBJS = \
neon_walreader.o \
pagestore_smgr.o \
relsize_cache.o \
unstable_extensions.o \
walproposer.o \
walproposer_pg.o \
control_plane_connector.o \

View File

@@ -18,6 +18,7 @@
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <curl/curl.h>
@@ -508,6 +509,8 @@ NeonXactCallback(XactEvent event, void *arg)
static bool
RoleIsNeonSuperuser(const char *role_name)
{
Assert(role_name);
return strcmp(role_name, "neon_superuser") == 0;
}
@@ -670,7 +673,7 @@ HandleCreateRole(CreateRoleStmt *stmt)
static void
HandleAlterRole(AlterRoleStmt *stmt)
{
const char *role_name = stmt->role->rolename;
char *role_name;
DefElem *dpass;
ListCell *option;
bool found = false;
@@ -678,6 +681,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
InitRoleTableIfNeeded();
role_name = get_rolespec_name(stmt->role);
if (RoleIsNeonSuperuser(role_name) && !superuser())
elog(ERROR, "can't ALTER neon_superuser");
@@ -689,9 +693,13 @@ HandleAlterRole(AlterRoleStmt *stmt)
if (strcmp(defel->defname, "password") == 0)
dpass = defel;
}
/* We only care about updates to the password */
if (!dpass)
{
pfree(role_name);
return;
}
entry = hash_search(CurrentDdlTable->role_table,
role_name,
@@ -704,6 +712,8 @@ HandleAlterRole(AlterRoleStmt *stmt)
else
entry->password = NULL;
entry->type = Op_Set;
pfree(role_name);
}
static void

View File

@@ -30,6 +30,7 @@
#include "neon.h"
#include "control_plane_connector.h"
#include "logical_replication_monitor.h"
#include "unstable_extensions.h"
#include "walsender_hooks.h"
#if PG_MAJORVERSION_NUM >= 16
#include "storage/ipc.h"
@@ -424,6 +425,7 @@ _PG_init(void)
LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitUnstableExtensionsSupport();
InitLogicalReplicationMonitor();
InitControlPlaneConnector();

View File

@@ -42,3 +42,4 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
MemoryContextSwitchTo(old_context);
}
#endif

View File

@@ -0,0 +1,129 @@
#include <stdlib.h>
#include <string.h>
#include "postgres.h"
#include "nodes/plannodes.h"
#include "nodes/parsenodes.h"
#include "tcop/utility.h"
#include "utils/errcodes.h"
#include "utils/guc.h"
#include "neon_pgversioncompat.h"
#include "unstable_extensions.h"
static bool allow_unstable_extensions = false;
static char *unstable_extensions = NULL;
static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
static bool
list_contains(char const* comma_separated_list, char const* val)
{
char const* occ = comma_separated_list;
size_t val_len = strlen(val);
if (val_len == 0)
return false;
while ((occ = strstr(occ, val)) != NULL)
{
if ((occ == comma_separated_list || occ[-1] == ',')
&& (occ[val_len] == '\0' || occ[val_len] == ','))
{
return true;
}
occ += val_len;
}
return false;
}
static void
CheckUnstableExtension(
PlannedStmt *pstmt,
const char *queryString,
bool readOnlyTree,
ProcessUtilityContext context,
ParamListInfo params,
QueryEnvironment *queryEnv,
DestReceiver *dest,
QueryCompletion *qc)
{
Node *parseTree = pstmt->utilityStmt;
if (allow_unstable_extensions || unstable_extensions == NULL)
goto process;
switch (nodeTag(parseTree))
{
case T_CreateExtensionStmt:
{
CreateExtensionStmt *stmt = castNode(CreateExtensionStmt, parseTree);
if (list_contains(unstable_extensions, stmt->extname))
{
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("%s extension is in beta and may be unstable or introduce backward-incompatible changes.\nWe recommend testing it in a separate, dedicated Neon project.", stmt->extname),
errhint("to proceed with installation, run SET neon.allow_unstable_extensions='true'")));
}
break;
}
default:
goto process;
}
process:
if (PreviousProcessUtilityHook)
{
PreviousProcessUtilityHook(
pstmt,
queryString,
readOnlyTree,
context,
params,
queryEnv,
dest,
qc);
}
else
{
standard_ProcessUtility(
pstmt,
queryString,
readOnlyTree,
context,
params,
queryEnv,
dest,
qc);
}
}
void
InitUnstableExtensionsSupport(void)
{
DefineCustomBoolVariable(
"neon.allow_unstable_extensions",
"Allow unstable extensions to be installed and used",
NULL,
&allow_unstable_extensions,
false,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomStringVariable(
"neon.unstable_extensions",
"List of unstable extensions",
NULL,
&unstable_extensions,
NULL,
PGC_SUSET,
0,
NULL, NULL, NULL);
PreviousProcessUtilityHook = ProcessUtility_hook;
ProcessUtility_hook = CheckUnstableExtension;
}

View File

@@ -0,0 +1,6 @@
#ifndef __NEON_UNSTABLE_EXTENSIONS_H__
#define __NEON_UNSTABLE_EXTENSIONS_H__
void InitUnstableExtensionsSupport(void);
#endif

53
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -1521,6 +1521,21 @@ files = [
[package.dependencies]
six = "*"
[[package]]
name = "jwcrypto"
version = "1.5.6"
description = "Implementation of JOSE Web standards"
optional = false
python-versions = ">= 3.8"
files = [
{file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"},
{file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"},
]
[package.dependencies]
cryptography = ">=3.4"
typing-extensions = ">=4.5.0"
[[package]]
name = "kafka-python"
version = "2.0.2"
@@ -2111,7 +2126,6 @@ files = [
{file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@@ -2120,8 +2134,6 @@ files = [
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
@@ -2603,7 +2615,6 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2912,6 +2923,20 @@ files = [
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]
[[package]]
name = "types-jwcrypto"
version = "1.5.0.20240925"
description = "Typing stubs for jwcrypto"
optional = false
python-versions = ">=3.8"
files = [
{file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"},
{file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"},
]
[package.dependencies]
cryptography = "*"
[[package]]
name = "types-psutil"
version = "5.9.5.12"
@@ -3118,13 +3143,13 @@ files = [
[[package]]
name = "werkzeug"
version = "3.0.3"
version = "3.0.6"
description = "The comprehensive WSGI web application library."
optional = false
python-versions = ">=3.8"
files = [
{file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
{file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
{file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
{file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
]
[package.dependencies]
@@ -3159,16 +3184,6 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3406,4 +3421,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
content-hash = "ad5c9ee7723359af22bbd7fa41538dcf78913c02e947a13a8f9a87eb3a59039e"

View File

@@ -1,5 +1,5 @@
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
use tracing::{debug, info};
use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
use crate::auth::{self, AuthFlow};
@@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext(
secret: AuthSecret,
config: &'static AuthenticationConfig,
) -> auth::Result<ComputeCredentials> {
warn!("cleartext auth flow override is enabled, proceeding");
debug!("cleartext auth flow override is enabled, proceeding");
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
// pause the timer while we communicate with the client
@@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
info: ComputeUserInfoNoEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
warn!("project not specified, resorting to the password hack auth flow");
debug!("project not specified, resorting to the password hack auth flow");
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
// pause the timer while we communicate with the client

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::future::Future;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
@@ -45,6 +46,7 @@ pub(crate) enum FetchAuthRulesError {
RoleJwksNotConfigured,
}
#[derive(Clone)]
pub(crate) struct AuthRule {
pub(crate) id: String,
pub(crate) jwks_url: url::Url,
@@ -277,7 +279,7 @@ impl JwkCacheEntryLock {
// get the key from the JWKs if possible. If not, wait for the keys to update.
let (jwk, expected_audience) = loop {
match guard.find_jwk_and_audience(kid, role_name) {
match guard.find_jwk_and_audience(&kid, role_name) {
Some(jwk) => break jwk,
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
@@ -312,7 +314,9 @@ impl JwkCacheEntryLock {
if let Some(aud) = expected_audience {
if payload.audience.0.iter().all(|s| s != aud) {
return Err(JwtError::InvalidJwtTokenAudience);
return Err(JwtError::InvalidClaims(
JwtClaimsError::InvalidJwtTokenAudience,
));
}
}
@@ -320,13 +324,15 @@ impl JwkCacheEntryLock {
if let Some(exp) = payload.expiration {
if now >= exp + CLOCK_SKEW_LEEWAY {
return Err(JwtError::JwtTokenHasExpired);
return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired));
}
}
if let Some(nbf) = payload.not_before {
if nbf >= now + CLOCK_SKEW_LEEWAY {
return Err(JwtError::JwtTokenNotYetReadyToUse);
return Err(JwtError::InvalidClaims(
JwtClaimsError::JwtTokenNotYetReadyToUse,
));
}
}
@@ -420,8 +426,8 @@ struct JwtHeader<'a> {
#[serde(rename = "alg")]
algorithm: jose_jwa::Algorithm,
/// key id, must be provided for our usecase
#[serde(rename = "kid")]
key_id: Option<&'a str>,
#[serde(rename = "kid", borrow)]
key_id: Option<Cow<'a, str>>,
}
/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
@@ -440,17 +446,17 @@ struct JwtPayload<'a> {
// the following entries are only extracted for the sake of debug logging.
/// Issuer of the JWT
#[serde(rename = "iss")]
issuer: Option<&'a str>,
#[serde(rename = "iss", borrow)]
issuer: Option<Cow<'a, str>>,
/// Subject of the JWT (the user)
#[serde(rename = "sub")]
subject: Option<&'a str>,
#[serde(rename = "sub", borrow)]
subject: Option<Cow<'a, str>>,
/// Unique token identifier
#[serde(rename = "jti")]
jwt_id: Option<&'a str>,
#[serde(rename = "jti", borrow)]
jwt_id: Option<Cow<'a, str>>,
/// Unique session identifier
#[serde(rename = "sid")]
session_id: Option<&'a str>,
#[serde(rename = "sid", borrow)]
session_id: Option<Cow<'a, str>>,
}
/// `OneOrMany` supports parsing either a single item or an array of items.
@@ -585,14 +591,8 @@ pub(crate) enum JwtError {
#[error("Provided authentication token is not a valid JWT encoding")]
JwtEncoding(#[from] JwtEncodingError),
#[error("invalid JWT token audience")]
InvalidJwtTokenAudience,
#[error("JWT token has expired")]
JwtTokenHasExpired,
#[error("JWT token is not yet ready to use")]
JwtTokenNotYetReadyToUse,
#[error(transparent)]
InvalidClaims(#[from] JwtClaimsError),
#[error("invalid P256 key")]
InvalidP256Key(jose_jwk::crypto::Error),
@@ -644,6 +644,19 @@ pub enum JwtEncodingError {
InvalidCompactForm,
}
#[derive(Error, Debug, PartialEq)]
#[non_exhaustive]
pub enum JwtClaimsError {
#[error("invalid JWT token audience")]
InvalidJwtTokenAudience,
#[error("JWT token has expired")]
JwtTokenHasExpired,
#[error("JWT token is not yet ready to use")]
JwtTokenNotYetReadyToUse,
}
#[allow(dead_code, reason = "Debug use only")]
#[derive(Debug)]
pub(crate) enum KeyType {
@@ -680,6 +693,8 @@ mod tests {
use hyper_util::rt::TokioIo;
use rand::rngs::OsRng;
use rsa::pkcs8::DecodePrivateKey;
use serde::Serialize;
use serde_json::json;
use signature::Signer;
use tokio::net::TcpListener;
@@ -693,6 +708,7 @@ mod tests {
key: jose_jwk::Key::Ec(pk),
prm: jose_jwk::Parameters {
kid: Some(kid),
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
..Default::default()
},
};
@@ -706,24 +722,47 @@ mod tests {
key: jose_jwk::Key::Rsa(pk),
prm: jose_jwk::Parameters {
kid: Some(kid),
alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
..Default::default()
},
};
(sk, jwk)
}
fn now() -> u64 {
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs()
}
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
let now = now();
let body = typed_json::json! {{
"exp": now + 3600,
"nbf": now,
"aud": ["audience1", "neon", "audience2"],
"sub": "user1",
"sid": "session1",
"jti": "token1",
"iss": "neon-testing",
}};
build_custom_jwt_payload(kid, body, sig)
}
fn build_custom_jwt_payload(
kid: String,
body: impl Serialize,
sig: jose_jwa::Signing,
) -> String {
let header = JwtHeader {
algorithm: jose_jwa::Algorithm::Signing(sig),
key_id: Some(&kid),
key_id: Some(Cow::Owned(kid)),
};
let body = typed_json::json! {{
"exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
}};
let header =
base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
let body = base64::encode_config(serde_json::to_string(&body).unwrap(), URL_SAFE_NO_PAD);
format!("{header}.{body}")
}
@@ -738,6 +777,16 @@ mod tests {
format!("{payload}.{sig}")
}
fn new_custom_ec_jwt(kid: String, key: &p256::SecretKey, body: impl Serialize) -> String {
use p256::ecdsa::{Signature, SigningKey};
let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256);
let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
format!("{payload}.{sig}")
}
fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
use rsa::pkcs1v15::SigningKey;
use rsa::signature::SignatureEncoding;
@@ -809,37 +858,34 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
-----END PRIVATE KEY-----
";
#[tokio::test]
async fn renew() {
let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into());
let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into());
let (ec1, jwk3) = new_ec_jwk("3".into());
let (ec2, jwk4) = new_ec_jwk("4".into());
#[derive(Clone)]
struct Fetch(Vec<AuthRule>);
let foo_jwks = jose_jwk::JwkSet {
keys: vec![jwk1, jwk3],
};
let bar_jwks = jose_jwk::JwkSet {
keys: vec![jwk2, jwk4],
};
impl FetchAuthRules for Fetch {
async fn fetch_auth_rules(
&self,
_ctx: &RequestMonitoring,
_endpoint: EndpointId,
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
Ok(self.0.clone())
}
}
async fn jwks_server(
router: impl for<'a> Fn(&'a str) -> Option<Vec<u8>> + Send + Sync + 'static,
) -> SocketAddr {
let router = Arc::new(router);
let service = service_fn(move |req| {
let foo_jwks = foo_jwks.clone();
let bar_jwks = bar_jwks.clone();
let router = Arc::clone(&router);
async move {
let jwks = match req.uri().path() {
"/foo" => &foo_jwks,
"/bar" => &bar_jwks,
_ => {
return Response::builder()
.status(404)
.body(Full::new(Bytes::new()));
}
};
let body = serde_json::to_vec(jwks).unwrap();
Response::builder()
.status(200)
.body(Full::new(Bytes::from(body)))
match router(req.uri().path()) {
Some(body) => Response::builder()
.status(200)
.body(Full::new(Bytes::from(body))),
None => Response::builder()
.status(404)
.body(Full::new(Bytes::new())),
}
}
});
@@ -854,84 +900,61 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
}
});
let client = reqwest::Client::new();
addr
}
#[derive(Clone)]
struct Fetch(SocketAddr, Vec<RoleNameInt>);
#[tokio::test]
async fn check_jwt_happy_path() {
let (rs1, jwk1) = new_rsa_jwk(RS1, "rs1".into());
let (rs2, jwk2) = new_rsa_jwk(RS2, "rs2".into());
let (ec1, jwk3) = new_ec_jwk("ec1".into());
let (ec2, jwk4) = new_ec_jwk("ec2".into());
impl FetchAuthRules for Fetch {
async fn fetch_auth_rules(
&self,
_ctx: &RequestMonitoring,
_endpoint: EndpointId,
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
Ok(vec![
AuthRule {
id: "foo".to_owned(),
jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
audience: None,
role_names: self.1.clone(),
},
AuthRule {
id: "bar".to_owned(),
jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
audience: None,
role_names: self.1.clone(),
},
])
}
}
let foo_jwks = jose_jwk::JwkSet {
keys: vec![jwk1, jwk3],
};
let bar_jwks = jose_jwk::JwkSet {
keys: vec![jwk2, jwk4],
};
let jwks_addr = jwks_server(move |path| match path {
"/foo" => Some(serde_json::to_vec(&foo_jwks).unwrap()),
"/bar" => Some(serde_json::to_vec(&bar_jwks).unwrap()),
_ => None,
})
.await;
let role_name1 = RoleName::from("anonymous");
let role_name2 = RoleName::from("authenticated");
let fetch = Fetch(
addr,
vec![
RoleNameInt::from(&role_name1),
RoleNameInt::from(&role_name2),
],
);
let roles = vec![
RoleNameInt::from(&role_name1),
RoleNameInt::from(&role_name2),
];
let rules = vec![
AuthRule {
id: "foo".to_owned(),
jwks_url: format!("http://{jwks_addr}/foo").parse().unwrap(),
audience: None,
role_names: roles.clone(),
},
AuthRule {
id: "bar".to_owned(),
jwks_url: format!("http://{jwks_addr}/bar").parse().unwrap(),
audience: None,
role_names: roles.clone(),
},
];
let fetch = Fetch(rules);
let jwk_cache = JwkCache::default();
let endpoint = EndpointId::from("ep");
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
let jwt1 = new_rsa_jwt("1".into(), rs1);
let jwt2 = new_rsa_jwt("2".into(), rs2);
let jwt3 = new_ec_jwt("3".into(), &ec1);
let jwt4 = new_ec_jwt("4".into(), &ec2);
// had the wrong kid, therefore will have the wrong ecdsa signature
let bad_jwt = new_ec_jwt("3".into(), &ec2);
// this role_name is not accepted
let bad_role_name = RoleName::from("cloud_admin");
let err = jwk_cache
.check_jwt(
&RequestMonitoring::test(),
&bad_jwt,
&client,
endpoint.clone(),
&role_name1,
&fetch,
)
.await
.unwrap_err();
assert!(err.to_string().contains("signature error"));
let err = jwk_cache
.check_jwt(
&RequestMonitoring::test(),
&jwt1,
&client,
endpoint.clone(),
&bad_role_name,
&fetch,
)
.await
.unwrap_err();
assert!(err.to_string().contains("jwk not found"));
let jwt1 = new_rsa_jwt("rs1".into(), rs1);
let jwt2 = new_rsa_jwt("rs2".into(), rs2);
let jwt3 = new_ec_jwt("ec1".into(), &ec1);
let jwt4 = new_ec_jwt("ec2".into(), &ec2);
let tokens = [jwt1, jwt2, jwt3, jwt4];
let role_names = [role_name1, role_name2];
@@ -940,15 +963,250 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
jwk_cache
.check_jwt(
&RequestMonitoring::test(),
token,
&client,
endpoint.clone(),
role,
&fetch,
token,
)
.await
.unwrap();
}
}
}
/// AWS Cognito escapes the `/` in the URL.
#[tokio::test]
async fn check_jwt_regression_cognito_issuer() {
let (key, jwk) = new_ec_jwk("key".into());
let now = now();
let token = new_custom_ec_jwt(
"key".into(),
&key,
typed_json::json! {{
"sub": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
// cognito uses `\/`. I cannot replicated that easily here as serde_json will refuse
// to write that escape character. instead I will make a bogus URL using `\` instead.
"iss": "https:\\\\cognito-idp.us-west-2.amazonaws.com\\us-west-2_abcdefgh",
"client_id": "abcdefghijklmnopqrstuvwxyz",
"origin_jti": "6759d132-3fe7-446e-9e90-2fe7e8017893",
"event_id": "ec9c36ab-b01d-46a0-94e4-87fde6767065",
"token_use": "access",
"scope": "aws.cognito.signin.user.admin",
"auth_time":now,
"exp":now + 60,
"iat":now,
"jti": "b241614b-0b93-4bdc-96db-0a3c7061d9c0",
"username": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
}},
);
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
let jwks_addr = jwks_server(move |_path| Some(serde_json::to_vec(&jwks).unwrap())).await;
let role_name = RoleName::from("anonymous");
let rules = vec![AuthRule {
id: "aws-cognito".to_owned(),
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
audience: None,
role_names: vec![RoleNameInt::from(&role_name)],
}];
let fetch = Fetch(rules);
let jwk_cache = JwkCache::default();
let endpoint = EndpointId::from("ep");
jwk_cache
.check_jwt(
&RequestMonitoring::test(),
endpoint.clone(),
&role_name,
&fetch,
&token,
)
.await
.unwrap();
}
#[tokio::test]
async fn check_jwt_invalid_signature() {
let (_, jwk) = new_ec_jwk("1".into());
let (key, _) = new_ec_jwk("1".into());
// has a matching kid, but signed by the wrong key
let bad_jwt = new_ec_jwt("1".into(), &key);
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
let jwks_addr = jwks_server(move |path| match path {
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
_ => None,
})
.await;
let role = RoleName::from("authenticated");
let rules = vec![AuthRule {
id: String::new(),
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
audience: None,
role_names: vec![RoleNameInt::from(&role)],
}];
let fetch = Fetch(rules);
let jwk_cache = JwkCache::default();
let ep = EndpointId::from("ep");
let ctx = RequestMonitoring::test();
let err = jwk_cache
.check_jwt(&ctx, ep, &role, &fetch, &bad_jwt)
.await
.unwrap_err();
assert!(
matches!(err, JwtError::Signature(_)),
"expected \"signature error\", got {err:?}"
);
}
#[tokio::test]
async fn check_jwt_unknown_role() {
let (key, jwk) = new_rsa_jwk(RS1, "1".into());
let jwt = new_rsa_jwt("1".into(), key);
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
let jwks_addr = jwks_server(move |path| match path {
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
_ => None,
})
.await;
let role = RoleName::from("authenticated");
let rules = vec![AuthRule {
id: String::new(),
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
audience: None,
role_names: vec![RoleNameInt::from(&role)],
}];
let fetch = Fetch(rules);
let jwk_cache = JwkCache::default();
let ep = EndpointId::from("ep");
// this role_name is not accepted
let bad_role_name = RoleName::from("cloud_admin");
let ctx = RequestMonitoring::test();
let err = jwk_cache
.check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt)
.await
.unwrap_err();
assert!(
matches!(err, JwtError::JwkNotFound),
"expected \"jwk not found\", got {err:?}"
);
}
#[tokio::test]
async fn check_jwt_invalid_claims() {
let (key, jwk) = new_ec_jwk("1".into());
let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
let jwks_addr = jwks_server(move |path| match path {
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
_ => None,
})
.await;
let now = SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs();
struct Test {
body: serde_json::Value,
error: JwtClaimsError,
}
let table = vec![
Test {
body: json! {{
"nbf": now + 60,
"aud": "neon",
}},
error: JwtClaimsError::JwtTokenNotYetReadyToUse,
},
Test {
body: json! {{
"exp": now - 60,
"aud": ["neon"],
}},
error: JwtClaimsError::JwtTokenHasExpired,
},
Test {
body: json! {{
}},
error: JwtClaimsError::InvalidJwtTokenAudience,
},
Test {
body: json! {{
"aud": [],
}},
error: JwtClaimsError::InvalidJwtTokenAudience,
},
Test {
body: json! {{
"aud": "foo",
}},
error: JwtClaimsError::InvalidJwtTokenAudience,
},
Test {
body: json! {{
"aud": ["foo"],
}},
error: JwtClaimsError::InvalidJwtTokenAudience,
},
Test {
body: json! {{
"aud": ["foo", "bar"],
}},
error: JwtClaimsError::InvalidJwtTokenAudience,
},
];
let role = RoleName::from("authenticated");
let rules = vec![AuthRule {
id: String::new(),
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
audience: Some("neon".to_string()),
role_names: vec![RoleNameInt::from(&role)],
}];
let fetch = Fetch(rules);
let jwk_cache = JwkCache::default();
let ep = EndpointId::from("ep");
let ctx = RequestMonitoring::test();
for test in table {
let jwt = new_custom_ec_jwt("1".into(), &key, test.body);
match jwk_cache
.check_jwt(&ctx, ep.clone(), &role, &fetch, &jwt)
.await
{
Err(JwtError::InvalidClaims(error)) if error == test.error => {}
Err(err) => {
panic!("expected {:?}, got {err:?}", test.error)
}
Ok(_payload) => {
panic!("expected {:?}, got ok", test.error)
}
}
}
}
}

View File

@@ -137,9 +137,6 @@ struct ProxyCliArgs {
/// size of the threadpool for password hashing
#[clap(long, default_value_t = 4)]
scram_thread_pool_size: u8,
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
disable_dynamic_rate_limiter: bool,
/// Endpoint rate limiter max number of requests per second.
///
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
@@ -615,9 +612,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
and metric-collection-interval must be specified"
),
};
if !args.disable_dynamic_rate_limiter {
bail!("dynamic rate limiter should be disabled");
}
let config::ConcurrencyLockOptions {
shards,

View File

@@ -0,0 +1,5 @@
use http::StatusCode;
pub trait HttpCodeError {
fn get_http_status_code(&self) -> StatusCode;
}

View File

@@ -6,6 +6,7 @@ mod backend;
pub mod cancel_set;
mod conn_pool;
mod conn_pool_lib;
mod error;
mod http_conn_pool;
mod http_util;
mod json;

View File

@@ -28,6 +28,7 @@ use uuid::Uuid;
use super::backend::{LocalProxyConnError, PoolingBackend};
use super::conn_pool::{AuthData, ConnInfoWithAuth};
use super::conn_pool_lib::{self, ConnInfo};
use super::error::HttpCodeError;
use super::http_util::json_response;
use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
use super::local_conn_pool;
@@ -238,7 +239,6 @@ fn get_conn_info(
Ok(ConnInfoWithAuth { conn_info, auth })
}
// TODO: return different http error codes
pub(crate) async fn handle(
config: &'static ProxyConfig,
ctx: RequestMonitoring,
@@ -319,9 +319,8 @@ pub(crate) async fn handle(
"forwarding error to user"
);
// TODO: this shouldn't always be bad request.
json_response(
StatusCode::BAD_REQUEST,
e.get_http_status_code(),
json!({
"message": message,
"code": code,
@@ -405,6 +404,25 @@ impl UserFacingError for SqlOverHttpError {
}
}
impl HttpCodeError for SqlOverHttpError {
fn get_http_status_code(&self) -> StatusCode {
match self {
SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST,
SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() {
ErrorKind::User => StatusCode::BAD_REQUEST,
_ => StatusCode::INTERNAL_SERVER_ERROR,
},
SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST,
SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE,
SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE,
SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST,
SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST,
SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR,
SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR,
}
}
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum ReadPayloadError {
#[error("could not read the HTTP request body: {0}")]

View File

@@ -23,7 +23,7 @@ backoff = "^2.2.1"
pytest-lazy-fixture = "^0.6.3"
prometheus-client = "^0.14.1"
pytest-timeout = "^2.1.0"
Werkzeug = "^3.0.3"
Werkzeug = "^3.0.6"
pytest-order = "^1.1.0"
allure-pytest = "^2.13.2"
pytest-asyncio = "^0.21.0"
@@ -42,6 +42,9 @@ pytest-repeat = "^0.9.3"
websockets = "^12.0"
clickhouse-connect = "^0.7.16"
kafka-python = "^2.0.2"
jwcrypto = "^1.5.6"
h2 = "^4.1.0"
types-jwcrypto = "^1.5.0.20240925"
[tool.poetry.group.dev.dependencies]
mypy = "==1.3.0"

View File

@@ -262,14 +262,6 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
check_permission(&request, Some(ttid.tenant_id))?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
// Note: with evicted timelines it should work better then de-evict them and
// stream; probably start_snapshot would copy partial s3 file to dest path
// and stream control file, or return WalResidentTimeline if timeline is not
// evicted.
let tli = tli
.wal_residence_guard()
.await
.map_err(ApiError::InternalServerError)?;
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
// so create the chan and write to it in another task.

View File

@@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize};
use std::{
cmp::min,
io::{self, ErrorKind},
sync::Arc,
};
use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
use tokio_tar::{Archive, Builder, Header};
@@ -25,8 +26,8 @@ use crate::{
routes::TimelineStatus,
},
safekeeper::Term,
state::TimelinePersistentState,
timeline::WalResidentTimeline,
state::{EvictionState, TimelinePersistentState},
timeline::{Timeline, WalResidentTimeline},
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
wal_backup,
wal_storage::open_wal_file,
@@ -43,18 +44,33 @@ use utils::{
/// Stream tar archive of timeline to tx.
#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
pub async fn stream_snapshot(
tli: WalResidentTimeline,
tli: Arc<Timeline>,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>,
) {
if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
// Error type/contents don't matter as they won't can't reach the client
// (hyper likely doesn't do anything with it), but http stream will be
// prematurely terminated. It would be nice to try to send the error in
// trailers though.
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
error!("snapshot failed: {:#}", e);
match tli.try_wal_residence_guard().await {
Err(e) => {
tx.send(Err(anyhow!("Error checking residence: {:#}", e)))
.await
.ok();
}
Ok(maybe_resident_tli) => {
if let Err(e) = match maybe_resident_tli {
Some(resident_tli) => {
stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
.await
}
None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
} {
// Error type/contents don't matter as they won't can't reach the client
// (hyper likely doesn't do anything with it), but http stream will be
// prematurely terminated. It would be nice to try to send the error in
// trailers though.
tx.send(Err(anyhow!("snapshot failed"))).await.ok();
error!("snapshot failed: {:#}", e);
}
}
}
}
@@ -80,12 +96,10 @@ impl Drop for SnapshotContext {
}
}
pub async fn stream_snapshot_guts(
tli: WalResidentTimeline,
source: NodeId,
destination: NodeId,
/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel.
fn prepare_tar_stream(
tx: mpsc::Sender<Result<Bytes>>,
) -> Result<()> {
) -> tokio_tar::Builder<impl AsyncWrite + Unpin + Send> {
// tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
// use SinkWriter as a Write impl. That is,
// - create Sink from the tx. It returns PollSendError if chan is closed.
@@ -100,12 +114,38 @@ pub async fn stream_snapshot_guts(
// - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
// into CopyToBytes. This is a data copy.
let copy_to_bytes = CopyToBytes::new(oksink);
let mut writer = SinkWriter::new(copy_to_bytes);
let pinned_writer = std::pin::pin!(writer);
let writer = SinkWriter::new(copy_to_bytes);
let pinned_writer = Box::pin(writer);
// Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
// which is also likely suboptimal.
let mut ar = Builder::new_non_terminated(pinned_writer);
Builder::new_non_terminated(pinned_writer)
}
/// Implementation of snapshot for an offloaded timeline, only reads control file
pub(crate) async fn stream_snapshot_offloaded_guts(
tli: Arc<Timeline>,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>,
) -> Result<()> {
let mut ar = prepare_tar_stream(tx);
tli.snapshot_offloaded(&mut ar, source, destination).await?;
ar.finish().await?;
Ok(())
}
/// Implementation of snapshot for a timeline which is resident (includes some segment data)
pub async fn stream_snapshot_resident_guts(
tli: WalResidentTimeline,
source: NodeId,
destination: NodeId,
tx: mpsc::Sender<Result<Bytes>>,
) -> Result<()> {
let mut ar = prepare_tar_stream(tx);
let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
pausable_failpoint!("sk-snapshot-after-list-pausable");
@@ -138,6 +178,70 @@ pub async fn stream_snapshot_guts(
Ok(())
}
impl Timeline {
/// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and
/// pass a modified control file into the provided tar stream (nothing with data segments on disk, since
/// we are offloaded and there aren't any)
async fn snapshot_offloaded<W: AsyncWrite + Unpin + Send>(
self: &Arc<Timeline>,
ar: &mut tokio_tar::Builder<W>,
source: NodeId,
destination: NodeId,
) -> Result<()> {
// Take initial copy of control file, then release state lock
let mut control_file = {
let shared_state = self.write_shared_state().await;
let control_file = TimelinePersistentState::clone(shared_state.sk.state());
// Rare race: we got unevicted between entering function and reading control file.
// We error out and let API caller retry.
if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) {
bail!("Timeline was un-evicted during snapshot, please retry");
}
control_file
};
// Modify the partial segment of the in-memory copy for the control file to
// point to the destination safekeeper.
let replace = control_file
.partial_backup
.replace_uploaded_segment(source, destination)?;
let Some(replace) = replace else {
// In Manager:: ready_for_eviction, we do not permit eviction unless the timeline
// has a partial segment. It is unexpected that
anyhow::bail!("Timeline has no partial segment, cannot generate snapshot");
};
tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}");
// Optimistically try to copy the partial segment to the destination's path: this
// can fail if the timeline was un-evicted and modified in the background.
let remote_timeline_path = &self.remote_path;
wal_backup::copy_partial_segment(
&replace.previous.remote_path(remote_timeline_path),
&replace.current.remote_path(remote_timeline_path),
)
.await?;
// Since the S3 copy succeeded with the path given in our control file snapshot, and
// we are sending that snapshot in our response, we are giving the caller a consistent
// snapshot even if our local Timeline was unevicted or otherwise modified in the meantime.
let buf = control_file
.write_to_buf()
.with_context(|| "failed to serialize control store")?;
let mut header = Header::new_gnu();
header.set_size(buf.len().try_into().expect("never breaches u64"));
ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
.await
.with_context(|| "failed to append to archive")?;
Ok(())
}
}
impl WalResidentTimeline {
/// Start streaming tar archive with timeline:
/// 1) stream control file under lock;

View File

@@ -21,18 +21,15 @@ use postgres_backend::QueryError;
use pq_proto::BeMessage;
use serde::Deserialize;
use serde::Serialize;
use std::future;
use std::net::SocketAddr;
use std::sync::Arc;
use tokio::io::AsyncRead;
use tokio::io::AsyncWrite;
use tokio::sync::mpsc::channel;
use tokio::sync::mpsc::error::TryRecvError;
use tokio::sync::mpsc::Receiver;
use tokio::sync::mpsc::Sender;
use tokio::sync::mpsc::{channel, Receiver, Sender};
use tokio::task;
use tokio::task::JoinHandle;
use tokio::time::Duration;
use tokio::time::Instant;
use tokio::time::{Duration, MissedTickBehavior};
use tracing::*;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
@@ -444,9 +441,9 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
}
}
// Send keepalive messages to walproposer, to make sure it receives updates
// even when it writes a steady stream of messages.
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
/// The WAL flush interval. This ensures we periodically flush the WAL and send AppendResponses to
/// walproposer, even when it's writing a steady stream of messages.
const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
/// replies to reply_tx.
@@ -494,67 +491,76 @@ impl WalAcceptor {
async fn run(&mut self) -> anyhow::Result<()> {
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
// After this timestamp we will stop processing AppendRequests and send a response
// to the walproposer. walproposer sends at least one AppendRequest per second,
// we will send keepalives by replying to these requests once per second.
let mut next_keepalive = Instant::now();
// Periodically flush the WAL.
let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
flush_ticker.tick().await; // skip the initial, immediate tick
while let Some(mut next_msg) = self.msg_rx.recv().await {
// Update walreceiver state in shmem for reporting.
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
}
// Tracks unflushed appends.
let mut dirty = false;
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
// Loop through AppendRequests while available to write as many WAL records as
// possible without fsyncing.
//
// Make sure the WAL is flushed before returning, see:
// https://github.com/neondatabase/neon/issues/9259
//
// Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
// Otherwise, we might end up in a situation where we read a message, but don't
// process it.
while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
if self.reply_tx.send(reply).await.is_err() {
break; // disconnected, flush WAL and return on next send/recv
}
}
// get out of this loop if keepalive time is reached
if Instant::now() >= next_keepalive {
loop {
let reply = tokio::select! {
// Process inbound message.
msg = self.msg_rx.recv() => {
// If disconnected, break to flush WAL and return.
let Some(mut msg) = msg else {
break;
};
// Update walreceiver state in shmem for reporting.
if let ProposerAcceptorMessage::Elected(_) = &msg {
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
}
// continue pulling AppendRequests if available
match self.msg_rx.try_recv() {
Ok(msg) => next_msg = msg,
Err(TryRecvError::Empty) => break,
// on disconnect, flush WAL and return on next send/recv
Err(TryRecvError::Disconnected) => break,
};
// Don't flush the WAL on every append, only periodically via flush_ticker.
// This batches multiple appends per fsync. If the channel is empty after
// sending the reply, we'll schedule an immediate flush.
if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
dirty = true;
}
self.tli.process_msg(&msg).await?
}
// flush all written WAL to the disk
self.tli
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?
} else {
// process message other than AppendRequest
self.tli.process_msg(&next_msg).await?
// While receiving AppendRequests, flush the WAL periodically and respond with an
// AppendResponse to let walproposer know we're still alive.
_ = flush_ticker.tick(), if dirty => {
dirty = false;
self.tli
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?
}
// If there are no pending messages, flush the WAL immediately.
//
// TODO: this should be done via flush_ticker.reset_immediately(), but that's always
// delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866.
_ = future::ready(()), if dirty && self.msg_rx.is_empty() => {
dirty = false;
flush_ticker.reset();
self.tli
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?
}
};
if let Some(reply) = reply_msg {
// Send reply, if any.
if let Some(reply) = reply {
if self.reply_tx.send(reply).await.is_err() {
return Ok(()); // chan closed, streaming terminated
break; // disconnected, break to flush WAL and return
}
// reset keepalive time
next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
}
}
// Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
if dirty {
self.tli
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?;
}
Ok(())
}
}

View File

@@ -797,14 +797,17 @@ impl Timeline {
state.sk.term_bump(to).await
}
/// Get the timeline guard for reading/writing WAL files.
/// If WAL files are not present on disk (evicted), they will be automatically
/// downloaded from remote storage. This is done in the manager task, which is
/// responsible for issuing all guards.
///
/// NB: don't use this function from timeline_manager, it will deadlock.
/// NB: don't use this function while holding shared_state lock.
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
/// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
async fn do_wal_residence_guard(
self: &Arc<Self>,
block: bool,
) -> Result<Option<WalResidentTimeline>> {
let op_label = if block {
"wal_residence_guard"
} else {
"try_wal_residence_guard"
};
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
@@ -816,10 +819,13 @@ impl Timeline {
// Wait 30 seconds for the guard to be acquired. It can time out if someone is
// holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
// is stuck.
let res = tokio::time::timeout_at(
started_at + Duration::from_secs(30),
self.manager_ctl.wal_residence_guard(),
)
let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async {
if block {
self.manager_ctl.wal_residence_guard().await.map(Some)
} else {
self.manager_ctl.try_wal_residence_guard().await
}
})
.await;
let guard = match res {
@@ -827,14 +833,14 @@ impl Timeline {
let finished_at = Instant::now();
let elapsed = finished_at - started_at;
MISC_OPERATION_SECONDS
.with_label_values(&["wal_residence_guard"])
.with_label_values(&[op_label])
.observe(elapsed.as_secs_f64());
guard
}
Ok(Err(e)) => {
warn!(
"error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
"error acquiring in {op_label}, statuses {:?} => {:?}",
status_before,
self.mgr_status.get()
);
@@ -842,7 +848,7 @@ impl Timeline {
}
Err(_) => {
warn!(
"timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
"timeout acquiring in {op_label} guard, statuses {:?} => {:?}",
status_before,
self.mgr_status.get()
);
@@ -850,7 +856,28 @@ impl Timeline {
}
};
Ok(WalResidentTimeline::new(self.clone(), guard))
Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g)))
}
/// Get the timeline guard for reading/writing WAL files.
/// If WAL files are not present on disk (evicted), they will be automatically
/// downloaded from remote storage. This is done in the manager task, which is
/// responsible for issuing all guards.
///
/// NB: don't use this function from timeline_manager, it will deadlock.
/// NB: don't use this function while holding shared_state lock.
pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
self.do_wal_residence_guard(true)
.await
.map(|m| m.expect("Always get Some in block=true mode"))
}
/// Get the timeline guard for reading/writing WAL files if the timeline is resident,
/// else return None
pub(crate) async fn try_wal_residence_guard(
self: &Arc<Self>,
) -> Result<Option<WalResidentTimeline>> {
self.do_wal_residence_guard(false).await
}
pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {

View File

@@ -56,6 +56,9 @@ impl Manager {
// This also works for the first segment despite last_removed_segno
// being 0 on init because this 0 triggers run of wal_removal_task
// on success of which manager updates the horizon.
//
// **Note** pull_timeline functionality assumes that evicted timelines always have
// a partial segment: if we ever change this condition, must also update that code.
&& self
.partial_backup_uploaded
.as_ref()

View File

@@ -100,6 +100,8 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
pub enum ManagerCtlMessage {
/// Request to get a guard for WalResidentTimeline, with WAL files available locally.
GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
/// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None
TryGuardRequest(tokio::sync::oneshot::Sender<Option<ResidenceGuard>>),
/// Request to drop the guard.
GuardDrop(GuardId),
/// Request to reset uploaded partial backup state.
@@ -110,6 +112,7 @@ impl std::fmt::Debug for ManagerCtlMessage {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"),
ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
}
@@ -152,6 +155,19 @@ impl ManagerCtl {
.and_then(std::convert::identity)
}
/// Issue a new guard if the timeline is currently not offloaded, else return None
/// Sends a message to the manager and waits for the response.
/// Can be blocked indefinitely if the manager is stuck.
pub async fn try_wal_residence_guard(&self) -> anyhow::Result<Option<ResidenceGuard>> {
let (tx, rx) = tokio::sync::oneshot::channel();
self.manager_tx
.send(ManagerCtlMessage::TryGuardRequest(tx))?;
// wait for the manager to respond with the guard
rx.await
.map_err(|e| anyhow::anyhow!("response read fail: {:?}", e))
}
/// Request timeline manager to reset uploaded partial segment state and
/// wait for the result.
pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
@@ -674,6 +690,17 @@ impl Manager {
warn!("failed to reply with a guard, receiver dropped");
}
}
Some(ManagerCtlMessage::TryGuardRequest(tx)) => {
let result = if self.is_offloaded {
None
} else {
Some(self.access_service.create_guard())
};
if tx.send(result).is_err() {
warn!("failed to reply with a guard, receiver dropped");
}
}
Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
self.access_service.drop_guard(guard_id);
}

View File

@@ -968,6 +968,28 @@ async fn handle_tenant_shard_migrate(
)
}
async fn handle_tenant_shard_cancel_reconcile(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
json_response(
StatusCode::OK,
service
.tenant_shard_cancel_reconcile(tenant_shard_id)
.await?,
)
}
async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -1776,6 +1798,16 @@ pub fn make_router(
RequestName("control_v1_tenant_migrate"),
)
})
.put(
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|r| {
tenant_service_handler(
r,
handle_tenant_shard_cancel_reconcile,
RequestName("control_v1_tenant_cancel_reconcile"),
)
},
)
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
tenant_service_handler(
r,

View File

@@ -4834,6 +4834,43 @@ impl Service {
Ok(TenantShardMigrateResponse {})
}
/// 'cancel' in this context means cancel any ongoing reconcile
pub(crate) async fn tenant_shard_cancel_reconcile(
&self,
tenant_shard_id: TenantShardId,
) -> Result<(), ApiError> {
// Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete
let waiter = {
let locked = self.inner.write().unwrap();
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant shard not found").into(),
));
};
let waiter = shard.get_waiter();
match waiter {
None => {
tracing::info!("Shard does not have an ongoing Reconciler");
return Ok(());
}
Some(waiter) => {
tracing::info!("Cancelling Reconciler");
shard.cancel_reconciler();
waiter
}
}
};
// Cancellation should be prompt. If this fails we have still done our job of firing the
// cancellation token, but by returning an ApiError we will indicate to the caller that
// the Reconciler is misbehaving and not respecting the cancellation token
self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT)
.await?;
Ok(())
}
/// This is for debug/support only: we simply drop all state for a tenant, without
/// detaching or deleting it on pageservers.
pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
@@ -4921,16 +4958,7 @@ impl Service {
stripe_size,
},
placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
// There is no way to know what the tenant's config was: revert to defaults
//
// TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
//
// we write to both v1+v2 storage, so that the test case can use either storage format for testing
config: TenantConfig {
switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
..TenantConfig::default()
},
config: TenantConfig::default(),
})
.await?;

View File

@@ -1317,6 +1317,12 @@ impl TenantShard {
})
}
pub(crate) fn cancel_reconciler(&self) {
if let Some(handle) = self.reconciler.as_ref() {
handle.cancel.cancel()
}
}
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
/// if it is not already running
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
pytest_plugins = (
"fixtures.pg_version",
"fixtures.parametrize",
"fixtures.h2server",
"fixtures.httpserver",
"fixtures.compute_reconfigure",
"fixtures.storage_controller_proxy",

View File

@@ -0,0 +1,198 @@
"""
https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html
auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :)
"""
import asyncio
import collections
import io
import json
from collections.abc import AsyncIterable
import pytest_asyncio
from h2.config import H2Configuration
from h2.connection import H2Connection
from h2.errors import ErrorCodes
from h2.events import (
ConnectionTerminated,
DataReceived,
RemoteSettingsChanged,
RequestReceived,
StreamEnded,
StreamReset,
WindowUpdated,
)
from h2.exceptions import ProtocolError, StreamClosedError
from h2.settings import SettingCodes
RequestData = collections.namedtuple("RequestData", ["headers", "data"])
class H2Server:
def __init__(self, host, port) -> None:
self.host = host
self.port = port
class H2Protocol(asyncio.Protocol):
def __init__(self):
config = H2Configuration(client_side=False, header_encoding="utf-8")
self.conn = H2Connection(config=config)
self.transport = None
self.stream_data = {}
self.flow_control_futures = {}
def connection_made(self, transport: asyncio.Transport): # type: ignore[override]
self.transport = transport
self.conn.initiate_connection()
self.transport.write(self.conn.data_to_send())
def connection_lost(self, _exc):
for future in self.flow_control_futures.values():
future.cancel()
self.flow_control_futures = {}
def data_received(self, data: bytes):
assert self.transport is not None
try:
events = self.conn.receive_data(data)
except ProtocolError:
self.transport.write(self.conn.data_to_send())
self.transport.close()
else:
self.transport.write(self.conn.data_to_send())
for event in events:
if isinstance(event, RequestReceived):
self.request_received(event.headers, event.stream_id)
elif isinstance(event, DataReceived):
self.receive_data(event.data, event.stream_id)
elif isinstance(event, StreamEnded):
self.stream_complete(event.stream_id)
elif isinstance(event, ConnectionTerminated):
self.transport.close()
elif isinstance(event, StreamReset):
self.stream_reset(event.stream_id)
elif isinstance(event, WindowUpdated):
self.window_updated(event.stream_id, event.delta)
elif isinstance(event, RemoteSettingsChanged):
if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings:
self.window_updated(None, 0)
self.transport.write(self.conn.data_to_send())
def request_received(self, headers: list[tuple[str, str]], stream_id: int):
headers_map = collections.OrderedDict(headers)
# Store off the request data.
request_data = RequestData(headers_map, io.BytesIO())
self.stream_data[stream_id] = request_data
def stream_complete(self, stream_id: int):
"""
When a stream is complete, we can send our response.
"""
try:
request_data = self.stream_data[stream_id]
except KeyError:
# Just return, we probably 405'd this already
return
headers = request_data.headers
body = request_data.data.getvalue().decode("utf-8")
data = json.dumps({"headers": headers, "body": body}, indent=4).encode("utf8")
response_headers = (
(":status", "200"),
("content-type", "application/json"),
("content-length", str(len(data))),
)
self.conn.send_headers(stream_id, response_headers)
asyncio.ensure_future(self.send_data(data, stream_id))
def receive_data(self, data: bytes, stream_id: int):
"""
We've received some data on a stream. If that stream is one we're
expecting data on, save it off. Otherwise, reset the stream.
"""
try:
stream_data = self.stream_data[stream_id]
except KeyError:
self.conn.reset_stream(stream_id, error_code=ErrorCodes.PROTOCOL_ERROR)
else:
stream_data.data.write(data)
def stream_reset(self, stream_id):
"""
A stream reset was sent. Stop sending data.
"""
if stream_id in self.flow_control_futures:
future = self.flow_control_futures.pop(stream_id)
future.cancel()
async def send_data(self, data, stream_id):
"""
Send data according to the flow control rules.
"""
while data:
while self.conn.local_flow_control_window(stream_id) < 1:
try:
await self.wait_for_flow_control(stream_id)
except asyncio.CancelledError:
return
chunk_size = min(
self.conn.local_flow_control_window(stream_id),
len(data),
self.conn.max_outbound_frame_size,
)
try:
self.conn.send_data(
stream_id, data[:chunk_size], end_stream=(chunk_size == len(data))
)
except (StreamClosedError, ProtocolError):
# The stream got closed and we didn't get told. We're done
# here.
break
assert self.transport is not None
self.transport.write(self.conn.data_to_send())
data = data[chunk_size:]
async def wait_for_flow_control(self, stream_id):
"""
Waits for a Future that fires when the flow control window is opened.
"""
f: asyncio.Future[None] = asyncio.Future()
self.flow_control_futures[stream_id] = f
await f
def window_updated(self, stream_id, delta):
"""
A window update frame was received. Unblock some number of flow control
Futures.
"""
if stream_id and stream_id in self.flow_control_futures:
f = self.flow_control_futures.pop(stream_id)
f.set_result(delta)
elif not stream_id:
for f in self.flow_control_futures.values():
f.set_result(delta)
self.flow_control_futures = {}
@pytest_asyncio.fixture(scope="function")
async def http2_echoserver() -> AsyncIterable[H2Server]:
loop = asyncio.get_event_loop()
serve = await loop.create_server(H2Protocol, "127.0.0.1", 0)
(host, port) = serve.sockets[0].getsockname()
asyncio.create_task(serve.wait_closed())
server = H2Server(host, port)
yield server
serve.close()

View File

@@ -16,7 +16,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.pageserver.common_types import IndexPartDump
from fixtures.pg_version import PgVersion
from fixtures.utils import AuxFileStore
if TYPE_CHECKING:
from typing import (
@@ -201,7 +200,6 @@ class NeonLocalCli(AbstractNeonCli):
shard_stripe_size: Optional[int] = None,
placement_policy: Optional[str] = None,
set_default: bool = False,
aux_file_policy: Optional[AuxFileStore] = None,
):
"""
Creates a new tenant, returns its id and its initial timeline's id.
@@ -223,13 +221,6 @@ class NeonLocalCli(AbstractNeonCli):
)
)
if aux_file_policy is AuxFileStore.V2:
args.extend(["-c", "switch_aux_file_policy:v2"])
elif aux_file_policy is AuxFileStore.V1:
args.extend(["-c", "switch_aux_file_policy:v1"])
elif aux_file_policy is AuxFileStore.CrossValidation:
args.extend(["-c", "switch_aux_file_policy:cross-validation"])
if set_default:
args.append("--set-default")

View File

@@ -35,11 +35,13 @@ import toml
from _pytest.config import Config
from _pytest.config.argparsing import Parser
from _pytest.fixtures import FixtureRequest
from jwcrypto import jwk
# Type-related stuff
from psycopg2.extensions import connection as PgConnection
from psycopg2.extensions import cursor as PgCursor
from psycopg2.extensions import make_dsn, parse_dsn
from pytest_httpserver import HTTPServer
from urllib3.util.retry import Retry
from fixtures import overlayfs
@@ -53,6 +55,7 @@ from fixtures.common_types import (
TimelineId,
)
from fixtures.endpoint.http import EndpointHttpClient
from fixtures.h2server import H2Server
from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.neon_cli import NeonLocalCli, Pagectl
@@ -94,7 +97,6 @@ from fixtures.utils import (
subprocess_capture,
wait_until,
)
from fixtures.utils import AuxFileStore as AuxFileStore # reexport
from .neon_api import NeonAPI, NeonApiEndpoint
@@ -353,7 +355,6 @@ class NeonEnvBuilder:
initial_tenant: Optional[TenantId] = None,
initial_timeline: Optional[TimelineId] = None,
pageserver_virtual_file_io_engine: Optional[str] = None,
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None,
safekeeper_extra_opts: Optional[list[str]] = None,
storage_controller_port_override: Optional[int] = None,
@@ -405,8 +406,6 @@ class NeonEnvBuilder:
f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
)
self.pageserver_aux_file_policy = pageserver_aux_file_policy
self.safekeeper_extra_opts = safekeeper_extra_opts
self.storage_controller_port_override = storage_controller_port_override
@@ -467,7 +466,6 @@ class NeonEnvBuilder:
timeline_id=env.initial_timeline,
shard_count=initial_tenant_shard_count,
shard_stripe_size=initial_tenant_shard_stripe_size,
aux_file_policy=self.pageserver_aux_file_policy,
)
assert env.initial_tenant == initial_tenant
assert env.initial_timeline == initial_timeline
@@ -1027,7 +1025,6 @@ class NeonEnv:
self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
# Create the neon_local's `NeonLocalInitConf`
@@ -1323,7 +1320,6 @@ class NeonEnv:
shard_stripe_size: Optional[int] = None,
placement_policy: Optional[str] = None,
set_default: bool = False,
aux_file_policy: Optional[AuxFileStore] = None,
) -> tuple[TenantId, TimelineId]:
"""
Creates a new tenant, returns its id and its initial timeline's id.
@@ -1340,7 +1336,6 @@ class NeonEnv:
shard_stripe_size=shard_stripe_size,
placement_policy=placement_policy,
set_default=set_default,
aux_file_policy=aux_file_policy,
)
return tenant_id, timeline_id
@@ -1398,7 +1393,6 @@ def neon_simple_env(
compatibility_pg_distrib_dir: Path,
pg_version: PgVersion,
pageserver_virtual_file_io_engine: str,
pageserver_aux_file_policy: Optional[AuxFileStore],
pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
pageserver_virtual_file_io_mode: Optional[str],
) -> Iterator[NeonEnv]:
@@ -1431,7 +1425,6 @@ def neon_simple_env(
test_name=request.node.name,
test_output_dir=test_output_dir,
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
pageserver_aux_file_policy=pageserver_aux_file_policy,
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
combination=combination,
@@ -1458,7 +1451,6 @@ def neon_env_builder(
top_output_dir: Path,
pageserver_virtual_file_io_engine: str,
pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
pageserver_aux_file_policy: Optional[AuxFileStore],
record_property: Callable[[str, object], None],
pageserver_virtual_file_io_mode: Optional[str],
) -> Iterator[NeonEnvBuilder]:
@@ -1501,7 +1493,6 @@ def neon_env_builder(
test_name=request.node.name,
test_output_dir=test_output_dir,
test_overlay_dir=test_overlay_dir,
pageserver_aux_file_policy=pageserver_aux_file_policy,
pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
) as builder:
@@ -3092,6 +3083,31 @@ class PSQL:
)
def generate_proxy_tls_certs(common_name: str, key_path: Path, crt_path: Path):
if not key_path.exists():
r = subprocess.run(
[
"openssl",
"req",
"-new",
"-x509",
"-days",
"365",
"-nodes",
"-text",
"-out",
str(crt_path),
"-keyout",
str(key_path),
"-subj",
f"/CN={common_name}",
"-addext",
f"subjectAltName = DNS:{common_name}",
]
)
assert r.returncode == 0
class NeonProxy(PgProtocol):
link_auth_uri: str = "http://dummy-uri"
@@ -3190,29 +3206,7 @@ class NeonProxy(PgProtocol):
# generate key of it doesn't exist
crt_path = self.test_output_dir / "proxy.crt"
key_path = self.test_output_dir / "proxy.key"
if not key_path.exists():
r = subprocess.run(
[
"openssl",
"req",
"-new",
"-x509",
"-days",
"365",
"-nodes",
"-text",
"-out",
str(crt_path),
"-keyout",
str(key_path),
"-subj",
"/CN=*.localtest.me",
"-addext",
"subjectAltName = DNS:*.localtest.me",
]
)
assert r.returncode == 0
generate_proxy_tls_certs("*.localtest.me", key_path, crt_path)
args = [
str(self.neon_binpath / "proxy"),
@@ -3392,6 +3386,125 @@ class NeonProxy(PgProtocol):
assert out == "ok"
class NeonAuthBroker:
class ControlPlane:
def __init__(self, endpoint: str):
self.endpoint = endpoint
def extra_args(self) -> list[str]:
args = [
*["--auth-backend", "console"],
*["--auth-endpoint", self.endpoint],
]
return args
def __init__(
self,
neon_binpath: Path,
test_output_dir: Path,
http_port: int,
mgmt_port: int,
external_http_port: int,
auth_backend: NeonAuthBroker.ControlPlane,
):
self.domain = "apiauth.localtest.me" # resolves to 127.0.0.1
self.host = "127.0.0.1"
self.http_port = http_port
self.external_http_port = external_http_port
self.neon_binpath = neon_binpath
self.test_output_dir = test_output_dir
self.mgmt_port = mgmt_port
self.auth_backend = auth_backend
self.http_timeout_seconds = 15
self._popen: Optional[subprocess.Popen[bytes]] = None
def start(self) -> NeonAuthBroker:
assert self._popen is None
# generate key of it doesn't exist
crt_path = self.test_output_dir / "proxy.crt"
key_path = self.test_output_dir / "proxy.key"
generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path)
args = [
str(self.neon_binpath / "proxy"),
*["--http", f"{self.host}:{self.http_port}"],
*["--mgmt", f"{self.host}:{self.mgmt_port}"],
*["--wss", f"{self.host}:{self.external_http_port}"],
*["-c", str(crt_path)],
*["-k", str(key_path)],
*["--sql-over-http-pool-opt-in", "false"],
*["--is-auth-broker", "true"],
*self.auth_backend.extra_args(),
]
logfile = open(self.test_output_dir / "proxy.log", "w")
self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
self._wait_until_ready()
return self
# Sends SIGTERM to the proxy if it has been started
def terminate(self):
if self._popen:
self._popen.terminate()
# Waits for proxy to exit if it has been opened with a default timeout of
# two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time.
def wait_for_exit(self, timeout=2):
if self._popen:
self._popen.wait(timeout=timeout)
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
def _wait_until_ready(self):
assert (
self._popen and self._popen.poll() is None
), "Proxy exited unexpectedly. Check test log."
requests.get(f"http://{self.host}:{self.http_port}/v1/status")
async def query(self, query, args, **kwargs):
user = kwargs["user"]
token = kwargs["token"]
expected_code = kwargs.get("expected_code")
log.info(f"Executing http query: {query}")
connstr = f"postgresql://{user}@{self.domain}/postgres"
async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client:
response = await client.post(
f"https://{self.domain}:{self.external_http_port}/sql",
json={"query": query, "params": args},
headers={
"Neon-Connection-String": connstr,
"Authorization": f"Bearer {token}",
},
)
if expected_code is not None:
assert response.status_code == expected_code, f"response: {response.json()}"
return response.json()
def get_metrics(self) -> str:
request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
return request_result.text
def __enter__(self) -> NeonAuthBroker:
return self
def __exit__(
self,
_exc_type: Optional[type[BaseException]],
_exc_value: Optional[BaseException],
_traceback: Optional[TracebackType],
):
if self._popen is not None:
self._popen.terminate()
try:
self._popen.wait(timeout=5)
except subprocess.TimeoutExpired:
log.warning("failed to gracefully terminate proxy; killing")
self._popen.kill()
@pytest.fixture(scope="function")
def link_proxy(
port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
@@ -3456,6 +3569,74 @@ def static_proxy(
yield proxy
@pytest.fixture(scope="function")
def neon_authorize_jwk() -> jwk.JWK:
kid = str(uuid.uuid4())
key = jwk.JWK.generate(kty="RSA", size=2048, alg="RS256", use="sig", kid=kid)
assert isinstance(key, jwk.JWK)
return key
@pytest.fixture(scope="function")
def static_auth_broker(
port_distributor: PortDistributor,
neon_binpath: Path,
test_output_dir: Path,
httpserver: HTTPServer,
neon_authorize_jwk: jwk.JWK,
http2_echoserver: H2Server,
) -> Iterable[NeonAuthBroker]:
"""Neon Auth Broker that routes to a mocked local_proxy and a mocked cplane HTTP API."""
local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}"
# return local_proxy addr on ProxyWakeCompute.
httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
{
"address": local_proxy_addr,
"aux": {
"endpoint_id": "ep-foo-bar-1234",
"branch_id": "br-foo-bar",
"project_id": "foo-bar",
},
}
)
# return jwks mock addr on GetEndpointJwks
httpserver.expect_request(re.compile("^/cplane/endpoints/.+/jwks$")).respond_with_json(
{
"jwks": [
{
"id": "foo",
"jwks_url": httpserver.url_for("/authorize/jwks.json"),
"provider_name": "test",
"jwt_audience": None,
"role_names": ["anonymous", "authenticated"],
}
]
}
)
# return static fixture jwks.
jwk = neon_authorize_jwk.export_public(as_dict=True)
httpserver.expect_request("/authorize/jwks.json").respond_with_json({"keys": [jwk]})
mgmt_port = port_distributor.get_port()
http_port = port_distributor.get_port()
external_http_port = port_distributor.get_port()
with NeonAuthBroker(
neon_binpath=neon_binpath,
test_output_dir=test_output_dir,
http_port=http_port,
mgmt_port=mgmt_port,
external_http_port=external_http_port,
auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")),
) as proxy:
proxy.start()
yield proxy
class Endpoint(PgProtocol, LogUtils):
"""An object representing a Postgres compute endpoint managed by the control plane."""

View File

@@ -10,12 +10,6 @@ from _pytest.python import Metafunc
from fixtures.pg_version import PgVersion
if TYPE_CHECKING:
from typing import Any, Optional
from fixtures.utils import AuxFileStore
if TYPE_CHECKING:
from typing import Any, Optional
@@ -50,11 +44,6 @@ def pageserver_virtual_file_io_mode() -> Optional[str]:
return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
@pytest.fixture(scope="function", autouse=True)
def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
return None
def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]:
toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
if toml_table is None:

View File

@@ -1,7 +1,6 @@
from __future__ import annotations
import contextlib
import enum
import json
import os
import re
@@ -515,21 +514,6 @@ def assert_no_errors(log_file: Path, service: str, allowed_errors: list[str]):
assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add"
@enum.unique
class AuxFileStore(str, enum.Enum):
V1 = "v1"
V2 = "v2"
CrossValidation = "cross-validation"
@override
def __repr__(self) -> str:
return f"'aux-{self.value}'"
@override
def __str__(self) -> str:
return f"'aux-{self.value}'"
def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str]):
"""
This is essentially:

View File

@@ -9,7 +9,7 @@ import pytest
from fixtures.benchmark_fixture import MetricReport
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
from fixtures.neon_fixtures import logical_replication_sync
if TYPE_CHECKING:
from fixtures.benchmark_fixture import NeonBenchmarker
@@ -17,7 +17,6 @@ if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv, PgBin
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@pytest.mark.timeout(1000)
def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
env = neon_simple_env

View File

@@ -172,7 +172,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
},
"walreceiver_connect_timeout": "13m",
"image_layer_creation_check_threshold": 1,
"switch_aux_file_policy": "cross-validation",
"lsn_lease_length": "1m",
"lsn_lease_length_for_ts": "5s",
}

View File

@@ -0,0 +1,37 @@
import json
import pytest
from fixtures.neon_fixtures import NeonAuthBroker
from jwcrypto import jwk, jwt
@pytest.mark.asyncio
async def test_auth_broker_happy(
static_auth_broker: NeonAuthBroker,
neon_authorize_jwk: jwk.JWK,
):
"""
Signs a JWT and uses it to authorize a query to local_proxy.
"""
token = jwt.JWT(
header={"kid": neon_authorize_jwk.key_id, "alg": "RS256"}, claims={"sub": "user1"}
)
token.make_signed_token(neon_authorize_jwk)
res = await static_auth_broker.query("foo", ["arg1"], user="anonymous", token=token.serialize())
# local proxy mock just echos back the request
# check that we forward the correct data
assert (
res["headers"]["authorization"] == f"Bearer {token.serialize()}"
), "JWT should be forwarded"
assert (
"anonymous" in res["headers"]["neon-connection-string"]
), "conn string should be forwarded"
assert json.loads(res["body"]) == {
"query": "foo",
"params": ["arg1"],
}, "Query body should be forwarded"

View File

@@ -7,6 +7,7 @@ import psycopg2
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, VanillaPostgres
from psycopg2.errors import UndefinedObject
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@@ -335,3 +336,34 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
if not result:
raise AssertionError("Could not count databases")
assert result[0] == 0, "Database 'failure' still exists after restart"
def test_ddl_forwarding_role_specs(neon_simple_env: NeonEnv):
"""
Postgres has a concept of role specs:
ROLESPEC_CSTRING: ALTER ROLE xyz
ROLESPEC_CURRENT_USER: ALTER ROLE current_user
ROLESPEC_CURRENT_ROLE: ALTER ROLE current_role
ROLESPEC_SESSION_USER: ALTER ROLE session_user
ROLESPEC_PUBLIC: ALTER ROLE public
The extension is required to serialize these special role spec into
usernames for the purpose of DDL forwarding.
"""
env = neon_simple_env
endpoint = env.endpoints.create_start("main")
with endpoint.cursor() as cur:
# ROLESPEC_CSTRING
cur.execute("ALTER ROLE cloud_admin WITH PASSWORD 'york'")
# ROLESPEC_CURRENT_USER
cur.execute("ALTER ROLE current_user WITH PASSWORD 'pork'")
# ROLESPEC_CURRENT_ROLE
cur.execute("ALTER ROLE current_role WITH PASSWORD 'cork'")
# ROLESPEC_SESSION_USER
cur.execute("ALTER ROLE session_user WITH PASSWORD 'bork'")
# ROLESPEC_PUBLIC
with pytest.raises(UndefinedObject):
cur.execute("ALTER ROLE public WITH PASSWORD 'dork'")

View File

@@ -5,11 +5,9 @@ from functools import partial
from random import choice
from string import ascii_lowercase
import pytest
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
AuxFileStore,
NeonEnv,
NeonEnvBuilder,
PgProtocol,
@@ -23,17 +21,6 @@ def random_string(n: int):
return "".join([choice(ascii_lowercase) for _ in range(n)])
@pytest.mark.parametrize(
"pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation]
)
def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
env = neon_simple_env
with env.pageserver.http_client() as client:
tenant_config = client.tenant_config(env.initial_tenant).effective_config
assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
@@ -173,7 +160,6 @@ COMMIT;
# Test that neon.logical_replication_max_snap_files works
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
def slot_removed(ep):
assert (
@@ -350,7 +336,6 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
#
# Most pages start with a contrecord, so we don't do anything special
# to ensure that.
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
@@ -395,7 +380,6 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
# logical replication bug as such, but without logical replication,
# records passed ot the WAL redo process are never large enough to hit
# the bug.
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
@@ -467,7 +451,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
def test_replication_shutdown(neon_simple_env: NeonEnv):
# Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
env = neon_simple_env

View File

@@ -561,7 +561,7 @@ def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
# query generates a million rows - should hit the 10MB reponse limit quickly
response = query(
400,
507,
"select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
)
assert "response is too large (max is 10485760 bytes)" in response["message"]

View File

@@ -18,6 +18,7 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
NeonPageserver,
PageserverAvailability,
PageserverSchedulingPolicy,
PgBin,
@@ -298,17 +299,20 @@ def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
env.storage_controller.consistency_check()
@pytest.mark.parametrize("warm_up", [True, False])
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
def prepare_onboarding_env(
neon_env_builder: NeonEnvBuilder,
) -> tuple[NeonEnv, NeonPageserver, TenantId, int]:
"""
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
which provides the /location_config API. This is similar to creating a tenant,
but imports the generation number.
For tests that do onboarding of a tenant to the storage controller, a small dance to
set up one pageserver that won't be managed by the storage controller and create
a tenant there.
"""
# One pageserver to simulate legacy environment, two to be managed by storage controller
neon_env_builder.num_pageservers = 3
# Enable tests to use methods that require real S3 API
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
# Start services by hand so that we can skip registration on one of the pageservers
env = neon_env_builder.init_configs()
env.broker.start()
@@ -329,7 +333,6 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
# will be attached after onboarding
env.pageservers[1].start()
env.pageservers[2].start()
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
for sk in env.safekeepers:
sk.start()
@@ -339,6 +342,23 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
generation = 123
origin_ps.tenant_create(tenant_id, generation=generation)
origin_ps.http_client().timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
return (env, origin_ps, tenant_id, generation)
@pytest.mark.parametrize("warm_up", [True, False])
def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
"""
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
which provides the /location_config API. This is similar to creating a tenant,
but imports the generation number.
"""
env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
# As if doing a live migration, first configure origin into stale mode
r = origin_ps.http_client().tenant_location_conf(
tenant_id,
@@ -475,6 +495,70 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
env.storage_controller.consistency_check()
@run_only_on_default_postgres("this test doesn't start an endpoint")
def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
"""
Sometimes, the control plane wants to delete a tenant that wasn't attached to any pageserver,
and also wasn't ever registered with the storage controller.
It may do this by calling /location_conf in mode Detached and then calling the delete API
as normal.
"""
env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
remote_prefix = "/".join(
(
"tenants",
str(tenant_id),
)
)
# Detach it from its original pageserver.
origin_ps.http_client().tenant_location_conf(
tenant_id,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
"generation": None,
},
)
# Since we will later assert that remote data is gone, as a control also check it was ever there
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix=remote_prefix,
)
# Register with storage controller in Detached state
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
generation += 1
r = virtual_ps_http.tenant_location_conf(
tenant_id,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
"generation": generation,
},
)
assert len(r["shards"]) == 0 # location_conf tells us there are no attached shards
# Onboarding in Detached state shouldn't have attached it to any pageserver
for ps in env.pageservers:
assert ps.http_client().tenant_list() == []
# Delete it via the storage controller
virtual_ps_http.tenant_delete(tenant_id)
# Check that we really deleted it
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix=remote_prefix,
)
def test_storage_controller_compute_hook(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
@@ -872,6 +956,14 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
assert all(v["may_schedule"] for v in response.json()["nodes"].values())
# Reconciler cancel API should be a no-op when nothing is in flight
env.storage_controller.request(
"PUT",
f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0102/cancel_reconcile",
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
# Node unclean drop API
response = env.storage_controller.request(
"POST",
f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
@@ -879,6 +971,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
)
assert len(env.storage_controller.node_list()) == 1
# Tenant unclean drop API
response = env.storage_controller.request(
"POST",
f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
@@ -892,7 +985,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
assert len(response.json()) == 1
# Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
# meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
env.storage_controller.consistency_check()
@@ -1660,6 +1752,11 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
assert "Stop" in storcon_cli(["tenants"])[3]
# Cancel ongoing reconcile on a tenant
storcon_cli(
["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
)
# Change a tenant's placement
storcon_cli(
["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]

View File

@@ -137,14 +137,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
}
)
# Create two branches and archive them
parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
leaf_timeline_id = env.create_branch(
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
# Create three branches that depend on each other, starting with two
grandparent_timeline_id = env.create_branch(
"test_ancestor_branch_archive_grandparent", tenant_id
)
parent_timeline_id = env.create_branch(
"test_ancestor_branch_archive_parent", tenant_id, "test_ancestor_branch_archive_grandparent"
)
# write some stuff to the parent
with env.endpoints.create_start(
"test_ancestor_branch_archive_branch1", tenant_id=tenant_id
"test_ancestor_branch_archive_parent", tenant_id=tenant_id
) as endpoint:
endpoint.safe_psql_many(
[
@@ -154,6 +157,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
)
sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
# create the third branch
leaf_timeline_id = env.create_branch(
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
)
ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
@@ -171,6 +179,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
state=TimelineArchivalState.ARCHIVED,
)
ps_http.timeline_archival_config(
tenant_id,
grandparent_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
def timeline_offloaded_logged(timeline_id: TimelineId) -> bool:
return (
env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
@@ -201,30 +215,34 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
ps_http.timeline_archival_config(
tenant_id,
parent_timeline_id,
grandparent_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
parent_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
parent_detail = ps_http.timeline_detail(
tenant_id,
leaf_timeline_id,
parent_timeline_id,
)
assert leaf_detail["is_archived"] is False
assert parent_detail["is_archived"] is False
with env.endpoints.create_start(
"test_ancestor_branch_archive_branch1", tenant_id=tenant_id
"test_ancestor_branch_archive_parent", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
assert sum == sum_again
# Test that deletion of offloaded timelines works
ps_http.timeline_delete(tenant_id, leaf_timeline_id)
assert not timeline_offloaded_logged(initial_timeline_id)
def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("delete_timeline", [False, True])
def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timeline: bool):
"""
Test for persistence of timeline offload state
"""
@@ -306,27 +324,35 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
assert timeline_offloaded_api(child_timeline_id)
assert not timeline_offloaded_api(root_timeline_id)
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
child_detail = ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
assert child_detail["is_archived"] is False
if delete_timeline:
ps_http.timeline_delete(tenant_id, child_timeline_id)
with pytest.raises(PageserverApiException, match="not found"):
ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
else:
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
child_detail = ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
assert child_detail["is_archived"] is False
with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
assert sum == sum_again
with env.endpoints.create_start(
"test_archived_branch_persisted", tenant_id=tenant_id
) as endpoint:
sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
assert sum == sum_again
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
)
assert not timeline_offloaded_api(root_timeline_id)

View File

@@ -0,0 +1,50 @@
from __future__ import annotations
from typing import TYPE_CHECKING, cast
import pytest
from psycopg2.errors import InsufficientPrivilege
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
def test_unstable_extensions_installation(neon_simple_env: NeonEnv):
"""
Test that the unstable extension support within the neon extension can
block extension installation.
"""
env = neon_simple_env
neon_unstable_extensions = "pg_prewarm,amcheck"
endpoint = env.endpoints.create(
"main",
config_lines=[
"neon.allow_unstable_extensions=false",
f"neon.unstable_extensions='{neon_unstable_extensions}'",
],
)
endpoint.respec(skip_pg_catalog_updates=False)
endpoint.start()
with endpoint.cursor() as cursor:
cursor.execute("SELECT current_setting('neon.unstable_extensions')")
result = cursor.fetchone()
assert result is not None
setting = cast("str", result[0])
assert setting == neon_unstable_extensions
with pytest.raises(InsufficientPrivilege):
cursor.execute("CREATE EXTENSION pg_prewarm")
with pytest.raises(InsufficientPrivilege):
cursor.execute("CREATE EXTENSION amcheck")
# Make sure that we can install a "stable" extension
cursor.execute("CREATE EXTENSION pageinspect")
cursor.execute("BEGIN")
cursor.execute("SET neon.allow_unstable_extensions TO true")
cursor.execute("CREATE EXTENSION pg_prewarm")
cursor.execute("COMMIT")

Some files were not shown because too many files have changed in this diff Show More