Merge pull request #8451 from neondatabase/rc/2024-07-22

## Storage & Compute release 2024-07-22

This PR has so many commits because the release branch diverged from `main`.

Details https://neondb.slack.com/archives/C033A2WE6BZ/p1721650938949059?thread_ts=1721308848.034069&cid=C033A2WE6BZ

The commit range that is truly new since the last storage release are the the `main` commit which I cherry-picked using this command

```
git cherry-pick 8a8b83df27383a07bb7dbba519325c15d2f46357..4e547e6
```
This commit is contained in:
Christian Schwarz
2024-07-22 19:17:01 +02:00
committed by GitHub
115 changed files with 3864 additions and 1460 deletions

View File

@@ -9,8 +9,8 @@ inputs:
description: 'Region ID, if not set the project will be created in the default region'
default: aws-us-east-2
postgres_version:
description: 'Postgres version; default is 15'
default: '15'
description: 'Postgres version; default is 16'
default: '16'
api_host:
description: 'Neon API host'
default: console-stage.neon.build

View File

@@ -57,9 +57,10 @@ jobs:
bench:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
strategy:
fail-fast: false
matrix:
include:
- DEFAULT_PG_VERSION: 14
- DEFAULT_PG_VERSION: 16
PLATFORM: "neon-staging"
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
provisioner: 'k8s-pod'
@@ -146,6 +147,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
replication-tests:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
@@ -190,6 +192,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 5400
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -215,11 +218,14 @@ jobs:
# Available platforms:
# - neon-captest-new: Freshly created project (1 CU)
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
# - neon-captest-reuse: Reusing existing project
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
env:
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
runs-on: ubuntu-22.04
outputs:
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -230,23 +236,33 @@ jobs:
- name: Generate matrix for pgbench benchmark
id: pgbench-compare-matrix
run: |
region_id_default=${{ env.DEFAULT_REGION_ID }}
matrix='{
"pg_version" : [
16
],
"region_id" : [
"'"$region_id_default"'"
],
"platform": [
"neon-captest-new",
"neon-captest-reuse",
"neonvm-captest-new"
],
"db_size": [ "10gb" ],
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
{ "platform": "rds-aurora", "db_size": "50gb"}]')
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "50gb"}]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -298,7 +314,7 @@ jobs:
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -323,14 +339,14 @@ jobs:
prefix: latest
- name: Create Neon Project
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
region_id: ${{ matrix.region_id }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
- name: Set up Connection String
@@ -343,7 +359,7 @@ jobs:
neonvm-captest-sharding-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
;;
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
rds-aurora)
@@ -368,6 +384,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -381,6 +398,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -394,6 +412,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -420,6 +439,13 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
pgbench-pgvector:
strategy:
fail-fast: false
matrix:
include:
- PLATFORM: "neon-captest-pgvector"
- PLATFORM: "azure-captest-pgvector"
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -427,8 +453,9 @@ jobs:
DEFAULT_PG_VERSION: 16
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-captest-pgvector"
PLATFORM: ${{ matrix.PLATFORM }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
@@ -438,17 +465,39 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
# until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
# instead of using Neon artifacts containing pgbench
- name: Install postgresql-16 where pytest expects it
run: |
cd /home/nonroot
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb
dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
mkdir -p /tmp/neon/pg_install/v16/bin
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib
/tmp/neon/pg_install/v16/bin/pgbench --version
/tmp/neon/pg_install/v16/bin/psql --version
- name: Set up Connection String
id: set-up-connstr
run: |
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
case "${PLATFORM}" in
neon-captest-pgvector)
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
;;
azure-captest-pgvector)
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
@@ -460,6 +509,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -473,6 +523,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -487,11 +538,10 @@ jobs:
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
clickbench-compare:
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
# we use for performance testing in pgbench-compare.
@@ -735,6 +785,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

79
Cargo.lock generated
View File

@@ -1368,6 +1368,7 @@ dependencies = [
"tracing",
"url",
"utils",
"whoami",
"workspace_hack",
]
@@ -3233,16 +3234,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
@@ -3538,12 +3529,6 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "p256"
version = "0.11.1"
@@ -4404,6 +4389,7 @@ dependencies = [
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-utils",
"typed-json",
"url",
"urlencoding",
"utils",
@@ -4602,6 +4588,15 @@ dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "redox_syscall"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "regex"
version = "1.10.2"
@@ -5811,6 +5806,28 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "storage_controller_client"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"bytes",
"futures",
"pageserver_api",
"pageserver_client",
"postgres",
"reqwest 0.12.4",
"serde",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-stream",
"tokio-util",
"utils",
"workspace_hack",
]
[[package]]
name = "storage_scrubber"
version = "0.1.0"
@@ -5845,6 +5862,7 @@ dependencies = [
"serde",
"serde_json",
"serde_with",
"storage_controller_client",
"thiserror",
"tokio",
"tokio-postgres",
@@ -5874,6 +5892,7 @@ dependencies = [
"reqwest 0.12.4",
"serde",
"serde_json",
"storage_controller_client",
"thiserror",
"tokio",
"tracing",
@@ -6600,7 +6619,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",
@@ -6665,6 +6683,16 @@ dependencies = [
"static_assertions",
]
[[package]]
name = "typed-json"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "typenum"
version = "1.16.0"
@@ -6961,6 +6989,12 @@ version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasite"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
[[package]]
name = "wasm-bindgen"
version = "0.2.92"
@@ -7113,6 +7147,17 @@ dependencies = [
"once_cell",
]
[[package]]
name = "whoami"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
dependencies = [
"redox_syscall 0.4.1",
"wasite",
"web-sys",
]
[[package]]
name = "winapi"
version = "0.3.9"

View File

@@ -13,6 +13,7 @@ members = [
"safekeeper",
"storage_broker",
"storage_controller",
"storage_controller/client",
"storage_scrubber",
"workspace_hack",
"libs/compute_api",
@@ -182,14 +183,16 @@ tower-service = "0.3.2"
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.21.0"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
twox-hash = { version = "1.6.3", default-features = false }
typed-json = "0.1"
url = "2.2"
urlencoding = "2.1"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
rustls-native-certs = "0.7"
x509-parser = "0.15"
whoami = "1.5.1"
## TODO replace this with tracing
env_logger = "0.10"
@@ -219,6 +222,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
desim = { version = "0.1", path = "./libs/desim" }
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
storage_controller_client = { path = "./storage_controller/client" }
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }

View File

@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
FROM build-deps AS rum-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY patches/rum.patch /rum.patch
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
patch -p1 < /rum.patch && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control

View File

@@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> {
impl<'m> MigrationRunner<'m> {
pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
// The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
assert!(migrations.len() + 1 < i64::MAX as usize);
Self { client, migrations }
}
@@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> {
Ok(row.get::<&str, i64>("id"))
}
fn update_migration_id(&mut self) -> Result<()> {
let setval = format!(
"UPDATE neon_migration.migration_id SET id={}",
self.migrations.len()
);
fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
self.client
.simple_query(&setval)
@@ -57,44 +57,49 @@ impl<'m> MigrationRunner<'m> {
pub fn run_migrations(mut self) -> Result<()> {
self.prepare_migrations()?;
let mut current_migration: usize = self.get_migration_id()? as usize;
let starting_migration_id = current_migration;
let query = "BEGIN";
self.client
.simple_query(query)
.context("run_migrations begin")?;
let mut current_migration = self.get_migration_id()? as usize;
while current_migration < self.migrations.len() {
macro_rules! migration_id {
($cm:expr) => {
($cm + 1) as i64
};
}
let migration = self.migrations[current_migration];
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", current_migration);
info!("Skipping migration id={}", migration_id!(current_migration));
} else {
info!(
"Running migration id={}:\n{}\n",
current_migration, migration
migration_id!(current_migration),
migration
);
self.client
.simple_query("BEGIN")
.context("begin migration")?;
self.client.simple_query(migration).with_context(|| {
format!("run_migration current_migration={}", current_migration)
format!(
"run_migrations migration id={}",
migration_id!(current_migration)
)
})?;
// Migration IDs start at 1
self.update_migration_id(migration_id!(current_migration))?;
self.client
.simple_query("COMMIT")
.context("commit migration")?;
info!("Finished migration id={}", migration_id!(current_migration));
}
current_migration += 1;
}
self.update_migration_id()?;
let query = "COMMIT";
self.client
.simple_query(query)
.context("run_migrations commit")?;
info!(
"Ran {} migrations",
(self.migrations.len() - starting_migration_id)
);
Ok(())
}
}

View File

@@ -777,21 +777,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
// Add new migrations in numerical order.
let migrations = [
include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
include_str!("./migrations/0001-alter_roles.sql"),
include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
include_str!("./migrations/0002-alter_roles.sql"),
include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
include_str!(
"./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
"./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
),
include_str!(
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
"./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
),
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
include_str!(
"./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
),
];

View File

@@ -40,6 +40,7 @@ safekeeper_api.workspace = true
postgres_connection.workspace = true
storage_broker.workspace = true
utils.workspace = true
whoami.workspace = true
compute_api.workspace = true
workspace_hack.workspace = true

View File

@@ -1,9 +1,9 @@
//! Code to manage the storage broker
//!
//! In the local test environment, the data for each safekeeper is stored in
//! In the local test environment, the storage broker stores its data directly in
//!
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! .neon
//! ```
use std::time::Duration;

View File

@@ -1,8 +1,10 @@
//! Code to manage pageservers
//!
//! In the local test environment, the pageserver stores its data directly in
//! In the local test environment, the data for each pageserver is stored in
//!
//! .neon/
//! ```text
//! .neon/pageserver_<pageserver_id>
//! ```
//!
use std::collections::HashMap;

View File

@@ -29,7 +29,6 @@ use utils::{
pub struct StorageController {
env: LocalEnv,
listen: String,
path: Utf8PathBuf,
private_key: Option<Vec<u8>>,
public_key: Option<String>,
postgres_port: u16,
@@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller";
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
const DB_NAME: &str = "storage_controller";
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -65,10 +66,6 @@ pub struct InspectResponse {
impl StorageController {
pub fn from_env(env: &LocalEnv) -> Self {
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
.unwrap()
.join("attachments.json");
// Makes no sense to construct this if pageservers aren't going to use it: assume
// pageservers have control plane API set
let listen_url = env.control_plane_api.clone().unwrap();
@@ -128,7 +125,6 @@ impl StorageController {
Self {
env: env.clone(),
path,
listen,
private_key,
public_key,
@@ -203,7 +199,6 @@ impl StorageController {
///
/// Returns the database url
pub async fn setup_database(&self) -> anyhow::Result<String> {
const DB_NAME: &str = "storage_controller";
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -232,6 +227,30 @@ impl StorageController {
Ok(database_url)
}
pub async fn connect_to_database(
&self,
) -> anyhow::Result<(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
)> {
tokio_postgres::Config::new()
.host("localhost")
.port(self.postgres_port)
// The user is the ambient operating system user name.
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
//
// Until we get there, use the ambient operating system user name.
// Recent tokio-postgres versions default to this if the user isn't specified.
// But tokio-postgres fork doesn't have this upstream commit:
// https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
// => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
.user(&whoami::username())
.dbname(DB_NAME)
.connect(tokio_postgres::NoTls)
.await
.map_err(anyhow::Error::new)
}
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
// Start a vanilla Postgres process used by the storage controller for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
@@ -256,18 +275,21 @@ impl StorageController {
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
)
.await?;
};
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
//
// NB: it's important that we rewrite this file on each start command so we propagate changes
// from `LocalEnv`'s config file (`.neon/config`).
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
)
.await?;
println!("Starting storage controller database...");
let db_start_args = [
"-w",
@@ -296,11 +318,38 @@ impl StorageController {
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
// We support running a startup SQL script to fiddle with the database before we launch storcon.
// This is used by the test suite.
let startup_script_path = self
.env
.base_data_dir
.join("storage_controller_db.startup.sql");
let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
Ok(script) => {
tokio::fs::remove_file(startup_script_path).await?;
script
}
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
// always run some startup script so that this code path doesn't bit rot
"BEGIN; COMMIT;".to_string()
} else {
anyhow::bail!("Failed to read startup script: {e}")
}
}
};
let (mut client, conn) = self.connect_to_database().await?;
let conn = tokio::spawn(conn);
let tx = client.build_transaction();
let tx = tx.start().await?;
tx.batch_execute(&startup_script).await?;
tx.commit().await?;
drop(client);
conn.await??;
let mut args = vec![
"-l",
&self.listen,
"-p",
self.path.as_ref(),
"--dev",
"--database-url",
&database_url,

View File

@@ -17,6 +17,7 @@ pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
storage_controller_client.workspace = true
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true

View File

@@ -14,15 +14,15 @@ use pageserver_api::{
},
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use pageserver_client::mgmt_api::{self};
use reqwest::{Method, StatusCode, Url};
use serde::{de::DeserializeOwned, Serialize};
use utils::id::{NodeId, TenantId};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
TenantShardMigrateRequest, TenantShardMigrateResponse,
};
use storage_controller_client::control_api::Client;
#[derive(Subcommand, Debug)]
enum Command {
@@ -249,64 +249,6 @@ impl FromStr for NodeAvailabilityArg {
}
}
struct Client {
base_url: Url,
jwt_token: Option<String>,
client: reqwest::Client,
}
impl Client {
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
Self {
base_url,
jwt_token,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
}
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: Method,
path: String,
body: Option<RQ>,
) -> mgmt_api::Result<RS>
where
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
self.base_url.host_str().unwrap(),
self.base_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
let response = response.error_from_body().await?;
response
.json()
.await
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();

View File

@@ -0,0 +1,252 @@
# Ancestor Timeline Deletion
Created on: 2024-02-23
Author: John Spray
# Summary
When a tenant creates a new timeline that they will treat as their 'main' history,
it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
this is necessary because it is forbidden to delete a timeline which has descendents.
A new pageserver API is proposed to 'adopt' data from a parent timeline into
one of its children, such that the link between ancestor and child can be severed,
leaving the parent in a state where it may then be deleted.
# Motivation
Retaining parent timelines currently has two costs:
- Cognitive load on users, who have to remember which is the "real" main timeline.
- Storage capacity cost, as the parent timeline will retain layers up to the
child's timeline point, even if the child fully covers its keyspace with image
layers and will never actually read from the parent.
# Solution
A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
wish to detach from its parent.
On success, this API will leave the following state:
- The detached child timeline will no longer have an ancestor, and will contain all
the data needed to service reads without recursing into an ancestor.
- Any other children of the parent whose timeline points were at a lower LSN than
the detached child timeline will be modified to have the child timeline as their
new parent.
- The parent timeline will still exist, but the child will no longer have it as an
ancestor. If this was the last timeline that depended on the parent, then the
parent will become deletable.
This API's implementation will consist of a series of retryable steps, such that
on failures/timeout it can safely be called again to reach the target state.
## Example
### Before
The user has "rolled back" their project to LSN X, resulting in a "new main"
timeline. The parent "old main" timeline still exists, and they would like
to clean it up.
They have two other timelines A and B. A is from before the rollback point,
and B is from after the rollback point.
```
----"old main" timeline-------X-------------------------------------------->
| | |
|-> child A | |
|-> "new main" timeline |
-> child B
```
### After calling detach ancestor API
The "new main" timeline is no longer dependent on old main, and neither
is child A, because it had a branch point before X.
The user may now choose to delete child B and "old main" to get to
a pristine state. Child B is likely to be unwanted since the user
chose to roll back to X, and it branches from after X. However, we
don't assume this in the API; it is up to the user to delete it.
```
|----"old main" timeline---------------------------------------------------->
|
|
|
-> child B
|----"new main" timeline--------->
|
|-> child A
```
### After removing timelines
We end up with a totally clean state that leaves no trace that a rollback
ever happened: there is only one root timeline.
```
| ----"new main" timeline----------->
|
|-> child A
```
## Caveats
Important things for API users to bear in mind:
- this API does not delete the parent timeline: you must still do that explicitly.
- if there are other child timelines ahead of the branch point of the detached
child, the parent won't be deletable: you must either delete or detach those
children.
- do _not_ simply loop over all children and detach them all: this can have an
extremely high storage cost. The detach ancestor API is intended for use on a single
timeline to make it the new "main".
- The detach ancestor API should also not be
exposed directly to the user as button/API, because they might decide
to click it for all the children and thereby generate many copies of the
parent's data -- the detach ancestor API should be used as part
of a high level "clean up after rollback" feature.
## `detach_ancestor` API implementation
Terms used in the following sections:
- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
called "new main" in the example.
- "the parent": the parent of "the child". Also called "old main" in the example.
- "the branch point" the ancestor_lsn of "the child"
### Phase 1: write out adopted layers to S3
The child will "adopt" layers from the parent, such that its end state contains
all the parent's history as well as its own.
For all layers in the parent's layer map whose high LSN is below the branch
point, issue S3 CopyObject requests to duplicate them into the child timeline's
prefix. Do not add them to the child's layer map yet.
For delta layers in the parent's layer map which straddle the branch point, read them
and write out only content up to the branch point into new layer objects.
This is a long running operation if the parent has many layers: it should be
implemented in a way that resumes rather than restarting from scratch, if the API
times out and is called again.
As an optimization, if there are no other timelines that will be adopted into
the child, _and_ the child's image layers already full cover the branch LSN,
then we may skip adopting layers.
### Phase 2: update the child's index
Having written out all needed layers in phase 1, atomically link them all
into the child's IndexPart and upload to S3. This may be done while the
child Timeline is still running.
### Phase 3: modify timelines ancestry
Modify the child's ancestor to None, and upload its IndexPart to persist the change.
For all timelines which have the same parent as the child, and have a branch
point lower than our branch point, switch their ancestor_timeline to the child,
and upload their IndexPart to persist the change.
## Alternatives considered
### Generate full image layer on child, rather than adopting parent deltas
This would work for the case of a single child, but would prevent re-targeting
other timelines that depended on the parent. If we detached many children this
way, the storage cost would become prohibitive (consider a 1TB database with
100 child timelines: it would cost 100TiB if they all generated their own image layers).
### Don't rewrite anything: just fake it in the API
We could add a layer of indirection that let a child "pretend" that it had no
ancestor, when in reality it still had the parent. The pageserver API could
accept deletion of ancestor timelines, and just update child metadata to make
them look like they have no ancestor.
This would not achieve the desired reduction in storage cost, and may well be more
complex to maintain than simply implementing the API described in this RFC.
### Avoid copying objects: enable child index to use parent layers directly
We could teach IndexPart to store a TimelineId for each layer, such that a child
timeline could reference a parent's layers directly, rather than copying them
into the child's prefix.
This would impose a cost for the normal case of indices that only target the
timeline's own layers, add complexity, and break the useful simplifying
invariant that timelines "own" their own path. If child timelines were
referencing layers from the parent, we would have to ensure that the parent
never runs GC/compaction again, which would make the API less flexible (the
proposal in this RFC enables deletion of the parent but doesn't require it.)
## Performance
### Adopting layers
- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
of such requests: this can take up to tens of seconds and will compete for RemoteStorage
semaphore units with other activity on the pageserver.
- If we are running on storage backend that doesn't implement CopyObject, then
this part will be much more expensive as we would stream all layer content
through the pageserver. This is no different to issuing a lot
of reads to a timeline that does not have a warm local cache: it will move
a lot of gigabytes, but that shouldn't break anything.
- Generating truncated layers for delta that straddle the branch point will
require streaming read/write of all the layers in question.
### Updating timeline ancestry
The simplest way to update timeline ancestry will probably be to stop and start
all the Timeline objects: this is preferable to the complexity of making their
ancestry mutable at runtime.
There will be a corresponding "stutter" in the availability of the timelines,
of the order 10-100ms, which is the time taken to upload their IndexPart, and
restart the Timeline.
# Interaction with other features
## Concurrent timeline creation
If new historic timelines are created using the parent as an ancestor while the
detach ancestor API is running, they will not be re-parented to the child. This
doesn't break anything, but it leaves the parent in a state where it might not
be possible to delete it.
Since timeline creations are an explicit user action, this is not something we need to
worry about as the storage layer: a user who wants to delete their parent timeline will not create
new children, and if they do, they can choose to delete those children to
enable deleting the parent.
For the least surprise to the user, before starting the detach ancestor branch
operation, the control plane should wait until all branches are created and not
allow any branches to be created before the branch point on the ancestor branch
while the operation is ongoing.
## WAL based disaster recovery
WAL based disaster recovery currently supports only restoring of the main
branch. Enabling WAL based disaster recovery in the future requires that we
keep a record which timeline generated the WAL and at which LSN was a parent
detached. Keep a list of timeline ids and the LSN in which they were detached in
the `index_part.json`. Limit the size of the list to 100 first entries, after
which the WAL disaster recovery will not be possible.
## Sharded tenants
For sharded tenants, calls to the detach ancestor API will pass through the storage
controller, which will handle them the same as timeline creations: invoke first
on shard zero, and then on all the other shards.

View File

@@ -44,7 +44,7 @@ If you need to modify the database schema, heres how to create a migration:
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once youve committed a migration no further steps are needed.

View File

@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
pub shard_params: ShardParameters,
}
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantDescribeResponse {
pub tenant_id: TenantId,
pub shards: Vec<TenantDescribeResponseShard>,
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
pub listen_pg_port: u16,
}
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantDescribeResponseShard {
pub tenant_shard_id: TenantShardId,

View File

@@ -651,6 +651,17 @@ pub struct TenantDetails {
pub timelines: Vec<TimelineId>,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
pub enum TimelineArchivalState {
Archived,
Unarchived,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct TimelineArchivalConfigRequest {
pub state: TimelineArchivalState,
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {

View File

@@ -1,6 +1,6 @@
use utils::id::TimelineId;
#[derive(Default, serde::Serialize)]
#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct AncestorDetached {
pub reparented_timelines: Vec<TimelineId>,
}

View File

@@ -443,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
impl GenericRemoteStorage {
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
let timeout = storage_config.timeout;
Ok(match &storage_config.storage {
RemoteStorageKind::LocalFs { local_path: path } => {
@@ -458,7 +458,7 @@ impl GenericRemoteStorage {
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
}
RemoteStorageKind::AzureContainer(azure_config) => {
let storage_account = azure_config

View File

@@ -16,16 +16,10 @@ use std::{
use anyhow::{anyhow, Context as _};
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
meta::credentials::CredentialsProviderChain,
profile::ProfileFileCredentialsProvider,
provider_config::ProviderConfig,
default_provider::credentials::DefaultCredentialsChain,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
BehaviorVersion,
};
use aws_credential_types::provider::SharedCredentialsProvider;
use aws_sdk_s3::{
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
@@ -76,40 +70,27 @@ struct GetObjectRequest {
}
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
tracing::debug!(
"Creating s3 remote storage for S3 bucket {}",
remote_storage_config.bucket_name
);
let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
let region = Region::new(remote_storage_config.bucket_region.clone());
let region_opt = Some(region.clone());
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
let credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try(
"env",
EnvironmentVariableCredentialsProvider::new(),
)
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
.or_else(
"profile-sso",
ProfileFileCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// needed to access remote extensions bucket
.or_else(
"token",
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
// https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
// https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
// Incomplete list of auth methods used by this:
// * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
// * "AWS_PROFILE" / `aws sso login --profile <profile>`
// * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// * http (ECS/EKS) container credentials
// * imds v2
let credentials_provider = DefaultCredentialsChain::builder()
.region(region)
.build()
.await;
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
@@ -118,9 +99,9 @@ impl S3Bucket {
#[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
BehaviorVersion::v2023_11_09(),
)
.region(region)
.region(region_opt)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.credentials_provider(credentials_provider)
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
@@ -1041,8 +1022,8 @@ mod tests {
use crate::{RemotePath, S3Bucket, S3Config};
#[test]
fn relative_path() {
#[tokio::test]
async fn relative_path() {
let all_paths = ["", "some/path", "some/path/"];
let all_paths: Vec<RemotePath> = all_paths
.iter()
@@ -1085,8 +1066,9 @@ mod tests {
max_keys_per_list_response: Some(5),
upload_storage_class: None,
};
let storage =
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
.await
.expect("remote storage init");
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
let result = storage.relative_path_to_s3_object(test_path);
let expected = expected_outputs[prefix_idx][test_path_idx];

View File

@@ -31,6 +31,7 @@ struct EnabledAzure {
impl EnabledAzure {
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
let client = create_azure_client(max_keys_in_list_response)
.await
.context("Azure client creation")
.expect("Azure client creation failed");
@@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
}
}
fn create_azure_client(
async fn create_azure_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
use rand::Rng;
@@ -221,6 +222,8 @@ fn create_azure_client(
timeout: Duration::from_secs(120),
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
GenericRemoteStorage::from_config(&remote_storage_config)
.await
.context("remote storage init")?,
))
}

View File

@@ -197,6 +197,7 @@ struct EnabledS3 {
impl EnabledS3 {
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
let client = create_s3_client(max_keys_in_list_response)
.await
.context("S3 client creation")
.expect("S3 client creation failed");
@@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
}
}
fn create_s3_client(
async fn create_s3_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
use rand::Rng;
@@ -385,7 +386,9 @@ fn create_s3_client(
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
GenericRemoteStorage::from_config(&remote_storage_config)
.await
.context("remote storage init")?,
))
}

View File

@@ -33,6 +33,10 @@ pub enum Scope {
GenerationsApi,
// Allows access to control plane managment API and some storage controller endpoints.
Admin,
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
/// of a tenant & post scrub results.
Scrubber,
}
/// JWT payload. See docs/authentication.md for the format

View File

@@ -1,6 +1,7 @@
use std::collections::HashMap;
use bytes::Bytes;
use detach_ancestor::AncestorDetached;
use pageserver_api::{models::*, shard::TenantShardId};
use reqwest::{IntoUrl, Method, StatusCode};
use utils::{
@@ -418,6 +419,23 @@ impl Client {
}
}
pub async fn timeline_detach_ancestor(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Result<AncestorDetached> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
self.mgmt_api_endpoint
);
self.request(Method::PUT, &uri, ())
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{}/reset",

View File

@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
.get("remote_storage")
.expect("need remote_storage");
let config = RemoteStorageConfig::from_toml(toml_item)?;
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
let cancel = CancellationToken::new();
storage
.unwrap()

View File

@@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
}
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
format!(
"JWT scope '{:?}' is ineligible for Pageserver auth",
claims.scope
)
.into(),
)),
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
Err(AuthError(
format!(
"JWT scope '{:?}' is ineligible for Pageserver auth",
claims.scope
)
.into(),
))
}
}
}

View File

@@ -385,7 +385,7 @@ fn start_pageserver(
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;
// Set up deletion queue
let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -622,7 +622,6 @@ fn start_pageserver(
metric_collection_endpoint,
&conf.metric_collection_bucket,
conf.metric_collection_interval,
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
@@ -702,7 +701,7 @@ fn start_pageserver(
}
}
fn create_remote_storage_client(
async fn create_remote_storage_client(
conf: &'static PageServerConf,
) -> anyhow::Result<GenericRemoteStorage> {
let config = if let Some(config) = &conf.remote_storage_config {
@@ -712,7 +711,7 @@ fn create_remote_storage_client(
};
// Create the client
let mut remote_storage = GenericRemoteStorage::from_config(config)?;
let mut remote_storage = GenericRemoteStorage::from_config(config).await?;
// If `test_remote_failures` is non-zero, wrap the client with a
// wrapper that simulates failures.

View File

@@ -68,7 +68,6 @@ pub mod defaults {
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -123,7 +122,6 @@ pub mod defaults {
#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
@@ -238,7 +236,6 @@ pub struct PageServerConf {
// How often to collect metrics and send them to the metrics endpoint.
pub metric_collection_interval: Duration,
// How often to send unchanged cached metrics to the metrics endpoint.
pub cached_metric_collection_interval: Duration,
pub metric_collection_endpoint: Option<Url>,
pub metric_collection_bucket: Option<RemoteStorageConfig>,
pub synthetic_size_calculation_interval: Duration,
@@ -370,7 +367,6 @@ struct PageServerConfigBuilder {
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
metric_collection_interval: BuilderValue<Duration>,
cached_metric_collection_interval: BuilderValue<Duration>,
metric_collection_endpoint: BuilderValue<Option<Url>>,
synthetic_size_calculation_interval: BuilderValue<Duration>,
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
@@ -454,10 +450,6 @@ impl PageServerConfigBuilder {
DEFAULT_METRIC_COLLECTION_INTERVAL,
)
.expect("cannot parse default metric collection interval")),
cached_metric_collection_interval: Set(humantime::parse_duration(
DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
)
.expect("cannot parse default cached_metric_collection_interval")),
synthetic_size_calculation_interval: Set(humantime::parse_duration(
DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
)
@@ -589,14 +581,6 @@ impl PageServerConfigBuilder {
self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
}
pub fn cached_metric_collection_interval(
&mut self,
cached_metric_collection_interval: Duration,
) {
self.cached_metric_collection_interval =
BuilderValue::Set(cached_metric_collection_interval)
}
pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
}
@@ -730,7 +714,6 @@ impl PageServerConfigBuilder {
broker_keepalive_interval,
log_format,
metric_collection_interval,
cached_metric_collection_interval,
metric_collection_endpoint,
metric_collection_bucket,
synthetic_size_calculation_interval,
@@ -947,7 +930,6 @@ impl PageServerConf {
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
}),
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
"cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
"metric_collection_endpoint" => {
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
builder.metric_collection_endpoint(Some(endpoint));
@@ -1080,7 +1062,6 @@ impl PageServerConf {
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
),
metric_collection_interval: Duration::from_secs(60),
cached_metric_collection_interval: Duration::from_secs(60 * 60),
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
metric_collection_bucket: None,
synthetic_size_calculation_interval: Duration::from_secs(60),
@@ -1259,7 +1240,6 @@ initial_superuser_name = 'zzzz'
id = 10
metric_collection_interval = '222 s'
cached_metric_collection_interval = '22200 s'
metric_collection_endpoint = 'http://localhost:80/metrics'
synthetic_size_calculation_interval = '333 s'
@@ -1315,9 +1295,6 @@ background_task_maximum_delay = '334 s'
metric_collection_interval: humantime::parse_duration(
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
)?,
cached_metric_collection_interval: humantime::parse_duration(
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
)?,
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
metric_collection_bucket: None,
synthetic_size_calculation_interval: humantime::parse_duration(
@@ -1396,7 +1373,6 @@ background_task_maximum_delay = '334 s'
eviction_task_immitated_concurrent_logical_size_queries:
ConfigurableSemaphore::default(),
metric_collection_interval: Duration::from_secs(222),
cached_metric_collection_interval: Duration::from_secs(22200),
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
metric_collection_bucket: None,
synthetic_size_calculation_interval: Duration::from_secs(333),

View File

@@ -46,19 +46,12 @@ pub async fn collect_metrics(
metric_collection_endpoint: &Url,
metric_collection_bucket: &Option<RemoteStorageConfig>,
metric_collection_interval: Duration,
_cached_metric_collection_interval: Duration,
synthetic_size_calculation_interval: Duration,
node_id: NodeId,
local_disk_storage: Utf8PathBuf,
cancel: CancellationToken,
ctx: RequestContext,
) -> anyhow::Result<()> {
if _cached_metric_collection_interval != Duration::ZERO {
tracing::warn!(
"cached_metric_collection_interval is no longer used, please set it to zero."
)
}
// spin up background worker that caclulates tenant sizes
let worker_ctx =
ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
@@ -103,7 +96,7 @@ pub async fn collect_metrics(
.expect("Failed to create http client with timeout");
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
match GenericRemoteStorage::from_config(bucket_config) {
match GenericRemoteStorage::from_config(bucket_config).await {
Ok(client) => Some(client),
Err(e) => {
// Non-fatal error: if we were given an invalid config, we will proceed

View File

@@ -828,9 +828,9 @@ mod test {
}
}
fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let harness = TenantHarness::create(test_name).await?;
// We do not load() the harness: we only need its config and remote_storage
@@ -844,7 +844,9 @@ mod test {
},
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
let storage = GenericRemoteStorage::from_config(&storage_config)
.await
.unwrap();
let mock_control_plane = MockControlPlane::new();
@@ -922,7 +924,9 @@ mod test {
#[tokio::test]
async fn deletion_queue_smoke() -> anyhow::Result<()> {
// Basic test that the deletion queue processes the deletions we pass into it
let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
let ctx = setup("deletion_queue_smoke")
.await
.expect("Failed test setup");
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;
@@ -992,7 +996,9 @@ mod test {
#[tokio::test]
async fn deletion_queue_validation() -> anyhow::Result<()> {
let ctx = setup("deletion_queue_validation").expect("Failed test setup");
let ctx = setup("deletion_queue_validation")
.await
.expect("Failed test setup");
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;
@@ -1051,7 +1057,9 @@ mod test {
#[tokio::test]
async fn deletion_queue_recovery() -> anyhow::Result<()> {
// Basic test that the deletion queue processes the deletions we pass into it
let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
let mut ctx = setup("deletion_queue_recovery")
.await
.expect("Failed test setup");
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;

View File

@@ -377,7 +377,7 @@ paths:
schema:
$ref: "#/components/schemas/ConflictError"
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
parameters:
- name: tenant_id
in: path
@@ -397,6 +397,51 @@ paths:
"202":
description: Tenant scheduled to load successfully
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
put:
description: |
Either archives or unarchives the given timeline.
An archived timeline may not have any non-archived children.
requestBody:
required: false
content:
application/json:
schema:
$ref: "#/components/schemas/ArchivalConfigRequest"
responses:
"200":
description: Timeline (un)archived successfully
"409":
description: |
The tenant/timeline is already being modified, perhaps by a concurrent call to this API
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/synthetic_size:
parameters:
- name: tenant_id
@@ -429,7 +474,9 @@ paths:
schema:
$ref: "#/components/schemas/SyntheticSizeResponse"
text/html:
description: SVG representation of the tenant and it's timelines.
schema:
type: string
description: SVG representation of the tenant and its timelines.
"401":
description: Unauthorized Error
content:
@@ -568,7 +615,7 @@ paths:
type: string
- name: timeline_id
in: path
ŕequired: true
required: true
schema:
type: string
@@ -774,15 +821,13 @@ components:
TenantCreateRequest:
allOf:
- $ref: '#/components/schemas/TenantConfig'
- $ref: '#/components/schemas/TenantLoadRequest'
- type: object
required:
- new_tenant_id
properties:
new_tenant_id:
type: string
generation:
type: integer
description: Attachment generation number.
TenantLoadRequest:
type: object
properties:
@@ -846,6 +891,15 @@ components:
warm:
type: boolean
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
ArchivalConfigRequest:
type: object
required
- state
properties:
state:
description: The archival state of a timeline
type: string
enum: ["Archived", "Unarchived"]
TenantConfig:
type: object
properties:
@@ -1106,7 +1160,7 @@ components:
reparented_timelines:
type: array
description: Set of reparented timeline ids
properties:
items:
type: string
format: hex
description: TimelineId

View File

@@ -18,14 +18,17 @@ use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::LocationConfigMode;
use pageserver_api::models::LsnLease;
use pageserver_api::models::LsnLeaseRequest;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigRequest;
use pageserver_api::models::TenantLocationConfigResponse;
use pageserver_api::models::TenantScanRemoteStorageResponse;
use pageserver_api::models::TenantScanRemoteStorageShard;
@@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation;
use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::TenantSorting;
use pageserver_api::models::TimelineArchivalConfigRequest;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::TopTenantShardsRequest;
use pageserver_api::models::TopTenantShardsResponse;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
};
use pageserver_api::shard::ShardCount;
use pageserver_api::shard::TenantShardId;
use remote_storage::DownloadError;
@@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler(
json_response(StatusCode::OK, ())
}
async fn timeline_archival_config_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant
.apply_timeline_archival_config(timeline_id, request_data.state)
.await
.context("applying archival config")
.map_err(ApiError::InternalServerError)?;
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_archival_config",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
state = ?request_data.state,
%timeline_id))
.await?;
json_response(StatusCode::OK, ())
}
async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -1721,7 +1755,9 @@ async fn timeline_detach_ancestor_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
use crate::tenant::timeline::detach_ancestor::Options;
use crate::tenant::timeline::detach_ancestor;
use pageserver_api::models::detach_ancestor::AncestorDetached;
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1729,7 +1765,7 @@ async fn timeline_detach_ancestor_handler(
let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
async move {
let mut options = Options::default();
let mut options = detach_ancestor::Options::default();
let rewrite_concurrency =
parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1757,27 +1793,36 @@ async fn timeline_detach_ancestor_handler(
let timeline = tenant.get_timeline(timeline_id, true)?;
let (_guard, prepared) = timeline
let progress = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
.await?;
let res = state
.tenant_manager
.complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
.await;
// uncomment to allow early as possible Tenant::drop
// drop(tenant);
match res {
Ok(reparented_timelines) => {
let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
let resp = match progress {
detach_ancestor::Progress::Prepared(_guard, prepared) => {
// it would be great to tag the guard on to the tenant activation future
let reparented_timelines = state
.tenant_manager
.complete_detaching_timeline_ancestor(
tenant_shard_id,
timeline_id,
prepared,
ctx,
)
.await
.context("timeline detach ancestor completion")
.map_err(ApiError::InternalServerError)?;
AncestorDetached {
reparented_timelines,
};
json_response(StatusCode::OK, resp)
}
}
Err(e) => Err(ApiError::InternalServerError(
e.context("timeline detach completion"),
)),
}
detach_ancestor::Progress::Done(resp) => resp,
};
json_response(StatusCode::OK, resp)
}
.instrument(span)
.await
@@ -2778,6 +2823,10 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|r| api_handler(r, timeline_preserve_initdb_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
|r| api_handler(r, timeline_archival_config_handler),
)
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_detail_handler)
})

View File

@@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum MetricLayerKind {
Delta,
Image,
}
static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_layer_bytes",
"Sum of layer physical sizes in bytes",
&["tenant_id", "shard_id", "timeline_id", "kind"]
)
.expect("failed to define a metric")
});
static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_layer_count",
"Number of layers that exist",
&["tenant_id", "shard_id", "timeline_id", "kind"]
)
.expect("failed to define a metric")
});
static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_archive_size",
@@ -585,6 +610,22 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_total",
"Size of uncompressed data written into image layers"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_out_bytes_total",
"Size of compressed image layer written"
)
.expect("failed to define a metric")
});
pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
@@ -1490,7 +1531,6 @@ pub(crate) enum ComputeCommandKind {
Basebackup,
Fullbackup,
LeaseLsn,
Show,
}
pub(crate) struct ComputeCommandCounters {
@@ -2142,6 +2182,10 @@ pub(crate) struct TimelineMetrics {
pub last_record_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub(crate) layer_size_image: UIntGauge,
pub(crate) layer_count_image: UIntGauge,
pub(crate) layer_size_delta: UIntGauge,
pub(crate) layer_count_delta: UIntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
@@ -2224,6 +2268,42 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let layer_size_image = TIMELINE_LAYER_SIZE
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Image.into(),
])
.unwrap();
let layer_count_image = TIMELINE_LAYER_COUNT
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Image.into(),
])
.unwrap();
let layer_size_delta = TIMELINE_LAYER_SIZE
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Delta.into(),
])
.unwrap();
let layer_count_delta = TIMELINE_LAYER_COUNT
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Delta.into(),
])
.unwrap();
let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2278,6 +2358,10 @@ impl TimelineMetrics {
last_record_gauge,
pitr_history_size,
archival_size,
layer_size_image,
layer_count_image,
layer_size_delta,
layer_count_delta,
standby_horizon_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
@@ -2339,6 +2423,31 @@ impl TimelineMetrics {
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Image.into(),
]);
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Image.into(),
]);
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Delta.into(),
]);
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Delta.into(),
]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);

View File

@@ -1479,66 +1479,6 @@ where
))?
}
};
} else if let Some(params) = parts.strip_prefix(&["show"]) {
// show <tenant_id>
if params.len() != 1 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for config command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
tracing::Span::current().record("tenant_id", field::display(tenant_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Show)
.inc();
let tenant = self
.get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
)
.await?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"checkpoint_distance"),
RowDescriptor::int8_col(b"checkpoint_timeout"),
RowDescriptor::int8_col(b"compaction_target_size"),
RowDescriptor::int8_col(b"compaction_period"),
RowDescriptor::int8_col(b"compaction_threshold"),
RowDescriptor::int8_col(b"gc_horizon"),
RowDescriptor::int8_col(b"gc_period"),
RowDescriptor::int8_col(b"image_creation_threshold"),
RowDescriptor::int8_col(b"pitr_interval"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
Some(
tenant
.get_checkpoint_timeout()
.as_secs()
.to_string()
.as_bytes(),
),
Some(tenant.get_compaction_target_size().to_string().as_bytes()),
Some(
tenant
.get_compaction_period()
.as_secs()
.to_string()
.as_bytes(),
),
Some(tenant.get_compaction_threshold().to_string().as_bytes()),
Some(tenant.get_gc_horizon().to_string().as_bytes()),
Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else {
return Err(QueryError::Other(anyhow::anyhow!(
"unknown command {query_string}"

View File

@@ -2031,7 +2031,7 @@ mod tests {
#[tokio::test]
async fn aux_files_round_trip() -> anyhow::Result<()> {
let name = "aux_files_round_trip";
let harness = TenantHarness::create(name)?;
let harness = TenantHarness::create(name).await?;
pub const TIMELINE_ID: TimelineId =
TimelineId::from_array(hex!("11223344556677881122334455667788"));

View File

@@ -21,6 +21,7 @@ use futures::FutureExt;
use futures::StreamExt;
use pageserver_api::models;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::TimelineState;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::WalRedoManagerStatus;
@@ -1228,6 +1229,14 @@ impl Tenant {
Ok(timeline_preloads)
}
pub async fn apply_timeline_archival_config(
&self,
_timeline_id: TimelineId,
_config: TimelineArchivalState,
) -> anyhow::Result<()> {
Ok(())
}
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
self.tenant_shard_id
}
@@ -2912,7 +2921,7 @@ impl Tenant {
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
target.within_ancestor_pitr =
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
}
}
@@ -2928,7 +2937,7 @@ impl Tenant {
timeline.metrics.pitr_history_size.set(
timeline
.get_last_record_lsn()
.checked_sub(target.cutoffs.pitr)
.checked_sub(target.cutoffs.time)
.unwrap_or(Lsn(0))
.0,
);
@@ -3788,7 +3797,7 @@ pub(crate) mod harness {
}
impl TenantHarness {
pub fn create_custom(
pub async fn create_custom(
test_name: &'static str,
tenant_conf: TenantConf,
tenant_id: TenantId,
@@ -3824,7 +3833,7 @@ pub(crate) mod harness {
},
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
Ok(Self {
@@ -3839,7 +3848,7 @@ pub(crate) mod harness {
})
}
pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
// Disable automatic GC and compaction to make the unit tests more deterministic.
// The tests perform them manually if needed.
let tenant_conf = TenantConf {
@@ -3856,6 +3865,7 @@ pub(crate) mod harness {
shard,
Generation::new(0xdeadbeef),
)
.await
}
pub fn span(&self) -> tracing::Span {
@@ -3992,7 +4002,7 @@ mod tests {
#[tokio::test]
async fn test_basic() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4039,7 +4049,8 @@ mod tests {
#[tokio::test]
async fn no_duplicate_timelines() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
.await?
.load()
.await;
let _ = tenant
@@ -4071,7 +4082,7 @@ mod tests {
async fn test_branch() -> anyhow::Result<()> {
use std::str::from_utf8;
let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4193,7 +4204,8 @@ mod tests {
#[tokio::test]
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
let (tenant, ctx) =
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
.await?
.load()
.await;
let tline = tenant
@@ -4240,7 +4252,8 @@ mod tests {
#[tokio::test]
async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
let (tenant, ctx) =
TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
.await?
.load()
.await;
@@ -4262,7 +4275,7 @@ mod tests {
.source()
.unwrap()
.to_string()
.contains("is earlier than latest GC horizon"));
.contains("is earlier than latest GC cutoff"));
}
}
@@ -4295,7 +4308,8 @@ mod tests {
#[tokio::test]
async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
let (tenant, ctx) =
TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
.await?
.load()
.await;
let tline = tenant
@@ -4352,7 +4366,8 @@ mod tests {
#[tokio::test]
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
let (tenant, ctx) =
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
.await?
.load()
.await;
let tline = tenant
@@ -4382,10 +4397,10 @@ mod tests {
}
#[tokio::test]
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
let (tenant, ctx) =
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
.load()
.await;
let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
.await?
.load()
.await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4423,7 +4438,7 @@ mod tests {
#[tokio::test]
async fn timeline_load() -> anyhow::Result<()> {
const TEST_NAME: &str = "timeline_load";
let harness = TenantHarness::create(TEST_NAME)?;
let harness = TenantHarness::create(TEST_NAME).await?;
{
let (tenant, ctx) = harness.load().await;
let tline = tenant
@@ -4450,7 +4465,7 @@ mod tests {
#[tokio::test]
async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
const TEST_NAME: &str = "timeline_load_with_ancestor";
let harness = TenantHarness::create(TEST_NAME)?;
let harness = TenantHarness::create(TEST_NAME).await?;
// create two timelines
{
let (tenant, ctx) = harness.load().await;
@@ -4498,7 +4513,10 @@ mod tests {
#[tokio::test]
async fn delta_layer_dumping() -> anyhow::Result<()> {
use storage_layer::AsLayerDesc;
let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
.await?
.load()
.await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4525,7 +4543,7 @@ mod tests {
#[tokio::test]
async fn test_images() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -4696,7 +4714,7 @@ mod tests {
//
#[tokio::test]
async fn test_bulk_insert() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_bulk_insert")?;
let harness = TenantHarness::create("test_bulk_insert").await?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4727,7 +4745,7 @@ mod tests {
// so the search can stop at the first delta layer and doesn't traverse any deeper.
#[tokio::test]
async fn test_get_vectored() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_get_vectored")?;
let harness = TenantHarness::create("test_get_vectored").await?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4805,7 +4823,7 @@ mod tests {
#[tokio::test]
async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_get_vectored_aux_files")?;
let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
@@ -4891,7 +4909,8 @@ mod tests {
TenantId::generate(),
ShardIdentity::unsharded(),
Generation::new(0xdeadbeef),
)?;
)
.await?;
let (tenant, ctx) = harness.load().await;
let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5034,7 +5053,7 @@ mod tests {
// ```
#[tokio::test]
async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
let (tenant, ctx) = harness.load().await;
let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5183,7 +5202,7 @@ mod tests {
name: &'static str,
compaction_algorithm: CompactionAlgorithm,
) -> anyhow::Result<()> {
let mut harness = TenantHarness::create(name)?;
let mut harness = TenantHarness::create(name).await?;
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
kind: compaction_algorithm,
};
@@ -5267,7 +5286,8 @@ mod tests {
#[tokio::test]
async fn test_traverse_branches() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
.await?
.load()
.await;
let mut tline = tenant
@@ -5357,7 +5377,8 @@ mod tests {
#[tokio::test]
async fn test_traverse_ancestors() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
.await?
.load()
.await;
let mut tline = tenant
@@ -5423,7 +5444,8 @@ mod tests {
#[tokio::test]
async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
.await?
.load()
.await;
@@ -5492,7 +5514,7 @@ mod tests {
#[tokio::test]
async fn test_create_guard_crash() -> anyhow::Result<()> {
let name = "test_create_guard_crash";
let harness = TenantHarness::create(name)?;
let harness = TenantHarness::create(name).await?;
{
let (tenant, ctx) = harness.load().await;
let tline = tenant
@@ -5545,7 +5567,7 @@ mod tests {
name: &'static str,
compaction_algorithm: CompactionAlgorithm,
) -> anyhow::Result<()> {
let mut harness = TenantHarness::create(name)?;
let mut harness = TenantHarness::create(name).await?;
harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
kind: compaction_algorithm,
};
@@ -5569,7 +5591,7 @@ mod tests {
#[tokio::test]
async fn test_metadata_scan() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_scan")?;
let harness = TenantHarness::create("test_metadata_scan").await?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5688,7 +5710,7 @@ mod tests {
#[tokio::test]
async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5747,7 +5769,9 @@ mod tests {
#[tokio::test]
async fn test_branch_copies_dirty_aux_file_flag() {
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
.await
.unwrap();
// the default aux file policy to switch is v1 if not set by the admins
assert_eq!(
@@ -5849,7 +5873,9 @@ mod tests {
#[tokio::test]
async fn aux_file_policy_switch() {
let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
let mut harness = TenantHarness::create("aux_file_policy_switch")
.await
.unwrap();
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
let (tenant, ctx) = harness.load().await;
@@ -6023,7 +6049,9 @@ mod tests {
#[tokio::test]
async fn aux_file_policy_force_switch() {
let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
let mut harness = TenantHarness::create("aux_file_policy_force_switch")
.await
.unwrap();
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
let (tenant, ctx) = harness.load().await;
@@ -6084,7 +6112,9 @@ mod tests {
#[tokio::test]
async fn aux_file_policy_auto_detect() {
let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
.await
.unwrap();
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
let (tenant, ctx) = harness.load().await;
@@ -6147,7 +6177,7 @@ mod tests {
#[tokio::test]
async fn test_metadata_image_creation() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_image_creation")?;
let harness = TenantHarness::create("test_metadata_image_creation").await?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -6246,7 +6276,7 @@ mod tests {
#[tokio::test]
async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
let (tenant, ctx) = harness.load().await;
let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
@@ -6318,7 +6348,7 @@ mod tests {
#[tokio::test]
async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
let (tenant, ctx) = harness.load().await;
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6410,7 +6440,7 @@ mod tests {
#[tokio::test]
async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
let (tenant, ctx) = harness.load().await;
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6490,7 +6520,9 @@ mod tests {
#[tokio::test]
async fn test_metadata_tombstone_image_creation() {
let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6562,8 +6594,9 @@ mod tests {
#[tokio::test]
async fn test_metadata_tombstone_empty_image_creation() {
let harness =
TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6626,7 +6659,7 @@ mod tests {
#[tokio::test]
async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
@@ -6718,8 +6751,8 @@ mod tests {
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.pitr = Lsn(0x30);
guard.cutoffs.horizon = Lsn(0x30);
guard.cutoffs.time = Lsn(0x30);
guard.cutoffs.space = Lsn(0x30);
}
let expected_result = [
@@ -6810,7 +6843,7 @@ mod tests {
vec![
// Image layer at GC horizon
PersistentLayerKey {
key_range: Key::MIN..get_key(10),
key_range: Key::MIN..Key::MAX,
lsn_range: Lsn(0x30)..Lsn(0x31),
is_delta: false
},
@@ -6834,7 +6867,7 @@ mod tests {
#[tokio::test]
async fn test_neon_test_record() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_neon_test_record")?;
let harness = TenantHarness::create("test_neon_test_record").await?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
@@ -6915,7 +6948,7 @@ mod tests {
#[tokio::test]
async fn test_lsn_lease() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let end_lsn = Lsn(0x100);
@@ -7004,7 +7037,7 @@ mod tests {
#[tokio::test]
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
@@ -7109,8 +7142,8 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![],
cutoffs: GcCutoffs {
pitr: Lsn(0x30),
horizon: Lsn(0x30),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
within_ancestor_pitr: false,

View File

@@ -262,7 +262,7 @@ where
pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
where
R: 'a,
R: 'a + Send,
{
DiskBtreeIterator {
stream: Box::pin(self.into_stream(start_key, ctx)),
@@ -521,7 +521,7 @@ where
pub struct DiskBtreeIterator<'a> {
#[allow(clippy::type_complexity)]
stream: std::pin::Pin<
Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
>,
}

View File

@@ -2698,7 +2698,9 @@ mod tests {
// Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
// wait for it to complete before proceeding.
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
.await
.unwrap();
let (t, _ctx) = h.load().await;
// harness loads it to active, which is forced and nothing is running on the tenant

View File

@@ -241,7 +241,7 @@ use self::index::IndexPart;
use super::metadata::MetadataUpdate;
use super::storage_layer::{Layer, LayerName, ResidentLayer};
use super::upload_queue::SetDeletedFlagProgress;
use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
use super::Generation;
pub(crate) use download::{
@@ -1930,6 +1930,31 @@ impl RemoteTimelineClient {
}
}
}
/// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
/// externally to RemoteTimelineClient.
pub(crate) fn initialized_upload_queue(
&self,
) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
let mut inner = self.upload_queue.lock().unwrap();
inner.initialized_mut()?;
Ok(UploadQueueAccessor { inner })
}
}
pub(crate) struct UploadQueueAccessor<'a> {
inner: std::sync::MutexGuard<'a, UploadQueue>,
}
impl<'a> UploadQueueAccessor<'a> {
pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
match &*self.inner {
UploadQueue::Initialized(x) => &x.clean.0,
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
unreachable!("checked before constructing")
}
}
}
}
pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -2103,7 +2128,7 @@ mod tests {
impl TestSetup {
async fn new(test_name: &str) -> anyhow::Result<Self> {
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let harness = TenantHarness::create(test_name).await?;
let (tenant, ctx) = harness.load().await;
let timeline = tenant

View File

@@ -176,6 +176,24 @@ pub(crate) struct Lineage {
///
/// If you are adding support for detaching from a hierarchy, consider changing the ancestry
/// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
// FIXME: this is insufficient even for path of two timelines for future wal recovery
// purposes:
//
// assuming a "old main" which has received most of the WAL, and has a branch "new main",
// starting a bit before "old main" last_record_lsn. the current version works fine,
// because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
//
// then assuming "new main" would similarly receive a branch right before its last_record_lsn,
// "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
// here. however, we cannot recover from WAL using only that information, we would need the
// whole ancestry here:
//
// ```json
// [
// ["old main", ancestor_lsn("new main"), _],
// ["new main", ancestor_lsn("new new main"), _]
// ]
// ```
#[serde(skip_serializing_if = "Option::is_none", default)]
original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
}
@@ -217,6 +235,14 @@ impl Lineage {
self.original_ancestor
.is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
}
pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
self.original_ancestor.is_some()
}
pub(crate) fn is_reparented(&self) -> bool {
!self.reparenting_history.is_empty()
}
}
#[cfg(test)]

View File

@@ -135,11 +135,9 @@ pub struct TimelineInputs {
ancestor_lsn: Lsn,
last_record: Lsn,
latest_gc_cutoff: Lsn,
horizon_cutoff: Lsn,
pitr_cutoff: Lsn,
/// Cutoff point based on GC settings
next_gc_cutoff: Lsn,
next_pitr_cutoff: Lsn,
/// Cutoff point calculated from the user-supplied 'max_retention_period'
retention_param_cutoff: Option<Lsn>,
@@ -150,7 +148,7 @@ pub struct TimelineInputs {
/// Gathers the inputs for the tenant sizing model.
///
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
/// is updated on-demand, during the start of this calculation and separate from the
/// [`TimelineInputs::latest_gc_cutoff`].
///
@@ -158,11 +156,8 @@ pub struct TimelineInputs {
///
/// ```text
/// 0-----|---------|----|------------| · · · · · |·> lsn
/// initdb_lsn branchpoints* next_gc_cutoff latest
/// initdb_lsn branchpoints* next_pitr_cutoff latest
/// ```
///
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
/// tenant size will be zero.
pub(super) async fn gather_inputs(
tenant: &Tenant,
limit: &Arc<Semaphore>,
@@ -172,7 +167,7 @@ pub(super) async fn gather_inputs(
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<ModelInputs, CalculateSyntheticSizeError> {
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
// refresh is needed to update [`timeline::GcCutoffs`]
tenant.refresh_gc_info(cancel, ctx).await?;
// Collect information about all the timelines
@@ -236,20 +231,18 @@ pub(super) async fn gather_inputs(
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
// actually removing files.
//
// We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
// We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
// a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
// than our internal space cutoff. This means that if someone drops a database and waits for their
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
// horizon_cutoff.
let pitr_cutoff = gc_info.cutoffs.pitr;
let horizon_cutoff = gc_info.cutoffs.horizon;
let mut next_gc_cutoff = pitr_cutoff;
// the space cutoff.
let mut next_pitr_cutoff = gc_info.cutoffs.time;
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
if next_gc_cutoff < param_cutoff {
next_gc_cutoff = param_cutoff;
if next_pitr_cutoff < param_cutoff {
next_pitr_cutoff = param_cutoff;
}
Some(param_cutoff)
} else {
@@ -263,7 +256,7 @@ pub(super) async fn gather_inputs(
.copied()
.collect::<Vec<_>>();
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
// next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
// want to query any logical size before initdb_lsn.
let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
@@ -291,10 +284,10 @@ pub(super) async fn gather_inputs(
)
}
// Add a point for the GC cutoff
let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
// Add a point for the PITR cutoff
let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
if !branch_start_needed {
lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
}
lsns.sort_unstable();
@@ -333,7 +326,7 @@ pub(super) async fn gather_inputs(
parent: Some(parent),
lsn: lsn.0,
size: None,
needed: lsn > next_gc_cutoff,
needed: lsn > next_pitr_cutoff,
},
timeline_id: timeline.timeline_id,
kind,
@@ -357,8 +350,8 @@ pub(super) async fn gather_inputs(
segment: Segment {
parent: Some(lease_parent),
lsn: lsn.0,
size: None, // Filled in later, if necessary
needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
size: None, // Filled in later, if necessary
needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
},
timeline_id: timeline.timeline_id,
kind: LsnKind::LeaseStart,
@@ -398,9 +391,7 @@ pub(super) async fn gather_inputs(
last_record: last_record_lsn,
// this is not used above, because it might not have updated recently enough
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
horizon_cutoff,
pitr_cutoff,
next_gc_cutoff,
next_pitr_cutoff,
retention_param_cutoff,
lease_points,
});
@@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() {
"ancestor_lsn": "0/18D3D98",
"last_record": "0/2230CD0",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/2210CD0",
"pitr_cutoff": "0/2210CD0",
"next_gc_cutoff": "0/2210CD0",
"next_pitr_cutoff": "0/2210CD0",
"retention_param_cutoff": null,
"lease_points": []
},
@@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() {
"ancestor_lsn": "0/176D998",
"last_record": "0/1837770",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/1817770",
"pitr_cutoff": "0/1817770",
"next_gc_cutoff": "0/1817770",
"next_pitr_cutoff": "0/1817770",
"retention_param_cutoff": null,
"lease_points": []
},
@@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() {
"ancestor_lsn": "0/0",
"last_record": "0/18D3D98",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/18B3D98",
"pitr_cutoff": "0/18B3D98",
"next_gc_cutoff": "0/18B3D98",
"next_pitr_cutoff": "0/18B3D98",
"retention_param_cutoff": null,
"lease_points": []
}
@@ -820,9 +805,7 @@ fn verify_size_for_one_branch() {
"ancestor_lsn": "0/0",
"last_record": "47/280A5860",
"latest_gc_cutoff": "47/240A5860",
"horizon_cutoff": "47/240A5860",
"pitr_cutoff": "47/240A5860",
"next_gc_cutoff": "47/240A5860",
"next_pitr_cutoff": "47/240A5860",
"retention_param_cutoff": "0/0",
"lease_points": []
}

View File

@@ -6,8 +6,6 @@ pub(crate) mod inmemory_layer;
pub(crate) mod layer;
mod layer_desc;
mod layer_name;
#[cfg(test)]
pub mod merge_iterator;
use crate::context::{AccessStatsBehavior, RequestContext};
@@ -676,6 +674,26 @@ impl LayerAccessStats {
},
}
}
/// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
///
/// This indicates whether the layer has been used for some purpose that would motivate
/// us to keep it on disk, such as for serving a getpage request.
fn accessed(&self) -> bool {
let locked = self.0.lock().unwrap();
let inner = &locked.for_eviction_policy;
// Consider it accessed if the most recent access is more recent than
// the most recent change in residence status.
match (
inner.last_accesses.recent(),
inner.last_residence_changes.recent(),
) {
(None, _) => false,
(Some(_), None) => true,
(Some(a), Some(r)) => a.when >= r.timestamp,
}
}
}
/// Get a layer descriptor from a layer.

View File

@@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::disk_btree::{
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
};
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
VectoredReadPlanner,
};
use crate::tenant::{PageReconstructError, Timeline};
use crate::virtual_file::{self, VirtualFile};
@@ -53,6 +56,7 @@ use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
@@ -747,12 +751,10 @@ impl DeltaLayer {
}
impl DeltaLayerInner {
#[cfg(test)]
pub(crate) fn key_range(&self) -> &Range<Key> {
&self.layer_key_range
}
#[cfg(test)]
pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
&self.layer_lsn_range
}
@@ -1180,9 +1182,7 @@ impl DeltaLayerInner {
let delta_key = DeltaKey::from_slice(key);
let val_ref = ValueRef {
blob_ref: BlobRef(value),
reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
Adapter(self),
)),
layer: self,
};
let pos = BlobRef(value).pos();
if let Some(last) = all_keys.last_mut() {
@@ -1426,7 +1426,7 @@ impl DeltaLayerInner {
let keys = self.load_keys(ctx).await?;
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
let buf = val.load_raw(ctx).await?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
@@ -1461,8 +1461,7 @@ impl DeltaLayerInner {
use pageserver_api::key::CHECKPOINT_KEY;
use postgres_ffi::CheckPoint;
if key == CHECKPOINT_KEY {
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
let val = Value::des(&buf)?;
let val = val.load(ctx).await?;
match val {
Value::Image(img) => {
let checkpoint = CheckPoint::decode(&img)?;
@@ -1515,7 +1514,6 @@ impl DeltaLayerInner {
offset
}
#[cfg(test)]
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
@@ -1526,7 +1524,7 @@ impl DeltaLayerInner {
index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
key_values_batch: std::collections::VecDeque::new(),
is_end: false,
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
planner: StreamingVectoredReadPlanner::new(
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
1024, // The default value. Unit tests might use a different value
),
@@ -1547,17 +1545,24 @@ pub struct DeltaEntry<'a> {
/// Reference to an on-disk value
pub struct ValueRef<'a> {
blob_ref: BlobRef,
reader: BlockCursor<'a>,
layer: &'a DeltaLayerInner,
}
impl<'a> ValueRef<'a> {
/// Loads the value from disk
pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
// theoretically we *could* record an access time for each, but it does not really matter
let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
let buf = self.load_raw(ctx).await?;
let val = Value::des(&buf)?;
Ok(val)
}
async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
self.layer,
)));
let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
Ok(buf)
}
}
pub(crate) struct Adapter<T>(T);
@@ -1591,17 +1596,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
}
}
#[cfg(test)]
pub struct DeltaLayerIterator<'a> {
delta_layer: &'a DeltaLayerInner,
ctx: &'a RequestContext,
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
planner: StreamingVectoredReadPlanner,
index_iter: DiskBtreeIterator<'a>,
key_values_batch: VecDeque<(Key, Lsn, Value)>,
is_end: bool,
}
#[cfg(test)]
impl<'a> DeltaLayerIterator<'a> {
/// Retrieve a batch of key-value pairs into the iterator buffer.
async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -1668,6 +1671,7 @@ pub(crate) mod test {
use rand::RngCore;
use super::*;
use crate::repository::Value;
use crate::tenant::harness::TIMELINE_ID;
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
use crate::tenant::Tenant;
@@ -1677,6 +1681,7 @@ pub(crate) mod test {
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
DEFAULT_PG_VERSION,
};
use bytes::Bytes;
/// Construct an index for a fictional delta layer and and then
/// traverse in order to plan vectored reads for a query. Finally,
@@ -1929,7 +1934,7 @@ pub(crate) mod test {
#[tokio::test]
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
let (tenant, ctx) = harness.load().await;
let timeline_id = TimelineId::generate();
@@ -2029,7 +2034,9 @@ pub(crate) mod test {
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
.await
.unwrap();
let (tenant, ctx) = h.load().await;
let ctx = &ctx;
let timeline = tenant
@@ -2245,6 +2252,15 @@ pub(crate) mod test {
(k1, l1).cmp(&(k2, l2))
}
pub(crate) fn sort_delta_value(
(k1, l1, v1): &(Key, Lsn, Value),
(k2, l2, v2): &(Key, Lsn, Value),
) -> std::cmp::Ordering {
let order_1 = if v1.is_image() { 0 } else { 1 };
let order_2 = if v2.is_image() { 0 } else { 1 };
(k1, l1, order_1).cmp(&(k2, l2, order_2))
}
pub(crate) async fn produce_delta_layer(
tenant: &Tenant,
tline: &Arc<Timeline>,
@@ -2253,7 +2269,7 @@ pub(crate) mod test {
) -> anyhow::Result<ResidentLayer> {
deltas.sort_by(sort_delta);
let (key_start, _, _) = deltas.first().unwrap();
let (key_max, _, _) = deltas.first().unwrap();
let (key_max, _, _) = deltas.last().unwrap();
let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
let lsn_end = Lsn(lsn_max.0 + 1);
@@ -2298,10 +2314,7 @@ pub(crate) mod test {
#[tokio::test]
async fn delta_layer_iterator() {
use crate::repository::Value;
use bytes::Bytes;
let harness = TenantHarness::create("delta_layer_iterator").unwrap();
let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant

View File

@@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::disk_btree::{
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
};
use crate::tenant::storage_layer::{
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
VectoredReadPlanner,
};
use crate::tenant::{PageReconstructError, Timeline};
use crate::virtual_file::{self, VirtualFile};
@@ -50,6 +53,7 @@ use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
@@ -369,12 +373,10 @@ impl ImageLayer {
}
impl ImageLayerInner {
#[cfg(test)]
pub(crate) fn key_range(&self) -> &Range<Key> {
&self.key_range
}
#[cfg(test)]
pub(crate) fn lsn(&self) -> Lsn {
self.lsn
}
@@ -699,7 +701,6 @@ impl ImageLayerInner {
}
}
#[cfg(test)]
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
@@ -708,9 +709,9 @@ impl ImageLayerInner {
image_layer: self,
ctx,
index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
key_values_batch: std::collections::VecDeque::new(),
key_values_batch: VecDeque::new(),
is_end: false,
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
planner: StreamingVectoredReadPlanner::new(
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
1024, // The default value. Unit tests might use a different value
),
@@ -737,6 +738,9 @@ struct ImageLayerWriterInner {
key_range: Range<Key>,
lsn: Lsn,
// Total uncompressed bytes passed into put_image
uncompressed_bytes: u64,
blob_writer: BlobWriter<false>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
}
@@ -792,6 +796,7 @@ impl ImageLayerWriterInner {
lsn,
tree: tree_builder,
blob_writer,
uncompressed_bytes: 0,
};
Ok(writer)
@@ -810,6 +815,7 @@ impl ImageLayerWriterInner {
) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let compression = self.conf.image_compression;
self.uncompressed_bytes += img.len() as u64;
let (_img, res) = self
.blob_writer
.write_blob_maybe_compressed(img, ctx, compression)
@@ -835,6 +841,11 @@ impl ImageLayerWriterInner {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
// Calculate compression ratio
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
let mut file = self.blob_writer.into_inner();
// Write out the index
@@ -974,17 +985,15 @@ impl Drop for ImageLayerWriter {
}
}
#[cfg(test)]
pub struct ImageLayerIterator<'a> {
image_layer: &'a ImageLayerInner,
ctx: &'a RequestContext,
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
planner: StreamingVectoredReadPlanner,
index_iter: DiskBtreeIterator<'a>,
key_values_batch: VecDeque<(Key, Lsn, Value)>,
is_end: bool,
}
#[cfg(test)]
impl<'a> ImageLayerIterator<'a> {
/// Retrieve a batch of key-value pairs into the iterator buffer.
async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -1102,6 +1111,7 @@ mod test {
ShardIdentity::unsharded(),
get_next_gen(),
)
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant
@@ -1168,6 +1178,7 @@ mod test {
// But here, all we care about is that the gen number is unique.
get_next_gen(),
)
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant
@@ -1299,7 +1310,7 @@ mod test {
#[tokio::test]
async fn image_layer_iterator() {
let harness = TenantHarness::create("image_layer_iterator").unwrap();
let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant

View File

@@ -385,6 +385,7 @@ impl Layer {
}
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
#[allow(dead_code)]
pub(crate) async fn load_key_values(
&self,
ctx: &RequestContext,
@@ -693,6 +694,18 @@ impl Drop for LayerInner {
// and we could be delaying shutdown for nothing.
}
if let Some(timeline) = self.timeline.upgrade() {
// Only need to decrement metrics if the timeline still exists: otherwise
// it will have already de-registered these metrics via TimelineMetrics::shutdown
if self.desc.is_delta() {
timeline.metrics.layer_count_delta.dec();
timeline.metrics.layer_size_delta.sub(self.desc.file_size);
} else {
timeline.metrics.layer_count_image.dec();
timeline.metrics.layer_size_image.sub(self.desc.file_size);
}
}
if !*self.wanted_deleted.get_mut() {
return;
}
@@ -791,6 +804,15 @@ impl LayerInner {
(heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
};
// This object acts as a RAII guard on these metrics: increment on construction
if desc.is_delta() {
timeline.metrics.layer_count_delta.inc();
timeline.metrics.layer_size_delta.add(desc.file_size);
} else {
timeline.metrics.layer_count_image.inc();
timeline.metrics.layer_size_image.add(desc.file_size);
}
LayerInner {
conf,
debug_str: {
@@ -1469,14 +1491,22 @@ impl LayerInner {
let duration = SystemTime::now().duration_since(local_layer_mtime);
match duration {
Ok(elapsed) => {
timeline
.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(elapsed);
let accessed = self.access_stats.accessed();
if accessed {
// Only layers used for reads contribute to our "low residence" metric that is used
// to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
// to be rapidly evicted without contributing to this metric.
timeline
.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(elapsed);
}
tracing::info!(
residence_millis = elapsed.as_millis(),
accessed,
"evicted layer after known residence period"
);
}
@@ -1889,7 +1919,7 @@ impl ResidentLayer {
self.owner.metadata()
}
#[cfg(test)]
/// Cast the layer to a delta, return an error if it is an image layer.
pub(crate) async fn get_as_delta(
&self,
ctx: &RequestContext,
@@ -1901,7 +1931,7 @@ impl ResidentLayer {
}
}
#[cfg(test)]
/// Cast the layer to an image, return an error if it is a delta layer.
pub(crate) async fn get_as_image(
&self,
ctx: &RequestContext,

View File

@@ -22,7 +22,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
async fn smoke_test() {
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("smoke_test").unwrap();
let h = TenantHarness::create("smoke_test").await.unwrap();
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let (tenant, _) = h.load().await;
@@ -176,7 +176,9 @@ async fn evict_and_wait_on_wanted_deleted() {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
.await
.unwrap();
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
let (tenant, ctx) = h.load().await;
@@ -258,7 +260,9 @@ fn read_wins_pending_eviction() {
rt.block_on(async move {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
let h = TenantHarness::create("read_wins_pending_eviction")
.await
.unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -390,7 +394,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
rt.block_on(async move {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create(name).unwrap();
let h = TenantHarness::create(name).await.unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -559,8 +563,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
#[tokio::test(start_paused = true)]
async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
let handle = tokio::runtime::Handle::current();
let h =
TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
.await
.unwrap();
let (tenant, ctx) = h.load().await;
let timeline = tenant
@@ -636,7 +641,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
#[tokio::test(start_paused = true)]
async fn evict_and_wait_does_not_wait_for_download() {
// let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
.await
.unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -733,7 +740,9 @@ async fn eviction_cancellation_on_drop() {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
let h = TenantHarness::create("eviction_cancellation_on_drop")
.await
.unwrap();
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
let (tenant, ctx) = h.load().await;

View File

@@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
use std::cmp::Ordering;
let a = self.peek_next_key_lsn();
let b = other.peek_next_key_lsn();
let a = self.peek_next_key_lsn_value();
let b = other.peek_next_key_lsn_value();
match (a, b) {
(Some((k1, l1)), Some((k2, l2))) => {
let loaded_1 = if self.is_loaded() { 1 } else { 0 };
let loaded_2 = if other.is_loaded() { 1 } else { 0 };
(Some((k1, l1, v1)), Some((k2, l2, v2))) => {
fn map_value_to_num(val: &Option<&Value>) -> usize {
match val {
None => 0,
Some(Value::Image(_)) => 1,
Some(Value::WalRecord(_)) => 2,
}
}
let order_1 = map_value_to_num(&v1);
let order_2 = map_value_to_num(&v2);
// When key_lsn are the same, the unloaded iter will always appear before the loaded one.
// And note that we do a reverse at the end of the comparison, so it works with the max heap.
(k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
(k1, l1, order_1).cmp(&(k2, l2, order_2))
}
(Some(_), None) => Ordering::Less,
(None, Some(_)) => Ordering::Greater,
@@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> {
}
}
fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
match self {
Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
Self::Loaded { iter } => iter
.peek()
.as_ref()
.map(|(key, lsn, val)| (key, *lsn, Some(val))),
Self::NotLoaded {
first_key_lower_bound: (key, lsn),
..
} => Some((key, *lsn)),
} => Some((key, *lsn, None)),
}
}
@@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> {
}
}
/// A merge iterator over delta/image layer iterators. When duplicated records are
/// found, the iterator will not perform any deduplication, and the caller should handle
/// these situation. By saying duplicated records, there are many possibilities:
/// * Two same delta at the same LSN.
/// * Two same image at the same LSN.
/// * Delta/image at the same LSN where the image has already applied the delta.
/// The iterator will always put the image before the delta.
pub struct MergeIterator<'a> {
heap: BinaryHeap<IteratorWrapper<'a>>,
}
@@ -245,8 +262,9 @@ mod tests {
use crate::{
tenant::{
harness::{TenantHarness, TIMELINE_ID},
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
},
walrecord::NeonWalRecord,
DEFAULT_PG_VERSION,
};
@@ -275,7 +293,9 @@ mod tests {
use crate::repository::Value;
use bytes::Bytes;
let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
let harness = TenantHarness::create("merge_iterator_merge_in_between")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
@@ -338,7 +358,9 @@ mod tests {
use crate::repository::Value;
use bytes::Bytes;
let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
let harness = TenantHarness::create("merge_iterator_delta_merge")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
@@ -407,6 +429,133 @@ mod tests {
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
}
// TODO: image layer merge, delta+image mixed merge
// TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
#[tokio::test]
async fn delta_image_mixed_merge() {
use crate::repository::Value;
use bytes::Bytes;
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
// In this test case, we want to test if the iterator still works correctly with multiple copies
// of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
// Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
// An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
// could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
// one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
// correctly process these situations and return everything as-is, and the upper layer of the system
// will handle duplicated LSNs.
let test_deltas1 = vec![
(
get_key(0),
Lsn(0x10),
Value::WalRecord(NeonWalRecord::wal_init()),
),
(
get_key(0),
Lsn(0x18),
Value::WalRecord(NeonWalRecord::wal_append("a")),
),
(
get_key(5),
Lsn(0x10),
Value::WalRecord(NeonWalRecord::wal_init()),
),
(
get_key(5),
Lsn(0x18),
Value::WalRecord(NeonWalRecord::wal_append("b")),
),
];
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
.await
.unwrap();
let mut test_deltas2 = test_deltas1.clone();
test_deltas2.push((
get_key(10),
Lsn(0x20),
Value::Image(Bytes::copy_from_slice(b"test")),
));
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
.await
.unwrap();
let test_deltas3 = vec![
(
get_key(0),
Lsn(0x10),
Value::Image(Bytes::copy_from_slice(b"")),
),
(
get_key(5),
Lsn(0x18),
Value::Image(Bytes::copy_from_slice(b"b")),
),
(
get_key(15),
Lsn(0x20),
Value::Image(Bytes::copy_from_slice(b"test")),
),
];
let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
.await
.unwrap();
let mut test_deltas4 = test_deltas3.clone();
test_deltas4.push((
get_key(20),
Lsn(0x20),
Value::Image(Bytes::copy_from_slice(b"test")),
));
let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
.await
.unwrap();
let mut expect = Vec::new();
expect.extend(test_deltas1);
expect.extend(test_deltas2);
expect.extend(test_deltas3);
expect.extend(test_deltas4);
expect.sort_by(sort_delta_value);
// Test with different layer order for MergeIterator::create to ensure the order
// is stable.
let mut merge_iter = MergeIterator::create(
&[
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
],
&[],
&ctx,
);
assert_merge_iter_equal(&mut merge_iter, &expect).await;
let mut merge_iter = MergeIterator::create(
&[
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
],
&[],
&ctx,
);
assert_merge_iter_equal(&mut merge_iter, &expect).await;
is_send(merge_iter);
}
fn is_send(_: impl Send) {}
}

View File

@@ -69,6 +69,7 @@ use std::{
use crate::{
aux_file::AuxFileSizeEstimator,
tenant::{
config::defaults::DEFAULT_PITR_INTERVAL,
layer_map::{LayerMap, SearchResult},
metadata::TimelineMetadata,
storage_layer::PersistentLayerDesc,
@@ -197,7 +198,7 @@ impl PartialOrd for Hole {
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
drop(rlock)
}
@@ -270,7 +271,7 @@ pub struct Timeline {
///
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
pub(crate) layers: tokio::sync::RwLock<LayerManager>,
last_freeze_at: AtomicLsn,
// Atomic would be more appropriate here.
@@ -477,37 +478,32 @@ impl GcInfo {
}
}
/// The `GcInfo` component describing which Lsns need to be retained.
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
/// between time-based and space-based retention for observability and consumption metrics purposes.
#[derive(Debug)]
pub(crate) struct GcCutoffs {
/// Keep everything newer than this point.
///
/// This is calculated by subtracting 'gc_horizon' setting from
/// last-record LSN
///
/// FIXME: is this inclusive or exclusive?
pub(crate) horizon: Lsn,
/// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
/// history we must keep to retain a specified number of bytes of WAL.
pub(crate) space: Lsn,
/// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
/// point.
///
/// This is calculated by finding a number such that a record is needed for PITR
/// if only if its LSN is larger than 'pitr_cutoff'.
pub(crate) pitr: Lsn,
/// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
/// history we must keep to enable reading back at least the PITR interval duration.
pub(crate) time: Lsn,
}
impl Default for GcCutoffs {
fn default() -> Self {
Self {
horizon: Lsn::INVALID,
pitr: Lsn::INVALID,
space: Lsn::INVALID,
time: Lsn::INVALID,
}
}
}
impl GcCutoffs {
fn select_min(&self) -> Lsn {
std::cmp::min(self.horizon, self.pitr)
std::cmp::min(self.space, self.time)
}
}
@@ -866,7 +862,7 @@ impl Timeline {
let gc_info = self.gc_info.read().unwrap();
let history = self
.get_last_record_lsn()
.checked_sub(gc_info.cutoffs.pitr)
.checked_sub(gc_info.cutoffs.time)
.unwrap_or(Lsn(0))
.0;
(history, gc_info.within_ancestor_pitr)
@@ -1565,7 +1561,7 @@ impl Timeline {
) -> anyhow::Result<()> {
ensure!(
lsn >= **latest_gc_cutoff_lsn,
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
"LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
lsn,
**latest_gc_cutoff_lsn,
);
@@ -3408,6 +3404,7 @@ impl Timeline {
}
}
#[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
#[allow(clippy::doc_lazy_continuation)]
/// Get the data needed to reconstruct all keys in the provided keyspace
///
@@ -4732,13 +4729,7 @@ impl Timeline {
tenant: &crate::tenant::Tenant,
options: detach_ancestor::Options,
ctx: &RequestContext,
) -> Result<
(
completion::Completion,
detach_ancestor::PreparedTimelineDetach,
),
detach_ancestor::Error,
> {
) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
detach_ancestor::prepare(self, tenant, options, ctx).await
}
@@ -4945,24 +4936,21 @@ impl Timeline {
}
/// Find the Lsns above which layer files need to be retained on
/// garbage collection. This is separate from actually performing the GC,
/// and is updated more frequently, so that compaction can remove obsolete
/// page versions more aggressively.
/// garbage collection.
///
/// TODO: that's wishful thinking, compaction doesn't actually do that
/// currently.
/// We calculate two cutoffs, one based on time and one based on WAL size. `pitr`
/// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
/// the space-based retention.
///
/// The 'cutoff_horizon' point is used to retain recent versions that might still be
/// needed by read-only nodes. (As of this writing, the caller just passes
/// the latest LSN subtracted by a constant, and doesn't do anything smart
/// to figure out what read-only nodes might actually need.)
///
/// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
/// whether a record is needed for PITR.
/// This function doesn't simply to calculate time & space based retention: it treats time-based
/// retention as authoritative if enabled, and falls back to space-based retention if calculating
/// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might
/// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs
/// in the response as the GC cutoff point for the timeline.
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub(super) async fn find_gc_cutoffs(
&self,
cutoff_horizon: Lsn,
space_cutoff: Lsn,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
@@ -4975,58 +4963,87 @@ impl Timeline {
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
//
// Some unit tests depend on garbage-collection working even when
// CLOG data is missing, so that find_lsn_for_timestamp() doesn't
// work, so avoid calling it altogether if time-based retention is not
// configured. It would be pointless anyway.
let pitr_cutoff = if pitr != Duration::ZERO {
let now = SystemTime::now();
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
match self
.find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
.await?
{
LsnForTimestamp::Present(lsn) => lsn,
LsnForTimestamp::Future(lsn) => {
// The timestamp is in the future. That sounds impossible,
// but what it really means is that there hasn't been
// any commits since the cutoff timestamp.
//
// In this case we should use the LSN of the most recent commit,
// which is implicitly the last LSN in the log.
debug!("future({})", lsn);
self.get_last_record_lsn()
}
LsnForTimestamp::Past(lsn) => {
debug!("past({})", lsn);
// conservative, safe default is to remove nothing, when we
// have no commit timestamp data available
*self.get_latest_gc_cutoff_lsn()
}
LsnForTimestamp::NoData(lsn) => {
debug!("nodata({})", lsn);
// conservative, safe default is to remove nothing, when we
// have no commit timestamp data available
*self.get_latest_gc_cutoff_lsn()
}
}
} else {
// If we don't have enough data to convert to LSN,
// play safe and don't remove any layers.
*self.get_latest_gc_cutoff_lsn()
if cfg!(test) {
// Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
if pitr == Duration::ZERO {
return Ok(GcCutoffs {
time: self.get_last_record_lsn(),
space: space_cutoff,
});
}
}
// Calculate a time-based limit on how much to retain:
// - if PITR interval is set, then this is our cutoff.
// - if PITR interval is not set, then we do a lookup
// based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
let time_cutoff = {
let now = SystemTime::now();
let time_range = if pitr == Duration::ZERO {
humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
} else {
pitr
};
// If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
let timestamp = to_pg_timestamp(time_cutoff);
match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
LsnForTimestamp::Present(lsn) => Some(lsn),
LsnForTimestamp::Future(lsn) => {
// The timestamp is in the future. That sounds impossible,
// but what it really means is that there hasn't been
// any commits since the cutoff timestamp.
//
// In this case we should use the LSN of the most recent commit,
// which is implicitly the last LSN in the log.
debug!("future({})", lsn);
Some(self.get_last_record_lsn())
}
LsnForTimestamp::Past(lsn) => {
debug!("past({})", lsn);
None
}
LsnForTimestamp::NoData(lsn) => {
debug!("nodata({})", lsn);
None
}
}
} else {
// No time-based retention was configured. Interpret this as "keep no history".
self.get_last_record_lsn()
};
Ok(GcCutoffs {
horizon: cutoff_horizon,
pitr: pitr_cutoff,
Ok(match (pitr, time_cutoff) {
(Duration::ZERO, Some(time_cutoff)) => {
// PITR is not set. Retain the size-based limit, or the default time retention,
// whichever requires less data.
GcCutoffs {
time: self.get_last_record_lsn(),
space: std::cmp::max(time_cutoff, space_cutoff),
}
}
(Duration::ZERO, None) => {
// PITR is not set, and time lookup failed
GcCutoffs {
time: self.get_last_record_lsn(),
space: space_cutoff,
}
}
(_, None) => {
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
// cannot advance beyond what was already GC'd, and respect space-based retention
GcCutoffs {
time: *self.get_latest_gc_cutoff_lsn(),
space: space_cutoff,
}
}
(_, Some(time_cutoff)) => {
// PITR interval is set and we looked up timestamp successfully. Ignore
// size based retention and make time cutoff authoritative
GcCutoffs {
time: time_cutoff,
space: time_cutoff,
}
}
})
}
@@ -5051,11 +5068,11 @@ impl Timeline {
return Err(GcError::TimelineCancelled);
}
let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
let gc_info = self.gc_info.read().unwrap();
let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
let pitr_cutoff = gc_info.cutoffs.pitr;
let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
let time_cutoff = gc_info.cutoffs.time;
let retain_lsns = gc_info.retain_lsns.clone();
// Gets the maximum LSN that holds the valid lease.
@@ -5065,14 +5082,14 @@ impl Timeline {
let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
(
horizon_cutoff,
pitr_cutoff,
space_cutoff,
time_cutoff,
retain_lsns,
max_lsn_with_valid_lease,
)
};
let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
let standby_horizon = self.standby_horizon.load();
// Hold GC for the standby, but as a safety guard do it only within some
// reasonable lag.
@@ -5101,8 +5118,8 @@ impl Timeline {
let res = self
.gc_timeline(
horizon_cutoff,
pitr_cutoff,
space_cutoff,
time_cutoff,
retain_lsns,
max_lsn_with_valid_lease,
new_gc_cutoff,
@@ -5120,8 +5137,8 @@ impl Timeline {
async fn gc_timeline(
&self,
horizon_cutoff: Lsn,
pitr_cutoff: Lsn,
space_cutoff: Lsn,
time_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
max_lsn_with_valid_lease: Option<Lsn>,
new_gc_cutoff: Lsn,
@@ -5182,22 +5199,22 @@ impl Timeline {
result.layers_total += 1;
// 1. Is it newer than GC horizon cutoff point?
if l.get_lsn_range().end > horizon_cutoff {
if l.get_lsn_range().end > space_cutoff {
debug!(
"keeping {} because it's newer than horizon_cutoff {}",
"keeping {} because it's newer than space_cutoff {}",
l.layer_name(),
horizon_cutoff,
space_cutoff,
);
result.layers_needed_by_cutoff += 1;
continue 'outer;
}
// 2. It is newer than PiTR cutoff point?
if l.get_lsn_range().end > pitr_cutoff {
if l.get_lsn_range().end > time_cutoff {
debug!(
"keeping {} because it's newer than pitr_cutoff {}",
"keeping {} because it's newer than time_cutoff {}",
l.layer_name(),
pitr_cutoff,
time_cutoff,
);
result.layers_needed_by_pitr += 1;
continue 'outer;
@@ -6029,8 +6046,9 @@ mod tests {
#[tokio::test]
async fn two_layer_eviction_attempts_at_the_same_time() {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant

View File

@@ -26,9 +26,11 @@ use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
use crate::tenant::timeline::{Layer, ResidentLayer};
use crate::tenant::DeltaLayer;
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -195,7 +197,7 @@ impl Timeline {
tracing::info!(
"latest_gc_cutoff: {}, pitr cutoff {}",
*latest_gc_cutoff,
self.gc_info.read().unwrap().cutoffs.pitr
self.gc_info.read().unwrap().cutoffs.time
);
let layers = self.layers.read().await;
@@ -379,7 +381,7 @@ impl Timeline {
};
let begin = tokio::time::Instant::now();
let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
let phase1_layers_locked = self.layers.read().await;
let now = tokio::time::Instant::now();
stats.read_lock_acquisition_micros =
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -399,9 +401,9 @@ impl Timeline {
}
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
async fn compact_level0_phase1(
self: &Arc<Self>,
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
async fn compact_level0_phase1<'a>(
self: &'a Arc<Self>,
guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
mut stats: CompactLevel0Phase1StatsBuilder,
target_file_size: u64,
ctx: &RequestContext,
@@ -415,6 +417,7 @@ impl Timeline {
.map(|x| guard.get_from_desc(&x))
.collect_vec();
stats.level0_deltas_count = Some(level0_deltas.len());
// Only compact if enough layers have accumulated.
let threshold = self.get_compaction_threshold();
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -445,6 +448,22 @@ impl Timeline {
let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
// Accumulate the size of layers in `deltas_to_compact`
let mut deltas_to_compact_bytes = 0;
// Under normal circumstances, we will accumulate up to compaction_interval L0s of size
// checkpoint_distance each. To avoid edge cases using extra system resources, bound our
// work in this function to only operate on this much delta data at once.
//
// Take the max of the configured value & the default, so that tests that configure tiny values
// can still use a sensible amount of memory, but if a deployed system configures bigger values we
// still let them compact a full stack of L0s in one go.
let delta_size_limit = std::cmp::max(
self.get_compaction_threshold(),
DEFAULT_COMPACTION_THRESHOLD,
) as u64
* std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
for l in level0_deltas_iter {
let lsn_range = &l.layer_desc().lsn_range;
@@ -453,7 +472,20 @@ impl Timeline {
break;
}
deltas_to_compact.push(l.download_and_keep_resident().await?);
deltas_to_compact_bytes += l.metadata().file_size;
prev_lsn_end = lsn_range.end;
if deltas_to_compact_bytes >= delta_size_limit {
info!(
l0_deltas_selected = deltas_to_compact.len(),
l0_deltas_total = level0_deltas.len(),
"L0 compaction picker hit max delta layer size limit: {}",
delta_size_limit
);
// Proceed with compaction, but only a subset of L0s
break;
}
}
let lsn_range = Range {
start: deltas_to_compact
@@ -990,7 +1022,7 @@ impl Timeline {
"enhanced legacy compaction currently does not support retain_lsns (branches)"
)));
}
let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
let gc_cutoff = gc_info.cutoffs.select_min();
let mut selected_layers = Vec::new();
// TODO: consider retain_lsns
drop(gc_info);
@@ -1008,10 +1040,12 @@ impl Timeline {
);
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
// Also, collect the layer information to decide when to split the new delta layers.
let mut all_key_values = Vec::new();
let mut downloaded_layers = Vec::new();
let mut delta_split_points = BTreeSet::new();
for layer in &layer_selection {
all_key_values.extend(layer.load_key_values(ctx).await?);
let resident_layer = layer.download_and_keep_resident().await?;
downloaded_layers.push(resident_layer);
let desc = layer.layer_desc();
if desc.is_delta() {
// TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
@@ -1021,44 +1055,28 @@ impl Timeline {
delta_split_points.insert(key_range.end);
}
}
// Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
// image layers, make image appear before than delta.
struct ValueWrapper<'a>(&'a crate::repository::Value);
impl Ord for ValueWrapper<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
use crate::repository::Value;
use std::cmp::Ordering;
match (self.0, other.0) {
(Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
(Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
_ => Ordering::Equal,
}
let mut delta_layers = Vec::new();
let mut image_layers = Vec::new();
for resident_layer in &downloaded_layers {
if resident_layer.layer_desc().is_delta() {
let layer = resident_layer.get_as_delta(ctx).await?;
delta_layers.push(layer);
} else {
let layer = resident_layer.get_as_image(ctx).await?;
image_layers.push(layer);
}
}
impl PartialOrd for ValueWrapper<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ValueWrapper<'_> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == std::cmp::Ordering::Equal
}
}
impl Eq for ValueWrapper<'_> {}
all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
(k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
});
let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
// Data of the same key.
let mut accumulated_values = Vec::new();
let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
let mut last_key: Option<Key> = None;
/// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
async fn flush_accumulated_states(
tline: &Arc<Timeline>,
key: Key,
accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
accumulated_values: &[(Key, Lsn, crate::repository::Value)],
horizon: Lsn,
) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
let mut base_image = None;
@@ -1159,7 +1177,7 @@ impl Timeline {
self.conf,
self.timeline_id,
self.tenant_shard_id,
&(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
&(Key::MIN..Key::MAX), // covers the full key range
gc_cutoff,
ctx,
)
@@ -1169,20 +1187,24 @@ impl Timeline {
let delta_split_points = delta_split_points.into_iter().collect_vec();
let mut current_delta_split_point = 0;
let mut delta_layers = Vec::new();
for item @ (key, _, _) in &all_key_values {
if &last_key == key {
accumulated_values.push(item);
while let Some((key, lsn, val)) = merge_iter.next().await? {
if last_key.is_none() || last_key.as_ref() == Some(&key) {
if last_key.is_none() {
last_key = Some(key);
}
accumulated_values.push((key, lsn, val));
} else {
let last_key = last_key.as_mut().unwrap();
let (deltas, image) =
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
.await?;
// Put the image into the image layer. Currently we have a single big layer for the compaction.
image_layer_writer.put_image(last_key, image, ctx).await?;
image_layer_writer.put_image(*last_key, image, ctx).await?;
delta_values.extend(deltas);
delta_layers.extend(
flush_deltas(
&mut delta_values,
last_key,
*last_key,
&delta_split_points,
&mut current_delta_split_point,
self,
@@ -1192,11 +1214,12 @@ impl Timeline {
.await?,
);
accumulated_values.clear();
accumulated_values.push(item);
last_key = *key;
*last_key = key;
accumulated_values.push((key, lsn, val));
}
}
let last_key = last_key.expect("no keys produced during compaction");
// TODO: move this part to the loop body
let (deltas, image) =
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;

View File

@@ -10,6 +10,7 @@ use crate::{
},
virtual_file::{MaybeFatalIo, VirtualFile},
};
use pageserver_api::models::detach_ancestor::AncestorDetached;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -39,6 +40,9 @@ pub(crate) enum Error {
#[error("unexpected error")]
Unexpected(#[source] anyhow::Error),
#[error("failpoint: {}", .0)]
Failpoint(&'static str),
}
impl From<Error> for ApiError {
@@ -57,11 +61,41 @@ impl From<Error> for ApiError {
| e @ Error::CopyDeltaPrefix(_)
| e @ Error::UploadRewritten(_)
| e @ Error::CopyFailed(_)
| e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
| e @ Error::Unexpected(_)
| e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
}
}
}
impl From<crate::tenant::upload_queue::NotInitialized> for Error {
fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self {
// treat all as shutting down signals, even though that is not entirely correct
// (uninitialized state)
Error::ShuttingDown
}
}
impl From<FlushLayerError> for Error {
fn from(value: FlushLayerError) -> Self {
match value {
FlushLayerError::Cancelled => Error::ShuttingDown,
FlushLayerError::NotRunning(_) => {
// FIXME(#6424): technically statically unreachable right now, given how we never
// drop the sender
Error::ShuttingDown
}
FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
Error::FlushAncestor(value)
}
}
}
}
pub(crate) enum Progress {
Prepared(completion::Completion, PreparedTimelineDetach),
Done(AncestorDetached),
}
pub(crate) struct PreparedTimelineDetach {
layers: Vec<Layer>,
}
@@ -88,7 +122,7 @@ pub(super) async fn prepare(
tenant: &Tenant,
options: Options,
ctx: &RequestContext,
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
) -> Result<Progress, Error> {
use Error::*;
let Some((ancestor, ancestor_lsn)) = detached
@@ -96,15 +130,67 @@ pub(super) async fn prepare(
.as_ref()
.map(|tl| (tl.clone(), detached.ancestor_lsn))
else {
// TODO: check if we have already been detached; for this we need to read the stored data
// on remote client, for that we need a follow-up which makes uploads cheaper and maintains
// a projection of the commited data.
{
let accessor = detached.remote_client.initialized_upload_queue()?;
// we are safe to inspect the latest uploaded, because we can only witness this after
// restart is complete and ancestor is no more.
let latest = accessor.latest_uploaded_index_part();
if !latest.lineage.is_detached_from_original_ancestor() {
return Err(NoAncestor);
}
}
// detached has previously been detached; let's inspect each of the current timelines and
// report back the timelines which have been reparented by our detach
let mut all_direct_children = tenant
.timelines
.lock()
.unwrap()
.values()
.filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
.map(|tl| (tl.ancestor_lsn, tl.clone()))
.collect::<Vec<_>>();
let mut any_shutdown = false;
all_direct_children.retain(
|(_, tl)| match tl.remote_client.initialized_upload_queue() {
Ok(accessor) => accessor
.latest_uploaded_index_part()
.lineage
.is_reparented(),
Err(_shutdownalike) => {
// not 100% a shutdown, but let's bail early not to give inconsistent results in
// sharded enviroment.
any_shutdown = true;
true
}
},
);
if any_shutdown {
// it could be one or many being deleted; have client retry
return Err(Error::ShuttingDown);
}
let mut reparented = all_direct_children;
// why this instead of hashset? there is a reason, but I've forgotten it many times.
//
// the error is wrong per openapi
return Err(NoAncestor);
// maybe if this was a hashset we would not be able to distinguish some race condition.
reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
return Ok(Progress::Done(AncestorDetached {
reparented_timelines: reparented
.into_iter()
.map(|(_, tl)| tl.timeline_id)
.collect(),
}));
};
if !ancestor_lsn.is_valid() {
// rare case, probably wouldn't even load
tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
return Err(NoAncestor);
}
@@ -131,6 +217,15 @@ pub(super) async fn prepare(
let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
fail::fail_point!(
"timeline-detach-ancestor::before_starting_after_locking",
|_| Err(Error::Failpoint(
"timeline-detach-ancestor::before_starting_after_locking"
))
);
if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
let span =
tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
@@ -151,7 +246,7 @@ pub(super) async fn prepare(
}
};
res.map_err(FlushAncestor)?;
res?;
// we do not need to wait for uploads to complete but we do need `struct Layer`,
// copying delta prefix is unsupported currently for `InMemoryLayer`.
@@ -159,7 +254,7 @@ pub(super) async fn prepare(
elapsed_ms = started_at.elapsed().as_millis(),
"froze and flushed the ancestor"
);
Ok(())
Ok::<_, Error>(())
}
.instrument(span)
.await?;
@@ -283,7 +378,7 @@ pub(super) async fn prepare(
let prepared = PreparedTimelineDetach { layers: new_layers };
Ok((guard, prepared))
Ok(Progress::Prepared(guard, prepared))
}
fn partition_work(
@@ -350,7 +445,11 @@ async fn copy_lsn_prefix(
target_timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> Result<Option<ResidentLayer>, Error> {
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
if target_timeline.cancel.is_cancelled() {
return Err(ShuttingDown);
}
tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
@@ -529,7 +628,7 @@ pub(super) async fn complete(
match res {
Ok(Some(timeline)) => {
tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
reparented.push(timeline.timeline_id);
reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
}
Ok(None) => {
// lets just ignore this for now. one or all reparented timelines could had
@@ -551,5 +650,12 @@ pub(super) async fn complete(
tracing::info!("failed to reparent some candidates");
}
reparented.sort_unstable();
let reparented = reparented
.into_iter()
.map(|(_, timeline_id)| timeline_id)
.collect();
Ok(reparented)
}

View File

@@ -1118,7 +1118,7 @@ mod tests {
#[tokio::test]
async fn no_connection_no_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("no_connection_no_candidate")?;
let harness = TenantHarness::create("no_connection_no_candidate").await?;
let mut state = dummy_state(&harness).await;
let now = Utc::now().naive_utc();
@@ -1151,7 +1151,7 @@ mod tests {
#[tokio::test]
async fn connection_no_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("connection_no_candidate")?;
let harness = TenantHarness::create("connection_no_candidate").await?;
let mut state = dummy_state(&harness).await;
let now = Utc::now().naive_utc();
@@ -1216,7 +1216,7 @@ mod tests {
#[tokio::test]
async fn no_connection_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("no_connection_candidate")?;
let harness = TenantHarness::create("no_connection_candidate").await?;
let mut state = dummy_state(&harness).await;
let now = Utc::now().naive_utc();
@@ -1279,7 +1279,7 @@ mod tests {
#[tokio::test]
async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
let harness = TenantHarness::create("candidate_with_many_connection_failures").await?;
let mut state = dummy_state(&harness).await;
let now = Utc::now().naive_utc();
@@ -1319,7 +1319,7 @@ mod tests {
#[tokio::test]
async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
let now = Utc::now().naive_utc();
@@ -1385,7 +1385,8 @@ mod tests {
#[tokio::test]
async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
let harness =
TenantHarness::create("timeout_connection_threshold_current_candidate").await?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
let now = Utc::now().naive_utc();
@@ -1448,7 +1449,7 @@ mod tests {
#[tokio::test]
async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
let new_lsn = Lsn(100_100).align();
@@ -1550,7 +1551,7 @@ mod tests {
// and pageserver should prefer to connect to it.
let test_az = Some("test_az".to_owned());
let harness = TenantHarness::create("switch_to_same_availability_zone")?;
let harness = TenantHarness::create("switch_to_same_availability_zone").await?;
let mut state = dummy_state(&harness).await;
state.conf.availability_zone.clone_from(&test_az);
let current_lsn = Lsn(100_000).align();

View File

@@ -228,18 +228,20 @@ impl UploadQueue {
Ok(self.initialized_mut().expect("we just set it"))
}
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
pub(crate) fn initialized_mut(
&mut self,
) -> Result<&mut UploadQueueInitialized, NotInitialized> {
use UploadQueue::*;
match self {
Uninitialized => Err(NotInitialized::Uninitialized.into()),
Uninitialized => Err(NotInitialized::Uninitialized),
Initialized(x) => {
if x.shutting_down {
Err(NotInitialized::ShuttingDown.into())
Err(NotInitialized::ShuttingDown)
} else {
Ok(x)
}
}
Stopped(_) => Err(NotInitialized::Stopped.into()),
Stopped(_) => Err(NotInitialized::Stopped),
}
}

View File

@@ -396,7 +396,6 @@ impl<'a> VectoredBlobReader<'a> {
/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
/// max_cnt constraints.
#[cfg(test)]
pub struct StreamingVectoredReadPlanner {
read_builder: Option<VectoredReadBuilder>,
// Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
@@ -410,7 +409,6 @@ pub struct StreamingVectoredReadPlanner {
cnt: usize,
}
#[cfg(test)]
impl StreamingVectoredReadPlanner {
pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
assert!(max_cnt > 0);

View File

@@ -1754,7 +1754,7 @@ mod tests {
#[tokio::test]
async fn test_relsize() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -1975,7 +1975,10 @@ mod tests {
// and then created it again within the same layer.
#[tokio::test]
async fn test_drop_extend() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_drop_extend")
.await?
.load()
.await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -2046,7 +2049,10 @@ mod tests {
// and then extended it again within the same layer.
#[tokio::test]
async fn test_truncate_extend() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_truncate_extend")
.await?
.load()
.await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -2188,7 +2194,7 @@ mod tests {
/// split into multiple 1 GB segments in Postgres.
#[tokio::test]
async fn test_large_rel() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -2296,7 +2302,7 @@ mod tests {
let startpoint = Lsn::from_hex("14AEC08").unwrap();
let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap();
let (tenant, ctx) = harness.load().await;
let remote_initdb_path =

54
patches/rum.patch Normal file
View File

@@ -0,0 +1,54 @@
commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb
Author: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon Jul 15 12:31:56 2024 +0100
Neon: fix unlogged index build patch
diff --git a/src/ruminsert.c b/src/ruminsert.c
index e8b209d..e89bf2a 100644
--- a/src/ruminsert.c
+++ b/src/ruminsert.c
@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
+#ifdef NEON_SMGR
+ smgr_start_unlogged_build(index->rd_smgr);
+#endif
+
initRumState(&buildstate.rumstate, index);
buildstate.rumstate.isBuild = true;
buildstate.indtuples = 0;
@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+#endif
+
/*
* Write index to xlog
*/
@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
UnlockReleaseBuffer(buffer);
}
+#ifdef NEON_SMGR
+ {
+#if PG_VERSION_NUM >= 160000
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+
+ smgr_end_unlogged_build(index->rd_smgr);
+ }
+#endif
+
/*
* Return statistics
*/

13
poetry.lock generated
View File

@@ -2641,19 +2641,18 @@ pbr = "*"
[[package]]
name = "setuptools"
version = "65.5.1"
version = "70.0.0"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
files = [
{file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
{file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
{file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
{file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
]
[package.extras]
docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
[[package]]
name = "six"

View File

@@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
tracing.workspace = true
typed-json.workspace = true
url.workspace = true
urlencoding.workspace = true
utils.workspace = true

View File

@@ -181,8 +181,9 @@ pub async fn worker(
let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
let rx = rx.map(RequestData::from);
let storage =
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?;
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
.await
.context("remote storage init")?;
let properties = WriterProperties::builder()
.set_data_page_size_limit(config.parquet_upload_page_size)
@@ -217,6 +218,7 @@ pub async fn worker(
let storage_disconnect =
GenericRemoteStorage::from_config(&disconnect_events_storage_config)
.await
.context("remote storage for disconnect events init")?;
let parquet_config_disconnect = parquet_config.clone();
tokio::try_join!(
@@ -545,7 +547,9 @@ mod tests {
},
timeout: std::time::Duration::from_secs(120),
};
let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
let storage = GenericRemoteStorage::from_config(&remote_storage_config)
.await
.unwrap();
worker_inner(storage, rx, config).await.unwrap();

View File

@@ -18,7 +18,7 @@ use hyper1::Response;
use hyper1::StatusCode;
use hyper1::{HeaderMap, Request};
use pq_proto::StartupMessageParamsBuilder;
use serde_json::json;
use serde::Serialize;
use serde_json::Value;
use tokio::time;
use tokio_postgres::error::DbError;
@@ -32,6 +32,7 @@ use tokio_postgres::Transaction;
use tokio_util::sync::CancellationToken;
use tracing::error;
use tracing::info;
use typed_json::json;
use url::Url;
use utils::http::error::ApiError;
@@ -263,13 +264,8 @@ pub async fn handle(
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
_ => None,
};
fn get<'a, T: serde::Serialize>(
db: Option<&'a DbError>,
x: impl FnOnce(&'a DbError) -> T,
) -> Value {
db.map(x)
.and_then(|t| serde_json::to_value(t).ok())
.unwrap_or_default()
fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T {
db.map(x).unwrap_or_default()
}
if let Some(db_error) = db_error {
@@ -278,17 +274,11 @@ pub async fn handle(
let position = db_error.and_then(|db| db.position());
let (position, internal_position, internal_query) = match position {
Some(ErrorPosition::Original(position)) => (
Value::String(position.to_string()),
Value::Null,
Value::Null,
),
Some(ErrorPosition::Internal { position, query }) => (
Value::Null,
Value::String(position.to_string()),
Value::String(query.clone()),
),
None => (Value::Null, Value::Null, Value::Null),
Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None),
Some(ErrorPosition::Internal { position, query }) => {
(None, Some(position.to_string()), Some(query.clone()))
}
None => (None, None, None),
};
let code = get(db_error, |db| db.code().code());
@@ -578,10 +568,8 @@ async fn handle_inner(
.status(StatusCode::OK)
.header(header::CONTENT_TYPE, "application/json");
//
// Now execute the query and return the result
//
let result = match payload {
// Now execute the query and return the result.
let json_output = match payload {
Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
Payload::Batch(statements) => {
if parsed_headers.txn_read_only {
@@ -605,11 +593,9 @@ async fn handle_inner(
let metrics = client.metrics();
// how could this possibly fail
let body = serde_json::to_string(&result).expect("json serialization should not fail");
let len = body.len();
let len = json_output.len();
let response = response
.body(Full::new(Bytes::from(body)))
.body(Full::new(Bytes::from(json_output)))
// only fails if invalid status code or invalid header/values are given.
// these are not user configurable so it cannot fail dynamically
.expect("building response payload should not fail");
@@ -631,7 +617,7 @@ impl QueryData {
cancel: CancellationToken,
client: &mut Client<tokio_postgres::Client>,
parsed_headers: HttpHeaders,
) -> Result<Value, SqlOverHttpError> {
) -> Result<String, SqlOverHttpError> {
let (inner, mut discard) = client.inner();
let cancel_token = inner.cancel_token();
@@ -644,7 +630,10 @@ impl QueryData {
// The query successfully completed.
Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
discard.check_idle(status);
Ok(results)
let json_output =
serde_json::to_string(&results).expect("json serialization should not fail");
Ok(json_output)
}
// The query failed with an error
Either::Left((Err(e), __not_yet_cancelled)) => {
@@ -662,7 +651,10 @@ impl QueryData {
// query successed before it was cancelled.
Ok(Ok((status, results))) => {
discard.check_idle(status);
Ok(results)
let json_output = serde_json::to_string(&results)
.expect("json serialization should not fail");
Ok(json_output)
}
// query failed or was cancelled.
Ok(Err(error)) => {
@@ -696,7 +688,7 @@ impl BatchQueryData {
cancel: CancellationToken,
client: &mut Client<tokio_postgres::Client>,
parsed_headers: HttpHeaders,
) -> Result<Value, SqlOverHttpError> {
) -> Result<String, SqlOverHttpError> {
info!("starting transaction");
let (inner, mut discard) = client.inner();
let cancel_token = inner.cancel_token();
@@ -718,9 +710,9 @@ impl BatchQueryData {
e
})?;
let results =
let json_output =
match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
Ok(results) => {
Ok(json_output) => {
info!("commit");
let status = transaction.commit().await.map_err(|e| {
// if we cannot commit - for now don't return connection to pool
@@ -729,7 +721,7 @@ impl BatchQueryData {
e
})?;
discard.check_idle(status);
results
json_output
}
Err(SqlOverHttpError::Cancelled(_)) => {
if let Err(err) = cancel_token.cancel_query(NoTls).await {
@@ -753,7 +745,7 @@ impl BatchQueryData {
}
};
Ok(json!({ "results": results }))
Ok(json_output)
}
}
@@ -762,7 +754,7 @@ async fn query_batch(
transaction: &Transaction<'_>,
queries: BatchQueryData,
parsed_headers: HttpHeaders,
) -> Result<Vec<Value>, SqlOverHttpError> {
) -> Result<String, SqlOverHttpError> {
let mut results = Vec::with_capacity(queries.queries.len());
let mut current_size = 0;
for stmt in queries.queries {
@@ -787,7 +779,11 @@ async fn query_batch(
}
}
}
Ok(results)
let results = json!({ "results": results });
let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
Ok(json_output)
}
async fn query_to_json<T: GenericClient>(
@@ -795,7 +791,7 @@ async fn query_to_json<T: GenericClient>(
data: QueryData,
current_size: &mut usize,
parsed_headers: HttpHeaders,
) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> {
info!("executing query");
let query_params = data.params;
let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
@@ -844,8 +840,8 @@ async fn query_to_json<T: GenericClient>(
for c in row_stream.columns() {
fields.push(json!({
"name": Value::String(c.name().to_owned()),
"dataTypeID": Value::Number(c.type_().oid().into()),
"name": c.name().to_owned(),
"dataTypeID": c.type_().oid(),
"tableID": c.table_oid(),
"columnID": c.column_id(),
"dataTypeSize": c.type_size(),
@@ -863,15 +859,14 @@ async fn query_to_json<T: GenericClient>(
.map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
.collect::<Result<Vec<_>, _>>()?;
// resulting JSON format is based on the format of node-postgres result
Ok((
ready,
json!({
"command": command_tag_name,
"rowCount": command_tag_count,
"rows": rows,
"fields": fields,
"rowAsArray": array_mode,
}),
))
// Resulting JSON format is based on the format of node-postgres result.
let results = json!({
"command": command_tag_name.to_string(),
"rowCount": command_tag_count,
"rows": rows,
"fields": fields,
"rowAsArray": array_mode,
});
Ok((ready, results))
}

View File

@@ -357,11 +357,15 @@ pub async fn task_backup(
info!("metrics backup has shut down");
}
// Even if the remote storage is not configured, we still want to clear the metrics.
let storage = backup_config
.remote_storage_config
.as_ref()
.map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
.transpose()?;
let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() {
Some(
GenericRemoteStorage::from_config(config)
.await
.context("remote storage init")?,
)
} else {
None
};
let mut ticker = tokio::time::interval(backup_config.interval);
let mut prev = Utc::now();
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();

View File

@@ -12,13 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
}
Ok(())
}
(Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
format!(
"JWT scope '{:?}' is ineligible for Safekeeper auth",
claims.scope
)
.into(),
)),
(Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => {
Err(AuthError(
format!(
"JWT scope '{:?}' is ineligible for Safekeeper auth",
claims.scope
)
.into(),
))
}
(Scope::SafekeeperData, _) => Ok(()),
}
}

View File

@@ -418,7 +418,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
metrics::register_internal(Box::new(timeline_collector))?;
wal_backup::init_remote_storage(&conf);
wal_backup::init_remote_storage(&conf).await;
// Keep handles to main tasks to die if any of them disappears.
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =

View File

@@ -74,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> {
assert!(flush_lsn >= start_lsn);
if request.until_lsn > flush_lsn {
bail!("requested LSN is beyond the end of the timeline");
bail!(format!(
"requested LSN {} is beyond the end of the timeline {}",
request.until_lsn, flush_lsn
));
}
if request.until_lsn < start_lsn {
bail!("requested LSN is before the start of the timeline");
bail!(format!(
"requested LSN {} is before the start of the timeline {}",
request.until_lsn, start_lsn
));
}
if request.until_lsn > commit_lsn {

View File

@@ -173,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
.expect("Failed to create broker runtime")
});
pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("WAL remover")
.worker_threads(1)
.enable_all()
.build()
.expect("Failed to create broker runtime")
});
pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("WAL backup worker")
@@ -189,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
.build()
.expect("Failed to create WAL backup runtime")
});
pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("metric shifter")
.worker_threads(1)
.enable_all()
.build()
.expect("Failed to create broker runtime")
});

View File

@@ -199,10 +199,7 @@ async fn redownload_partial_segment(
file.flush().await?;
let final_path = local_segment_path(mgr, partial);
info!(
"downloaded {} bytes, renaming to {}",
final_path, final_path,
);
info!("downloaded {actual_len} bytes, renaming to {final_path}");
if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await {
// Probably rename succeeded, but fsync of it failed. Remove
// the file then to avoid using it.

View File

@@ -4,7 +4,7 @@
use std::collections::HashSet;
use tracing::{debug, warn};
use tracing::debug;
use crate::timeline_manager::ManagerCtlMessage;
@@ -23,7 +23,7 @@ impl Drop for ResidenceGuard {
.manager_tx
.send(ManagerCtlMessage::GuardDrop(self.guard_id));
if let Err(e) = res {
warn!("failed to send GuardDrop message: {:?}", e);
debug!("failed to send GuardDrop message: {:?}", e);
}
}
}

View File

@@ -22,7 +22,7 @@ use tokio::fs::File;
use tokio::select;
use tokio::sync::mpsc::{self, Receiver, Sender};
use tokio::sync::watch;
use tokio::sync::{watch, OnceCell};
use tokio::time::sleep;
use tracing::*;
@@ -33,8 +33,6 @@ use crate::timeline::{PeerInfo, WalResidentTimeline};
use crate::timeline_manager::{Manager, StateSnapshot};
use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
use once_cell::sync::OnceCell;
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
@@ -167,7 +165,7 @@ fn determine_offloader(
}
}
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();
// Storage must be configured and initialized when this is called.
fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
@@ -178,14 +176,22 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
.unwrap()
}
pub fn init_remote_storage(conf: &SafeKeeperConf) {
pub async fn init_remote_storage(conf: &SafeKeeperConf) {
// TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
// dependencies to all tasks instead.
REMOTE_STORAGE.get_or_init(|| {
conf.remote_storage
.as_ref()
.map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
});
REMOTE_STORAGE
.get_or_init(|| async {
if let Some(conf) = conf.remote_storage.as_ref() {
Some(
GenericRemoteStorage::from_config(conf)
.await
.expect("failed to create remote storage"),
)
} else {
None
}
})
.await;
}
struct WalBackupTask {

View File

@@ -289,6 +289,18 @@ impl PartialBackup {
})
.collect();
if new_segments.len() == 1 {
// we have an uploaded segment, it must not be deleted from remote storage
segments_to_delete.retain(|name| name != &new_segments[0].name);
} else {
// there should always be zero or one uploaded segment
assert!(
new_segments.is_empty(),
"too many uploaded segments: {:?}",
new_segments
);
}
info!("deleting objects: {:?}", segments_to_delete);
let mut objects_to_delete = vec![];
for seg in segments_to_delete.iter() {

View File

@@ -0,0 +1,23 @@
[package]
name = "storage_controller_client"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
pageserver_api.workspace = true
pageserver_client.workspace = true
thiserror.workspace = true
async-trait.workspace = true
reqwest.workspace = true
utils.workspace = true
serde.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
tokio-postgres.workspace = true
tokio-stream.workspace = true
tokio.workspace = true
futures.workspace = true
tokio-util.workspace = true
anyhow.workspace = true
postgres.workspace = true
bytes.workspace = true

View File

@@ -0,0 +1,62 @@
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use reqwest::{Method, Url};
use serde::{de::DeserializeOwned, Serialize};
use std::str::FromStr;
pub struct Client {
base_url: Url,
jwt_token: Option<String>,
client: reqwest::Client,
}
impl Client {
pub fn new(base_url: Url, jwt_token: Option<String>) -> Self {
Self {
base_url,
jwt_token,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
}
/// Simple HTTP request wrapper for calling into storage controller
pub async fn dispatch<RQ, RS>(
&self,
method: Method,
path: String,
body: Option<RQ>,
) -> mgmt_api::Result<RS>
where
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
self.base_url.host_str().unwrap(),
self.base_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
let response = response.error_from_body().await?;
response
.json()
.await
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
}
}

View File

@@ -0,0 +1 @@
pub mod control_api;

View File

@@ -330,6 +330,22 @@ async fn handle_tenant_timeline_delete(
.await
}
async fn handle_tenant_timeline_detach_ancestor(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
let res = service
.tenant_timeline_detach_ancestor(tenant_id, timeline_id)
.await?;
json_response(StatusCode::OK, res)
}
async fn handle_tenant_timeline_passthrough(
service: Arc<Service>,
req: Request<Body>,
@@ -414,7 +430,7 @@ async fn handle_tenant_describe(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
check_permissions(&req, Scope::Scrubber)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
@@ -1006,6 +1022,16 @@ pub fn make_router(
RequestName("v1_tenant_timeline"),
)
})
.put(
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor",
|r| {
tenant_service_handler(
r,
handle_tenant_timeline_detach_ancestor,
RequestName("v1_tenant_timeline_detach_ancestor"),
)
},
)
// Tenant detail GET passthrough to shard zero:
.get("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(

View File

@@ -1,11 +1,11 @@
use anyhow::{anyhow, Context};
use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
@@ -51,10 +51,6 @@ struct Cli {
#[arg(long)]
compute_hook_url: Option<String>,
/// Path to the .json file to store state (will be created if it doesn't exist)
#[arg(short, long)]
path: Option<Utf8PathBuf>,
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
#[arg(long)]
database_url: Option<String>,
@@ -206,11 +202,10 @@ async fn async_main() -> anyhow::Result<()> {
let args = Cli::parse();
tracing::info!(
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
"version: {}, launch_timestamp: {}, build_tag {}, listening on {}",
GIT_VERSION,
launch_ts.to_string(),
BUILD_TAG,
args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
args.listen
);
@@ -277,8 +272,7 @@ async fn async_main() -> anyhow::Result<()> {
.await
.context("Running database migrations")?;
let json_path = args.path;
let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
let persistence = Arc::new(Persistence::new(secrets.database_url));
let service = Service::spawn(config, persistence.clone()).await?;
@@ -316,22 +310,23 @@ async fn async_main() -> anyhow::Result<()> {
}
tracing::info!("Terminating on signal");
if json_path.is_some() {
// Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
// full postgres dumps around.
if let Err(e) = persistence.write_tenants_json().await {
tracing::error!("Failed to write JSON on shutdown: {e}")
// Stop HTTP server first, so that we don't have to service requests
// while shutting down Service.
server_shutdown.cancel();
match tokio::time::timeout(Duration::from_secs(5), server_task).await {
Ok(Ok(_)) => {
tracing::info!("Joined HTTP server task");
}
Ok(Err(e)) => {
tracing::error!("Error joining HTTP server task: {e}")
}
Err(_) => {
tracing::warn!("Timed out joining HTTP server task");
// We will fall through and shut down the service anyway, any request handlers
// in flight will experience cancellation & their clients will see a torn connection.
}
}
// Stop HTTP server first, so that we don't have to service requests
// while shutting down Service
server_shutdown.cancel();
if let Err(e) = server_task.await {
tracing::error!("Error joining HTTP server task: {e}")
}
tracing::info!("Joined HTTP server task");
service.shutdown().await;
tracing::info!("Service shutdown complete");

View File

@@ -1,8 +1,9 @@
use pageserver_api::{
models::{
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
TopTenantShardsRequest, TopTenantShardsResponse,
},
shard::TenantShardId,
};
@@ -226,6 +227,21 @@ impl PageserverClient {
)
}
pub(crate) async fn timeline_detach_ancestor(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Result<AncestorDetached> {
measured_request!(
"timeline_detach_ancestor",
crate::metrics::Method::Put,
&self.node_id_label,
self.inner
.timeline_detach_ancestor(tenant_shard_id, timeline_id)
.await
)
}
pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
measured_request!(
"utilization",

View File

@@ -5,8 +5,6 @@ use std::time::Duration;
use std::time::Instant;
use self::split_state::SplitState;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
@@ -55,11 +53,6 @@ use crate::node::Node;
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
pub struct Persistence {
connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
// In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of
// test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
// compatible just yet.
json_path: Option<Utf8PathBuf>,
}
/// Legacy format, for use in JSON compat objects in test environment
@@ -124,7 +117,7 @@ impl Persistence {
const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
pub fn new(database_url: String) -> Self {
let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
// We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
@@ -139,10 +132,7 @@ impl Persistence {
.build(manager)
.expect("Could not build connection pool");
Self {
connection_pool,
json_path,
}
Self { connection_pool }
}
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
@@ -302,85 +292,13 @@ impl Persistence {
/// At startup, load the high level state for shards, such as their config + policy. This will
/// be enriched at runtime with state discovered on pageservers.
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
let loaded = self
.with_measured_conn(
DatabaseOperation::ListTenantShards,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
},
)
.await?;
if loaded.is_empty() {
if let Some(path) = &self.json_path {
if tokio::fs::try_exists(path)
.await
.map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
{
tracing::info!("Importing from legacy JSON format at {path}");
return self.list_tenant_shards_json(path).await;
}
}
}
Ok(loaded)
}
/// Shim for automated compatibility tests: load tenants from a JSON file instead of database
pub(crate) async fn list_tenant_shards_json(
&self,
path: &Utf8Path,
) -> DatabaseResult<Vec<TenantShardPersistence>> {
let bytes = tokio::fs::read(path)
.await
.map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
for shard in decoded.tenants.values_mut() {
if shard.placement_policy == "\"Single\"" {
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
shard.placement_policy = "{\"Attached\":0}".to_string();
}
if shard.scheduling_policy.is_empty() {
shard.scheduling_policy =
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
}
}
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
// Synchronize database with what is in the JSON file
self.insert_tenant_shards(tenants.clone()).await?;
Ok(tenants)
}
/// For use in testing environments, where we dump out JSON on shutdown.
pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
let Some(path) = &self.json_path else {
anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
};
tracing::info!("Writing state to {path}...");
let tenants = self.list_tenant_shards().await?;
let mut tenants_map = HashMap::new();
for tsp in tenants {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
};
tenants_map.insert(tenant_shard_id, tsp);
}
let json = serde_json::to_string(&JsonPersistence {
tenants: tenants_map,
})?;
tokio::fs::write(path, &json).await?;
tracing::info!("Wrote {} bytes to {path}...", json.len());
Ok(())
self.with_measured_conn(
DatabaseOperation::ListTenantShards,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
},
)
.await
}
/// Tenants must be persisted before we schedule them for the first time. This enables us

View File

@@ -117,6 +117,7 @@ enum TenantOperations {
TimelineCreate,
TimelineDelete,
AttachHook,
TimelineDetachAncestor,
}
#[derive(Clone, strum_macros::Display)]
@@ -2376,18 +2377,18 @@ impl Service {
tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
client
.tenant_time_travel_remote_storage(
tenant_shard_id,
&timestamp,
&done_if_after,
)
.await
.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
node
))
})?;
.tenant_time_travel_remote_storage(
tenant_shard_id,
&timestamp,
&done_if_after,
)
.await
.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
node
))
})?;
}
}
Ok(())
@@ -2757,7 +2758,7 @@ impl Service {
// Create timeline on remaining shards with number >0
if !targets.is_empty() {
// If we had multiple shards, issue requests for the remainder now.
let jwt = self.config.jwt_token.clone();
let jwt = &self.config.jwt_token;
self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
let create_req = create_req.clone();
Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
@@ -2768,6 +2769,114 @@ impl Service {
Ok(timeline_info)
}
pub(crate) async fn tenant_timeline_detach_ancestor(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<models::detach_ancestor::AncestorDetached, ApiError> {
tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",);
let _tenant_lock = trace_shared_lock(
&self.tenant_op_locks,
tenant_id,
TenantOperations::TimelineDetachAncestor,
)
.await;
self.ensure_attached_wait(tenant_id).await?;
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
let node_id = shard.intent.get_attached().ok_or_else(|| {
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
})?;
let node = locked
.nodes
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
}
targets
};
if targets.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant not found").into(),
));
}
async fn detach_one(
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
node: Node,
jwt: Option<String>,
) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
tracing::info!(
"Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
);
let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
client
.timeline_detach_ancestor(tenant_shard_id, timeline_id)
.await
.map_err(|e| {
use mgmt_api::Error;
match e {
// no ancestor (ever)
Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
"{node}: {}",
msg.strip_prefix("Conflict: ").unwrap_or(&msg)
)),
// too many ancestors
Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
}
// rest can be mapped
other => passthrough_api_error(&node, other),
}
})
.map(|res| (tenant_shard_id.shard_number, res))
}
// no shard needs to go first/last; the operation should be idempotent
// TODO: it would be great to ensure that all shards return the same error
let mut results = self
.tenant_for_shards(targets, |tenant_shard_id, node| {
futures::FutureExt::boxed(detach_one(
tenant_shard_id,
timeline_id,
node,
self.config.jwt_token.clone(),
))
})
.await?;
let any = results.pop().expect("we must have at least one response");
let mismatching = results
.iter()
.filter(|(_, res)| res != &any.1)
.collect::<Vec<_>>();
if !mismatching.is_empty() {
let matching = results.len() - mismatching.len();
tracing::error!(
matching,
compared_against=?any,
?mismatching,
"shards returned different results"
);
}
Ok(any.1)
}
/// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
///
/// On success, the returned vector contains exactly the same number of elements as the input `locations`.
@@ -2894,8 +3003,8 @@ impl Service {
.await
.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
))
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
))
})
}
@@ -3847,6 +3956,8 @@ impl Service {
"failpoint".to_string()
)));
failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
tracing::info!(
"Split {} into {}",
parent_id,

View File

@@ -34,6 +34,7 @@ camino.workspace = true
rustls.workspace = true
rustls-native-certs.workspace = true
once_cell.workspace = true
storage_controller_client.workspace = true
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }

View File

@@ -24,6 +24,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use clap::ValueEnum;
use pageserver::tenant::TENANTS_SEGMENT_NAME;
use pageserver_api::shard::TenantShardId;
use remote_storage::RemotePath;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use tokio::io::AsyncReadExt;
@@ -31,7 +32,7 @@ use tracing::error;
use tracing_appender::non_blocking::WorkerGuard;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
use utils::fs_ext;
use utils::id::{TenantId, TimelineId};
use utils::id::{TenantId, TenantTimelineId, TimelineId};
const MAX_RETRIES: usize = 20;
const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -54,7 +55,7 @@ pub struct S3Target {
/// in the pageserver, as all timeline objects existing in the scope of a particular
/// tenant: the scrubber is different in that it handles collections of data referring to many
/// TenantShardTimelineIds in on place.
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub struct TenantShardTimelineId {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
@@ -67,6 +68,10 @@ impl TenantShardTimelineId {
timeline_id,
}
}
fn as_tenant_timeline_id(&self) -> TenantTimelineId {
TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id)
}
}
impl Display for TenantShardTimelineId {
@@ -179,6 +184,22 @@ impl RootTarget {
.with_sub_segment(&id.timeline_id.to_string())
}
/// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal
/// key in the S3 bucket.
pub fn absolute_key(&self, key: &RemotePath) -> String {
let root = match self {
Self::Pageserver(root) => root,
Self::Safekeeper(root) => root,
};
let prefix = &root.prefix_in_bucket;
if prefix.ends_with('/') {
format!("{prefix}{key}")
} else {
format!("{prefix}/{key}")
}
}
pub fn bucket_name(&self) -> &str {
match self {
Self::Pageserver(root) => &root.bucket_name,
@@ -216,6 +237,14 @@ impl BucketConfig {
}
}
pub struct ControllerClientConfig {
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
pub controller_api: Url,
/// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'.
pub controller_jwt: String,
}
pub struct ConsoleConfig {
pub token: String,
pub base_url: Url,

View File

@@ -1,11 +1,12 @@
use anyhow::bail;
use anyhow::{anyhow, bail};
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use storage_scrubber::find_large_objects;
use reqwest::Url;
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use storage_scrubber::pageserver_physical_gc::GcMode;
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
use storage_scrubber::tenant_snapshot::SnapshotDownloader;
use storage_scrubber::{find_large_objects, ControllerClientConfig};
use storage_scrubber::{
init_logging, pageserver_physical_gc::pageserver_physical_gc,
scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
@@ -24,6 +25,14 @@ struct Cli {
#[arg(short, long, default_value_t = false)]
delete: bool,
#[arg(long)]
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
controller_api: Option<Url>,
#[arg(long)]
/// JWT token for authenticating with storage controller. Requires scope 'scrubber' or 'admin'.
controller_jwt: Option<String>,
}
#[derive(Subcommand, Debug)]
@@ -204,8 +213,37 @@ async fn main() -> anyhow::Result<()> {
min_age,
mode,
} => {
let summary =
pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
let controller_client_conf = cli.controller_api.map(|controller_api| {
ControllerClientConfig {
controller_api,
// Default to no key: this is a convenience when working in a development environment
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
}
});
match (&controller_client_conf, mode) {
(Some(_), _) => {
// Any mode may run when controller API is set
}
(None, GcMode::Full) => {
// The part of physical GC where we erase ancestor layers cannot be done safely without
// confirming the most recent complete shard split with the controller. Refuse to run, rather
// than doing it unsafely.
return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
}
(None, GcMode::DryRun | GcMode::IndicesOnly) => {
// These GcModes do not require the controller to run.
}
}
let summary = pageserver_physical_gc(
bucket_config,
controller_client_conf,
tenant_ids,
min_age.into(),
mode,
)
.await?;
println!("{}", serde_json::to_string(&summary).unwrap());
Ok(())
}

View File

@@ -1,22 +1,50 @@
use std::time::{Duration, UNIX_EPOCH};
use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use crate::checks::{list_timeline_blobs, BlobDataParseResult};
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
use crate::{
init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
};
use aws_sdk_s3::Client;
use futures_util::{StreamExt, TryStreamExt};
use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::IndexPart;
use pageserver_api::shard::TenantShardId;
use pageserver_api::controller_api::TenantDescribeResponse;
use pageserver_api::shard::{ShardIndex, TenantShardId};
use remote_storage::RemotePath;
use reqwest::Method;
use serde::Serialize;
use storage_controller_client::control_api;
use tracing::{info_span, Instrument};
use utils::generation::Generation;
use utils::id::{TenantId, TenantTimelineId};
#[derive(Serialize, Default)]
pub struct GcSummary {
indices_deleted: usize,
remote_storage_errors: usize,
controller_api_errors: usize,
ancestor_layers_deleted: usize,
}
impl GcSummary {
fn merge(&mut self, other: Self) {
let Self {
indices_deleted,
remote_storage_errors,
ancestor_layers_deleted,
controller_api_errors,
} = other;
self.indices_deleted += indices_deleted;
self.remote_storage_errors += remote_storage_errors;
self.ancestor_layers_deleted += ancestor_layers_deleted;
self.controller_api_errors += controller_api_errors;
}
}
#[derive(clap::ValueEnum, Debug, Clone, Copy)]
@@ -26,9 +54,9 @@ pub enum GcMode {
// Enable only removing old-generation indices
IndicesOnly,
// Enable all forms of GC
// TODO: this will be used when shard split ancestor layer deletion is added
// All,
Full,
}
impl std::fmt::Display for GcMode {
@@ -36,10 +64,232 @@ impl std::fmt::Display for GcMode {
match self {
GcMode::DryRun => write!(f, "dry-run"),
GcMode::IndicesOnly => write!(f, "indices-only"),
GcMode::Full => write!(f, "full"),
}
}
}
mod refs {
use super::*;
// Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other
// shard in the same tenant. This is sparse! The vast majority of timelines will have no cross-shard refs, and those that
// do have cross shard refs should eventually drop most of them via compaction.
//
// In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor
// which is is referenced_.
#[derive(Default)]
pub(super) struct AncestorRefs(
BTreeMap<TenantTimelineId, HashMap<(ShardIndex, LayerName), usize>>,
);
impl AncestorRefs {
/// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline.
pub(super) fn update(
&mut self,
ttid: TenantShardTimelineId,
layers: Vec<(LayerName, LayerFileMetadata)>,
) {
let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default();
for (layer_name, layer_metadata) in layers {
// Increment refcount of this layer in the ancestor shard
*(ttid_refs
.entry((layer_metadata.shard, layer_name))
.or_default()) += 1;
}
}
/// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount
///
/// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent.
pub(super) fn get_ttid_refcounts(
&self,
ttid: &TenantTimelineId,
) -> Option<&HashMap<(ShardIndex, LayerName), usize>> {
self.0.get(ttid)
}
}
}
use refs::AncestorRefs;
// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC:
// - Are there any ancestor shards?
// - Are there any refs to ancestor shards' layers?
#[derive(Default)]
struct TenantRefAccumulator {
shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
// For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
ancestor_ref_shards: AncestorRefs,
}
impl TenantRefAccumulator {
fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) {
let this_shard_idx = ttid.tenant_shard_id.to_index();
(*self
.shards_seen
.entry(ttid.tenant_shard_id.tenant_id)
.or_default())
.push(this_shard_idx);
let mut ancestor_refs = Vec::new();
for (layer_name, layer_metadata) in &index_part.layer_metadata {
if layer_metadata.shard != this_shard_idx {
// This is a reference from this shard to a layer in an ancestor shard: we must track this
// as a marker to not GC this layer from the parent.
ancestor_refs.push((layer_name.clone(), layer_metadata.clone()));
}
}
if !ancestor_refs.is_empty() {
tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len());
self.ancestor_ref_shards.update(ttid, ancestor_refs);
}
}
/// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve
async fn into_gc_ancestors(
self,
controller_client: &control_api::Client,
summary: &mut GcSummary,
) -> (Vec<TenantShardId>, AncestorRefs) {
let mut ancestors_to_gc = Vec::new();
for (tenant_id, mut shard_indices) in self.shards_seen {
// Find the highest shard count
let latest_count = shard_indices
.iter()
.map(|i| i.shard_count)
.max()
.expect("Always at least one shard");
let (mut latest_shards, ancestor_shards) = {
let at =
itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
(shard_indices[0..at].to_owned(), &shard_indices[at..])
};
// Sort shards, as we will later compare them with a sorted list from the controller
latest_shards.sort();
// Check that we have a complete view of the latest shard count: this should always be the case unless we happened
// to scan the S3 bucket halfway through a shard split.
if latest_shards.len() != latest_count.count() as usize {
// This should be extremely rare, so we warn on it.
tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
continue;
}
// Check if we have any non-latest-count shards
if ancestor_shards.is_empty() {
tracing::debug!(%tenant_id, "No ancestor shards to clean up");
continue;
}
// Based on S3 view, this tenant looks like it might have some ancestor shard work to do. We
// must only do this work if the tenant is not currently being split: otherwise, it is not safe
// to GC ancestors, because if the split fails then the controller will try to attach ancestor
// shards again.
match controller_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await
{
Err(e) => {
// We were not able to learn the latest shard split state from the controller, so we will not
// do ancestor GC on this tenant.
tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}");
summary.controller_api_errors += 1;
continue;
}
Ok(desc) => {
// We expect to see that the latest shard count matches the one we saw in S3, and that none
// of the shards indicate splitting in progress.
let controller_indices: Vec<ShardIndex> = desc
.shards
.iter()
.map(|s| s.tenant_shard_id.to_index())
.collect();
if controller_indices != latest_shards {
tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
continue;
}
if desc.shards.iter().any(|s| s.is_splitting) {
tracing::info!(%tenant_id, "One or more shards is currently splitting");
continue;
}
// This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs.
tracing::info!(%tenant_id, "Validated state with controller: {desc:?}");
}
}
// GC ancestor shards
for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId {
tenant_id,
shard_count: idx.shard_count,
shard_number: idx.shard_number,
}) {
ancestors_to_gc.push(ancestor_shard);
}
}
(ancestors_to_gc, self.ancestor_ref_shards)
}
}
async fn is_old_enough(
s3_client: &Client,
bucket_config: &BucketConfig,
min_age: &Duration,
key: &str,
summary: &mut GcSummary,
) -> bool {
// Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident
// it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects.
let age: Duration = match s3_client
.head_object()
.bucket(&bucket_config.bucket)
.key(key)
.send()
.await
{
Ok(response) => match response.last_modified {
None => {
tracing::warn!("Missing last_modified");
summary.remote_storage_errors += 1;
return false;
}
Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) {
Ok(Ok(e)) => e,
Err(_) | Ok(Err(_)) => {
tracing::warn!("Bad last_modified time: {last_modified:?}");
return false;
}
},
},
Err(e) => {
tracing::warn!("Failed to HEAD {key}: {e}");
summary.remote_storage_errors += 1;
return false;
}
};
let old_enough = &age > min_age;
if !old_enough {
tracing::info!(
"Skipping young object {} < {}",
humantime::format_duration(age),
humantime::format_duration(*min_age)
);
}
old_enough
}
async fn maybe_delete_index(
s3_client: &Client,
bucket_config: &BucketConfig,
@@ -79,45 +329,7 @@ async fn maybe_delete_index(
return;
}
// Validation: we will only delete indices after one week, so that during incidents we will have
// easy access to recent indices.
let age: Duration = match s3_client
.head_object()
.bucket(&bucket_config.bucket)
.key(key)
.send()
.await
{
Ok(response) => match response.last_modified {
None => {
tracing::warn!("Missing last_modified");
summary.remote_storage_errors += 1;
return;
}
Some(last_modified) => {
let last_modified =
UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
match last_modified.elapsed() {
Ok(e) => e,
Err(_) => {
tracing::warn!("Bad last_modified time: {last_modified:?}");
return;
}
}
}
},
Err(e) => {
tracing::warn!("Failed to HEAD {key}: {e}");
summary.remote_storage_errors += 1;
return;
}
};
if &age < min_age {
tracing::info!(
"Skipping young object {} < {}",
age.as_secs_f64(),
min_age.as_secs_f64()
);
if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await {
return;
}
@@ -145,6 +357,108 @@ async fn maybe_delete_index(
}
}
#[allow(clippy::too_many_arguments)]
async fn gc_ancestor(
s3_client: &Client,
bucket_config: &BucketConfig,
root_target: &RootTarget,
min_age: &Duration,
ancestor: TenantShardId,
refs: &AncestorRefs,
mode: GcMode,
summary: &mut GcSummary,
) -> anyhow::Result<()> {
// Scan timelines in the ancestor
let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?;
let mut timelines = std::pin::pin!(timelines);
// Build a list of keys to retain
while let Some(ttid) = timelines.next().await {
let ttid = ttid?;
let data = list_timeline_blobs(s3_client, ttid, root_target).await?;
let s3_layers = match data.blob_data {
BlobDataParseResult::Parsed {
index_part: _,
index_part_generation: _,
s3_layers,
} => s3_layers,
BlobDataParseResult::Relic => {
// Post-deletion tenant location: don't try and GC it.
continue;
}
BlobDataParseResult::Incorrect(reasons) => {
// Our primary purpose isn't to report on bad data, but log this rather than skipping silently
tracing::warn!(
"Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
);
continue;
}
};
let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id());
let ancestor_shard_index = ttid.tenant_shard_id.to_index();
for (layer_name, layer_gen) in s3_layers {
let ref_count = ttid_refs
.and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone())))
.copied()
.unwrap_or(0);
if ref_count > 0 {
tracing::debug!(%ttid, "Ancestor layer {layer_name} has {ref_count} refs");
continue;
}
tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced");
// Build the key for the layer we are considering deleting
let key = root_target.absolute_key(&remote_layer_path(
&ttid.tenant_shard_id.tenant_id,
&ttid.timeline_id,
ancestor_shard_index,
&layer_name,
layer_gen,
));
// We apply a time threshold to GCing objects that are un-referenced: this preserves our ability
// to roll back a shard split if we have to, by avoiding deleting ancestor layers right away
if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await {
continue;
}
if !matches!(mode, GcMode::Full) {
tracing::info!("Dry run: would delete key {key}");
continue;
}
// All validations passed: erase the object
match s3_client
.delete_object()
.bucket(&bucket_config.bucket)
.key(&key)
.send()
.await
{
Ok(_) => {
tracing::info!("Successfully deleted unreferenced ancestor layer {key}");
summary.ancestor_layers_deleted += 1;
}
Err(e) => {
tracing::warn!("Failed to delete layer {key}: {e}");
summary.remote_storage_errors += 1;
}
}
}
// TODO: if all the layers are gone, clean up the whole timeline dir (remove index)
}
Ok(())
}
/// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection
/// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection
/// is about removing:
@@ -156,22 +470,26 @@ async fn maybe_delete_index(
/// make sure that object listings don't get slowed down by large numbers of garbage objects.
pub async fn pageserver_physical_gc(
bucket_config: BucketConfig,
tenant_ids: Vec<TenantShardId>,
controller_client_conf: Option<ControllerClientConfig>,
tenant_shard_ids: Vec<TenantShardId>,
min_age: Duration,
mode: GcMode,
) -> anyhow::Result<GcSummary> {
let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
let tenants = if tenant_ids.is_empty() {
let tenants = if tenant_shard_ids.is_empty() {
futures::future::Either::Left(stream_tenants(&s3_client, &target))
} else {
futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
};
// How many tenants to process in parallel. We need to be mindful of pageservers
// accessing the same per tenant prefixes, so use a lower setting than pageservers.
const CONCURRENCY: usize = 32;
// Accumulate information about each tenant for cross-shard GC step we'll do at the end
let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
// Generate a stream of TenantTimelineId
let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
let timelines = timelines.try_buffered(CONCURRENCY);
@@ -185,16 +503,17 @@ pub async fn pageserver_physical_gc(
target: &RootTarget,
mode: GcMode,
ttid: TenantShardTimelineId,
accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
) -> anyhow::Result<GcSummary> {
let mut summary = GcSummary::default();
let data = list_timeline_blobs(s3_client, ttid, target).await?;
let (latest_gen, candidates) = match &data.blob_data {
let (index_part, latest_gen, candidates) = match &data.blob_data {
BlobDataParseResult::Parsed {
index_part: _index_part,
index_part,
index_part_generation,
s3_layers: _s3_layers,
} => (*index_part_generation, data.unused_index_keys),
} => (index_part, *index_part_generation, data.unused_index_keys),
BlobDataParseResult::Relic => {
// Post-deletion tenant location: don't try and GC it.
return Ok(summary);
@@ -206,6 +525,8 @@ pub async fn pageserver_physical_gc(
}
};
accumulator.lock().unwrap().update(ttid, index_part);
for key in candidates {
maybe_delete_index(
s3_client,
@@ -222,17 +543,61 @@ pub async fn pageserver_physical_gc(
Ok(summary)
}
let timelines = timelines
.map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
let mut summary = GcSummary::default();
while let Some(i) = timelines.next().await {
let tl_summary = i?;
// Drain futures for per-shard GC, populating accumulator as a side effect
{
let timelines = timelines.map_ok(|ttid| {
gc_timeline(
&s3_client,
&bucket_config,
&min_age,
&target,
mode,
ttid,
&accumulator,
)
});
let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
summary.indices_deleted += tl_summary.indices_deleted;
summary.remote_storage_errors += tl_summary.remote_storage_errors;
while let Some(i) = timelines.next().await {
summary.merge(i?);
}
}
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
let ControllerClientConfig {
controller_api,
controller_jwt,
} = c;
control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
}) else {
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
return Ok(summary);
};
let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator)
.unwrap()
.into_inner()
.unwrap()
.into_gc_ancestors(&controller_client, &mut summary)
.await;
for ancestor_shard in ancestor_shards {
gc_ancestor(
&s3_client,
&bucket_config,
&target,
&min_age,
ancestor_shard,
&ancestor_refs,
mode,
&mut summary,
)
.instrument(info_span!("gc_ancestor", %ancestor_shard))
.await?;
}
Ok(summary)

View File

@@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_smgr_query_seconds_sum",
"pageserver_archive_size",
"pageserver_pitr_history_size",
"pageserver_layer_bytes",
"pageserver_layer_count",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
"pageserver_evictions_total",

View File

@@ -261,3 +261,47 @@ class NeonAPI:
if op["status"] in {"scheduling", "running", "cancelling"}:
has_running = True
time.sleep(0.5)
class NeonApiEndpoint:
def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]):
self.neon_api = neon_api
if project_id is None:
project = neon_api.create_project(pg_version)
neon_api.wait_for_operation_to_finish(project["project"]["id"])
self.project_id = project["project"]["id"]
self.endpoint_id = project["endpoints"][0]["id"]
self.connstr = project["connection_uris"][0]["connection_uri"]
self.pgbench_env = connection_parameters_to_env(
project["connection_uris"][0]["connection_parameters"]
)
self.is_new = True
else:
project = neon_api.get_project_details(project_id)
if int(project["project"]["pg_version"]) != int(pg_version):
raise Exception(
f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})"
)
self.project_id = project_id
eps = neon_api.get_endpoints(project_id)["endpoints"]
self.endpoint_id = eps[0]["id"]
self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
"uri"
]
pw = self.connstr.split("@")[0].split(":")[-1]
self.pgbench_env = {
"PGHOST": eps[0]["host"],
"PGDATABASE": "neondb",
"PGUSER": "neondb_owner",
"PGPASSWORD": pw,
}
self.is_new = False
def restart(self):
self.neon_api.restart_endpoint(self.project_id, self.endpoint_id)
self.neon_api.wait_for_operation_to_finish(self.project_id)
def get_synthetic_storage_size(self) -> int:
return int(
self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"]
)

View File

@@ -31,6 +31,7 @@ import backoff
import httpx
import jwt
import psycopg2
import psycopg2.sql
import pytest
import requests
import toml
@@ -87,7 +88,7 @@ from fixtures.utils import (
)
from fixtures.utils import AuxFileStore as AuxFileStore # reexport
from .neon_api import NeonAPI
from .neon_api import NeonAPI, NeonApiEndpoint
"""
This file contains pytest fixtures. A fixture is a test resource that can be
@@ -727,8 +728,30 @@ class NeonEnvBuilder:
self.repo_dir / "local_fs_remote_storage",
)
if (attachments_json := Path(repo_dir / "attachments.json")).exists():
shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
# restore storage controller (the db is small, don't bother with overlayfs)
storcon_db_from_dir = repo_dir / "storage_controller_db"
storcon_db_to_dir = self.repo_dir / "storage_controller_db"
log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}")
assert storcon_db_from_dir.is_dir()
assert not storcon_db_to_dir.exists()
def ignore_postgres_log(path: str, _names):
if Path(path) == storcon_db_from_dir:
return {"postgres.log"}
return set()
shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
assert not (storcon_db_to_dir / "postgres.log").exists()
# NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
# However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
# will currently reject re-attach requests from them because the NodeMetadata isn't identical.
# So, from_repo_dir patches up the the storcon database.
patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
assert not patch_script_path.exists()
patch_script = ""
for ps in self.env.pageservers:
patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';"
patch_script_path.write_text(patch_script)
# Update the config with info about tenants and timelines
with (self.repo_dir / "config").open("r") as f:
@@ -974,7 +997,7 @@ class NeonEnvBuilder:
if self.scrub_on_exit:
try:
StorageScrubber(self).scan_metadata()
self.env.storage_scrubber.scan_metadata()
except Exception as e:
log.error(f"Error during remote storage scrub: {e}")
cleanup_error = e
@@ -1135,6 +1158,7 @@ class NeonEnv:
"listen_http_addr": f"localhost:{pageserver_port.http}",
"pg_auth_type": pg_auth_type,
"http_auth_type": http_auth_type,
"image_compression": "zstd",
}
if self.pageserver_virtual_file_io_engine is not None:
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
@@ -1201,6 +1225,9 @@ class NeonEnv:
)
cfg["safekeepers"].append(sk_cfg)
# Scrubber instance for tests that use it, and for use during teardown checks
self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir)
log.info(f"Config: {cfg}")
self.neon_cli.init(
cfg,
@@ -2400,7 +2427,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
"""
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
"""
response = self.request(
"GET",
@@ -2786,8 +2813,8 @@ class NeonPageserver(PgProtocol, LogUtils):
)
return client.tenant_attach(
tenant_id,
generation,
config,
generation=generation,
)
def tenant_detach(self, tenant_id: TenantId):
@@ -3158,6 +3185,18 @@ class RemotePostgres(PgProtocol):
pass
@pytest.fixture(scope="function")
def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB")
return NeonApiEndpoint(neon_api, pg_version, project_id)
@pytest.fixture(scope="function")
def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB")
return NeonApiEndpoint(neon_api, pg_version, project_id)
@pytest.fixture(scope="function")
def remote_pg(
test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion
@@ -3773,12 +3812,12 @@ class Endpoint(PgProtocol, LogUtils):
self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers
)
def respec(self, **kwargs):
def respec(self, **kwargs: Any) -> None:
"""Update the endpoint.json file used by control_plane."""
# Read config
config_path = os.path.join(self.endpoint_path(), "endpoint.json")
with open(config_path, "r") as f:
data_dict = json.load(f)
data_dict: dict[str, Any] = json.load(f)
# Write it back updated
with open(config_path, "w") as file:
@@ -3786,13 +3825,13 @@ class Endpoint(PgProtocol, LogUtils):
json.dump(dict(data_dict, **kwargs), file, indent=4)
# Please note: Migrations only run if pg_skip_catalog_updates is false
def wait_for_migrations(self):
def wait_for_migrations(self, num_migrations: int = 10):
with self.cursor() as cur:
def check_migrations_done():
cur.execute("SELECT id FROM neon_migration.migration_id")
migration_id = cur.fetchall()[0][0]
assert migration_id != 0
migration_id: int = cur.fetchall()[0][0]
assert migration_id >= num_migrations
wait_until(20, 0.5, check_migrations_done)
@@ -4042,6 +4081,22 @@ class Safekeeper(LogUtils):
self.id = id
self.running = running
self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
if extra_opts is None:
# Testing defaults: enable everything, and set short timeouts so that background
# work will happen during short tests.
# **Note**: Any test that explicitly sets extra_opts will not get these defaults.
extra_opts = [
"--enable-offload",
"--delete-offloaded-wal",
"--partial-backup-timeout",
"10s",
"--control-file-save-interval",
"1s",
"--eviction-min-resident",
"10s",
]
self.extra_opts = extra_opts
def start(
@@ -4213,9 +4268,9 @@ class Safekeeper(LogUtils):
class StorageScrubber:
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
def __init__(self, env: NeonEnv, log_dir: Path):
self.env = env
self.log_dir = log_dir or env.test_output_dir
self.log_dir = log_dir
def scrubber_cli(self, args: list[str], timeout) -> str:
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
@@ -4232,11 +4287,14 @@ class StorageScrubber:
if s3_storage.endpoint is not None:
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
base_args = [str(self.env.neon_binpath / "storage_scrubber")]
base_args = [
str(self.env.neon_binpath / "storage_scrubber"),
f"--controller-api={self.env.storage_controller_api}",
]
args = base_args + args
(output_path, stdout, status_code) = subprocess_capture(
self.env.test_output_dir,
self.log_dir,
args,
echo_stderr=True,
echo_stdout=True,
@@ -4275,7 +4333,10 @@ class StorageScrubber:
log.info(f"tenant-snapshot output: {stdout}")
def pageserver_physical_gc(
self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
self,
min_age_secs: int,
tenant_ids: Optional[list[TenantId]] = None,
mode: Optional[str] = None,
):
args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
@@ -4285,6 +4346,9 @@ class StorageScrubber:
for tenant_id in tenant_ids:
args.extend(["--tenant-id", str(tenant_id)])
if mode is not None:
args.extend(["--mode", mode])
stdout = self.scrubber_cli(
args,
timeout=30,

View File

@@ -117,6 +117,9 @@ class LayerMapInfo:
def image_layers(self) -> List[HistoricLayerInfo]:
return [x for x in self.historic_layers if x.kind == "Image"]
def delta_l0_layers(self) -> List[HistoricLayerInfo]:
return [x for x in self.historic_layers if x.kind == "Delta" and x.l0]
def historic_by_name(self) -> Set[str]:
return set(x.layer_file_name for x in self.historic_layers)
@@ -172,6 +175,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
if auth_token is not None:
self.headers["Authorization"] = f"Bearer {auth_token}"
def without_status_retrying(self) -> PageserverHttpClient:
retries = Retry(
status=0,
connect=5,
read=False,
backoff_factor=0.2,
status_forcelist=[],
allowed_methods=None,
remove_headers_on_redirect=[],
)
return PageserverHttpClient(
self.port, self.is_testing_enabled_or_skip, self.auth_token, retries
)
@property
def base_url(self) -> str:
return f"http://localhost:{self.port}"
@@ -223,8 +241,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
def tenant_attach(
self,
tenant_id: Union[TenantId, TenantShardId],
generation: int,
config: None | Dict[str, Any] = None,
generation: Optional[int] = None,
):
config = config or {}
@@ -814,17 +832,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
batch_size: int | None = None,
) -> Set[TimelineId]:
**kwargs,
) -> List[TimelineId]:
params = {}
if batch_size is not None:
params["batch_size"] = batch_size
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor",
params=params,
**kwargs,
)
self.verbose_error(res)
json = res.json()
return set(map(TimelineId, json["reparented_timelines"]))
return list(map(TimelineId, json["reparented_timelines"]))
def evict_layer(
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str

View File

@@ -255,11 +255,3 @@ def run_pagebench_benchmark(
unit="ms",
report=MetricReport.LOWER_IS_BETTER,
)
env.storage_controller.allowed_errors.append(
# The test setup swaps NeonEnv instances, hence different
# pg instances are used for the storage controller db. This means
# the storage controller doesn't know about the nodes mentioned
# in attachments.json at start-up.
".* Scheduler missing node 1",
)

View File

@@ -2,6 +2,7 @@ from contextlib import closing
import pytest
from fixtures.compare_fixtures import NeonCompare
from fixtures.log_helper import log
from fixtures.neon_fixtures import wait_for_last_flush_lsn
@@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare):
pageserver_http.timeline_compact(tenant_id, timeline_id)
neon_compare.report_size()
def test_compaction_l0_memory(neon_compare: NeonCompare):
"""
Generate a large stack of L0s pending compaction into L1s, and
measure the pageserver's peak RSS while doing so
"""
env = neon_compare.env
pageserver_http = env.pageserver.http_client()
tenant_id, timeline_id = env.neon_cli.create_tenant(
conf={
# Initially disable compaction so that we will build up a stack of L0s
"compaction_period": "0s",
"gc_period": "0s",
}
)
neon_compare.tenant = tenant_id
neon_compare.timeline = timeline_id
endpoint = env.endpoints.create_start(
"main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"]
)
# Read tenant effective config and assert on checkpoint_distance and compaction_threshold,
# as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them.
#
# If these assertions fail, it probably means we changed the default.
tenant_conf = pageserver_http.tenant_config(tenant_id)
assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024
assert tenant_conf.effective_config["compaction_threshold"] == 10
# Aim to write about 20 L0s, so that we will hit the limit on how many
# to compact at once
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
for i in range(200):
cur.execute(f"create table tbl{i} (i int, j int);")
cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);")
for j in range(100):
cur.execute(f"update tbl{i} set j = {j};")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
endpoint.stop()
# Check we have generated the L0 stack we expected
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
initial_l0s = len(layers.delta_l0_layers())
initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})")
def rss_hwm():
v = pageserver_http.get_metric_value("libmetrics_maxrss_kb")
assert v is not None
assert v > 0
return v * 1024
before = rss_hwm()
pageserver_http.timeline_compact(tenant_id, timeline_id)
after = rss_hwm()
log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})")
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})")
assert after > before # If we didn't use some memory the test is probably buggy
compaction_mapped_rss = after - before
# During L0 compaction, we require as much memory as the physical size of what we compacted, and then some,
# because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate
# repeated references to the same key.
#
# To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
# this memory estimate can be revised far downwards to something that doesn't scale
# linearly with the layer sizes.
MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
# If we find that compaction is using more memory, this may indicate a regression
assert compaction_mapped_rss < MEMORY_ESTIMATE
# If we find that compaction is using <0.5 the expected memory then:
# - maybe we made a big efficiency improvement, in which case update the test
# - maybe something is functionally wrong with the test and it's not driving the system as expected
assert compaction_mapped_rss > MEMORY_ESTIMATE / 2
# We should have compacted some but not all of the l0s, based on the limit on how much
# l0 to compact in one go
assert len(layers.delta_l0_layers()) > 0
assert len(layers.delta_l0_layers()) < initial_l0s
# The pageserver should have logged when it hit the compaction size limit
env.pageserver.assert_log_contains(".*hit max delta layer size limit.*")

View File

@@ -1,7 +1,6 @@
from __future__ import annotations
import time
import traceback
from typing import TYPE_CHECKING
import psycopg2
@@ -10,15 +9,12 @@ import pytest
from fixtures.benchmark_fixture import MetricReport
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_api import connection_parameters_to_env
from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
from fixtures.pg_version import PgVersion
if TYPE_CHECKING:
from fixtures.benchmark_fixture import NeonBenchmarker
from fixtures.neon_api import NeonAPI
from fixtures.neon_api import NeonApiEndpoint
from fixtures.neon_fixtures import NeonEnv, PgBin
from fixtures.pg_version import PgVersion
@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -86,8 +82,8 @@ def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
@pytest.mark.timeout(2 * 60 * 60)
def test_subscriber_lag(
pg_bin: PgBin,
neon_api: NeonAPI,
pg_version: PgVersion,
benchmark_project_pub: NeonApiEndpoint,
benchmark_project_sub: NeonApiEndpoint,
zenbenchmark: NeonBenchmarker,
):
"""
@@ -99,125 +95,82 @@ def test_subscriber_lag(
sync_interval_min = 5
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
pub_project = neon_api.create_project(pg_version)
pub_project_id = pub_project["project"]["id"]
neon_api.wait_for_operation_to_finish(pub_project_id)
error_occurred = False
pub_env = benchmark_project_pub.pgbench_env
sub_env = benchmark_project_sub.pgbench_env
pub_connstr = benchmark_project_pub.connstr
sub_connstr = benchmark_project_sub.connstr
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
if benchmark_project_pub.is_new:
pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
if benchmark_project_sub.is_new:
sub_cur.execute("truncate table pgbench_accounts")
sub_cur.execute("truncate table pgbench_history")
sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()
zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
)
try:
sub_project = neon_api.create_project(pg_version)
sub_project_id = sub_project["project"]["id"]
sub_endpoint_id = sub_project["endpoints"][0]["id"]
neon_api.wait_for_operation_to_finish(sub_project_id)
sub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-S"],
env=sub_env,
)
try:
pub_env = connection_parameters_to_env(
pub_project["connection_uris"][0]["connection_parameters"]
)
sub_env = connection_parameters_to_env(
sub_project["connection_uris"][0]["connection_parameters"]
)
pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
start = time.time()
while time.time() - start < test_duration_min * 60:
time.sleep(sync_interval_min * 60)
check_pgbench_still_running(pub_workload, "pub")
check_pgbench_still_running(sub_workload, "sub")
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
sub_connstr
) as sub_conn:
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
sub_cur.execute("truncate table pgbench_accounts")
sub_cur.execute("truncate table pgbench_history")
log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
sub_workload.terminate()
benchmark_project_sub.restart()
pub_cur.execute(
"create publication pub1 for table pgbench_accounts, pgbench_history"
)
sub_cur.execute(
f"create subscription sub1 connection '{pub_connstr}' publication pub1"
)
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()
zenbenchmark.record(
"initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
)
pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
)
try:
sub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-S"],
env=sub_env,
)
try:
start = time.time()
while time.time() - start < test_duration_min * 60:
time.sleep(sync_interval_min * 60)
check_pgbench_still_running(pub_workload, "pub")
check_pgbench_still_running(sub_workload, "sub")
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
sub_connstr
) as sub_conn:
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
sub_workload.terminate()
neon_api.restart_endpoint(
sub_project_id,
sub_endpoint_id,
)
neon_api.wait_for_operation_to_finish(sub_project_id)
sub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-S"],
env=sub_env,
)
# Measure storage to make sure replication information isn't bloating storage
sub_storage = neon_api.get_project_details(sub_project_id)["project"][
"synthetic_storage_size"
]
pub_storage = neon_api.get_project_details(pub_project_id)["project"][
"synthetic_storage_size"
]
zenbenchmark.record(
"sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
)
zenbenchmark.record(
"pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
)
finally:
sub_workload.terminate()
finally:
pub_workload.terminate()
except Exception as e:
error_occurred = True
log.error(f"Caught exception {e}")
log.error(traceback.format_exc())
# Measure storage to make sure replication information isn't bloating storage
sub_storage = benchmark_project_sub.get_synthetic_storage_size()
pub_storage = benchmark_project_pub.get_synthetic_storage_size()
zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
finally:
if not error_occurred:
neon_api.delete_project(sub_project_id)
except Exception as e:
error_occurred = True
log.error(f"Caught exception {e}")
log.error(traceback.format_exc())
sub_workload.terminate()
finally:
assert not error_occurred
neon_api.delete_project(pub_project_id)
pub_workload.terminate()
@pytest.mark.remote_cluster
@pytest.mark.timeout(2 * 60 * 60)
def test_publisher_restart(
pg_bin: PgBin,
neon_api: NeonAPI,
pg_version: PgVersion,
benchmark_project_pub: NeonApiEndpoint,
benchmark_project_sub: NeonApiEndpoint,
zenbenchmark: NeonBenchmarker,
):
"""
@@ -229,114 +182,70 @@ def test_publisher_restart(
sync_interval_min = 5
pgbench_duration = f"-T{test_duration_min * 60 * 2}"
pub_project = neon_api.create_project(pg_version)
pub_project_id = pub_project["project"]["id"]
pub_endpoint_id = pub_project["endpoints"][0]["id"]
neon_api.wait_for_operation_to_finish(pub_project_id)
error_occurred = False
pub_env = benchmark_project_pub.pgbench_env
sub_env = benchmark_project_sub.pgbench_env
pub_connstr = benchmark_project_pub.connstr
sub_connstr = benchmark_project_sub.connstr
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
if benchmark_project_pub.is_new:
pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
if benchmark_project_sub.is_new:
sub_cur.execute("truncate table pgbench_accounts")
sub_cur.execute("truncate table pgbench_history")
sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()
zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
)
try:
sub_project = neon_api.create_project(pg_version)
sub_project_id = sub_project["project"]["id"]
neon_api.wait_for_operation_to_finish(sub_project_id)
sub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-S"],
env=sub_env,
)
try:
pub_env = connection_parameters_to_env(
pub_project["connection_uris"][0]["connection_parameters"]
)
sub_env = connection_parameters_to_env(
sub_project["connection_uris"][0]["connection_parameters"]
)
pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
start = time.time()
while time.time() - start < test_duration_min * 60:
time.sleep(sync_interval_min * 60)
check_pgbench_still_running(pub_workload, "pub")
check_pgbench_still_running(sub_workload, "sub")
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
sub_cur.execute("truncate table pgbench_accounts")
sub_cur.execute("truncate table pgbench_history")
pub_cur.execute(
"create publication pub1 for table pgbench_accounts, pgbench_history"
)
sub_cur.execute(
f"create subscription sub1 connection '{pub_connstr}' publication pub1"
)
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()
zenbenchmark.record(
"initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
)
pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
)
try:
sub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-S"],
env=sub_env,
)
try:
start = time.time()
while time.time() - start < test_duration_min * 60:
time.sleep(sync_interval_min * 60)
check_pgbench_still_running(pub_workload, "pub")
check_pgbench_still_running(sub_workload, "sub")
pub_workload.terminate()
neon_api.restart_endpoint(
pub_project_id,
pub_endpoint_id,
)
neon_api.wait_for_operation_to_finish(pub_project_id)
pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
env=pub_env,
)
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
sub_connstr
) as sub_conn:
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
# Measure storage to make sure replication information isn't bloating storage
sub_storage = neon_api.get_project_details(sub_project_id)["project"][
"synthetic_storage_size"
]
pub_storage = neon_api.get_project_details(pub_project_id)["project"][
"synthetic_storage_size"
]
zenbenchmark.record(
"sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
)
zenbenchmark.record(
"pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
)
finally:
sub_workload.terminate()
finally:
pub_workload.terminate()
except Exception as e:
error_occurred = True
log.error(f"Caught exception {e}")
log.error(traceback.format_exc())
benchmark_project_pub.restart()
pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
env=pub_env,
)
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
sub_connstr
) as sub_conn:
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
# Measure storage to make sure replication information isn't bloating storage
sub_storage = benchmark_project_sub.get_synthetic_storage_size()
pub_storage = benchmark_project_pub.get_synthetic_storage_size()
zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
finally:
if not error_occurred:
neon_api.delete_project(sub_project_id)
except Exception as e:
error_occurred = True
log.error(f"Caught exception {e}")
log.error(traceback.format_exc())
sub_workload.terminate()
finally:
assert not error_occurred
neon_api.delete_project(pub_project_id)
pub_workload.terminate()

View File

@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
def check_pageserver(expect_success: bool, **conn_kwargs):
check_connection(
env.pageserver,
f"show {env.initial_tenant}",
f"pagestream {env.initial_tenant} {env.initial_timeline}",
expect_success,
**conn_kwargs,
)

View File

@@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
"compaction_period": "1 s",
"compaction_threshold": "2",
"image_creation_threshold": "1",
# set PITR interval to be small, so we can do GC
"pitr_interval": "1 s",
# Disable PITR, this test will set an explicit space-based GC limit
"pitr_interval": "0 s",
}
)

View File

@@ -6,7 +6,10 @@ from typing import Optional
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
from fixtures.neon_fixtures import (
NeonEnvBuilder,
generate_uploads_and_deletions,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.utils import wait_until
from fixtures.workload import Workload
@@ -142,6 +145,10 @@ def test_sharding_compaction(
"image_layer_creation_check_threshold": 0,
}
# Disable compression, as we can't estimate the size of layers with compression enabled
# TODO: implement eager layer cutting during compaction
neon_env_builder.pageserver_config_override = "image_compression='disabled'"
neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
env = neon_env_builder.init_start(
initial_tenant_conf=TENANT_CONF,
@@ -320,3 +327,87 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
or 0
) == 0
assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")
@pytest.mark.parametrize("enabled", [True, False])
def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool):
tenant_conf = {
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": f"{128 * 1024}",
"compaction_threshold": "1",
"compaction_target_size": f"{128 * 1024}",
# no PITR horizon, we specify the horizon when we request on-demand GC
"pitr_interval": "0s",
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# create image layers as eagerly as possible
"image_creation_threshold": "1",
"image_layer_creation_check_threshold": "0",
}
# Explicitly enable/disable compression, rather than using default
if enabled:
neon_env_builder.pageserver_config_override = "image_compression='zstd'"
else:
neon_env_builder.pageserver_config_override = "image_compression='disabled'"
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
pageserver = env.pageserver
ps_http = env.pageserver.http_client()
with env.endpoints.create_start(
"main", tenant_id=tenant_id, pageserver_id=pageserver.id
) as endpoint:
endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
# Generate around 800k worth of easily compressible data to store
for v in range(100):
endpoint.safe_psql(
f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))"
)
# run compaction to create image layers
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
layer_map = ps_http.layer_map_info(tenant_id, timeline_id)
image_layer_count = 0
delta_layer_count = 0
for layer in layer_map.historic_layers:
if layer.kind == "Image":
image_layer_count += 1
elif layer.kind == "Delta":
delta_layer_count += 1
assert image_layer_count > 0
assert delta_layer_count > 0
log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}")
bytes_in = pageserver.http_client().get_metric_value(
"pageserver_compression_image_in_bytes_total"
)
bytes_out = pageserver.http_client().get_metric_value(
"pageserver_compression_image_out_bytes_total"
)
assert bytes_in is not None
assert bytes_out is not None
log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)")
if enabled:
# We are writing high compressible repetitive plain text, expect excellent compression
EXPECT_RATIO = 0.2
assert bytes_out / bytes_in < EXPECT_RATIO
else:
# Nothing should be compressed if we disabled it.
assert bytes_out >= bytes_in
# Destroy the endpoint and create a new one to resetthe caches
with env.endpoints.create_start(
"main", tenant_id=tenant_id, pageserver_id=pageserver.id
) as endpoint:
for v in range(100):
res = endpoint.safe_psql(
f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
)
assert res[0][0] == 1

Some files were not shown because too many files have changed in this diff Show More