Compare commits

..

4 Commits

Author SHA1 Message Date
Anna Khanova
396d939f3e Upd 2024-06-06 14:38:04 +02:00
Anna Khanova
00d65b4ea3 Upd 2024-06-06 14:23:26 +02:00
Anna Khanova
d1270f2571 Added logging 2024-06-06 14:06:03 +02:00
Anna Khanova
34b52467f0 Add comments 2024-06-04 20:45:30 +02:00
129 changed files with 1507 additions and 4059 deletions

View File

@@ -8,7 +8,6 @@
!scripts/combine_control_files.py
!scripts/ninstall.sh
!vm-cgconfig.conf
!docker-compose/run-tests.sh
# Directories
!.cargo/
@@ -21,7 +20,7 @@
!patches/
!pgxn/
!proxy/
!storage_scrubber/
!s3_scrubber/
!safekeeper/
!storage_broker/
!storage_controller/

View File

@@ -69,41 +69,15 @@ jobs:
with:
ref: main
token: ${{ secrets.CI_ACCESS_TOKEN }}
- name: Look for existing PR
id: get-pr
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
- name: Get changed labels
id: get-labels
if: steps.get-pr.outputs.ALREADY_CREATED != ''
env:
ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \
<(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -)
LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \
<(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
paste -sd , -)
echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}
- run: gh pr checkout "${PR_NUMBER}"
- run: git checkout -b "${BRANCH}"
- run: git push --force origin "${BRANCH}"
if: steps.get-pr.outputs.ALREADY_CREATED == ''
- name: Create a Pull Request for CI run (if required)
if: steps.get-pr.outputs.ALREADY_CREATED == ''
env:
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
cat << EOF > body.md
@@ -114,33 +88,16 @@ jobs:
Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
EOF
LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER} --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft )| \
grep -E '^run' | paste -sd , -)
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
if [ -z "${ALREADY_CREATED}" ]; then
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
--body-file "body.md" \
--head "${BRANCH}" \
--base "main" \
--label ${LABELS} \
--label "run-e2e-tests-in-draft" \
--draft
- name: Modify the existing pull request (if required)
if: steps.get-pr.outputs.ALREADY_CREATED != ''
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }}
LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }}
ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
run: |
ADD_CMD=
REMOVE_CMD=
[ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}"
[ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}"
if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then
gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD}
fi
- run: git push --force origin "${BRANCH}"
if: steps.get-pr.outputs.ALREADY_CREATED != ''
cleanup:
# Close PRs and delete branchs if the original PR is closed.

View File

@@ -55,7 +55,7 @@ jobs:
exit 1
fi
- uses: actions/checkout@v4
- uses: actions/checkout@v3
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker

View File

@@ -859,26 +859,6 @@ jobs:
tags: |
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
- name: Build neon extensions test image
if: matrix.version == 'v16'
uses: docker/build-push-action@v5
with:
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
PG_VERSION=${{ matrix.version }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
provenance: false
push: true
pull: true
file: Dockerfile.compute-node
target: neon-pg-ext-test
cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
tags: |
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: matrix.version == 'v16'
@@ -922,13 +902,6 @@ jobs:
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
- name: Create multi-arch neon-test-extensions image
if: matrix.version == 'v16'
run: |
docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
- name: Create multi-arch compute-tools image
if: matrix.version == 'v16'
run: |
@@ -965,7 +938,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v1
with:
fetch-depth: 0
@@ -1047,7 +1020,7 @@ jobs:
exit 1
fi
- name: Verify docker-compose example and test extensions
- name: Verify docker-compose example
timeout-minutes: 20
run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
@@ -1101,8 +1074,6 @@ jobs:
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
done
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]

97
Cargo.lock generated
View File

@@ -5109,6 +5109,54 @@ version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
[[package]]
name = "s3_scrubber"
version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"aws-config",
"aws-sdk-s3",
"aws-smithy-async",
"bincode",
"bytes",
"camino",
"chrono",
"clap",
"crc32c",
"either",
"futures",
"futures-util",
"hex",
"histogram",
"humantime",
"itertools",
"once_cell",
"pageserver",
"pageserver_api",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"serde",
"serde_json",
"serde_with",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-appender",
"tracing-subscriber",
"utils",
"workspace_hack",
]
[[package]]
name = "safekeeper"
version = "0.1.0"
@@ -5765,54 +5813,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "storage_scrubber"
version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"aws-config",
"aws-sdk-s3",
"aws-smithy-async",
"bincode",
"bytes",
"camino",
"chrono",
"clap",
"crc32c",
"either",
"futures",
"futures-util",
"hex",
"histogram",
"humantime",
"itertools",
"once_cell",
"pageserver",
"pageserver_api",
"postgres_ffi",
"rand 0.8.5",
"remote_storage",
"reqwest 0.12.4",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"serde",
"serde_json",
"serde_with",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-appender",
"tracing-subscriber",
"utils",
"workspace_hack",
]
[[package]]
name = "storcon_cli"
version = "0.1.0"
@@ -5820,7 +5820,6 @@ dependencies = [
"anyhow",
"clap",
"comfy-table",
"futures",
"humantime",
"hyper 0.14.26",
"pageserver_api",

View File

@@ -13,7 +13,7 @@ members = [
"safekeeper",
"storage_broker",
"storage_controller",
"storage_scrubber",
"s3_scrubber",
"workspace_hack",
"trace",
"libs/compute_api",
@@ -120,7 +120,7 @@ num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0"
parking_lot = "0.12"
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
@@ -128,7 +128,7 @@ parquet_derive = "51.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
procfs = "0.14"
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
@@ -184,7 +184,7 @@ tower-service = "0.3.2"
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.21.0"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
twox-hash = { version = "1.6.3", default-features = false }
url = "2.2"
urlencoding = "2.1"

View File

@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.79.0
ENV RUSTC_VERSION=1.78.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \

View File

@@ -246,8 +246,8 @@ COPY patches/pgvector.patch /pgvector.patch
# By default, pgvector Makefile uses `-march=native`. We don't want that,
# because we build the images on different machines than where we run them.
# Pass OPTFLAGS="" to remove it.
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
patch -p1 < /pgvector.patch && \
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -928,69 +928,6 @@ RUN rm -r /usr/local/pgsql/include
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Layer neon-pg-ext-test
#
#########################################################################################
FROM neon-pg-ext-build AS neon-pg-ext-test
ARG PG_VERSION
RUN mkdir /ext-src
#COPY --from=postgis-build /postgis.tar.gz /ext-src/
#COPY --from=postgis-build /sfcgal/* /usr
COPY --from=plv8-build /plv8.tar.gz /ext-src/
COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
COPY --from=vector-pg-build /pgvector.patch /ext-src/
COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
#COPY --from=rum-pg-build /rum.tar.gz /ext-src
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
COPY --from=hll-pg-build /hll.tar.gz /ext-src
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
COPY patches/pg_hintplan.patch /ext-src
#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
COPY patches/pg_cron.patch /ext-src
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
COPY patches/pg_anon.patch /ext-src
COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
RUN cd /ext-src/ && for f in *.tar.gz; \
do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|| exit 1; rm -f $f; done
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
# cmake is required for the h3 test
RUN apt-get update && apt-get install -y cmake
RUN patch -p1 < /ext-src/pg_hintplan.patch
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
RUN patch -p1 </ext-src/pg_anon.patch
RUN patch -p1 </ext-src/pg_cron.patch
ENV PATH=/usr/local/pgsql/bin:$PATH
ENV PGHOST=compute
ENV PGPORT=55433
ENV PGUSER=cloud_admin
ENV PGDATABASE=postgres
#########################################################################################
#
# Final layer

View File

@@ -124,8 +124,6 @@ postgres-%: postgres-configure-% \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
+@echo "Compiling test_decoding $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install
.PHONY: postgres-clean-%
postgres-clean-%:

View File

@@ -735,7 +735,7 @@ fn cli() -> clap::Command {
Arg::new("filecache-connstr")
.long("filecache-connstr")
.default_value(
"host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
"host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
)
.value_name("FILECACHE_CONNSTR"),
)

View File

@@ -1 +0,0 @@
ALTER ROLE neon_superuser BYPASSRLS;

View File

@@ -1,18 +0,0 @@
DO $$
DECLARE
role_name text;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
END LOOP;
FOR role_name IN SELECT rolname FROM pg_roles
WHERE
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
END LOOP;
END $$;

View File

@@ -1,6 +0,0 @@
DO $$
BEGIN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
END IF;
END $$;

View File

@@ -1 +0,0 @@
GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;

View File

@@ -1,4 +0,0 @@
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
-- interacted with by neon_superuser without permission issues.
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;

View File

@@ -1,4 +0,0 @@
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
-- interacted with by neon_superuser without permission issues.
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;

View File

@@ -1,3 +0,0 @@
-- SKIP: Moved inline to the handle_grants() functions.
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;

View File

@@ -1,3 +0,0 @@
-- SKIP: Moved inline to the handle_grants() functions.
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;

View File

@@ -1,13 +0,0 @@
-- SKIP: The original goal of this migration was to prevent creating
-- subscriptions, but this migration was insufficient.
DO $$
DECLARE
role_name TEXT;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
END LOOP;
END $$;

View File

@@ -774,21 +774,44 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
// !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// Add new migrations in numerical order.
let migrations = [
include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
include_str!("./migrations/0001-alter_roles.sql"),
include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
include_str!(
"./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
),
include_str!(
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
),
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
"ALTER ROLE neon_superuser BYPASSRLS",
r#"
DO $$
DECLARE
role_name text;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
END LOOP;
FOR role_name IN SELECT rolname FROM pg_roles
WHERE
NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
END LOOP;
END $$;
"#,
r#"
DO $$
BEGIN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
END IF;
END
$$;"#,
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
// Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
"",
"",
"",
"",
"",
// Add new migrations below.
];
let mut func = || {
@@ -824,13 +847,10 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
while current_migration < migrations.len() {
let migration = &migrations[current_migration];
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", current_migration);
if migration.is_empty() {
info!("Skip migration id={}", current_migration);
} else {
info!(
"Running migration id={}:\n{}\n",
current_migration, migration
);
info!("Running migration:\n{}\n", migration);
client.simple_query(migration).with_context(|| {
format!("handle_migrations current_migration={}", current_migration)
})?;

View File

@@ -9,7 +9,6 @@ license.workspace = true
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
futures.workspace = true
humantime.workspace = true
hyper.workspace = true
pageserver_api.workspace = true

View File

@@ -1,4 +1,3 @@
use futures::StreamExt;
use std::{collections::HashMap, str::FromStr, time::Duration};
use clap::{Parser, Subcommand};
@@ -149,22 +148,6 @@ enum Command {
#[arg(long)]
threshold: humantime::Duration,
},
// Drain a set of specified pageservers by moving the primary attachments to pageservers
// outside of the specified set.
Drain {
// Set of pageserver node ids to drain.
#[arg(long)]
nodes: Vec<NodeId>,
// Optional: migration concurrency (default is 8)
#[arg(long)]
concurrency: Option<usize>,
// Optional: maximum number of shards to migrate
#[arg(long)]
max_shards: Option<usize>,
// Optional: when set to true, nothing is migrated, but the plan is printed to stdout
#[arg(long)]
dry_run: Option<bool>,
},
}
#[derive(Parser)]
@@ -754,194 +737,6 @@ async fn main() -> anyhow::Result<()> {
})
.await?;
}
Command::Drain {
nodes,
concurrency,
max_shards,
dry_run,
} => {
// Load the list of nodes, split them up into the drained and filled sets,
// and validate that draining is possible.
let node_descs = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
let mut node_to_drain_descs = Vec::new();
let mut node_to_fill_descs = Vec::new();
for desc in node_descs {
let to_drain = nodes.iter().any(|id| *id == desc.id);
if to_drain {
node_to_drain_descs.push(desc);
} else {
node_to_fill_descs.push(desc);
}
}
if nodes.len() != node_to_drain_descs.len() {
anyhow::bail!("Drain requested for node which doesn't exist.")
}
node_to_fill_descs.retain(|desc| {
matches!(desc.availability, NodeAvailabilityWrapper::Active)
&& matches!(
desc.scheduling,
NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
)
});
if node_to_fill_descs.is_empty() {
anyhow::bail!("There are no nodes to drain to")
}
// Set the node scheduling policy to draining for the nodes which
// we plan to drain.
for node_desc in node_to_drain_descs.iter() {
let req = NodeConfigureRequest {
node_id: node_desc.id,
availability: None,
scheduling: Some(NodeSchedulingPolicy::Draining),
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/node/{}/config", node_desc.id),
Some(req),
)
.await?;
}
// Perform the drain: move each tenant shard scheduled on a node to
// be drained to a node which is being filled. A simple round robin
// strategy is used to pick the new node.
let tenants = storcon_client
.dispatch::<(), Vec<TenantDescribeResponse>>(
Method::GET,
"control/v1/tenant".to_string(),
None,
)
.await?;
let mut selected_node_idx = 0;
struct DrainMove {
tenant_shard_id: TenantShardId,
from: NodeId,
to: NodeId,
}
let mut moves: Vec<DrainMove> = Vec::new();
let shards = tenants
.into_iter()
.flat_map(|tenant| tenant.shards.into_iter());
for shard in shards {
if let Some(max_shards) = max_shards {
if moves.len() >= max_shards {
println!(
"Stop planning shard moves since the requested maximum was reached"
);
break;
}
}
let should_migrate = {
if let Some(attached_to) = shard.node_attached {
node_to_drain_descs
.iter()
.map(|desc| desc.id)
.any(|id| id == attached_to)
} else {
false
}
};
if !should_migrate {
continue;
}
moves.push(DrainMove {
tenant_shard_id: shard.tenant_shard_id,
from: shard
.node_attached
.expect("We only migrate attached tenant shards"),
to: node_to_fill_descs[selected_node_idx].id,
});
selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len();
}
let total_moves = moves.len();
if dry_run == Some(true) {
println!("Dryrun requested. Planned {total_moves} moves:");
for mv in &moves {
println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to)
}
return Ok(());
}
const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
let mut stream = futures::stream::iter(moves)
.map(|mv| {
let client = Client::new(cli.api.clone(), cli.jwt.clone());
async move {
client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
Some(TenantShardMigrateRequest {
tenant_shard_id: mv.tenant_shard_id,
node_id: mv.to,
}),
)
.await
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
}
})
.buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY));
let mut success = 0;
let mut failure = 0;
while let Some(res) = stream.next().await {
match res {
Ok(_) => {
success += 1;
}
Err((tenant_shard_id, from, to, error)) => {
failure += 1;
println!(
"Failed to migrate {} from node {} to node {}: {}",
tenant_shard_id, from, to, error
);
}
}
if (success + failure) % 20 == 0 {
println!(
"Processed {}/{} shards: {} succeeded, {} failed",
success + failure,
total_moves,
success,
failure
);
}
}
println!(
"Processed {}/{} shards: {} succeeded, {} failed",
success + failure,
total_moves,
success,
failure
);
}
}
Ok(())

View File

@@ -8,11 +8,6 @@ USER root
RUN apt-get update && \
apt-get install -y curl \
jq \
python3-pip \
netcat
#Faker is required for the pg_anon test
RUN pip3 install Faker
#This is required for the pg_hintplan test
RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
USER postgres
USER postgres

View File

@@ -95,7 +95,7 @@
},
{
"name": "shared_preload_libraries",
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
"value": "neon",
"vartype": "string"
},
{
@@ -127,16 +127,6 @@
"name": "max_replication_flush_lag",
"value": "10GB",
"vartype": "string"
},
{
"name": "cron.database",
"value": "postgres",
"vartype": "string"
},
{
"name": "session_preload_libraries",
"value": "anon",
"vartype": "string"
}
]
},

View File

@@ -1,3 +1,5 @@
version: '3'
services:
minio:
restart: always
@@ -159,12 +161,12 @@ services:
context: ./compute_wrapper/
args:
- REPOSITORY=${REPOSITORY:-neondatabase}
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
- TAG=${TAG:-latest}
- http_proxy=$http_proxy
- https_proxy=$https_proxy
environment:
- PG_VERSION=${PG_VERSION:-16}
- PG_VERSION=${PG_VERSION:-14}
#- RUST_BACKTRACE=1
# Mount the test files directly, for faster editing cycle.
volumes:
@@ -192,14 +194,3 @@ services:
done"
depends_on:
- compute
neon-test-extensions:
profiles: ["test-extensions"]
image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
entrypoint:
- "/bin/bash"
- "-c"
command:
- sleep 1800
depends_on:
- compute

View File

@@ -7,94 +7,52 @@
# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
# to verify custom image builds (e.g pre-published ones).
#
# A test script for postgres extensions
# Currently supports only v16
#
set -eux -o pipefail
COMPOSE_FILE='docker-compose.yml'
cd $(dirname $0)
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
: ${http_proxy:=}
: ${https_proxy:=}
export http_proxy https_proxy
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
cleanup() {
echo "show container information"
docker ps
docker compose --profile test-extensions -f $COMPOSE_FILE logs
docker compose -f $COMPOSE_FILE logs
echo "stop containers..."
docker compose --profile test-extensions -f $COMPOSE_FILE down
docker compose -f $COMPOSE_FILE down
}
echo "clean up containers if exists"
cleanup
for pg_version in 14 15 16; do
echo "clean up containers if exists"
cleanup
PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
echo "start containers (pg_version=$pg_version)."
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
echo "wait until the compute is ready. timeout after 60s. "
cnt=0
while sleep 3; do
while sleep 1; do
# check timeout
cnt=`expr $cnt + 3`
cnt=`expr $cnt + 1`
if [ $cnt -gt 60 ]; then
echo "timeout before the compute is ready."
cleanup
exit 1
fi
if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
# check if the compute is ready
set +o pipefail
result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
set -o pipefail
if [ $result -eq 1 ]; then
echo "OK. The compute is ready to connect."
echo "execute simple queries."
docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
cleanup
break
fi
done
if [ $pg_version -ge 16 ]
then
echo Enabling trust connection
docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
echo Adding postgres role
docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
# This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
echo Adding dummy config
docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
# This block is required for the pg_anon extension test.
# The test assumes that it is running on the same host with the postgres engine.
# In our case it's not true, that's why we are copying files to the compute node
TMPDIR=$(mktemp -d)
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
rm -rf $TMPDIR
TMPDIR=$(mktemp -d)
# The following block does the same for the pg_hintplan test
docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
rm -rf $TMPDIR
# We are running tests now
if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
then
cleanup
else
FAILED=$(tail -1 testout.txt)
for d in $FAILED
do
mkdir $d
docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
cat $d/regression.out $d/regression.diffs || true
done
rm -rf $FAILED
cleanup
exit 1
fi
fi
cleanup
done

View File

@@ -1,15 +0,0 @@
#!/bin/bash
set -x
cd /ext-src
FAILED=
LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
for d in ${LIST}
do
[ -d ${d} ] || continue
psql -c "select 1" >/dev/null || break
make -C ${d} installcheck || FAILED="${d} ${FAILED}"
done
[ -z "${FAILED}" ] && exit 0
echo ${FAILED}
exit 1

View File

@@ -4,18 +4,18 @@
Currently we build two main images:
- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.
- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).
And additional intermediate image:
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
## Build pipeline
## Building pipeline
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
1. `neondatabase/compute-tools` and `neondatabase/compute-node`
2. `neondatabase/neon`
@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
1. create containers
You can specify version of neon cluster using following environment values.
- PG_VERSION: postgres version for compute (default is 16 as of this writing)
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest'
- PG_VERSION: postgres version for compute (default is 14)
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
```
$ cd docker-compose/
$ docker-compose down # remove the containers if exists
$ PG_VERSION=16 TAG=latest docker-compose up --build -d # You can specify the postgres and image version
$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version
Creating network "dockercompose_default" with the default driver
Creating docker-compose_storage_broker_1 ... done
(...omit...)
@@ -47,31 +47,29 @@ Creating docker-compose_storage_broker_1 ... done
2. connect compute node
```
$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres
psql (16.3)
Type "help" for help.
$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
$ chmod 600 ~/.pgpass
$ psql -h localhost -p 55433 -U cloud_admin
postgres=# CREATE TABLE t(key int primary key, value text);
CREATE TABLE
postgres=# insert into t values(1, 1);
postgres=# insert into t values(1,1);
INSERT 0 1
postgres=# select * from t;
key | value
key | value
-----+-------
1 | 1
(1 row)
```
3. If you want to see the log, you can use `docker-compose logs` command.
```
# check the container name you want to see
$ docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
3582f6d76227 docker-compose_compute "/shell/compute.sh" 2 minutes ago Up 2 minutes 0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp docker-compose_compute_1
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
d6968a5ae912 dockercompose_compute "/shell/compute.sh" 5 minutes ago Up 5 minutes 0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp dockercompose_compute_1
(...omit...)
$ docker logs -f docker-compose_compute_1
$ docker logs -f dockercompose_compute_1
2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql
2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
(...omit...)

View File

@@ -1,139 +0,0 @@
# Postgres Bundle for Pageserver
Created on 2024-06-17
## Summary
This RFC defines the responsibilities of Compute and Storage team regarding the
build & deployment of the Postgres code that Pageserver must run
(`initdb`, `postgres --wal-redo`).
## Motivation
Pageserver has to run Postgres binaries to do its job, specifically
* `initdb`
* `postgres --wal-redo` mode
Currently there is **no clear ownership** of
* how these binaries are built
* including, critically, dynamic linkage against other libraries such as `libicu`
* what build of the binaries ends up running on Pageservers
* how the binaries and runtime dependencies (e.g., shared libraries) are delivered to Pageservers
Further, these binaries have dependencies (e.g., libicu) which
1. prevent the Storage team from switching Pageserver distro and/or version, and
2. some dependencies impact compatibility between Storage and Compute (e.g., [libicu version impacts collation incompatibilty](https://github.com/neondatabase/neon/pull/8074))
3. some dependencies can cause database corruption if updated carelessly (locale => libc)
## Why Is This Worth Solving
1. Clearly defined ownership generally boosts execution speed & bug triage.
* Example for why execution speed matters: CVE in dependency => who takes care of patching & updating.
2. Centralize understanding of risks involved with some dependencies.
Currently, there is no team clearly responsible for assessing / tracking the risks. As a reminder from previous section, these are
* runtime incompatibilities
* database corruption
Also, it is an unlock for additional future value, see "Future Work" section.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
Pageserver (neon.git)
Compute (neon.git)
Deployment process (aws.git)
## Design
The basic interface between Compute and Storage team is as follows:
* Compute team publishes a "bundle" of the binaries required by Pageserver
* Storage team uses a pinned bundle in the Pageserver build process
* Storage team code review is required to update the pinned version
The "bundle" provides an interface agreed upon by Compute and Storage teams to run
* for each supported Postgres version at Neon (v14, v15, v16, ...)
* the `initdb` process
* behaving like a vanilla Postgres `initdb`
* `postgres --wal-redo` mode process
* following the walredo protocol specified elsewhere
The bundle is self-contained, i.e., it behaves the same way on any Linux system.
The only ambient runtime dependency is the Linux kernel.
The minimum Linux kernel version is 5.10.
### Variant 1: bundle = fully statically linked binaries
The "bundle" is a tarball of fully statically linked binaries
```
v14/initdb
v14/postgres
v15/initdb
v15/postgres
v16/initdb
v16/postgres
...
```
The directory structure is part of the interface.
### Variant 2: bundle = chrooted directory
The "bundle" is a tarball that contains all sorts of files, plus a launcher script.
```
LAUNCHER
storage
storage/does
storage/does/not
storage/does/not/care
```
To launch `initdb` or `postgres --wal-redo`, the Pageserver does
1. fork child process
2. `chroot` into the extracted directory
3. inside the chroot, run `/LAUNCHER VERSION PG_BINARY [FLAGS...]`
4. The `LAUNCHER` script sets up library search paths, etc, and then `exec`s the correct binary
We acknowledge this is half-way reinventing OCI + linux containers.
However, our needs are much simpler than what OCI & Docker provide.
Specifically, we do not want Pageserver to be runtime-dependent on e.g. Docker as the launcher.
The `chroot` is to enforce that the "bundle" be self-contained.
The special path `/inout` int he bundle is reserved, e.g., for `initdb` output.
### Variant 3: ???
Your design here, feedback welcome.
## Security implications
It's an improvement because a single team (Compute) will be responsible for runtime dependencies.
## Implementation & Rollout
Storage and Compute teams agree on a bundle definition.
Compute team changes their build process to produce both
1. existing: compute image / vm compute image
2. existing: pg_install tarball (currently built by `neon.git:Dockerfile`)
2. new: the bundle
Storage makes `neon.git` Pageserver changes to support using bundle (behind feature flag).
With feature flag disabled, existing `pg_install` tarball is used instead.
Storage & infra make `aws.git` changes to deploy bundle to pageservers, with feature flag disabled.
Storage team does gradual rollout.
Storage & infra teams remove support for `pg_install`, delete it from the nodes (experimentation in staging to ensure no hidden runtime deps!)
Compute team stops producing `pg_install` tarball.
## Future Work
We know that we can easily make pageserver fully statically linked.
Together with the self-contained "bundle" proposed above, Pageserver can then be deployed to different OSes.
For example, we have been entertaining the idea of trying Amazon Linux instead of Debian for Pageserver.
That experiment would be a lot simpler.

View File

@@ -3,7 +3,6 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::env;
use std::fmt::Display;
use std::io;
use std::num::NonZeroU32;
use std::pin::Pin;
@@ -30,7 +29,6 @@ use http_types::{StatusCode, Url};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use utils::backoff;
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
use crate::{
@@ -453,58 +451,26 @@ impl RemoteStorage for AzureBlobStorage {
// TODO batch requests are not supported by the SDK
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
for path in paths {
#[derive(Debug)]
enum AzureOrTimeout {
AzureError(azure_core::Error),
Timeout,
Cancel,
}
impl Display for AzureOrTimeout {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self:?}")
}
}
let warn_threshold = 3;
let max_retries = 5;
backoff::retry(
|| async {
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
let request = blob_client.delete().into_future();
let request = blob_client.delete().into_future();
let res = tokio::time::timeout(self.timeout, request).await;
let res = tokio::time::timeout(self.timeout, request).await;
match res {
Ok(Ok(_v)) => Ok(()),
Ok(Err(azure_err)) => {
if let Some(http_err) = azure_err.as_http_error() {
if http_err.status() == StatusCode::NotFound {
return Ok(());
}
}
Err(AzureOrTimeout::AzureError(azure_err))
match res {
Ok(Ok(_response)) => continue,
Ok(Err(e)) => {
if let Some(http_err) = e.as_http_error() {
if http_err.status() == StatusCode::NotFound {
continue;
}
Err(_elapsed) => Err(AzureOrTimeout::Timeout),
}
},
|err| match err {
AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false,
AzureOrTimeout::Cancel => true,
},
warn_threshold,
max_retries,
"deleting remote object",
cancel,
)
.await
.ok_or_else(|| AzureOrTimeout::Cancel)
.and_then(|x| x)
.map_err(|e| match e {
AzureOrTimeout::AzureError(err) => anyhow::Error::from(err),
AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(),
AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(),
})?;
return Err(e.into());
}
Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
}
}
Ok(())
};

View File

@@ -7,7 +7,7 @@ license.workspace = true
[dependencies]
hyper.workspace = true
opentelemetry = { workspace = true, features=["rt-tokio"] }
opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions.workspace = true
reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }

View File

@@ -34,9 +34,6 @@ pub enum ApiError {
#[error("Timeout")]
Timeout(Cow<'static, str>),
#[error("Request cancelled")]
Cancelled,
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -77,10 +74,6 @@ impl ApiError {
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
self.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
@@ -140,7 +133,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
_ => info!("Error processing HTTP request: {api_error:#}"),
}

View File

@@ -1,6 +1,11 @@
use std::collections::HashMap;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver::tenant::IndexPart;
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
use utils::lsn::Lsn;
#[derive(clap::Subcommand)]
pub(crate) enum IndexPartCmd {
@@ -12,7 +17,20 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
IndexPartCmd::Dump { path } => {
let bytes = tokio::fs::read(path).await.context("read file")?;
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
let output = serde_json::to_string_pretty(&des).context("serialize output")?;
#[derive(serde::Serialize)]
struct Output<'a> {
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
disk_consistent_lsn: Lsn,
timeline_metadata: &'a TimelineMetadata,
}
let output = Output {
layer_metadata: &des.layer_metadata,
disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
timeline_metadata: &des.metadata,
};
let output = serde_json::to_string_pretty(&output).context("serialize output")?;
println!("{output}");
Ok(())
}

View File

@@ -99,6 +99,8 @@ pub mod defaults {
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
///
/// Default built-in configuration file.
///
@@ -144,6 +146,8 @@ pub mod defaults {
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -296,6 +300,8 @@ pub struct PageServerConf {
///
/// Setting this to zero disables limits on total ephemeral layer size.
pub ephemeral_bytes_per_memory_kb: usize,
pub walredo_process_kind: crate::walredo::ProcessKind,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -401,6 +407,8 @@ struct PageServerConfigBuilder {
validate_vectored_get: BuilderValue<bool>,
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
}
impl PageServerConfigBuilder {
@@ -489,6 +497,8 @@ impl PageServerConfigBuilder {
)),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
}
}
}
@@ -676,6 +686,10 @@ impl PageServerConfigBuilder {
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
}
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
self.walredo_process_kind = BuilderValue::Set(value);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();
@@ -733,6 +747,7 @@ impl PageServerConfigBuilder {
max_vectored_read_bytes,
validate_vectored_get,
ephemeral_bytes_per_memory_kb,
walredo_process_kind,
}
CUSTOM LOGIC
{
@@ -1029,6 +1044,9 @@ impl PageServerConf {
"ephemeral_bytes_per_memory_kb" => {
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
}
"walredo_process_kind" => {
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -1112,6 +1130,7 @@ impl PageServerConf {
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
}
}
}
@@ -1351,6 +1370,7 @@ background_task_maximum_delay = '334 s'
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -1424,6 +1444,7 @@ background_task_maximum_delay = '334 s'
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -2,9 +2,10 @@
//! and push them to a HTTP endpoint.
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::size::CalculateSyntheticSizeError;
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
use crate::tenant::{
mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
@@ -349,12 +350,19 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await {
Ok(_) => {}
Err(CalculateSyntheticSizeError::Cancelled) => {}
Err(e) => {
let tenant_shard_id = tenant.tenant_shard_id();
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
}
let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
return;
};
// this error can be returned if timeline is shutting down, but it does not
// mean the synthetic size worker should terminate.
let shutting_down = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled)
);
if !shutting_down {
let tenant_shard_id = tenant.tenant_shard_id();
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
}
}

View File

@@ -81,10 +81,8 @@ paths:
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
404 means that deletion successfully finished"
responses:
"200":
description: Tenant was successfully deleted, or was already not found.
"404":
description: Tenant not found. This is a success result, equivalent to 200.
description: Tenant not found. This is the success path.
content:
application/json:
schema:

View File

@@ -181,7 +181,9 @@ impl From<PageReconstructError> for ApiError {
PageReconstructError::MissingKey(e) => {
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
}
PageReconstructError::Cancelled => ApiError::Cancelled,
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
}
@@ -1071,7 +1073,7 @@ async fn tenant_delete_handler(
let state = get_state(&request);
let status = state
state
.tenant_manager
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
.instrument(info_span!("tenant_delete_handler",
@@ -1080,14 +1082,7 @@ async fn tenant_delete_handler(
))
.await?;
// Callers use 404 as success for deletions, for historical reasons.
if status == StatusCode::NOT_FOUND {
return Err(ApiError::NotFound(
anyhow::anyhow!("Deletion complete").into(),
));
}
json_response(status, ())
json_response(StatusCode::ACCEPTED, ())
}
/// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1135,10 +1130,7 @@ async fn tenant_size_handler(
&ctx,
)
.await
.map_err(|e| match e {
crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown,
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
})?;
.map_err(ApiError::InternalServerError)?;
let mut sizes = None;
let accepts_html = headers
@@ -1146,7 +1138,9 @@ async fn tenant_size_handler(
.map(|v| v == "text/html")
.unwrap_or_default();
if !inputs_only.unwrap_or(false) {
let storage_model = inputs.calculate_model();
let storage_model = inputs
.calculate_model()
.map_err(ApiError::InternalServerError)?;
let size = storage_model.calculate();
// If request header expects html, return html
@@ -2430,25 +2424,6 @@ async fn list_aux_files(
json_response(StatusCode::OK, files)
}
async fn perf_info(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let result = timeline.perf_info().await;
json_response(StatusCode::OK, result)
}
async fn ingest_aux_files(
mut request: Request<Body>,
_cancel: CancellationToken,
@@ -2876,9 +2851,5 @@ pub fn make_router(
|r| testing_api_handler("list_aux_files", r, list_aux_files),
)
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
|r| testing_api_handler("perf_info", r, perf_info),
)
.any(handler_404))
}

View File

@@ -2108,7 +2108,6 @@ pub(crate) struct TimelineMetrics {
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
pub evictions: IntCounter,
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
shutdown: std::sync::atomic::AtomicBool,
}
impl TimelineMetrics {
@@ -2228,7 +2227,6 @@ impl TimelineMetrics {
evictions_with_low_residence_duration: std::sync::RwLock::new(
evictions_with_low_residence_duration,
),
shutdown: std::sync::atomic::AtomicBool::default(),
}
}
@@ -2251,17 +2249,6 @@ impl TimelineMetrics {
}
pub(crate) fn shutdown(&self) {
let was_shutdown = self
.shutdown
.swap(true, std::sync::atomic::Ordering::Relaxed);
if was_shutdown {
// this happens on tenant deletion because tenant first shuts down timelines, then
// invokes timeline deletion which first shuts down the timeline again.
// TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
return;
}
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;

View File

@@ -36,7 +36,6 @@ use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::pausable_failpoint;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -410,8 +409,6 @@ impl Timeline {
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<LsnForTimestamp, PageReconstructError> {
pausable_failpoint!("find-lsn-for-timestamp-pausable");
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
// We use this method to figure out the branching LSN for the new branch, but the
// GC cutoff could be before the branching point and we cannot create a new branch
@@ -427,7 +424,6 @@ impl Timeline {
let mut found_smaller = false;
let mut found_larger = false;
while low < high {
if cancel.is_cancelled() {
return Err(PageReconstructError::Cancelled);
@@ -919,14 +915,6 @@ impl Timeline {
result.add_key(AUX_FILES_KEY);
}
#[cfg(test)]
{
let guard = self.extra_test_dense_keyspace.load();
for kr in &guard.ranges {
result.add_range(kr.clone());
}
}
Ok((
result.to_keyspace(),
/* AUX sparse key space */

View File

@@ -509,24 +509,11 @@ pub(crate) enum GcError {
#[error(transparent)]
Remote(anyhow::Error),
// An error reading while calculating GC cutoffs
#[error(transparent)]
GcCutoffs(PageReconstructError),
// If GC was invoked for a particular timeline, this error means it didn't exist
#[error("timeline not found")]
TimelineNotFound,
}
impl From<PageReconstructError> for GcError {
fn from(value: PageReconstructError) -> Self {
match value {
PageReconstructError::Cancelled => Self::TimelineCancelled,
other => Self::GcCutoffs(other),
}
}
}
impl Tenant {
/// Yet another helper for timeline initialization.
///
@@ -1046,6 +1033,7 @@ impl Tenant {
remote_metadata,
TimelineResources {
remote_client,
deletion_queue_client: self.deletion_queue_client.clone(),
timeline_get_throttle: self.timeline_get_throttle.clone(),
},
ctx,
@@ -1071,6 +1059,7 @@ impl Tenant {
timeline_id,
&index_part.metadata,
remote_timeline_client,
self.deletion_queue_client.clone(),
)
.instrument(tracing::info_span!("timeline_delete", %timeline_id))
.await
@@ -2932,9 +2921,17 @@ impl Tenant {
.checked_sub(horizon)
.unwrap_or(Lsn(0));
let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
assert!(old.is_none());
let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
match res {
Ok(cutoffs) => {
let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
assert!(old.is_none());
}
Err(e) => {
tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
}
}
}
if !self.is_active() || self.cancel.is_cancelled() {
@@ -3398,12 +3395,6 @@ impl Tenant {
let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
let unfinished_timeline = raw_timeline.raw_timeline()?;
// Flush the new layer files to disk, before we make the timeline as available to
// the outside world.
//
// Flush loop needs to be spawned in order to be able to flush.
unfinished_timeline.maybe_spawn_flush_loop();
import_datadir::import_timeline_from_postgres_datadir(
unfinished_timeline,
&pgdata_path,
@@ -3415,6 +3406,12 @@ impl Tenant {
format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
})?;
// Flush the new layer files to disk, before we make the timeline as available to
// the outside world.
//
// Flush loop needs to be spawned in order to be able to flush.
unfinished_timeline.maybe_spawn_flush_loop();
fail::fail_point!("before-checkpoint-new-timeline", |_| {
anyhow::bail!("failpoint before-checkpoint-new-timeline");
});
@@ -3446,6 +3443,7 @@ impl Tenant {
);
TimelineResources {
remote_client,
deletion_queue_client: self.deletion_queue_client.clone(),
timeline_get_throttle: self.timeline_get_throttle.clone(),
}
}
@@ -3555,7 +3553,7 @@ impl Tenant {
cause: LogicalSizeCalculationCause,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
) -> anyhow::Result<size::ModelInputs> {
let logical_sizes_at_once = self
.conf
.concurrent_tenant_size_logical_size_queries
@@ -3570,8 +3568,8 @@ impl Tenant {
// See more for on the issue #2748 condenced out of the initial PR review.
let mut shared_cache = tokio::select! {
locked = self.cached_logical_sizes.lock() => locked,
_ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
_ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
_ = cancel.cancelled() => anyhow::bail!("cancelled"),
_ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
};
size::gather_inputs(
@@ -3595,10 +3593,10 @@ impl Tenant {
cause: LogicalSizeCalculationCause,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<u64, size::CalculateSyntheticSizeError> {
) -> anyhow::Result<u64> {
let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;
let size = inputs.calculate();
let size = inputs.calculate()?;
self.set_cached_synthetic_size(size);
@@ -4043,16 +4041,13 @@ mod tests {
use crate::repository::{Key, Value};
use crate::tenant::harness::*;
use crate::tenant::timeline::CompactFlags;
use crate::walrecord::NeonWalRecord;
use crate::DEFAULT_PG_VERSION;
use bytes::{Bytes, BytesMut};
use hex_literal::hex;
use itertools::Itertools;
use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
use rand::{thread_rng, Rng};
use storage_layer::PersistentLayerKey;
use tests::storage_layer::ValuesReconstructState;
use tests::timeline::{GetVectoredError, ShutdownMode};
use utils::bin_ser::BeSer;
@@ -5267,9 +5262,6 @@ mod tests {
let cancel = CancellationToken::new();
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut test_key_end = test_key;
test_key_end.field6 = NUM_KEYS as u32;
tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end));
let mut keyspace = KeySpaceAccum::new();
@@ -6229,8 +6221,8 @@ mod tests {
let cancel = CancellationToken::new();
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
base_key.field1 = AUX_KEY_PREFIX;
let mut test_key = base_key;
let mut lsn = Lsn(0x10);
@@ -6335,7 +6327,6 @@ mod tests {
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
)
.await?;
tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next())));
let child = tenant
.branch_timeline_test_with_layers(
@@ -6593,8 +6584,8 @@ mod tests {
}
#[tokio::test]
async fn test_metadata_tombstone_image_creation() {
let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
let (tenant, ctx) = harness.load().await;
let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6622,8 +6613,7 @@ mod tests {
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
Lsn(0x30),
)
.await
.unwrap();
.await?;
let cancel = CancellationToken::new();
@@ -6638,24 +6628,23 @@ mod tests {
},
&ctx,
)
.await
.unwrap();
.await?;
// Image layers are created at last_record_lsn
let images = tline
.inspect_image_layers(Lsn(0x30), &ctx)
.await
.unwrap()
.await?
.into_iter()
.filter(|(k, _)| k.is_metadata_key())
.collect::<Vec<_>>();
assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
Ok(())
}
#[tokio::test]
async fn test_metadata_tombstone_empty_image_creation() {
let harness =
TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
let (tenant, ctx) = harness.load().await;
let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6677,8 +6666,7 @@ mod tests {
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
Lsn(0x30),
)
.await
.unwrap();
.await?;
let cancel = CancellationToken::new();
@@ -6693,249 +6681,16 @@ mod tests {
},
&ctx,
)
.await
.unwrap();
.await?;
// Image layers are created at last_record_lsn
let images = tline
.inspect_image_layers(Lsn(0x30), &ctx)
.await
.unwrap()
.await?
.into_iter()
.filter(|(k, _)| k.is_metadata_key())
.collect::<Vec<_>>();
assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
}
#[tokio::test]
async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
// We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
//
// | D1 | | D3 |
// -| |-- gc horizon -----------------
// | | | D2 |
// --------- img layer ------------------
//
// What we should expact from this compaction is:
// | Part of D1 | | D3 |
// --------- img layer with D1+D2 at GC horizon------------------
// img layer at 0x10
let img_layer = (0..10)
.map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
.collect_vec();
let delta1 = vec![
// TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
(
get_key(1),
Lsn(0x20),
Value::Image(test_img("value 1@0x20")),
),
(
get_key(2),
Lsn(0x30),
Value::Image(test_img("value 2@0x30")),
),
(
get_key(3),
Lsn(0x40),
Value::Image(test_img("value 3@0x40")),
),
];
let delta2 = vec![
(
get_key(5),
Lsn(0x20),
Value::Image(test_img("value 5@0x20")),
),
(
get_key(6),
Lsn(0x20),
Value::Image(test_img("value 6@0x20")),
),
];
let delta3 = vec![
(
get_key(8),
Lsn(0x40),
Value::Image(test_img("value 8@0x40")),
),
(
get_key(9),
Lsn(0x40),
Value::Image(test_img("value 9@0x40")),
),
];
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![delta1, delta2, delta3], // delta layers
vec![(Lsn(0x10), img_layer)], // image layers
Lsn(0x50),
)
.await?;
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.pitr = Lsn(0x30);
guard.cutoffs.horizon = Lsn(0x30);
}
let cancel = CancellationToken::new();
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
// Check if the image layer at the GC horizon contains exactly what we want
let image_at_gc_horizon = tline
.inspect_image_layers(Lsn(0x30), &ctx)
.await
.unwrap()
.into_iter()
.filter(|(k, _)| k.is_metadata_key())
.collect::<Vec<_>>();
assert_eq!(image_at_gc_horizon.len(), 10);
let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
for idx in 0..10 {
assert_eq!(
image_at_gc_horizon[idx],
(
get_key(idx as u32),
test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
)
);
}
// Check if old layers are removed / new layers have the expected LSN
let mut all_layers = tline.inspect_historic_layers().await.unwrap();
all_layers.sort_by(|k1, k2| {
(
k1.is_delta,
k1.key_range.start,
k1.key_range.end,
k1.lsn_range.start,
k1.lsn_range.end,
)
.cmp(&(
k2.is_delta,
k2.key_range.start,
k2.key_range.end,
k2.lsn_range.start,
k2.lsn_range.end,
))
});
assert_eq!(
all_layers,
vec![
// Image layer at GC horizon
PersistentLayerKey {
key_range: Key::MIN..get_key(10),
lsn_range: Lsn(0x30)..Lsn(0x31),
is_delta: false
},
// The delta layer that is cut in the middle
PersistentLayerKey {
key_range: Key::MIN..get_key(9),
lsn_range: Lsn(0x30)..Lsn(0x41),
is_delta: true
},
// The delta layer we created and should not be picked for the compaction
PersistentLayerKey {
key_range: get_key(8)..get_key(10),
lsn_range: Lsn(0x40)..Lsn(0x41),
is_delta: true
}
]
);
Ok(())
}
#[tokio::test]
async fn test_neon_test_record() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_neon_test_record")?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let delta1 = vec![
(
get_key(1),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
),
(
get_key(1),
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
),
(get_key(2), Lsn(0x10), Value::Image("0x10".into())),
(
get_key(2),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
),
(
get_key(2),
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
),
(get_key(3), Lsn(0x10), Value::Image("0x10".into())),
(
get_key(3),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_clear()),
),
(get_key(4), Lsn(0x10), Value::Image("0x10".into())),
(
get_key(4),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_init()),
),
];
let image1 = vec![(get_key(1), "0x10".into())];
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![delta1], // delta layers
vec![(Lsn(0x10), image1)], // image layers
Lsn(0x50),
)
.await?;
assert_eq!(
tline.get(get_key(1), Lsn(0x50), &ctx).await?,
Bytes::from_static(b"0x10,0x20,0x30")
);
assert_eq!(
tline.get(get_key(2), Lsn(0x50), &ctx).await?,
Bytes::from_static(b"0x10,0x20,0x30")
);
// assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
// assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
Ok(())
}

View File

@@ -1,23 +1,15 @@
//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
//! this struct and it's original serialization format is still needed because they were written a
//! long time ago.
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
//! has a metadata that needs to be stored persistently.
//!
//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
//! versioning.
//! Later, the file gets used in [`remote_timeline_client`] as a part of
//! external storage import and export operations.
//!
//! To clean up this module we need to migrate all index_part.json files to a later version.
//! While doing this, we need to be mindful about s3 based recovery as well, so it might take
//! however long we keep the old versions to be able to delete the old code. After that, we can
//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and
//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards
//! compatibility.
//! The module contains all structs and related helper methods related to timeline metadata.
//!
//! [`remote_timeline_client`]: super::remote_timeline_client
//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart
use anyhow::ensure;
use serde::{Deserialize, Serialize};
use serde::{de::Error, Deserialize, Serialize, Serializer};
use utils::bin_ser::SerializeError;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
@@ -25,37 +17,17 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
const METADATA_FORMAT_VERSION: u16 = 4;
/// Previous supported format versions.
///
/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming
/// that requires a scrubber run which is yet to be done.
const METADATA_OLD_FORMAT_VERSION: u16 = 3;
/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic.
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
///
/// This is the same assumption that PostgreSQL makes with the control file,
///
/// see PG_CONTROL_MAX_SAFE_SIZE
const METADATA_MAX_SIZE: usize = 512;
/// Legacy metadata stored as a component of `index_part.json` per timeline.
/// Metadata stored on disk for each timeline
///
/// Do not make new changes to this type or the module. In production, we have two different kinds
/// of serializations of this type: bincode and json. Bincode version reflects what used to be
/// stored on disk in earlier versions and does internal crc32 checksumming.
///
/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would
/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern
/// as-exists in `index_part.json` ([`self::modern_serde`]).
///
/// ```compile_fail
/// #[derive(serde::Serialize)]
/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata);
/// ```
///
/// ```compile_fail
/// #[derive(serde::Deserialize)]
/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata);
/// ```
/// The fields correspond to the values we hold in memory, in Timeline.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TimelineMetadata {
hdr: TimelineMetadataHeader,
@@ -68,49 +40,6 @@ struct TimelineMetadataHeader {
size: u16, // size of serialized metadata
format_version: u16, // metadata format version (used for compatibility checks)
}
impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
type Error = Crc32CalculationFailed;
fn try_from(value: &TimelineMetadataBodyV2) -> Result<Self, Self::Error> {
#[derive(Default)]
struct Crc32Sink {
crc: u32,
count: usize,
}
impl std::io::Write for Crc32Sink {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.crc = crc32c::crc32c_append(self.crc, buf);
self.count += buf.len();
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
// jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
// across serialization versions
let mut sink = Crc32Sink::default();
<TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(value, &mut sink)
.map_err(Crc32CalculationFailed)?;
let size = METADATA_HDR_SIZE + sink.count;
Ok(TimelineMetadataHeader {
checksum: sink.crc,
size: size as u16,
format_version: METADATA_FORMAT_VERSION,
})
}
}
#[derive(thiserror::Error, Debug)]
#[error("re-serializing for crc32 failed")]
struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -182,12 +111,6 @@ impl TimelineMetadata {
}
}
#[cfg(test)]
pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result<Self> {
self.hdr = TimelineMetadataHeader::try_from(&self.body)?;
Ok(self)
}
fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
@@ -338,8 +261,32 @@ impl TimelineMetadata {
}
}
impl<'de> Deserialize<'de> for TimelineMetadata {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let bytes = Vec::<u8>::deserialize(deserializer)?;
Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
}
}
impl Serialize for TimelineMetadata {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
bytes.serialize(serializer)
}
}
pub(crate) mod modern_serde {
use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
use crate::tenant::metadata::METADATA_FORMAT_VERSION;
use super::{
TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
};
use serde::{Deserialize, Serialize};
pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
@@ -375,15 +322,71 @@ pub(crate) mod modern_serde {
let de = serde::de::value::MapAccessDeserializer::new(map);
let body = TimelineMetadataBodyV2::deserialize(de)?;
let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?;
Ok(TimelineMetadata { hdr, body })
// jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
// across serialization versions
let mut sink = Crc32Sink::default();
<TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
.map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
let size = METADATA_HDR_SIZE + sink.count;
Ok(TimelineMetadata {
hdr: TimelineMetadataHeader {
checksum: sink.crc,
size: size as u16,
format_version: METADATA_FORMAT_VERSION,
},
body,
})
}
}
deserializer.deserialize_any(Visitor)
}
#[derive(Default)]
struct Crc32Sink {
crc: u32,
count: usize,
}
impl std::io::Write for Crc32Sink {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.crc = crc32c::crc32c_append(self.crc, buf);
self.count += buf.len();
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
#[derive(thiserror::Error)]
#[error("re-serializing for crc32 failed")]
struct Crc32CalculationFailed<E>(#[source] E);
// this should be true for one release, after that we can change it to false
// remember to check the IndexPart::metadata field TODO comment as well
const LEGACY_BINCODED_BYTES: bool = true;
#[derive(serde::Serialize)]
#[serde(transparent)]
struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
struct JustTheBodyV2<'a>(&'a TimelineMetadata);
impl serde::Serialize for JustTheBodyV2<'_> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
// header is not needed, upon reading we've upgraded all v1 to v2
self.0.body.serialize(serializer)
}
}
pub(crate) fn serialize<S>(
metadata: &TimelineMetadata,
serializer: S,
@@ -391,23 +394,25 @@ pub(crate) mod modern_serde {
where
S: serde::Serializer,
{
// header is not needed, upon reading we've upgraded all v1 to v2
metadata.body.serialize(serializer)
// we cannot use TimelineMetadata::serialize for now because it'll do
// TimelineMetadata::to_bytes
if LEGACY_BINCODED_BYTES {
LegacyPaddedBytes(metadata).serialize(serializer)
} else {
JustTheBodyV2(metadata).serialize(serializer)
}
}
#[test]
fn deserializes_bytes_as_well_as_equivalent_body_v2() {
#[derive(serde::Deserialize, serde::Serialize)]
struct Wrapper(
#[serde(deserialize_with = "deserialize", serialize_with = "serialize")]
TimelineMetadata,
);
struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap();
let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
assert_eq!(
serialized,
@@ -548,6 +553,59 @@ mod tests {
);
}
#[test]
fn test_metadata_bincode_serde() {
let original_metadata = TimelineMetadata::new(
Lsn(0x200),
Some(Lsn(0x100)),
Some(TIMELINE_ID),
Lsn(0),
Lsn(0),
Lsn(0),
// Any version will do here, so use the default
crate::DEFAULT_PG_VERSION,
);
let metadata_bytes = original_metadata
.to_bytes()
.expect("Cannot create bytes array from metadata");
let metadata_bincode_be_bytes = original_metadata
.ser()
.expect("Cannot serialize the metadata");
// 8 bytes for the length of the vector
assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
let expected_bincode_bytes = {
let mut temp = vec![];
let len_bytes = metadata_bytes.len().to_be_bytes();
temp.extend_from_slice(&len_bytes);
temp.extend_from_slice(&metadata_bytes);
temp
};
assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
// Deserialized metadata has the metadata header, which is different from the serialized one.
// Reference: TimelineMetaData::to_bytes()
let expected_metadata = {
let mut temp_metadata = original_metadata;
let body_bytes = temp_metadata
.body
.ser()
.expect("Cannot serialize the metadata body");
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
let hdr = TimelineMetadataHeader {
size: metadata_size as u16,
format_version: METADATA_FORMAT_VERSION,
checksum: crc32c::crc32c(&body_bytes),
};
temp_metadata.hdr = hdr;
temp_metadata
};
assert_eq!(deserialized_metadata, expected_metadata);
}
#[test]
fn test_metadata_bincode_serde_ensure_roundtrip() {
let original_metadata = TimelineMetadata::new(
@@ -561,6 +619,8 @@ mod tests {
crate::DEFAULT_PG_VERSION,
);
let expected_bytes = vec![
/* bincode length encoding bytes */
0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
/* TimelineMetadataHeader */
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
/* TimelineMetadataBodyV2 */
@@ -590,7 +650,7 @@ mod tests {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
];
let metadata_ser_bytes = original_metadata.to_bytes().unwrap();
let metadata_ser_bytes = original_metadata.ser().unwrap();
assert_eq!(metadata_ser_bytes, expected_bytes);
let expected_metadata = {
@@ -608,7 +668,7 @@ mod tests {
temp_metadata.hdr = hdr;
temp_metadata
};
let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap();
let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
assert_eq!(des_metadata, expected_metadata);
}
}

View File

@@ -3,7 +3,6 @@
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use hyper::StatusCode;
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::LocationConfigMode;
@@ -55,7 +54,6 @@ use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::remote_timeline_client::remote_tenant_path;
use super::secondary::SecondaryTenant;
use super::timeline::detach_ancestor::PreparedTimelineDetach;
use super::TenantSharedResources;
@@ -1371,7 +1369,7 @@ impl TenantManager {
&self,
tenant_shard_id: TenantShardId,
activation_timeout: Duration,
) -> Result<StatusCode, DeleteTenantError> {
) -> Result<(), DeleteTenantError> {
super::span::debug_assert_current_span_has_tenant_id();
// We acquire a SlotGuard during this function to protect against concurrent
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
@@ -1384,79 +1382,18 @@ impl TenantManager {
//
// See https://github.com/neondatabase/neon/issues/5080
// Tenant deletion can happen two ways:
// - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
// state until deletion is complete.
// - New: called on a pageserver without an attached location. We proceed with deletion from
// remote storage.
//
// See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
match &slot_guard.old_value {
Some(TenantSlot::Attached(tenant)) => {
// Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
// deletion will be resumed across restarts.
let tenant = tenant.clone();
return self
.delete_tenant_attached(slot_guard, tenant, activation_timeout)
.await;
// unwrap is safe because we used MustExist mode when acquiring
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
TenantSlot::Attached(tenant) => tenant.clone(),
_ => {
// Express "not attached" as equivalent to "not found"
return Err(DeleteTenantError::NotAttached);
}
Some(TenantSlot::Secondary(secondary_tenant)) => {
secondary_tenant.shutdown().await;
let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| {
format!("local tenant directory {local_tenant_directory:?} rename")
})?;
spawn_background_purge(tmp_dir);
}
Some(TenantSlot::InProgress(_)) => unreachable!(),
None => {}
};
// Fall through: local state for this tenant is no longer present, proceed with remote delete
let remote_path = remote_tenant_path(&tenant_shard_id);
let keys = match self
.resources
.remote_storage
.list(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
)
.await
{
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
}
// Callers use 404 as success for deletions, for historical reasons.
Ok(StatusCode::NOT_FOUND)
}
async fn delete_tenant_attached(
&self,
slot_guard: SlotGuard,
tenant: Arc<Tenant>,
activation_timeout: Duration,
) -> Result<StatusCode, DeleteTenantError> {
match tenant.current_state() {
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
// If deletion is already in progress, return success (the semantics of this
@@ -1466,7 +1403,7 @@ impl TenantManager {
// The `delete_progress` lock is held: deletion is already happening
// in the bacckground
slot_guard.revert();
return Ok(StatusCode::ACCEPTED);
return Ok(());
}
}
_ => {
@@ -1499,8 +1436,7 @@ impl TenantManager {
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
slot_guard.revert();
let () = result?;
Ok(StatusCode::ACCEPTED)
result
}
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]

View File

@@ -38,17 +38,14 @@ pub struct IndexPart {
/// that latest version stores.
pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
/// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
/// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be
/// reused.
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated for convenience when reading the serialized structure, but is
// private because internally we would read from metadata instead.
pub(super) disk_consistent_lsn: Lsn,
// TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding
// the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes"
// for backwards compatibility.
// TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
#[serde(
rename = "metadata_bytes",
alias = "metadata",
with = "crate::tenant::metadata::modern_serde"
)]
pub metadata: TimelineMetadata,
@@ -79,11 +76,10 @@ impl IndexPart {
/// - 4: timeline_layers is fully removed.
/// - 5: lineage was added
/// - 6: last_aux_file_policy is added.
/// - 7: metadata_bytes is no longer written, but still read
const LATEST_VERSION: usize = 7;
const LATEST_VERSION: usize = 6;
// Versions we may see when reading from a bucket.
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
pub const FILE_NAME: &'static str = "index_part.json";
@@ -99,7 +95,7 @@ impl IndexPart {
}
}
pub fn version(&self) -> usize {
pub fn get_version(&self) -> usize {
self.version
}
@@ -221,9 +217,9 @@ impl Lineage {
#[cfg(test)]
mod tests {
use super::*;
use std::str::FromStr;
use utils::id::TimelineId;
use super::*;
#[test]
fn v1_indexpart_is_parsed() {
@@ -342,7 +338,8 @@ mod tests {
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
lineage: Lineage::default(),
last_aux_file_policy: None,
};
@@ -518,7 +515,8 @@ mod tests {
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
lineage: Lineage {
reparenting_history_truncated: false,
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -531,60 +529,6 @@ mod tests {
assert_eq!(part, expected);
}
#[test]
fn v7_indexpart_is_parsed() {
let example = r#"{
"version": 7,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata": {
"disk_consistent_lsn": "0/16960E8",
"prev_record_lsn": "0/1696070",
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
"ancestor_lsn": "0/0",
"latest_gc_cutoff_lsn": "0/1696070",
"initdb_lsn": "0/1696070",
"pg_version": 14
},
"deleted_at": "2023-07-31T09:00:00.123"
}"#;
let expected = IndexPart {
version: 7,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::new(
Lsn::from_str("0/16960E8").unwrap(),
Some(Lsn::from_str("0/1696070").unwrap()),
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
Lsn::INVALID,
Lsn::from_str("0/1696070").unwrap(),
Lsn::from_str("0/1696070").unwrap(),
14,
).with_recalculated_checksum().unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
lineage: Default::default(),
last_aux_file_policy: Default::default(),
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
fn parse_naive_datetime(s: &str) -> NaiveDateTime {
chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
}

View File

@@ -513,7 +513,7 @@ impl<'a> TenantDownloader<'a> {
// cover our access to local storage.
let Ok(_guard) = self.secondary_state.gate.enter() else {
// Shutting down
return Err(UpdateError::Cancelled);
return Ok(());
};
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
@@ -846,7 +846,7 @@ impl<'a> TenantDownloader<'a> {
for layer in timeline.layers {
if self.secondary_state.cancel.is_cancelled() {
tracing::debug!("Cancelled -- dropping out of layer loop");
return Err(UpdateError::Cancelled);
return Ok(());
}
// Existing on-disk layers: just update their access time.

View File

@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use anyhow::{bail, Context};
use tokio::sync::oneshot::error::RecvError;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
@@ -10,7 +11,7 @@ use tokio_util::sync::CancellationToken;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use super::{GcError, LogicalSizeCalculationCause, Tenant};
use super::{LogicalSizeCalculationCause, Tenant};
use crate::tenant::Timeline;
use utils::id::TimelineId;
use utils::lsn::Lsn;
@@ -42,44 +43,6 @@ pub struct SegmentMeta {
pub kind: LsnKind,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum CalculateSyntheticSizeError {
/// Something went wrong internally to the calculation of logical size at a particular branch point
#[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")]
LogicalSize {
timeline_id: TimelineId,
lsn: Lsn,
error: CalculateLogicalSizeError,
},
/// Something went wrong internally when calculating GC parameters at start of size calculation
#[error(transparent)]
GcInfo(GcError),
/// Totally unexpected errors, like panics joining a task
#[error(transparent)]
Fatal(anyhow::Error),
/// The LSN we are trying to calculate a size at no longer exists at the point we query it
#[error("Could not find size at {lsn} in timeline {timeline_id}")]
LsnNotFound { timeline_id: TimelineId, lsn: Lsn },
/// Tenant shut down while calculating size
#[error("Cancelled")]
Cancelled,
}
impl From<GcError> for CalculateSyntheticSizeError {
fn from(value: GcError) -> Self {
match value {
GcError::TenantCancelled | GcError::TimelineCancelled => {
CalculateSyntheticSizeError::Cancelled
}
other => CalculateSyntheticSizeError::GcInfo(other),
}
}
}
impl SegmentMeta {
fn size_needed(&self) -> bool {
match self.kind {
@@ -153,9 +116,12 @@ pub(super) async fn gather_inputs(
cause: LogicalSizeCalculationCause,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<ModelInputs, CalculateSyntheticSizeError> {
) -> anyhow::Result<ModelInputs> {
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
tenant.refresh_gc_info(cancel, ctx).await?;
tenant
.refresh_gc_info(cancel, ctx)
.await
.context("Failed to refresh gc_info before gathering inputs")?;
// Collect information about all the timelines
let mut timelines = tenant.list_timelines();
@@ -361,12 +327,6 @@ pub(super) async fn gather_inputs(
)
.await?;
if tenant.cancel.is_cancelled() {
// If we're shutting down, return an error rather than a sparse result that might include some
// timelines from before we started shutting down
return Err(CalculateSyntheticSizeError::Cancelled);
}
Ok(ModelInputs {
segments,
timeline_inputs,
@@ -385,7 +345,7 @@ async fn fill_logical_sizes(
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
cause: LogicalSizeCalculationCause,
ctx: &RequestContext,
) -> Result<(), CalculateSyntheticSizeError> {
) -> anyhow::Result<()> {
let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
timelines
.iter()
@@ -427,7 +387,7 @@ async fn fill_logical_sizes(
}
// Perform the size lookups
let mut have_any_error = None;
let mut have_any_error = false;
while let Some(res) = joinset.join_next().await {
// each of these come with Result<anyhow::Result<_>, JoinError>
// because of spawn + spawn_blocking
@@ -438,36 +398,21 @@ async fn fill_logical_sizes(
Err(join_error) => {
// cannot really do anything, as this panic is likely a bug
error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
have_any_error = Some(CalculateSyntheticSizeError::Fatal(
anyhow::anyhow!(join_error)
.context("task that calls spawn_ondemand_logical_size_calculation"),
));
have_any_error = true;
}
Ok(Err(recv_result_error)) => {
// cannot really do anything, as this panic is likely a bug
error!("failed to receive logical size query result: {recv_result_error:#}");
have_any_error = Some(CalculateSyntheticSizeError::Fatal(
anyhow::anyhow!(recv_result_error)
.context("Receiving logical size query result"),
));
have_any_error = true;
}
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
if matches!(error, CalculateLogicalSizeError::Cancelled) {
// Skip this: it's okay if one timeline among many is shutting down while we
// calculate inputs for the overall tenant.
continue;
} else {
if !matches!(error, CalculateLogicalSizeError::Cancelled) {
warn!(
timeline_id=%timeline.timeline_id,
"failed to calculate logical size at {lsn}: {error:#}"
);
have_any_error = Some(CalculateSyntheticSizeError::LogicalSize {
timeline_id: timeline.timeline_id,
lsn,
error,
});
}
have_any_error = true;
}
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
@@ -481,10 +426,10 @@ async fn fill_logical_sizes(
// prune any keys not needed anymore; we record every used key and added key.
logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));
if let Some(error) = have_any_error {
if have_any_error {
// we cannot complete this round, because we are missing data.
// we have however cached all we were able to request calculation on.
return Err(error);
anyhow::bail!("failed to calculate some logical_sizes");
}
// Insert the looked up sizes to the Segments
@@ -499,29 +444,32 @@ async fn fill_logical_sizes(
if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
seg.segment.size = Some(*size);
} else {
return Err(CalculateSyntheticSizeError::LsnNotFound { timeline_id, lsn });
bail!("could not find size at {} in timeline {}", lsn, timeline_id);
}
}
Ok(())
}
impl ModelInputs {
pub fn calculate_model(&self) -> tenant_size_model::StorageModel {
pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
// Convert SegmentMetas into plain Segments
StorageModel {
let storage = StorageModel {
segments: self
.segments
.iter()
.map(|seg| seg.segment.clone())
.collect(),
}
};
Ok(storage)
}
// calculate total project size
pub fn calculate(&self) -> u64 {
let storage = self.calculate_model();
pub fn calculate(&self) -> anyhow::Result<u64> {
let storage = self.calculate_model()?;
let sizes = storage.calculate();
sizes.total_size
Ok(sizes.total_size)
}
}
@@ -708,7 +656,7 @@ fn verify_size_for_multiple_branches() {
"#;
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
assert_eq!(inputs.calculate(), 37_851_408);
assert_eq!(inputs.calculate().unwrap(), 37_851_408);
}
#[test]
@@ -763,7 +711,7 @@ fn verify_size_for_one_branch() {
let model: ModelInputs = serde_json::from_str(doc).unwrap();
let res = model.calculate_model().calculate();
let res = model.calculate_model().unwrap().calculate();
println!("calculated synthetic size: {}", res.total_size);
println!("result: {:?}", serde_json::to_string(&res.segments));

View File

@@ -219,6 +219,7 @@ pub struct DeltaLayerInner {
// values copied from summary
index_start_blk: u32,
index_root_blk: u32,
lsn_range: Range<Lsn>,
file: VirtualFile,
file_id: FileId,
@@ -784,6 +785,7 @@ impl DeltaLayerInner {
file_id,
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
lsn_range: actual_summary.lsn_range,
max_vectored_read_bytes,
}))
}
@@ -909,7 +911,7 @@ impl DeltaLayerInner {
let reads = Self::plan_reads(
&keyspace,
lsn_range.clone(),
lsn_range,
data_end_offset,
index_reader,
planner,
@@ -922,50 +924,11 @@ impl DeltaLayerInner {
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
.await;
reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
Ok(())
}
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
#[cfg(test)]
pub(super) async fn load_key_values(
&self,
ctx: &RequestContext,
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
self.index_start_blk,
self.index_root_blk,
block_reader,
);
let mut result = Vec::new();
let mut stream =
Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let cursor = block_reader.block_cursor();
let mut buf = Vec::new();
while let Some(item) = stream.next().await {
let (key, lsn, pos) = item?;
// TODO: dedup code with get_reconstruct_value
// TODO: ctx handling and sharding
cursor
.read_blob_into_buf(pos.pos(), &mut buf, ctx)
.await
.with_context(|| {
format!("Failed to read blob from virtual file {}", self.file.path)
})?;
let val = Value::des(&buf).with_context(|| {
format!(
"Failed to deserialize file blob from virtual file {}",
self.file.path
)
})?;
result.push((key, lsn, val));
}
Ok(result)
}
async fn plan_reads<Reader>(
keyspace: &KeySpace,
lsn_range: Range<Lsn>,

View File

@@ -485,34 +485,6 @@ impl ImageLayerInner {
Ok(())
}
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
#[cfg(test)]
pub(super) async fn load_key_values(
&self,
ctx: &RequestContext,
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
let mut result = Vec::new();
let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let cursor = block_reader.block_cursor();
while let Some(item) = stream.next().await {
// TODO: dedup code with get_reconstruct_value
let (raw_key, offset) = item?;
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
// TODO: ctx handling and sharding
let blob = cursor
.read_blob(offset, ctx)
.await
.with_context(|| format!("failed to read value from offset {}", offset))?;
let value = Bytes::from(blob);
result.push((key, self.lsn, Value::Image(value)));
}
Ok(result)
}
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
/// and the keys in this layer.
///

View File

@@ -52,7 +52,7 @@ pub struct InMemoryLayer {
/// Frozen layers have an exclusive end LSN.
/// Writes are only allowed when this is `None`.
pub(crate) end_lsn: OnceLock<Lsn>,
end_lsn: OnceLock<Lsn>,
/// Used for traversal path. Cached representation of the in-memory layer before frozen.
local_path_str: Arc<str>,

View File

@@ -388,23 +388,6 @@ impl Layer {
})
}
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
#[cfg(test)]
pub(crate) async fn load_key_values(
&self,
ctx: &RequestContext,
) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
let layer = self
.0
.get_or_maybe_download(true, Some(ctx))
.await
.map_err(|err| match err {
DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
other => GetVectoredError::Other(anyhow::anyhow!(other)),
})?;
layer.load_key_values(&self.0, ctx).await
}
/// Download the layer if evicted.
///
/// Will not error when the layer is already downloaded.
@@ -1774,20 +1757,6 @@ impl DownloadedLayer {
}
}
#[cfg(test)]
async fn load_key_values(
&self,
owner: &Arc<LayerInner>,
ctx: &RequestContext,
) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
use LayerKind::*;
match self.get(owner, ctx).await? {
Delta(d) => d.load_key_values(ctx).await,
Image(i) => i.load_key_values(ctx).await,
}
}
async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
use LayerKind::*;
match self.get(owner, ctx).await? {

View File

@@ -815,7 +815,6 @@ async fn eviction_cancellation_on_drop() {
/// A test case to remind you the cost of these structures. You can bump the size limit
/// below if it is really necessary to add more fields to the structures.
#[test]
#[cfg(target_arch = "x86_64")]
fn layer_size() {
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);

View File

@@ -1,4 +1,3 @@
pub(crate) mod analysis;
mod compaction;
pub mod delete;
pub(crate) mod detach_ancestor;
@@ -62,7 +61,6 @@ use std::{
ops::ControlFlow,
};
use crate::metrics::GetKind;
use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
use crate::{
aux_file::AuxFileSizeEstimator,
@@ -76,6 +74,7 @@ use crate::{
disk_usage_eviction_task::DiskUsageEvictionInfo,
pgdatadir_mapping::CollectKeySpaceError,
};
use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
use crate::{
disk_usage_eviction_task::finite_f32,
tenant::storage_layer::{
@@ -205,6 +204,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: RemoteTimelineClient,
pub deletion_queue_client: DeletionQueueClient,
pub timeline_get_throttle: Arc<
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
>,
@@ -321,8 +321,6 @@ pub struct Timeline {
/// Locked automatically by [`TimelineWriter`] and checkpointer.
/// Must always be acquired before the layer map/individual layer lock
/// to avoid deadlock.
///
/// The state is cleared upon freezing.
write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
/// Used to avoid multiple `flush_loop` tasks running
@@ -425,14 +423,6 @@ pub struct Timeline {
/// Indicate whether aux file v2 storage is enabled.
pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
/// Some test cases directly place keys into the timeline without actually modifying the directory
/// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
/// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
/// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and
/// in the future, add `extra_test_sparse_keyspace` if necessary.
#[cfg(test)]
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
}
pub struct WalReceiverInfo {
@@ -1578,15 +1568,7 @@ impl Timeline {
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
// polluting the span hierarchy.
pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
let to_lsn = {
// Freeze the current open in-memory layer. It will be written to disk on next
// iteration.
let mut g = self.write_lock.lock().await;
let to_lsn = self.get_last_record_lsn();
self.freeze_inmem_layer_at(to_lsn, &mut g).await;
to_lsn
};
let to_lsn = self.freeze_inmem_layer(false).await;
self.flush_frozen_layers_and_wait(to_lsn).await
}
@@ -1595,7 +1577,7 @@ impl Timeline {
// an ephemeral layer open forever when idle. It also freezes layers if the global limit on
// ephemeral layer bytes has been breached.
pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
let Ok(mut write_guard) = self.write_lock.try_lock() else {
let Ok(_write_guard) = self.write_lock.try_lock() else {
// If the write lock is held, there is an active wal receiver: rolling open layers
// is their responsibility while they hold this lock.
return;
@@ -1672,35 +1654,24 @@ impl Timeline {
self.last_freeze_at.load(),
open_layer.get_opened_at(),
) {
let at_lsn = match open_layer.info() {
match open_layer.info() {
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
// We may reach this point if the layer was already frozen by not yet flushed: flushing
// happens asynchronously in the background.
tracing::debug!(
"Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
);
None
}
InMemoryLayerInfo::Open { .. } => {
// Upgrade to a write lock and freeze the layer
drop(layers_guard);
let mut layers_guard = self.layers.write().await;
let froze = layers_guard
.try_freeze_in_memory_layer(
current_lsn,
&self.last_freeze_at,
&mut write_guard,
)
layers_guard
.try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
.await;
Some(current_lsn).filter(|_| froze)
}
};
if let Some(lsn) = at_lsn {
let res: Result<u64, _> = self.flush_frozen_layers(lsn);
if let Err(e) = res {
tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
}
}
self.flush_frozen_layers();
}
}
@@ -2064,11 +2035,11 @@ impl Timeline {
true
} else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
info!(
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
projected_lsn,
layer_size,
opened_at.elapsed()
);
"Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
projected_lsn,
layer_size,
opened_at.elapsed()
);
true
} else {
@@ -2351,9 +2322,6 @@ impl Timeline {
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
#[cfg(test)]
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2412,7 +2380,7 @@ impl Timeline {
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
*flush_loop_state = FlushLoopState::Exited;
Ok(())
}
@@ -3675,21 +3643,28 @@ impl Timeline {
self.last_record_lsn.advance(new_lsn);
}
async fn freeze_inmem_layer_at(
&self,
at: Lsn,
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
) {
let frozen = {
let mut guard = self.layers.write().await;
guard
.try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
.await
/// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
/// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
// Freeze the current open in-memory layer. It will be written to disk on next
// iteration.
let _write_guard = if write_lock_held {
None
} else {
Some(self.write_lock.lock().await)
};
if frozen {
let now = Instant::now();
*(self.last_freeze_ts.write().unwrap()) = now;
}
let to_lsn = self.get_last_record_lsn();
self.freeze_inmem_layer_at(to_lsn).await;
to_lsn
}
async fn freeze_inmem_layer_at(&self, at: Lsn) {
let mut guard = self.layers.write().await;
guard
.try_freeze_in_memory_layer(at, &self.last_freeze_at)
.await;
}
/// Layer flusher task's main loop.
@@ -3783,14 +3758,18 @@ impl Timeline {
}
}
/// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
/// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
///
/// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
/// case, it means no data will be written between the top of the highest frozen layer and
/// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
/// locally for that part of the WAL.
fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
async fn flush_frozen_layers_and_wait(
&self,
last_record_lsn: Lsn,
) -> Result<(), FlushLayerError> {
let mut rx = self.layer_flush_done_tx.subscribe();
// Increment the flush cycle counter and wake up the flush task.
// Remember the new value, so that when we listen for the flush
// to finish, we know when the flush that we initiated has
@@ -3805,18 +3784,13 @@ impl Timeline {
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
my_flush_request = *counter + 1;
*counter = my_flush_request;
*lsn = std::cmp::max(at_lsn, *lsn);
*lsn = std::cmp::max(last_record_lsn, *lsn);
});
Ok(my_flush_request)
}
async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
let mut rx = self.layer_flush_done_tx.subscribe();
loop {
{
let (last_result_counter, last_result) = &*rx.borrow();
if *last_result_counter >= request {
if *last_result_counter >= my_flush_request {
if let Err(err) = last_result {
// We already logged the original error in
// flush_loop. We cannot propagate it to the caller
@@ -3843,9 +3817,12 @@ impl Timeline {
}
}
async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
let token = self.flush_frozen_layers(at_lsn)?;
self.wait_flush_completion(token).await
fn flush_frozen_layers(&self) {
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
*counter += 1;
*lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
});
}
/// Flush one frozen in-memory layer to disk, as a new delta layer.
@@ -4469,12 +4446,6 @@ impl Timeline {
if mode == ImageLayerCreationMode::Initial {
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
}
if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
// Skip compaction if there are not enough updates. Metadata compaction will do a scan and
// might mess up with evictions.
start = img_range.end;
continue;
}
} else if let ImageLayerCreationMode::Try = mode {
// check_for_image_layers = false -> skip
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4822,7 +4793,7 @@ impl Timeline {
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<GcCutoffs, PageReconstructError> {
) -> anyhow::Result<GcCutoffs> {
let _timer = self
.metrics
.find_gc_cutoffs_histo
@@ -5559,33 +5530,10 @@ impl Timeline {
all_data.sort();
Ok(all_data)
}
/// Get all historic layer descriptors in the layer map
#[cfg(test)]
pub(crate) async fn inspect_historic_layers(
self: &Arc<Timeline>,
) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
let mut layers = Vec::new();
let guard = self.layers.read().await;
for layer in guard.layer_map().iter_historic_layers() {
layers.push(layer.key());
}
Ok(layers)
}
#[cfg(test)]
pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) {
let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone();
keyspace.merge(&ks);
self.extra_test_dense_keyspace.store(Arc::new(keyspace));
}
}
type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
/// Tracking writes ingestion does to a particular in-memory layer.
///
/// Cleared upon freezing a layer.
struct TimelineWriterState {
open_layer: Arc<InMemoryLayer>,
current_size: u64,
@@ -5626,6 +5574,12 @@ impl Deref for TimelineWriter<'_> {
}
}
impl Drop for TimelineWriter<'_> {
fn drop(&mut self) {
self.write_guard.take();
}
}
#[derive(PartialEq)]
enum OpenLayerAction {
Roll,
@@ -5708,16 +5662,17 @@ impl<'a> TimelineWriter<'a> {
}
async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
assert!(self.write_guard.is_some());
self.tl.freeze_inmem_layer_at(freeze_at).await;
let now = Instant::now();
*(self.last_freeze_ts.write().unwrap()) = now;
self.tl.flush_frozen_layers();
let current_size = self.write_guard.as_ref().unwrap().current_size;
// self.write_guard will be taken by the freezing
self.tl
.freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
.await;
self.tl.flush_frozen_layers(freeze_at)?;
if current_size >= self.get_checkpoint_distance() * 2 {
if current_size > self.get_checkpoint_distance() {
warn!("Flushed oversized open layer with size {}", current_size)
}
@@ -5730,27 +5685,9 @@ impl<'a> TimelineWriter<'a> {
return OpenLayerAction::Open;
};
#[cfg(feature = "testing")]
if state.cached_last_freeze_at < self.tl.last_freeze_at.load() {
// this check and assertion are not really needed because
// LayerManager::try_freeze_in_memory_layer will always clear out the
// TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there
// is no TimelineWriterState.
assert!(
state.open_layer.end_lsn.get().is_some(),
"our open_layer must be outdated"
);
// this would be a memory leak waiting to happen because the in-memory layer always has
// an index
panic!("BUG: TimelineWriterState held on to frozen in-memory layer.");
}
if state.prev_lsn == Some(lsn) {
// Rolling mid LSN is not supported by [downstream code].
// Rolling mid LSN is not supported by downstream code.
// Hence, only roll at LSN boundaries.
//
// [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422
return OpenLayerAction::None;
}

View File

@@ -1,90 +0,0 @@
use std::{collections::BTreeSet, ops::Range};
use utils::lsn::Lsn;
use super::Timeline;
#[derive(serde::Serialize)]
pub(crate) struct RangeAnalysis {
start: String,
end: String,
has_image: bool,
num_of_deltas_above_image: usize,
total_num_of_deltas: usize,
}
impl Timeline {
pub(crate) async fn perf_info(&self) -> Vec<RangeAnalysis> {
// First, collect all split points of the layers.
let mut split_points = BTreeSet::new();
let mut delta_ranges = Vec::new();
let mut image_ranges = Vec::new();
let all_layer_files = {
let guard = self.layers.read().await;
guard.all_persistent_layers()
};
let lsn = self.get_last_record_lsn();
for key in all_layer_files {
split_points.insert(key.key_range.start);
split_points.insert(key.key_range.end);
if key.is_delta {
delta_ranges.push((key.key_range.clone(), key.lsn_range.clone()));
} else {
image_ranges.push((key.key_range.clone(), key.lsn_range.start));
}
}
// For each split range, compute the estimated read amplification.
let split_points = split_points.into_iter().collect::<Vec<_>>();
let mut result = Vec::new();
for i in 0..(split_points.len() - 1) {
let start = split_points[i];
let end = split_points[i + 1];
// Find the latest image layer that contains the information.
let mut maybe_image_layers = image_ranges
.iter()
// We insert split points for all image layers, and therefore a `contains` check for the start point should be enough.
.filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn)
.cloned()
.collect::<Vec<_>>();
maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1));
let image_layer = maybe_image_layers.last().cloned();
let lsn_filter_start = image_layer
.as_ref()
.map(|(_, lsn)| *lsn)
.unwrap_or(Lsn::INVALID);
fn overlaps_with(lsn_range_a: &Range<Lsn>, lsn_range_b: &Range<Lsn>) -> bool {
!(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end)
}
let maybe_delta_layers = delta_ranges
.iter()
.filter(|(key_range, lsn_range)| {
key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range)
})
.cloned()
.collect::<Vec<_>>();
let pitr_delta_layers = delta_ranges
.iter()
.filter(|(key_range, _)| key_range.contains(&start))
.cloned()
.collect::<Vec<_>>();
result.push(RangeAnalysis {
start: start.to_string(),
end: end.to_string(),
has_image: image_layer.is_some(),
num_of_deltas_above_image: maybe_delta_layers.len(),
total_num_of_deltas: pitr_delta_layers.len(),
});
}
result
}
}

View File

@@ -952,178 +952,6 @@ impl Timeline {
adaptor.flush_updates().await?;
Ok(())
}
/// An experimental compaction building block that combines compaction with garbage collection.
///
/// The current implementation picks all delta + image layers that are below or intersecting with
/// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
/// and create delta layers with all deltas >= gc horizon.
#[cfg(test)]
pub(crate) async fn compact_with_gc(
self: &Arc<Self>,
_cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
use crate::tenant::storage_layer::ValueReconstructState;
// Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
// The layer selection has the following properties:
// 1. If a layer is in the selection, all layers below it are in the selection.
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
let (layer_selection, gc_cutoff) = {
let guard = self.layers.read().await;
let layers = guard.layer_map();
let gc_info = self.gc_info.read().unwrap();
let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
let mut selected_layers = Vec::new();
// TODO: consider retain_lsns
drop(gc_info);
for desc in layers.iter_historic_layers() {
if desc.get_lsn_range().start <= gc_cutoff {
selected_layers.push(guard.get_from_desc(&desc));
}
}
(selected_layers, gc_cutoff)
};
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
let mut all_key_values = Vec::new();
for layer in &layer_selection {
all_key_values.extend(layer.load_key_values(ctx).await?);
}
// Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
// image layers, make image appear later than delta.
struct ValueWrapper<'a>(&'a crate::repository::Value);
impl Ord for ValueWrapper<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
use crate::repository::Value;
use std::cmp::Ordering;
match (self.0, other.0) {
(Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
(Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
_ => Ordering::Equal,
}
}
}
impl PartialOrd for ValueWrapper<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ValueWrapper<'_> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == std::cmp::Ordering::Equal
}
}
impl Eq for ValueWrapper<'_> {}
all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
(k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
});
let max_lsn = all_key_values
.iter()
.map(|(_, lsn, _)| lsn)
.max()
.copied()
.unwrap()
+ 1;
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
// Data of the same key.
let mut accumulated_values = Vec::new();
let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
/// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
async fn flush_accumulated_states(
tline: &Arc<Timeline>,
key: Key,
accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
horizon: Lsn,
) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
let mut base_image = None;
let mut keys_above_horizon = Vec::new();
let mut delta_above_base_image = Vec::new();
// We have a list of deltas/images. We want to create image layers while collect garbages.
for (key, lsn, val) in accumulated_values.iter().rev() {
if *lsn > horizon {
keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
} else if *lsn <= horizon {
match val {
crate::repository::Value::Image(image) => {
if lsn <= &horizon {
base_image = Some((*lsn, image.clone()));
break;
}
}
crate::repository::Value::WalRecord(wal) => {
delta_above_base_image.push((*lsn, wal.clone()));
}
}
}
}
delta_above_base_image.reverse();
keys_above_horizon.reverse();
let state = ValueReconstructState {
img: base_image,
records: delta_above_base_image,
};
let img = tline.reconstruct_value(key, horizon, state).await?;
Ok((keys_above_horizon, img))
}
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
all_key_values.first().unwrap().0,
gc_cutoff..max_lsn, // TODO: off by one?
ctx,
)
.await?;
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
&(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
gc_cutoff,
ctx,
)
.await?;
for item @ (key, _, _) in &all_key_values {
if &last_key == key {
accumulated_values.push(item);
} else {
let (deltas, image) =
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
.await?;
image_layer_writer.put_image(last_key, image, ctx).await?;
for (key, lsn, val) in deltas {
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
}
accumulated_values.clear();
accumulated_values.push(item);
last_key = *key;
}
}
let (deltas, image) =
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
image_layer_writer.put_image(last_key, image, ctx).await?;
for (key, lsn, val) in deltas {
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
}
accumulated_values.clear();
// TODO: split layers
let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
let image_layer = image_layer_writer.finish(self, ctx).await?;
// Step 3: Place back to the layer map.
{
let mut guard = self.layers.write().await;
guard.finish_gc_compaction(
&layer_selection,
&[delta_layer.clone(), image_layer.clone()],
&self.metrics,
)
};
Ok(())
}
}
struct TimelineAdaptor {

View File

@@ -11,6 +11,7 @@ use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
use crate::{
config::PageServerConf,
deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
@@ -262,6 +263,7 @@ impl DeleteTimelineFlow {
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: RemoteTimelineClient,
deletion_queue_client: DeletionQueueClient,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
// RemoteTimelineClient is the only functioning part.
@@ -272,6 +274,7 @@ impl DeleteTimelineFlow {
None, // Ancestor is not needed for deletion.
TimelineResources {
remote_client,
deletion_queue_client,
timeline_get_throttle: tenant.timeline_get_throttle.clone(),
},
// Important. We dont pass ancestor above because it can be missing.

View File

@@ -1,5 +1,4 @@
use anyhow::{bail, ensure, Context, Result};
use itertools::Itertools;
use pageserver_api::shard::TenantShardId;
use std::{collections::HashMap, sync::Arc};
use tracing::trace;
@@ -21,8 +20,6 @@ use crate::{
},
};
use super::TimelineWriterState;
/// Provides semantic APIs to manipulate the layer map.
#[derive(Default)]
pub(crate) struct LayerManager {
@@ -122,20 +119,18 @@ impl LayerManager {
Ok(layer)
}
/// Tries to freeze an open layer and also manages clearing the TimelineWriterState.
///
/// Returns true if anything was frozen.
pub(super) async fn try_freeze_in_memory_layer(
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
pub(crate) async fn try_freeze_in_memory_layer(
&mut self,
lsn: Lsn,
last_freeze_at: &AtomicLsn,
write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
) -> bool {
) {
let Lsn(last_record_lsn) = lsn;
let end_lsn = Lsn(last_record_lsn + 1);
let froze = if let Some(open_layer) = &self.layer_map.open_layer {
if let Some(open_layer) = &self.layer_map.open_layer {
let open_layer_rc = Arc::clone(open_layer);
// Does this layer need freezing?
open_layer.freeze(end_lsn).await;
// The layer is no longer open, update the layer map to reflect this.
@@ -143,25 +138,11 @@ impl LayerManager {
self.layer_map.frozen_layers.push_back(open_layer_rc);
self.layer_map.open_layer = None;
self.layer_map.next_open_layer_at = Some(end_lsn);
true
} else {
false
};
}
// Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
// accounts for regions in the LSN range where we might have ingested no data due to sharding.
last_freeze_at.store(end_lsn);
// the writer state must no longer have a reference to the frozen layer
let taken = write_lock.take();
assert_eq!(
froze,
taken.is_some(),
"should only had frozen a layer when TimelineWriterState existed"
);
froze
}
/// Add image layers to the layer map, called from `create_image_layers`.
@@ -226,18 +207,6 @@ impl LayerManager {
updates.flush();
}
/// Called when a GC-compaction is completed.
#[cfg(test)]
pub(crate) fn finish_gc_compaction(
&mut self,
compact_from: &[Layer],
compact_to: &[ResidentLayer],
metrics: &TimelineMetrics,
) {
// We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
self.finish_compact_l0(compact_from, compact_to, metrics)
}
/// Called when compaction is completed.
pub(crate) fn rewrite_layers(
&mut self,
@@ -339,10 +308,6 @@ impl LayerManager {
pub(crate) fn contains(&self, layer: &Layer) -> bool {
self.layer_fmgr.contains(layer)
}
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
self.layer_fmgr.0.keys().cloned().collect_vec()
}
}
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

View File

@@ -49,19 +49,6 @@ pub enum NeonWalRecord {
file_path: String,
content: Option<Bytes>,
},
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
#[cfg(test)]
Test {
/// Append a string to the image.
append: String,
/// Clear the image before appending.
clear: bool,
/// Treat this record as an init record. `clear` should be set to true if this field is set
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
/// its references in `timeline.rs`.
will_init: bool,
},
}
impl NeonWalRecord {
@@ -71,39 +58,11 @@ impl NeonWalRecord {
// If you change this function, you'll also need to change ValueBytes::will_init
match self {
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
#[cfg(test)]
NeonWalRecord::Test { will_init, .. } => *will_init,
// None of the special neon record types currently initialize the page
_ => false,
}
}
#[cfg(test)]
pub(crate) fn wal_append(s: impl AsRef<str>) -> Self {
Self::Test {
append: s.as_ref().to_string(),
clear: false,
will_init: false,
}
}
#[cfg(test)]
pub(crate) fn wal_clear() -> Self {
Self::Test {
append: "".to_string(),
clear: true,
will_init: false,
}
}
#[cfg(test)]
pub(crate) fn wal_init() -> Self {
Self::Test {
append: "".to_string(),
clear: true,
will_init: true,
}
}
}
/// DecodedBkpBlock represents per-page data contained in a WAL record.

View File

@@ -20,6 +20,7 @@
/// Process lifecycle and abstracction for the IPC protocol.
mod process;
pub use process::Kind as ProcessKind;
/// Code to apply [`NeonWalRecord`]s.
pub(crate) mod apply_neon;
@@ -53,7 +54,7 @@ pub struct PostgresRedoManager {
tenant_shard_id: TenantShardId,
conf: &'static PageServerConf,
last_redo_at: std::sync::Mutex<Option<Instant>>,
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
/// The current [`process::Process`] that is used by new redo requests.
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
/// their process object; we use [`Arc::clone`] for that.
@@ -65,7 +66,7 @@ pub struct PostgresRedoManager {
/// still be using the old redo process. But, those other tasks will most likely
/// encounter an error as well, and errors are an unexpected condition anyway.
/// So, probably we could get rid of the `Arc` in the future.
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
}
///
@@ -210,31 +211,26 @@ impl PostgresRedoManager {
const MAX_RETRY_ATTEMPTS: u32 = 1;
let mut n_attempts = 0u32;
loop {
let proc: Arc<process::WalRedoProcess> =
match self.redo_process.get_or_init_detached().await {
Ok(guard) => Arc::clone(&guard),
Err(permit) => {
// don't hold poison_guard, the launch code can bail
let start = Instant::now();
let proc = Arc::new(
process::WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
Ok(guard) => Arc::clone(&guard),
Err(permit) => {
// don't hold poison_guard, the launch code can bail
let start = Instant::now();
let proc = Arc::new(
process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
.context("launch walredo process")?,
);
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
info!(
duration_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
self.redo_process.set(Arc::clone(&proc), permit);
proc
}
};
);
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
info!(
duration_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
self.redo_process.set(Arc::clone(&proc), permit);
proc
}
};
let started_at = std::time::Instant::now();

View File

@@ -244,20 +244,6 @@ pub(crate) fn apply_in_neon(
let mut writer = page.writer();
dir.ser_into(&mut writer)?;
}
#[cfg(test)]
NeonWalRecord::Test {
append,
clear,
will_init,
} => {
if *will_init {
assert!(*clear, "init record must be clear to ensure correctness");
}
if *clear {
page.clear();
}
page.put_slice(append.as_bytes());
}
}
Ok(())
}

View File

@@ -1,187 +1,64 @@
/// Layer of indirection previously used to support multiple implementations.
/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
use std::time::Duration;
use bytes::Bytes;
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use tracing::warn;
use utils::lsn::Lsn;
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
mod no_leak_child;
/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
mod protocol;
use self::no_leak_child::NoLeakChild;
use crate::{
config::PageServerConf,
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
span::debug_assert_current_span_has_tenant_id,
walrecord::NeonWalRecord,
};
use anyhow::Context;
use bytes::Bytes;
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use postgres_ffi::BLCKSZ;
#[cfg(feature = "testing")]
use std::sync::atomic::AtomicUsize;
use std::{
collections::VecDeque,
process::{Command, Stdio},
time::Duration,
};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tracing::{debug, error, instrument, Instrument};
use utils::{lsn::Lsn, poison::Poison};
pub struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
#[cfg(feature = "testing")]
tenant_shard_id: TenantShardId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
mod process_impl {
pub(super) mod process_async;
}
struct ProcessInput {
stdin: tokio::process::ChildStdin,
n_requests: usize,
#[derive(
Clone,
Copy,
Debug,
PartialEq,
Eq,
strum_macros::EnumString,
strum_macros::Display,
strum_macros::IntoStaticStr,
serde_with::DeserializeFromStr,
serde_with::SerializeDisplay,
)]
#[strum(serialize_all = "kebab-case")]
#[repr(u8)]
pub enum Kind {
Sync,
Async,
}
struct ProcessOutput {
stdout: tokio::process::ChildStdout,
pending_responses: VecDeque<Option<Bytes>>,
n_processed_responses: usize,
}
pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(pg_version=pg_version))]
pub(crate) fn launch(
impl Process {
#[inline(always)]
pub fn launch(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
crate::span::debug_assert_current_span_has_tenant_id();
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
use no_leak_child::NoLeakChildCommandExt;
// Start postgres itself
let child = Command::new(pg_bin_dir_path.join("postgres"))
// the first arg must be --wal-redo so the child process enters into walredo mode
.arg("--wal-redo")
// the child doesn't process this arg, but, having it in the argv helps indentify the
// walredo process for a particular tenant when debugging a pagserver
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
.stdin(Stdio::piped())
.stderr(Stdio::piped())
.stdout(Stdio::piped())
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait(WalRedoKillCause::Startup);
});
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stderr = tokio::process::ChildStderr::from_std(stderr)
.context("convert to tokio::ChildStderr")?;
let stdin =
tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
let stdout = tokio::process::ChildStdout::from_std(stdout)
.context("convert to tokio::ChildStdout")?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
tokio::spawn(
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
let mut buf = Vec::new();
let res = loop {
buf.clear();
// TODO we don't trust the process to cap its stderr length.
// Currently it can do unbounded Vec allocation.
match stderr_lines.read_until(b'\n', &mut buf).await {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
}
Err(e) => {
break Err(e);
}
}
};
match res {
Ok(()) => (),
Err(e) => {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
);
Ok(Self {
if conf.walredo_process_kind != Kind::Async {
warn!(
configured = %conf.walredo_process_kind,
"the walredo_process_kind setting has been turned into a no-op, using async implementation"
);
}
Ok(Self(process_impl::process_async::WalRedoProcess::launch(
conf,
#[cfg(feature = "testing")]
tenant_shard_id,
child: Some(child),
stdin: tokio::sync::Mutex::new(Poison::new(
"stdin",
ProcessInput {
stdin,
n_requests: 0,
},
)),
stdout: tokio::sync::Mutex::new(Poison::new(
"stdout",
ProcessOutput {
stdout,
pending_responses: VecDeque::new(),
n_processed_responses: 0,
},
)),
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
})
pg_version,
)?))
}
pub(crate) fn id(&self) -> u32 {
self.child
.as_ref()
.expect("must not call this during Drop")
.id()
}
/// Apply given WAL records ('records') over an old page image. Returns
/// new page image.
///
/// # Cancel-Safety
///
/// Cancellation safe.
#[instrument(skip_all, fields(pid=%self.id()))]
#[inline(always)]
pub(crate) async fn apply_wal_records(
&self,
rel: RelTag,
@@ -190,193 +67,12 @@ impl WalRedoProcess {
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
debug_assert_current_span_has_tenant_id();
let tag = protocol::BufferTag { rel, blknum };
// Serialize all the messages to send the WAL redo process first.
//
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
protocol::build_push_page_msg(tag, img, &mut writebuf);
}
for (lsn, rec) in records.iter() {
if let NeonWalRecord::Postgres {
will_init: _,
rec: postgres_rec,
} = rec
{
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
} else {
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
}
}
protocol::build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let Ok(res) =
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
else {
anyhow::bail!("WAL redo timed out");
};
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
self.0
.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await
}
/// # Cancel-Safety
///
/// When not polled to completion (e.g. because in `tokio::select!` another
/// branch becomes ready before this future), concurrent and subsequent
/// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
/// Dispose of this process instance and create a new one.
async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
let request_no = {
let mut lock_guard = self.stdin.lock().await;
let mut poison_guard = lock_guard.check_and_arm()?;
let input = poison_guard.data_mut();
input
.stdin
.write_all(writebuf)
.await
.context("write to walredo stdin")?;
let request_no = input.n_requests;
input.n_requests += 1;
poison_guard.disarm();
request_no
};
// To improve walredo performance we separate sending requests and receiving
// responses. Them are protected by different mutexes (output and input).
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
// then there is not warranty that T1 will first granted output mutex lock.
// To address this issue we maintain number of sent requests, number of processed
// responses and ring buffer with pending responses. After sending response
// (under input mutex), threads remembers request number. Then it releases
// input mutex, locks output mutex and fetch in ring buffer all responses until
// its stored request number. The it takes correspondent element from
// pending responses ring buffer and truncate all empty elements from the front,
// advancing processed responses number.
let mut lock_guard = self.stdout.lock().await;
let mut poison_guard = lock_guard.check_and_arm()?;
let output = poison_guard.data_mut();
let n_processed_responses = output.n_processed_responses;
while n_processed_responses + output.pending_responses.len() <= request_no {
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; BLCKSZ.into()];
output
.stdout
.read_exact(&mut resultbuf)
.await
.context("read walredo stdout")?;
output
.pending_responses
.push_back(Some(Bytes::from(resultbuf)));
}
// Replace our request's response with None in `pending_responses`.
// Then make space in the ring buffer by clearing out any seqence of contiguous
// `None`'s from the front of `pending_responses`.
// NB: We can't pop_front() because other requests' responses because another
// requester might have grabbed the output mutex before us:
// T1: grab input mutex
// T1: send request_no 23
// T1: release input mutex
// T2: grab input mutex
// T2: send request_no 24
// T2: release input mutex
// T2: grab output mutex
// T2: n_processed_responses + output.pending_responses.len() <= request_no
// 23 0 24
// T2: enters poll loop that reads stdout
// T2: put response for 23 into pending_responses
// T2: put response for 24 into pending_resposnes
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
// T2: takes its response_24
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: releases output mutex
// T1: grabs output mutex
// T1: n_processed_responses + output.pending_responses.len() > request_no
// 23 2 23
// T1: skips poll loop that reads stdout
// T1: takes its response_23
// pending_responses now looks like this: Front None None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Back
// n_processed_responses now has value 25
let res = output.pending_responses[request_no - n_processed_responses]
.take()
.expect("we own this request_no, nobody else is supposed to take it");
while let Some(front) = output.pending_responses.front() {
if front.is_none() {
output.pending_responses.pop_front();
output.n_processed_responses += 1;
} else {
break;
}
}
poison_guard.disarm();
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
use std::sync::atomic::Ordering;
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
use std::io::Write;
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
}
}
#[cfg(not(feature = "testing"))]
fn record_and_log(&self, _: &[u8]) {}
}
impl Drop for WalRedoProcess {
fn drop(&mut self) {
self.child
.take()
.expect("we only do this once")
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
// no way to wait for stderr_logger_task from Drop because that is async only
pub(crate) fn id(&self) -> u32 {
self.0.id()
}
}

View File

@@ -0,0 +1,374 @@
use self::no_leak_child::NoLeakChild;
use crate::{
config::PageServerConf,
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
walrecord::NeonWalRecord,
walredo::process::{no_leak_child, protocol},
};
use anyhow::Context;
use bytes::Bytes;
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use postgres_ffi::BLCKSZ;
#[cfg(feature = "testing")]
use std::sync::atomic::AtomicUsize;
use std::{
collections::VecDeque,
process::{Command, Stdio},
time::Duration,
};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tracing::{debug, error, instrument, Instrument};
use utils::{lsn::Lsn, poison::Poison};
pub struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
}
struct ProcessInput {
stdin: tokio::process::ChildStdin,
n_requests: usize,
}
struct ProcessOutput {
stdout: tokio::process::ChildStdout,
pending_responses: VecDeque<Option<Bytes>>,
n_processed_responses: usize,
}
impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(pg_version=pg_version))]
pub(crate) fn launch(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
crate::span::debug_assert_current_span_has_tenant_id();
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
use no_leak_child::NoLeakChildCommandExt;
// Start postgres itself
let child = Command::new(pg_bin_dir_path.join("postgres"))
// the first arg must be --wal-redo so the child process enters into walredo mode
.arg("--wal-redo")
// the child doesn't process this arg, but, having it in the argv helps indentify the
// walredo process for a particular tenant when debugging a pagserver
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
.stdin(Stdio::piped())
.stderr(Stdio::piped())
.stdout(Stdio::piped())
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait(WalRedoKillCause::Startup);
});
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stderr = tokio::process::ChildStderr::from_std(stderr)
.context("convert to tokio::ChildStderr")?;
let stdin =
tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
let stdout = tokio::process::ChildStdout::from_std(stdout)
.context("convert to tokio::ChildStdout")?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
tokio::spawn(
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
let mut buf = Vec::new();
let res = loop {
buf.clear();
// TODO we don't trust the process to cap its stderr length.
// Currently it can do unbounded Vec allocation.
match stderr_lines.read_until(b'\n', &mut buf).await {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
}
Err(e) => {
break Err(e);
}
}
};
match res {
Ok(()) => (),
Err(e) => {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
);
Ok(Self {
conf,
tenant_shard_id,
child: Some(child),
stdin: tokio::sync::Mutex::new(Poison::new(
"stdin",
ProcessInput {
stdin,
n_requests: 0,
},
)),
stdout: tokio::sync::Mutex::new(Poison::new(
"stdout",
ProcessOutput {
stdout,
pending_responses: VecDeque::new(),
n_processed_responses: 0,
},
)),
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
})
}
pub(crate) fn id(&self) -> u32 {
self.child
.as_ref()
.expect("must not call this during Drop")
.id()
}
/// Apply given WAL records ('records') over an old page image. Returns
/// new page image.
///
/// # Cancel-Safety
///
/// Cancellation safe.
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
pub(crate) async fn apply_wal_records(
&self,
rel: RelTag,
blknum: u32,
base_img: &Option<Bytes>,
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let tag = protocol::BufferTag { rel, blknum };
// Serialize all the messages to send the WAL redo process first.
//
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
protocol::build_push_page_msg(tag, img, &mut writebuf);
}
for (lsn, rec) in records.iter() {
if let NeonWalRecord::Postgres {
will_init: _,
rec: postgres_rec,
} = rec
{
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
} else {
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
}
}
protocol::build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let Ok(res) =
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
else {
anyhow::bail!("WAL redo timed out");
};
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
}
/// # Cancel-Safety
///
/// When not polled to completion (e.g. because in `tokio::select!` another
/// branch becomes ready before this future), concurrent and subsequent
/// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
/// Dispose of this process instance and create a new one.
async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
let request_no = {
let mut lock_guard = self.stdin.lock().await;
let mut poison_guard = lock_guard.check_and_arm()?;
let input = poison_guard.data_mut();
input
.stdin
.write_all(writebuf)
.await
.context("write to walredo stdin")?;
let request_no = input.n_requests;
input.n_requests += 1;
poison_guard.disarm();
request_no
};
// To improve walredo performance we separate sending requests and receiving
// responses. Them are protected by different mutexes (output and input).
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
// then there is not warranty that T1 will first granted output mutex lock.
// To address this issue we maintain number of sent requests, number of processed
// responses and ring buffer with pending responses. After sending response
// (under input mutex), threads remembers request number. Then it releases
// input mutex, locks output mutex and fetch in ring buffer all responses until
// its stored request number. The it takes correspondent element from
// pending responses ring buffer and truncate all empty elements from the front,
// advancing processed responses number.
let mut lock_guard = self.stdout.lock().await;
let mut poison_guard = lock_guard.check_and_arm()?;
let output = poison_guard.data_mut();
let n_processed_responses = output.n_processed_responses;
while n_processed_responses + output.pending_responses.len() <= request_no {
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; BLCKSZ.into()];
output
.stdout
.read_exact(&mut resultbuf)
.await
.context("read walredo stdout")?;
output
.pending_responses
.push_back(Some(Bytes::from(resultbuf)));
}
// Replace our request's response with None in `pending_responses`.
// Then make space in the ring buffer by clearing out any seqence of contiguous
// `None`'s from the front of `pending_responses`.
// NB: We can't pop_front() because other requests' responses because another
// requester might have grabbed the output mutex before us:
// T1: grab input mutex
// T1: send request_no 23
// T1: release input mutex
// T2: grab input mutex
// T2: send request_no 24
// T2: release input mutex
// T2: grab output mutex
// T2: n_processed_responses + output.pending_responses.len() <= request_no
// 23 0 24
// T2: enters poll loop that reads stdout
// T2: put response for 23 into pending_responses
// T2: put response for 24 into pending_resposnes
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
// T2: takes its response_24
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: releases output mutex
// T1: grabs output mutex
// T1: n_processed_responses + output.pending_responses.len() > request_no
// 23 2 23
// T1: skips poll loop that reads stdout
// T1: takes its response_23
// pending_responses now looks like this: Front None None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Back
// n_processed_responses now has value 25
let res = output.pending_responses[request_no - n_processed_responses]
.take()
.expect("we own this request_no, nobody else is supposed to take it");
while let Some(front) = output.pending_responses.front() {
if front.is_none() {
output.pending_responses.pop_front();
output.n_processed_responses += 1;
} else {
break;
}
}
poison_guard.disarm();
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
use std::sync::atomic::Ordering;
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
use std::io::Write;
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
}
}
#[cfg(not(feature = "testing"))]
fn record_and_log(&self, _: &[u8]) {}
}
impl Drop for WalRedoProcess {
fn drop(&mut self) {
self.child
.take()
.expect("we only do this once")
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
// no way to wait for stderr_logger_task from Drop because that is async only
}
}

View File

@@ -1,223 +0,0 @@
commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
Author: Alexey Masterov <alexeymasterov@neon.tech>
Date: Fri May 31 06:34:26 2024 +0000
These alternative expected files were added to consider the neon features
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
new file mode 100644
index 0000000..2539cfd
--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
@@ -0,0 +1,101 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE: installing required extension "pgcrypto"
+SELECT anon.init();
+ init
+------
+ t
+(1 row)
+
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE mallory_the_masked_user;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.init();
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.anonymize_table('t1');
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ERROR: Only supersusers can start the dynamic masking engine.
+CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT * FROM mask.t1;
+ i | t
+---+---
+ 1 |
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ SELECT * FROM public.t1;
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR: Only supersusers can stop the dynamic masking engine.
+CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking
+----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR: permission denied
+DETAIL: The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;
diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
new file mode 100644
index 0000000..8b090fe
--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
@@ -0,0 +1,104 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE: installing required extension "pgcrypto"
+SELECT anon.init();
+ init
+------
+ t
+(1 row)
+
+CREATE ROLE oscar_the_owner;
+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE oscar_the_owner;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column?
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+ PERFORM anon.init();
+ EXCEPTION WHEN insufficient_privilege
+ THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE: insufficient_privilege
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+SELECT anon.anonymize_table('t1');
+ anonymize_table
+-----------------
+ t
+(1 row)
+
+SELECT * FROM t1;
+ i | t
+---+---
+ 1 |
+(1 row)
+
+UPDATE t1 SET t='test' WHERE i=1;
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking
+-----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+SELECT * FROM t1;
+ i | t
+---+------
+ 1 | test
+(1 row)
+
+--SELECT * FROM mask.t1;
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR: permission denied for schema mask
+CONTEXT: SQL statement "DROP VIEW mask.t1;"
+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
+SQL statement "SELECT anon.mask_drop_view(oid)
+ FROM pg_catalog.pg_class
+ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
+ AND relkind IN ('r','p','f')"
+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking
+----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR: permission denied
+DETAIL: The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;

View File

@@ -1,19 +0,0 @@
commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master)
Author: Alexey Masterov <alexeymasterov@neon.tech>
Date: Fri Jun 7 19:23:42 2024 +0000
Disable REGRESS_OPTIONS causing initdb
diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile
index 053314c..fbd5fb5 100644
--- a/ext-src/pg_cron-src/Makefile
+++ b/ext-src/pg_cron-src/Makefile
@@ -5,7 +5,7 @@ EXTENSION = pg_cron
DATA_built = $(EXTENSION)--1.0.sql
DATA = $(wildcard $(EXTENSION)--*--*.sql)
-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
+#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
REGRESS = pg_cron-test
# compilation configuration

View File

@@ -1,39 +0,0 @@
commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
Author: Alexey Masterov <alexeymasterov@neon.tech>
Date: Thu Jun 6 08:02:42 2024 +0000
Patch expected files to consider Neon's log messages
diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
index da723b8..f8d0102 100644
--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
@@ -9,13 +9,16 @@ SET search_path TO public;
----
-- No.A-1-1-3
CREATE EXTENSION pg_hint_plan;
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
-- No.A-1-2-3
DROP EXTENSION pg_hint_plan;
-- No.A-1-1-4
CREATE SCHEMA other_schema;
CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan"
CREATE EXTENSION pg_hint_plan;
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
DROP SCHEMA other_schema;
----
---- No. A-5-1 comment pattern
diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
index d372459..6282afe 100644
--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
SET client_min_messages TO LOG;
SET pg_hint_plan.enable_hint TO on;
CREATE EXTENSION file_fdw;
+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');

View File

@@ -1,8 +1,19 @@
From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 2 Feb 2024 22:26:45 +0200
Subject: [PATCH 1/1] Make v0.6.0 work with Neon
Now that the WAL-logging happens as a separate step at the end of the
build, we need a few neon-specific hints to make it work.
---
src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
index dcfb2bd..d5189ee 100644
index 680789b..ec54dea 100644
--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
@@ -20,7 +31,7 @@ index dcfb2bd..d5189ee 100644
/* Close relations within worker */
index_close(indexRel, indexLockmode);
table_close(heapRel, heapLockmode);
@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
SeedRandom(42);
#endif
@@ -32,13 +43,14 @@ index dcfb2bd..d5189ee 100644
BuildGraph(buildstate, forkNum);
- if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
+ if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
if (RelationNeedsWAL(index))
+ {
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+#ifdef NEON_SMGR
+ {
+#if PG_VERSION_NUM >= 160000
@@ -48,7 +60,7 @@ index dcfb2bd..d5189ee 100644
+#endif
+
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+ }
+#endif
@@ -57,6 +69,10 @@ index dcfb2bd..d5189ee 100644
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
FreeBuildState(buildstate);
}
--
2.39.2

View File

@@ -1,19 +0,0 @@
;; see also src/tools/editors/emacs.samples for more complete settings
((c-mode . ((c-basic-offset . 4)
(c-file-style . "bsd")
(fill-column . 78)
(indent-tabs-mode . t)
(tab-width . 4)))
(nxml-mode . ((fill-column . 78)
(indent-tabs-mode . nil)))
(perl-mode . ((perl-indent-level . 4)
(perl-continued-statement-offset . 2)
(perl-continued-brace-offset . -2)
(perl-brace-offset . 0)
(perl-brace-imaginary-offset . 0)
(perl-label-offset . -2)
(indent-tabs-mode . t)
(tab-width . 4)))
(sgml-mode . ((fill-column . 78)
(indent-tabs-mode . nil))))

View File

@@ -1,14 +0,0 @@
root = true
[*.{c,h,l,y,pl,pm}]
indent_style = tab
indent_size = tab
tab_width = 4
[*.{sgml,xml}]
indent_style = space
indent_size = 1
[*.xsl]
indent_style = space
indent_size = 2

View File

@@ -19,7 +19,6 @@
#include "catalog/pg_type.h"
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
#include "replication/logical.h"
#include "replication/slot.h"
#include "replication/walsender.h"
#include "storage/procsignal.h"
@@ -281,7 +280,6 @@ _PG_init(void)
pg_init_libpagestore();
pg_init_walproposer();
WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitLogicalReplicationMonitor();

View File

@@ -605,7 +605,7 @@ prefetch_read(PrefetchRequest *slot)
}
else
{
neon_shard_log(slot->shard_no, LOG,
neon_shard_log(slot->shard_no, WARNING,
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
(long)slot->my_ring_index,
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
@@ -3112,12 +3112,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
request_lsn = UINT64_MAX;
/*
* GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU
* GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
* segment has not changed since the basebackup, because in order to
* modify it, we would have had to download it already. And once
* downloaded, we never evict SLRU segments from local disk.
*/
not_modified_since = nm_adjust_lsn(GetRedoStartLsn());
not_modified_since = GetRedoStartLsn();
SlruKind kind;

View File

@@ -24,12 +24,8 @@
#include "walproposer.h"
static NeonWALReader *wal_reader = NULL;
struct WalSnd;
extern struct WalSnd *MyWalSnd;
extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
static XLogRecPtr
NeonWALReadWaitForWAL(XLogRecPtr loc)
@@ -40,28 +36,7 @@ NeonWALReadWaitForWAL(XLogRecPtr loc)
CHECK_FOR_INTERRUPTS();
}
// Walsender sends keepalives and stuff, so better use its normal wait
if (MyWalSnd != NULL)
return WalSndWaitForWal(loc);
for (;;)
{
XLogRecPtr flush_ptr;
if (!RecoveryInProgress())
#if PG_VERSION_NUM >= 150000
flush_ptr = GetFlushRecPtr(NULL);
#else
flush_ptr = GetFlushRecPtr();
#endif
else
flush_ptr = GetXLogReplayRecPtr(NULL);
if (loc <= flush_ptr)
return flush_ptr;
CHECK_FOR_INTERRUPTS();
pg_usleep(1000);
}
return WalSndWaitForWal(loc);
}
static int

View File

@@ -1,183 +1,16 @@
use measured::FixedCardinalityLabel;
use serde::{Deserialize, Serialize};
use std::fmt::{self, Display};
use std::fmt;
use crate::auth::IpPattern;
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
use crate::proxy::retry::ShouldRetry;
/// Generic error response with human-readable description.
/// Note that we can't always present it to user as is.
#[derive(Debug, Deserialize)]
pub struct ConsoleError {
pub error: Box<str>,
#[serde(skip)]
pub http_status_code: http::StatusCode,
pub status: Option<Status>,
}
impl ConsoleError {
pub fn get_reason(&self) -> Reason {
self.status
.as_ref()
.and_then(|s| s.details.error_info.as_ref())
.map(|e| e.reason)
.unwrap_or(Reason::Unknown)
}
pub fn get_user_facing_message(&self) -> String {
use super::provider::errors::REQUEST_FAILED;
self.status
.as_ref()
.and_then(|s| s.details.user_facing_message.as_ref())
.map(|m| m.message.clone().into())
.unwrap_or_else(|| {
// Ask @neondatabase/control-plane for review before adding more.
match self.http_status_code {
http::StatusCode::NOT_FOUND => {
// Status 404: failed to get a project-related resource.
format!("{REQUEST_FAILED}: endpoint cannot be found")
}
http::StatusCode::NOT_ACCEPTABLE => {
// Status 406: endpoint is disabled (we don't allow connections).
format!("{REQUEST_FAILED}: endpoint is disabled")
}
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
}
_ => REQUEST_FAILED.to_owned(),
}
})
}
}
impl Display for ConsoleError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let msg = self
.status
.as_ref()
.and_then(|s| s.details.user_facing_message.as_ref())
.map(|m| m.message.as_ref())
.unwrap_or_else(|| &self.error);
write!(f, "{}", msg)
}
}
impl ShouldRetry for ConsoleError {
fn could_retry(&self) -> bool {
if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() {
// retry some temporary failures because the compute was in a bad state
// (bad request can be returned when the endpoint was in transition)
return match &self {
ConsoleError {
http_status_code: http::StatusCode::BAD_REQUEST,
..
} => true,
// don't retry when quotas are exceeded
ConsoleError {
http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
ref error,
..
} => !error.contains("compute time quota of non-primary branches is exceeded"),
// locked can be returned when the endpoint was in transition
// or when quotas are exceeded. don't retry when quotas are exceeded
ConsoleError {
http_status_code: http::StatusCode::LOCKED,
ref error,
..
} => {
!error.contains("quota exceeded")
&& !error.contains("the limit for current plan reached")
}
_ => false,
};
}
// retry if the response has a retry delay
if let Some(retry_info) = self
.status
.as_ref()
.and_then(|s| s.details.retry_info.as_ref())
{
retry_info.retry_delay_ms > 0
} else {
false
}
}
}
#[derive(Debug, Deserialize)]
pub struct Status {
pub code: Box<str>,
pub message: Box<str>,
pub details: Details,
}
#[derive(Debug, Deserialize)]
pub struct Details {
pub error_info: Option<ErrorInfo>,
pub retry_info: Option<RetryInfo>,
pub user_facing_message: Option<UserFacingMessage>,
}
#[derive(Debug, Deserialize)]
pub struct ErrorInfo {
pub reason: Reason,
// Schema could also have `metadata` field, but it's not structured. Skip it for now.
}
#[derive(Clone, Copy, Debug, Deserialize, Default)]
pub enum Reason {
#[serde(rename = "ROLE_PROTECTED")]
RoleProtected,
#[serde(rename = "RESOURCE_NOT_FOUND")]
ResourceNotFound,
#[serde(rename = "PROJECT_NOT_FOUND")]
ProjectNotFound,
#[serde(rename = "ENDPOINT_NOT_FOUND")]
EndpointNotFound,
#[serde(rename = "BRANCH_NOT_FOUND")]
BranchNotFound,
#[serde(rename = "RATE_LIMIT_EXCEEDED")]
RateLimitExceeded,
#[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")]
NonPrimaryBranchComputeTimeExceeded,
#[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")]
ActiveTimeQuotaExceeded,
#[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")]
ComputeTimeQuotaExceeded,
#[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")]
WrittenDataQuotaExceeded,
#[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")]
DataTransferQuotaExceeded,
#[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
LogicalSizeQuotaExceeded,
#[default]
#[serde(other)]
Unknown,
}
impl Reason {
pub fn is_not_found(&self) -> bool {
matches!(
self,
Reason::ResourceNotFound
| Reason::ProjectNotFound
| Reason::EndpointNotFound
| Reason::BranchNotFound
)
}
}
#[derive(Debug, Deserialize)]
pub struct RetryInfo {
pub retry_delay_ms: u64,
}
#[derive(Debug, Deserialize)]
pub struct UserFacingMessage {
pub message: Box<str>,
}
/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].

View File

@@ -25,8 +25,8 @@ use tracing::info;
pub mod errors {
use crate::{
console::messages::{self, ConsoleError},
error::{io_error, ReportableError, UserFacingError},
http,
proxy::retry::ShouldRetry,
};
use thiserror::Error;
@@ -34,14 +34,17 @@ pub mod errors {
use super::ApiLockError;
/// A go-to error message which doesn't leak any detail.
pub const REQUEST_FAILED: &str = "Console request failed";
const REQUEST_FAILED: &str = "Console request failed";
/// Common console API error.
#[derive(Debug, Error)]
pub enum ApiError {
/// Error returned by the console itself.
#[error("{REQUEST_FAILED} with {0}")]
Console(ConsoleError),
#[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
Console {
status: http::StatusCode,
text: Box<str>,
},
/// Various IO errors like broken pipe or malformed payload.
#[error("{REQUEST_FAILED}: {0}")]
@@ -50,11 +53,11 @@ pub mod errors {
impl ApiError {
/// Returns HTTP status code if it's the reason for failure.
pub fn get_reason(&self) -> messages::Reason {
pub fn http_status_code(&self) -> Option<http::StatusCode> {
use ApiError::*;
match self {
Console(e) => e.get_reason(),
_ => messages::Reason::Unknown,
Console { status, .. } => Some(*status),
_ => None,
}
}
}
@@ -64,7 +67,22 @@ pub mod errors {
use ApiError::*;
match self {
// To minimize risks, only select errors are forwarded to users.
Console(c) => c.get_user_facing_message(),
// Ask @neondatabase/control-plane for review before adding more.
Console { status, .. } => match *status {
http::StatusCode::NOT_FOUND => {
// Status 404: failed to get a project-related resource.
format!("{REQUEST_FAILED}: endpoint cannot be found")
}
http::StatusCode::NOT_ACCEPTABLE => {
// Status 406: endpoint is disabled (we don't allow connections).
format!("{REQUEST_FAILED}: endpoint is disabled")
}
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
}
_ => REQUEST_FAILED.to_owned(),
},
_ => REQUEST_FAILED.to_owned(),
}
}
@@ -73,56 +91,29 @@ pub mod errors {
impl ReportableError for ApiError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiError::Console(e) => {
use crate::error::ErrorKind::*;
match e.get_reason() {
crate::console::messages::Reason::RoleProtected => User,
crate::console::messages::Reason::ResourceNotFound => User,
crate::console::messages::Reason::ProjectNotFound => User,
crate::console::messages::Reason::EndpointNotFound => User,
crate::console::messages::Reason::BranchNotFound => User,
crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit,
crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
User
}
crate::console::messages::Reason::ActiveTimeQuotaExceeded => User,
crate::console::messages::Reason::ComputeTimeQuotaExceeded => User,
crate::console::messages::Reason::WrittenDataQuotaExceeded => User,
crate::console::messages::Reason::DataTransferQuotaExceeded => User,
crate::console::messages::Reason::LogicalSizeQuotaExceeded => User,
crate::console::messages::Reason::Unknown => match &e {
ConsoleError {
http_status_code:
http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
..
} => crate::error::ErrorKind::User,
ConsoleError {
http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
error,
..
} if error.contains(
"compute time quota of non-primary branches is exceeded",
) =>
{
crate::error::ErrorKind::User
}
ConsoleError {
http_status_code: http::StatusCode::LOCKED,
error,
..
} if error.contains("quota exceeded")
|| error.contains("the limit for current plan reached") =>
{
crate::error::ErrorKind::User
}
ConsoleError {
http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
..
} => crate::error::ErrorKind::ServiceRateLimit,
ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
},
}
ApiError::Console {
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
..
} => crate::error::ErrorKind::User,
ApiError::Console {
status: http::StatusCode::UNPROCESSABLE_ENTITY,
text,
} if text.contains("compute time quota of non-primary branches is exceeded") => {
crate::error::ErrorKind::User
}
ApiError::Console {
status: http::StatusCode::LOCKED,
text,
} if text.contains("quota exceeded")
|| text.contains("the limit for current plan reached") =>
{
crate::error::ErrorKind::User
}
ApiError::Console {
status: http::StatusCode::TOO_MANY_REQUESTS,
..
} => crate::error::ErrorKind::ServiceRateLimit,
ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
}
}
@@ -133,7 +124,31 @@ pub mod errors {
match self {
// retry some transport errors
Self::Transport(io) => io.could_retry(),
Self::Console(e) => e.could_retry(),
// retry some temporary failures because the compute was in a bad state
// (bad request can be returned when the endpoint was in transition)
Self::Console {
status: http::StatusCode::BAD_REQUEST,
..
} => true,
// don't retry when quotas are exceeded
Self::Console {
status: http::StatusCode::UNPROCESSABLE_ENTITY,
ref text,
} => !text.contains("compute time quota of non-primary branches is exceeded"),
// locked can be returned when the endpoint was in transition
// or when quotas are exceeded. don't retry when quotas are exceeded
Self::Console {
status: http::StatusCode::LOCKED,
ref text,
} => {
// written data quota exceeded
// data transfer quota exceeded
// compute time quota exceeded
// logical size quota exceeded
!text.contains("quota exceeded")
&& !text.contains("the limit for current plan reached")
}
_ => false,
}
}
}

View File

@@ -94,14 +94,12 @@ impl Api {
let body = match parse_body::<GetRoleSecret>(response).await {
Ok(body) => body,
// Error 404 is special: it's ok not to have a secret.
// TODO(anna): retry
Err(e) => {
if e.get_reason().is_not_found() {
Err(e) => match e.http_status_code() {
Some(http::StatusCode::NOT_FOUND) => {
return Ok(AuthInfo::default());
} else {
return Err(e.into());
}
}
_otherwise => return Err(e.into()),
},
};
let secret = if body.role_secret.is_empty() {
@@ -330,24 +328,26 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
info!("request succeeded, processing the body");
return Ok(response.json().await?);
}
let s = response.bytes().await?;
// Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
info!("response_error plaintext: {:?}", s);
info!("response_error: {:?}", response);
let s = response.text().await?;
info!("response_error: {:?}", s);
return Err(ApiError::Console {
status,
text: s.into(),
});
// Don't throw an error here because it's not as important
// as the fact that the request itself has failed.
let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
let body = response.json().await.unwrap_or_else(|e| {
warn!("failed to parse error body: {e}");
ConsoleError {
error: "reason unclear (malformed error message)".into(),
http_status_code: status,
status: None,
}
});
body.http_status_code = status;
error!("console responded with an error ({status}): {body:?}");
Err(ApiError::Console(body))
let text = body.error;
error!("console responded with an error ({status}): {text}");
Err(ApiError::Console { status, text })
}
fn parse_host_port(input: &str) -> Option<(&str, u16)> {

View File

@@ -12,7 +12,7 @@ use crate::auth::backend::{
};
use crate::config::{CertResolver, RetryConfig};
use crate::console::caches::NodeInfoCache;
use crate::console::messages::{ConsoleError, MetricsAuxInfo};
use crate::console::messages::MetricsAuxInfo;
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
use crate::console::{self, CachedNodeInfo, NodeInfo};
use crate::error::ErrorKind;
@@ -484,20 +484,18 @@ impl TestBackend for TestConnectMechanism {
match action {
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
ConnectAction::WakeFail => {
let err = console::errors::ApiError::Console(ConsoleError {
http_status_code: http::StatusCode::FORBIDDEN,
error: "TEST".into(),
status: None,
});
let err = console::errors::ApiError::Console {
status: http::StatusCode::FORBIDDEN,
text: "TEST".into(),
};
assert!(!err.could_retry());
Err(console::errors::WakeComputeError::ApiError(err))
}
ConnectAction::WakeRetry => {
let err = console::errors::ApiError::Console(ConsoleError {
http_status_code: http::StatusCode::BAD_REQUEST,
error: "TEST".into(),
status: None,
});
let err = console::errors::ApiError::Console {
status: http::StatusCode::BAD_REQUEST,
text: "TEST".into(),
};
assert!(err.could_retry());
Err(console::errors::WakeComputeError::ApiError(err))
}

View File

@@ -1,5 +1,4 @@
use crate::config::RetryConfig;
use crate::console::messages::ConsoleError;
use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
use crate::context::RequestMonitoring;
use crate::metrics::{
@@ -89,76 +88,36 @@ fn report_error(e: &WakeComputeError, retry: bool) {
let kind = match e {
WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() {
crate::console::messages::Reason::RoleProtected => {
WakeupFailureKind::ApiConsoleBadRequest
}
crate::console::messages::Reason::ResourceNotFound => {
WakeupFailureKind::ApiConsoleBadRequest
}
crate::console::messages::Reason::ProjectNotFound => {
WakeupFailureKind::ApiConsoleBadRequest
}
crate::console::messages::Reason::EndpointNotFound => {
WakeupFailureKind::ApiConsoleBadRequest
}
crate::console::messages::Reason::BranchNotFound => {
WakeupFailureKind::ApiConsoleBadRequest
}
crate::console::messages::Reason::RateLimitExceeded => {
WakeupFailureKind::ApiConsoleLocked
}
crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
WakeupFailureKind::QuotaExceeded
}
crate::console::messages::Reason::ActiveTimeQuotaExceeded => {
WakeupFailureKind::QuotaExceeded
}
crate::console::messages::Reason::ComputeTimeQuotaExceeded => {
WakeupFailureKind::QuotaExceeded
}
crate::console::messages::Reason::WrittenDataQuotaExceeded => {
WakeupFailureKind::QuotaExceeded
}
crate::console::messages::Reason::DataTransferQuotaExceeded => {
WakeupFailureKind::QuotaExceeded
}
crate::console::messages::Reason::LogicalSizeQuotaExceeded => {
WakeupFailureKind::QuotaExceeded
}
crate::console::messages::Reason::Unknown => match e {
ConsoleError {
http_status_code: StatusCode::LOCKED,
ref error,
..
} if error.contains("written data quota exceeded")
|| error.contains("the limit for current plan reached") =>
{
WakeupFailureKind::QuotaExceeded
}
ConsoleError {
http_status_code: StatusCode::UNPROCESSABLE_ENTITY,
ref error,
..
} if error.contains("compute time quota of non-primary branches is exceeded") => {
WakeupFailureKind::QuotaExceeded
}
ConsoleError {
http_status_code: StatusCode::LOCKED,
..
} => WakeupFailureKind::ApiConsoleLocked,
ConsoleError {
http_status_code: StatusCode::BAD_REQUEST,
..
} => WakeupFailureKind::ApiConsoleBadRequest,
ConsoleError {
http_status_code, ..
} if http_status_code.is_server_error() => {
WakeupFailureKind::ApiConsoleOtherServerError
}
ConsoleError { .. } => WakeupFailureKind::ApiConsoleOtherError,
},
},
WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::LOCKED,
ref text,
}) if text.contains("written data quota exceeded")
|| text.contains("the limit for current plan reached") =>
{
WakeupFailureKind::QuotaExceeded
}
WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::UNPROCESSABLE_ENTITY,
ref text,
}) if text.contains("compute time quota of non-primary branches is exceeded") => {
WakeupFailureKind::QuotaExceeded
}
WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::LOCKED,
..
}) => WakeupFailureKind::ApiConsoleLocked,
WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::BAD_REQUEST,
..
}) => WakeupFailureKind::ApiConsoleBadRequest,
WakeComputeError::ApiError(ApiError::Console { status, .. })
if status.is_server_error() =>
{
WakeupFailureKind::ApiConsoleOtherServerError
}
WakeComputeError::ApiError(ApiError::Console { .. }) => {
WakeupFailureKind::ApiConsoleOtherError
}
WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
};

View File

@@ -92,6 +92,12 @@ impl LimiterInner {
}
fn take(&mut self, ready: &Notify) -> Option<()> {
tracing::info!(
"available: {}, in_flight: {}, limit: {}",
self.available,
self.in_flight,
self.limit
);
if self.available >= 1 {
self.available -= 1;
self.in_flight += 1;

View File

@@ -1,3 +1,5 @@
use std::usize;
use super::{LimitAlgorithm, Outcome, Sample};
/// Loss-based congestion avoidance.
@@ -26,6 +28,8 @@ pub struct Aimd {
impl LimitAlgorithm for Aimd {
fn update(&self, old_limit: usize, sample: Sample) -> usize {
use Outcome::*;
tracing::info!(old_limit, "updating limit");
tracing::info!(sample.in_flight, "in flight");
match sample.outcome {
Success => {
let utilisation = sample.in_flight as f32 / old_limit as f32;

View File

@@ -32,6 +32,8 @@ pub struct ClientFirstMessage<'a> {
pub bare: &'a str,
/// Channel binding mode.
pub cbind_flag: ChannelBinding<&'a str>,
/// (Client username)[<https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf/src/backend/libpq/auth-scram.c#L13>].
pub username: &'a str,
/// Client nonce.
pub nonce: &'a str,
}
@@ -56,14 +58,6 @@ impl<'a> ClientFirstMessage<'a> {
// In theory, these might be preceded by "reserved-mext" (i.e. "m=")
let username = parts.next()?.strip_prefix("n=")?;
// https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14
if !username.is_empty() {
tracing::warn!(username, "scram username provided, but is not expected")
// TODO(conrad):
// return None;
}
let nonce = parts.next()?.strip_prefix("r=")?;
// Validate but ignore auth extensions
@@ -72,6 +66,7 @@ impl<'a> ClientFirstMessage<'a> {
Some(Self {
bare,
cbind_flag,
username,
nonce,
})
}
@@ -193,18 +188,19 @@ mod tests {
// (Almost) real strings captured during debug sessions
let cases = [
(NotSupportedClient, "n,,n=,r=t8JwklwKecDLwSsA72rHmVju"),
(NotSupportedServer, "y,,n=,r=t8JwklwKecDLwSsA72rHmVju"),
(NotSupportedClient, "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"),
(NotSupportedServer, "y,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"),
(
Required("tls-server-end-point"),
"p=tls-server-end-point,,n=,r=t8JwklwKecDLwSsA72rHmVju",
"p=tls-server-end-point,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju",
),
];
for (cb, input) in cases {
let msg = ClientFirstMessage::parse(input).unwrap();
assert_eq!(msg.bare, "n=,r=t8JwklwKecDLwSsA72rHmVju");
assert_eq!(msg.bare, "n=pepe,r=t8JwklwKecDLwSsA72rHmVju");
assert_eq!(msg.username, "pepe");
assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju");
assert_eq!(msg.cbind_flag, cb);
}
@@ -212,13 +208,14 @@ mod tests {
#[test]
fn parse_client_first_message_with_invalid_gs2_authz() {
assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none())
assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none())
}
#[test]
fn parse_client_first_message_with_extra_params() {
let msg = ClientFirstMessage::parse("n,,n=,r=nonce,a=foo,b=bar,c=baz").unwrap();
assert_eq!(msg.bare, "n=,r=nonce,a=foo,b=bar,c=baz");
let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap();
assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz");
assert_eq!(msg.username, "user");
assert_eq!(msg.nonce, "nonce");
assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient);
}
@@ -226,9 +223,9 @@ mod tests {
#[test]
fn parse_client_first_message_with_extra_params_invalid() {
// must be of the form `<ascii letter>=<...>`
assert!(ClientFirstMessage::parse("n,,n=,r=nonce,abc=foo").is_none());
assert!(ClientFirstMessage::parse("n,,n=,r=nonce,1=foo").is_none());
assert!(ClientFirstMessage::parse("n,,n=,r=nonce,a").is_none());
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none());
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none());
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none());
}
#[test]

View File

@@ -45,10 +45,6 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
this.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.79.0"
channel = "1.78.0"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -1,5 +1,5 @@
[package]
name = "storage_scrubber"
name = "s3_scrubber"
version = "0.1.0"
edition.workspace = true
license.workspace = true

View File

@@ -1,4 +1,4 @@
# Neon Storage Scrubber
# Neon S3 scrubber
This tool directly accesses the S3 buckets used by the Neon `pageserver`
and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist.

View File

@@ -78,16 +78,17 @@ pub(crate) fn branch_cleanup_and_check_errors(
index_part_generation: _index_part_generation,
s3_layers: _s3_layers,
} => {
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) {
result
.errors
.push(format!("index_part.json version: {}", index_part.version()))
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
result.errors.push(format!(
"index_part.json version: {}",
index_part.get_version()
))
}
if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
result.warnings.push(format!(
"index_part.json version is not latest: {}",
index_part.version()
index_part.get_version()
))
}

View File

@@ -1,11 +1,11 @@
use anyhow::bail;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use storage_scrubber::pageserver_physical_gc::GcMode;
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
use storage_scrubber::tenant_snapshot::SnapshotDownloader;
use storage_scrubber::{
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use s3_scrubber::pageserver_physical_gc::GcMode;
use s3_scrubber::scan_pageserver_metadata::scan_metadata;
use s3_scrubber::tenant_snapshot::SnapshotDownloader;
use s3_scrubber::{
init_logging, pageserver_physical_gc::pageserver_physical_gc,
scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
TraversingDepth,

View File

@@ -125,7 +125,7 @@ impl MetadataSummary {
{
*self
.indices_by_version
.entry(index_part.version())
.entry(index_part.get_version())
.or_insert(0) += 1;
if let Err(e) = self.update_histograms(index_part) {

View File

@@ -213,9 +213,6 @@ pub async fn main_task(
}
};
// remove timeline from the broker active set sooner, before waiting for background tasks
tli_broker_active.set(false);
// shutdown background tasks
if conf.is_wal_backup_enabled() {
wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;

View File

@@ -20,6 +20,7 @@
use camino::Utf8PathBuf;
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
use rand::Rng;
use remote_storage::RemotePath;
use serde::{Deserialize, Serialize};
@@ -275,6 +276,13 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
debug!("started");
let await_duration = conf.partial_backup_timeout;
// sleep for random time to avoid thundering herd
{
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
let sleep_duration = await_duration.mul_f64(randf64);
tokio::time::sleep(sleep_duration).await;
}
let (_, persistent_state) = tli.get_state().await;
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();

View File

@@ -142,6 +142,52 @@ async fn handle_tenant_create(
)
}
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids
// needing to track a "deleting" state for tenants.
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
where
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
{
let started_at = Instant::now();
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
// completed.
let mut retry_period = Duration::from_secs(1);
// On subsequent retries, wait longer.
let max_retry_period = Duration::from_secs(5);
// Enable callers with a 30 second request timeout to reliably get a response
let max_wait = Duration::from_secs(25);
loop {
let status = f(service.clone()).await?;
match status {
StatusCode::ACCEPTED => {
tracing::info!("Deletion accepted, waiting to try again...");
tokio::time::sleep(retry_period).await;
retry_period = max_retry_period;
}
StatusCode::NOT_FOUND => {
tracing::info!("Deletion complete");
return json_response(StatusCode::OK, ());
}
_ => {
tracing::warn!("Unexpected status {status}");
return json_response(status, ());
}
}
let now = Instant::now();
if now + retry_period > started_at + max_wait {
tracing::info!("Deletion timed out waiting for 404");
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
// the pageserver's swagger definition for this endpoint, and has the same desired
// effect of causing the control plane to retry later.
return json_response(StatusCode::CONFLICT, ());
}
}
}
async fn handle_tenant_location_config(
service: Arc<Service>,
mut req: Request<Body>,
@@ -237,17 +283,13 @@ async fn handle_tenant_delete(
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let status_code = service
.tenant_delete(tenant_id)
.await
.and_then(map_reqwest_hyper_status)?;
if status_code == StatusCode::NOT_FOUND {
// The pageserver uses 404 for successful deletion, but we use 200
json_response(StatusCode::OK, ())
} else {
json_response(status_code, ())
}
deletion_wrapper(service, move |service| async move {
service
.tenant_delete(tenant_id)
.await
.and_then(map_reqwest_hyper_status)
})
.await
}
async fn handle_tenant_timeline_create(
@@ -275,51 +317,6 @@ async fn handle_tenant_timeline_delete(
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
// For timeline deletions, which both implement an "initially return 202, then 404 once
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
where
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
{
let started_at = Instant::now();
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
// completed.
let mut retry_period = Duration::from_secs(1);
// On subsequent retries, wait longer.
let max_retry_period = Duration::from_secs(5);
// Enable callers with a 30 second request timeout to reliably get a response
let max_wait = Duration::from_secs(25);
loop {
let status = f(service.clone()).await?;
match status {
StatusCode::ACCEPTED => {
tracing::info!("Deletion accepted, waiting to try again...");
tokio::time::sleep(retry_period).await;
retry_period = max_retry_period;
}
StatusCode::NOT_FOUND => {
tracing::info!("Deletion complete");
return json_response(StatusCode::OK, ());
}
_ => {
tracing::warn!("Unexpected status {status}");
return json_response(status, ());
}
}
let now = Instant::now();
if now + retry_period > started_at + max_wait {
tracing::info!("Deletion timed out waiting for 404");
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
// the pageserver's swagger definition for this endpoint, and has the same desired
// effect of causing the control plane to retry later.
return json_response(StatusCode::CONFLICT, ());
}
}
}
deletion_wrapper(service, move |service| async move {
service
.tenant_timeline_delete(tenant_id, timeline_id)

View File

@@ -29,8 +29,6 @@ pub enum MaySchedule {
struct SchedulerNode {
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
shard_count: usize,
/// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
attached_shard_count: usize,
/// Whether this node is currently elegible to have new shards scheduled (this is derived
/// from a node's availability state and scheduling policy).
@@ -44,9 +42,7 @@ impl PartialEq for SchedulerNode {
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
);
may_schedule_matches
&& self.shard_count == other.shard_count
&& self.attached_shard_count == other.attached_shard_count
may_schedule_matches && self.shard_count == other.shard_count
}
}
@@ -142,15 +138,6 @@ impl ScheduleContext {
}
}
pub(crate) enum RefCountUpdate {
PromoteSecondary,
Attach,
Detach,
DemoteAttached,
AddSecondary,
RemoveSecondary,
}
impl Scheduler {
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
let mut scheduler_nodes = HashMap::new();
@@ -159,7 +146,6 @@ impl Scheduler {
node.get_id(),
SchedulerNode {
shard_count: 0,
attached_shard_count: 0,
may_schedule: node.may_schedule(),
},
);
@@ -185,7 +171,6 @@ impl Scheduler {
node.get_id(),
SchedulerNode {
shard_count: 0,
attached_shard_count: 0,
may_schedule: node.may_schedule(),
},
);
@@ -194,10 +179,7 @@ impl Scheduler {
for shard in shards {
if let Some(node_id) = shard.intent.get_attached() {
match expect_nodes.get_mut(node_id) {
Some(node) => {
node.shard_count += 1;
node.attached_shard_count += 1;
}
Some(node) => node.shard_count += 1,
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
shard.tenant_shard_id,
@@ -245,42 +227,31 @@ impl Scheduler {
Ok(())
}
/// Update the reference counts of a node. These reference counts are used to guide scheduling
/// decisions, not for memory management: they represent the number of tenant shard whose IntentState
/// targets this node and the number of tenants shars whose IntentState is attached to this
/// node.
/// Increment the reference count of a node. This reference count is used to guide scheduling
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
/// this node.
///
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
/// [`Self::new`] or [`Self::node_upsert`])
pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
let Some(node) = self.nodes.get_mut(&node_id) else {
tracing::error!("Scheduler missing node {node_id}");
debug_assert!(false);
return;
};
node.shard_count += 1;
}
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
let Some(node) = self.nodes.get_mut(&node_id) else {
debug_assert!(false);
tracing::error!("Scheduler missing node {node_id}");
return;
};
match update {
RefCountUpdate::PromoteSecondary => {
node.attached_shard_count += 1;
}
RefCountUpdate::Attach => {
node.shard_count += 1;
node.attached_shard_count += 1;
}
RefCountUpdate::Detach => {
node.shard_count -= 1;
node.attached_shard_count -= 1;
}
RefCountUpdate::DemoteAttached => {
node.attached_shard_count -= 1;
}
RefCountUpdate::AddSecondary => {
node.shard_count += 1;
}
RefCountUpdate::RemoveSecondary => {
node.shard_count -= 1;
}
}
node.shard_count -= 1;
}
pub(crate) fn node_upsert(&mut self, node: &Node) {
@@ -292,7 +263,6 @@ impl Scheduler {
Vacant(entry) => {
entry.insert(SchedulerNode {
shard_count: 0,
attached_shard_count: 0,
may_schedule: node.may_schedule(),
});
}
@@ -415,11 +385,6 @@ impl Scheduler {
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
self.nodes.get(&node_id).unwrap().shard_count
}
#[cfg(test)]
pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
self.nodes.get(&node_id).unwrap().attached_shard_count
}
}
#[cfg(test)]
@@ -472,28 +437,18 @@ mod tests {
let scheduled = scheduler.schedule_shard(&[], &context)?;
t2_intent.set_attached(&mut scheduler, Some(scheduled));
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
t1_intent.push_secondary(&mut scheduler, scheduled);
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
t1_intent.clear(&mut scheduler);
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
let total_attached = scheduler.get_node_attached_shard_count(NodeId(1))
+ scheduler.get_node_attached_shard_count(NodeId(2));
assert_eq!(total_attached, 1);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
if cfg!(debug_assertions) {
// Dropping an IntentState without clearing it causes a panic in debug mode,
@@ -504,12 +459,8 @@ mod tests {
assert!(result.is_err());
} else {
t2_intent.clear(&mut scheduler);
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
}
Ok(())

View File

@@ -2376,86 +2376,63 @@ impl Service {
let _tenant_lock =
trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
// Detach all shards
let (detach_waiters, shard_ids, node) = {
let mut shard_ids = Vec::new();
let mut detach_waiters = Vec::new();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
self.ensure_attached_wait(tenant_id).await?;
// TODO: refactor into helper
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
tenants.range_mut(TenantShardId::tenant_range(tenant_id))
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
shard_ids.push(*tenant_shard_id);
let node_id = shard.intent.get_attached().ok_or_else(|| {
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
})?;
let node = locked
.nodes
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
// Update the tenant's intent to remove all attachments
shard.policy = PlacementPolicy::Detached;
shard
.schedule(scheduler, &mut ScheduleContext::default())
.expect("De-scheduling is infallible");
debug_assert!(shard.intent.get_attached().is_none());
debug_assert!(shard.intent.get_secondary().is_empty());
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
detach_waiters.push(waiter);
}
targets.push((*tenant_shard_id, node.clone()));
}
// Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
// was attached, just has to be able to see the S3 content)
let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
let node = nodes
.get(&node_id)
.expect("Pageservers may not be deleted while lock is active");
(detach_waiters, shard_ids, node.clone())
targets
};
// This reconcile wait can fail in a few ways:
// A there is a very long queue for the reconciler semaphore
// B some pageserver is failing to handle a detach promptly
// C some pageserver goes offline right at the moment we send it a request.
//
// A and C are transient: the semaphore will eventually become available, and once a node is marked offline
// the next attempt to reconcile will silently skip detaches for an offline node and succeed. If B happens,
// it's a bug, and needs resolving at the pageserver level (we shouldn't just leave attachments behind while
// deleting the underlying data).
self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
.await?;
let locations = shard_ids
.into_iter()
.map(|s| (s, node.clone()))
.collect::<Vec<_>>();
let results = self.tenant_for_shards_api(
locations,
|tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
1,
3,
RECONCILE_TIMEOUT,
&self.cancel,
)
.await;
for result in results {
match result {
Ok(StatusCode::ACCEPTED) => {
// This should never happen: we waited for detaches to finish above
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Unexpectedly still attached on {}",
node
)));
}
Ok(_) => {}
Err(mgmt_api::Error::Cancelled) => {
return Err(ApiError::ShuttingDown);
}
Err(e) => {
// This is unexpected: remote deletion should be infallible, unless the object store
// at large is unavailable.
tracing::error!("Error deleting via node {}: {e}", node);
return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
}
// Phase 1: delete on the pageservers
let mut any_pending = false;
for (tenant_shard_id, node) in targets {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
);
// TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
// surface immediately as an error to our caller.
let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Error deleting shard {tenant_shard_id} on node {node}: {e}",
))
})?;
tracing::info!(
"Shard {tenant_shard_id} on node {node}, delete returned {}",
status
);
if status == StatusCode::ACCEPTED {
any_pending = true;
}
}
if any_pending {
// Caller should call us again later. When we eventually see 404s from
// all the shards, we may proceed to delete our records of the tenant.
tracing::info!(
"Tenant {} has some shards pending deletion, returning 202",
tenant_id
);
return Ok(StatusCode::ACCEPTED);
}
// Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
// our in-memory state and database state.
@@ -4316,7 +4293,7 @@ impl Service {
continue;
}
if tenant_shard.intent.demote_attached(scheduler, node_id) {
if tenant_shard.intent.demote_attached(node_id) {
tenant_shard.sequence = tenant_shard.sequence.next();
// TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters

View File

@@ -8,7 +8,7 @@ use crate::{
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
persistence::TenantShardPersistence,
reconciler::ReconcileUnits,
scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
};
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
use pageserver_api::{
@@ -153,7 +153,7 @@ impl IntentState {
}
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
if let Some(node_id) = node_id {
scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach);
scheduler.node_inc_ref(node_id);
}
Self {
attached: node_id,
@@ -164,10 +164,10 @@ impl IntentState {
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
if self.attached != new_attached {
if let Some(old_attached) = self.attached.take() {
scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
scheduler.node_dec_ref(old_attached);
}
if let Some(new_attached) = &new_attached {
scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach);
scheduler.node_inc_ref(*new_attached);
}
self.attached = new_attached;
}
@@ -177,27 +177,22 @@ impl IntentState {
/// secondary to attached while maintaining the scheduler's reference counts.
pub(crate) fn promote_attached(
&mut self,
scheduler: &mut Scheduler,
_scheduler: &mut Scheduler,
promote_secondary: NodeId,
) {
// If we call this with a node that isn't in secondary, it would cause incorrect
// scheduler reference counting, since we assume the node is already referenced as a secondary.
debug_assert!(self.secondary.contains(&promote_secondary));
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
// need to call into it here.
self.secondary.retain(|n| n != &promote_secondary);
let demoted = self.attached;
self.attached = Some(promote_secondary);
scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary);
if let Some(demoted) = demoted {
scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached);
}
}
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
debug_assert!(!self.secondary.contains(&new_secondary));
scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary);
scheduler.node_inc_ref(new_secondary);
self.secondary.push(new_secondary);
}
@@ -205,27 +200,27 @@ impl IntentState {
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
let index = self.secondary.iter().position(|n| *n == node_id);
if let Some(index) = index {
scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
scheduler.node_dec_ref(node_id);
self.secondary.remove(index);
}
}
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
for secondary in self.secondary.drain(..) {
scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary);
scheduler.node_dec_ref(secondary);
}
}
/// Remove the last secondary node from the list of secondaries
pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
if let Some(node_id) = self.secondary.pop() {
scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
scheduler.node_dec_ref(node_id);
}
}
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
if let Some(old_attached) = self.attached.take() {
scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
scheduler.node_dec_ref(old_attached);
}
self.clear_secondary(scheduler);
@@ -256,11 +251,12 @@ impl IntentState {
/// forget the location on the offline node.
///
/// Returns true if a change was made
pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool {
pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
// need to call into it here.
self.attached = None;
self.secondary.push(node_id);
scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached);
true
} else {
false
@@ -597,7 +593,7 @@ impl TenantShard {
Secondary => {
if let Some(node_id) = self.intent.get_attached() {
// Populate secondary by demoting the attached node
self.intent.demote_attached(scheduler, *node_id);
self.intent.demote_attached(*node_id);
modified = true;
} else if self.intent.secondary.is_empty() {
// Populate secondary by scheduling a fresh node
@@ -787,7 +783,7 @@ impl TenantShard {
old_attached_node_id,
new_attached_node_id,
}) => {
self.intent.demote_attached(scheduler, old_attached_node_id);
self.intent.demote_attached(old_attached_node_id);
self.intent
.promote_attached(scheduler, new_attached_node_id);
}
@@ -1325,9 +1321,7 @@ pub(crate) mod tests {
assert_ne!(attached_node_id, secondary_node_id);
// Notifying the attached node is offline should demote it to a secondary
let changed = tenant_shard
.intent
.demote_attached(&mut scheduler, attached_node_id);
let changed = tenant_shard.intent.demote_attached(attached_node_id);
assert!(changed);
assert!(tenant_shard.intent.attached.is_none());
assert_eq!(tenant_shard.intent.secondary.len(), 2);
@@ -1610,14 +1604,7 @@ pub(crate) mod tests {
// We should see equal number of locations on the two nodes.
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
// Scheduling does not consider the number of attachments picking the initial
// pageserver to attach to (hence the assertion that all primaries are on the
// same node)
// TODO: Tweak the scheduling to evenly distribute attachments for new shards.
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
// Add another two nodes: we should see the shards spread out when their optimize
// methods are called
@@ -1626,16 +1613,9 @@ pub(crate) mod tests {
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1);
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1);
for shard in shards.iter_mut() {
shard.intent.clear(&mut scheduler);

View File

@@ -285,21 +285,6 @@ def test_foobar(neon_env_builder: NeonEnvBuilder):
...
```
The env includes a default tenant and timeline. Therefore, you do not need to create your own
tenant/timeline for testing.
```python
def test_foobar2(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start() # Start the environment
with env.endpoints.create_start("main") as endpoint:
# Start the compute endpoint
client = env.pageserver.http_client() # Get the pageserver client
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)
```
For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html
At the end of a test, all the nodes in the environment are automatically stopped, so you

View File

@@ -833,7 +833,7 @@ class NeonEnvBuilder:
def enable_scrub_on_exit(self):
"""
Call this if you would like the fixture to automatically run
storage_scrubber at the end of the test, as a bidirectional test
s3_scrubber at the end of the test, as a bidirectional test
that the scrubber is working properly, and that the code within
the test didn't produce any invalid remote state.
"""
@@ -948,7 +948,7 @@ class NeonEnvBuilder:
if self.scrub_on_exit:
try:
StorageScrubber(self).scan_metadata()
S3Scrubber(self).scan_metadata()
except Exception as e:
log.error(f"Error during remote storage scrub: {e}")
cleanup_error = e
@@ -3386,7 +3386,7 @@ def static_proxy(
yield proxy
class Endpoint(PgProtocol, LogUtils):
class Endpoint(PgProtocol):
"""An object representing a Postgres compute endpoint managed by the control plane."""
def __init__(
@@ -3452,7 +3452,6 @@ class Endpoint(PgProtocol, LogUtils):
)
path = Path("endpoints") / self.endpoint_id / "pgdata"
self.pgdata_dir = os.path.join(self.env.repo_dir, path)
self.logfile = self.endpoint_path() / "compute.log"
config_lines = config_lines or []
@@ -3937,7 +3936,7 @@ class Safekeeper(LogUtils):
wait_until(20, 0.5, paused)
class StorageScrubber:
class S3Scrubber:
def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
self.env = env
self.log_dir = log_dir or env.test_output_dir
@@ -3957,7 +3956,7 @@ class StorageScrubber:
if s3_storage.endpoint is not None:
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
base_args = [str(self.env.neon_binpath / "storage_scrubber")]
base_args = [str(self.env.neon_binpath / "s3_scrubber")]
args = base_args + args
(output_path, stdout, status_code) = subprocess_capture(

View File

@@ -94,6 +94,8 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
".*WARN.*path=/v1/utilization .*request was dropped before completing",
# Can happen during shutdown
".*scheduling deletion on drop failed: queue is in state Stopped.*",
# Can happen during shutdown
".*ignoring failure to find gc cutoffs: timeline shutting down.*",
)

Some files were not shown because too many files have changed in this diff Show More