Compare commits

..

7 Commits

Author SHA1 Message Date
Alex Chi Z
5713ff31fc remove stale comments
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-30 10:54:53 -04:00
Alex Chi Z
6e0682e9ad index_part is source of truth
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-30 10:53:18 -04:00
Alex Chi Z
fd984a2750 fix mypy
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-30 10:51:01 -04:00
Alex Chi Z
00091a4fb1 fix tests
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-29 17:56:22 -04:00
Alex Chi Z
d37cd4b282 always put a placeholder even in v2
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-29 14:16:19 -04:00
Alex Chi Z
7442d47c3e run some tests in validation mode
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-29 12:09:48 -04:00
Alex Chi Z
f6b1891564 feat(pageserver): enable reldirv2 by default in regress tests, try 2
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-28 17:02:55 -04:00
218 changed files with 2705 additions and 11805 deletions

View File

@@ -146,9 +146,7 @@ jobs:
with:
file: build-tools/Dockerfile
context: .
attests: |
type=provenance,mode=max
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
provenance: false
push: true
pull: true
build-args: |

View File

@@ -634,9 +634,7 @@ jobs:
DEBIAN_VERSION=bookworm
secrets: |
SUBZERO_ACCESS_TOKEN=${{ secrets.CI_ACCESS_TOKEN }}
attests: |
type=provenance,mode=max
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
provenance: false
push: true
pull: true
file: Dockerfile
@@ -749,9 +747,7 @@ jobs:
PG_VERSION=${{ matrix.version.pg }}
BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
DEBIAN_VERSION=${{ matrix.version.debian }}
attests: |
type=provenance,mode=max
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
provenance: false
push: true
pull: true
file: compute/compute-node.Dockerfile
@@ -770,9 +766,7 @@ jobs:
PG_VERSION=${{ matrix.version.pg }}
BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
DEBIAN_VERSION=${{ matrix.version.debian }}
attests: |
type=provenance,mode=max
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
provenance: false
push: true
pull: true
file: compute/compute-node.Dockerfile

View File

@@ -72,10 +72,9 @@ jobs:
options: --init --user root
services:
clickhouse:
image: clickhouse/clickhouse-server:25.6
image: clickhouse/clickhouse-server:24.8
env:
CLICKHOUSE_PASSWORD: ${{ needs.generate-ch-tmppw.outputs.tmp_val }}
PGSSLCERT: /tmp/postgresql.crt
ports:
- 9000:9000
- 8123:8123

1
.gitignore vendored
View File

@@ -15,7 +15,6 @@ neon.iml
/.neon
/integration_tests/.neon
compaction-suite-results.*
pgxn/neon/communicator/communicator_bindings.h
docker-compose/docker-compose-parallel.yml
# Coverage

72
Cargo.lock generated
View File

@@ -259,17 +259,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "atomic_enum"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -1307,31 +1296,13 @@ dependencies = [
name = "communicator"
version = "0.1.0"
dependencies = [
"atomic_enum",
"axum",
"bytes",
"cbindgen",
"clashmap",
"fiemap",
"http 1.3.1",
"itertools 0.10.5",
"libc",
"measured",
"neon-shmem",
"nix 0.30.1",
"pageserver_api",
"pageserver_client_grpc",
"pageserver_page_api",
"prometheus",
"prost 0.13.5",
"strum_macros",
"thiserror 1.0.69",
"tokio",
"tokio-pipe",
"tonic",
"tracing",
"tracing-subscriber",
"uring-common",
"utils",
"workspace_hack",
]
@@ -1672,9 +1643,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crossterm"
@@ -2338,15 +2309,6 @@ version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
[[package]]
name = "fiemap"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92e387bc8b3342ba5cd115fb566e6bf2c82562433dffcecbc2474265cf8a875a"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "filetime"
version = "0.2.22"
@@ -4429,16 +4391,13 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"axum",
"bytes",
"camino",
"clap",
"futures",
"hdrhistogram",
"http 1.3.1",
"humantime",
"humantime-serde",
"metrics",
"pageserver_api",
"pageserver_client",
"pageserver_client_grpc",
@@ -4528,7 +4487,6 @@ dependencies = [
"pageserver_client",
"pageserver_compaction",
"pageserver_page_api",
"peekable",
"pem",
"pin-project-lite",
"postgres-protocol",
@@ -4542,7 +4500,6 @@ dependencies = [
"pprof",
"pq_proto",
"procfs",
"prost 0.13.5",
"rand 0.9.1",
"range-set-blaze",
"regex",
@@ -4837,15 +4794,6 @@ dependencies = [
"sha2",
]
[[package]]
name = "peekable"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
dependencies = [
"smallvec",
]
[[package]]
name = "pem"
version = "3.0.3"
@@ -5129,6 +5077,8 @@ dependencies = [
"crc32c",
"criterion",
"env_logger",
"log",
"memoffset 0.9.0",
"once_cell",
"postgres",
"postgres_ffi_types",
@@ -5569,7 +5519,6 @@ dependencies = [
"workspace_hack",
"x509-cert",
"zerocopy 0.8.24",
"zeroize",
]
[[package]]
@@ -7698,16 +7647,6 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "tokio-pipe"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
dependencies = [
"libc",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.10"
@@ -9075,8 +9014,8 @@ dependencies = [
"clap",
"clap_builder",
"const-oid",
"criterion",
"crossbeam-epoch",
"crossbeam-utils",
"crypto-bigint 0.5.5",
"der 0.7.8",
"deranged",
@@ -9119,6 +9058,7 @@ dependencies = [
"num-iter",
"num-rational",
"num-traits",
"once_cell",
"p256 0.13.2",
"parquet",
"portable-atomic",

View File

@@ -93,7 +93,6 @@ clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] }
comfy-table = "7.1"
const_format = "0.2"
crossbeam-utils = "0.8.21"
crc32c = "0.6"
diatomic-waker = { version = "0.2.3" }
either = "1.8"
@@ -136,6 +135,7 @@ lock_api = "0.4.13"
md5 = "0.7.0"
measured = { version = "0.0.22", features=["lasso"] }
measured-process = { version = "0.0.22" }
memoffset = "0.9"
moka = { version = "0.12", features = ["sync"] }
nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
# Do not update to >= 7.0.0, at least. The update will have a significant impact
@@ -153,7 +153,6 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
peekable = "0.3.0"
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
@@ -192,7 +191,6 @@ smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
spki = "0.7.3"
spin = "0.9.8"
strum = "0.26"
strum_macros = "0.26"
"subtle" = "2.5.0"
@@ -204,6 +202,7 @@ thiserror = "1.0"
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tokio = { version = "1.43.1", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -235,18 +234,14 @@ uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
rustls-native-certs = "0.8"
whoami = "1.5.1"
zerocopy = { version = "0.8", features = ["derive", "simd"] }
json-structural-diff = { version = "0.2.0" }
x509-cert = { version = "0.2.5" }
zerocopy = { version = "0.8", features = ["derive", "simd"] }
zeroize = "1.8"
## TODO replace this with tracing
env_logger = "0.11"
log = "0.4"
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

View File

@@ -103,7 +103,7 @@ RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \
&& if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \
export CARGO_FEATURES="rest_broker"; \
fi \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo auditable build \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
--features $CARGO_FEATURES \
--bin pg_sni_router \
--bin pageserver \

View File

@@ -299,7 +299,6 @@ WORKDIR /home/nonroot
ENV RUSTC_VERSION=1.88.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
ARG CARGO_AUDITABLE_VERSION=0.7.0
ARG RUSTFILT_VERSION=0.2.1
ARG CARGO_HAKARI_VERSION=0.9.36
ARG CARGO_DENY_VERSION=0.18.2
@@ -315,16 +314,14 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
. "$HOME/.cargo/env" && \
cargo --version && rustup --version && \
rustup component add llvm-tools rustfmt clippy && \
cargo install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" && \
cargo auditable install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" --force && \
cargo auditable install rustfilt --version "${RUSTFILT_VERSION}" && \
cargo auditable install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \
cargo auditable install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \
cargo auditable install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \
cargo auditable install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \
cargo auditable install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \
cargo auditable install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \
--features postgres-bundled --no-default-features && \
cargo install rustfilt --locked --version "${RUSTFILT_VERSION}" && \
cargo install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \
cargo install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \
cargo install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \
cargo install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \
cargo install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \
cargo install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \
--features postgres-bundled --no-default-features && \
rm -rf /home/nonroot/.cargo/registry && \
rm -rf /home/nonroot/.cargo/git

View File

@@ -1 +1 @@
SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer;
SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;

View File

@@ -1 +1 @@
SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter;
SELECT checkpoints_req FROM pg_stat_bgwriter;

View File

@@ -1 +1 @@
SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter;
SELECT checkpoints_timed FROM pg_stat_bgwriter;

View File

@@ -1 +1 @@
SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled;
SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;

View File

@@ -1,4 +1,4 @@
SELECT CASE
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8
ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
END AS lsn;

View File

@@ -1,7 +1,7 @@
SELECT
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;

View File

@@ -1,7 +1,7 @@
SELECT
(SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id,
(SELECT current_setting('neon.timeline_id')) AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
(SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;

View File

@@ -1,9 +1,9 @@
SELECT
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
(SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
) AS logical_snapshots_bytes;

View File

@@ -1 +1 @@
SELECT pg_catalog.current_setting('max_connections') AS max_connections;
SELECT current_setting('max_connections') as max_connections;

View File

@@ -1,4 +1,4 @@
SELECT datname database_name,
pg_catalog.age(datfrozenxid) frozen_xid_age
FROM pg_catalog.pg_database
age(datfrozenxid) frozen_xid_age
FROM pg_database
ORDER BY frozen_xid_age DESC LIMIT 10;

View File

@@ -1,4 +1,4 @@
SELECT datname database_name,
pg_catalog.mxid_age(datminmxid) min_mxid_age
FROM pg_catalog.pg_database
mxid_age(datminmxid) min_mxid_age
FROM pg_database
ORDER BY min_mxid_age DESC LIMIT 10;

View File

@@ -1,4 +1,4 @@
SELECT CASE
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
ELSE 0
END AS lsn;

View File

@@ -1 +1 @@
SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled;
SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;

View File

@@ -1 +1 @@
SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state;
SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;

View File

@@ -1,5 +1,5 @@
SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total
FROM pg_catalog.pg_database
SELECT sum(pg_database_size(datname)) AS total
FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2;

View File

@@ -3,6 +3,6 @@
-- minutes.
SELECT
x::pg_catalog.text AS duration_seconds,
x::text as duration_seconds,
neon.approximate_working_set_size_seconds(x) AS size
FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);

View File

@@ -3,6 +3,6 @@
SELECT
x AS duration,
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM (
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
VALUES ('5m'), ('15m'), ('1h')
) AS t (x);

View File

@@ -1 +1 @@
SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;

View File

@@ -1,3 +1,3 @@
SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn
FROM pg_catalog.pg_replication_slots
SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
FROM pg_replication_slots
WHERE slot_type = 'logical';

View File

@@ -1 +1 @@
SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size';
SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';

View File

@@ -1,13 +1,13 @@
-- We export stats for 10 non-system databases. Without this limit it is too
-- easy to abuse the system by creating lots of databases.
SELECT pg_catalog.pg_database_size(datname) AS db_size,
SELECT pg_database_size(datname) AS db_size,
deadlocks,
tup_inserted AS inserted,
tup_updated AS updated,
tup_deleted AS deleted,
datname
FROM pg_catalog.pg_stat_database
FROM pg_stat_database
WHERE datname IN (
SELECT datname FROM pg_database
-- Ignore invalid databases, as we will likely have problems with

View File

@@ -3,4 +3,4 @@
-- replay LSN may have advanced past the receive LSN we are using for the
-- calculation.
SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes;
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;

View File

@@ -1,5 +1,5 @@
SELECT
CASE
WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0
ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp()))
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
END AS replication_delay_seconds;

View File

@@ -1,10 +1,10 @@
SELECT
slot_name,
pg_catalog.pg_wal_lsn_diff(
pg_wal_lsn_diff(
CASE
WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn()
ELSE pg_catalog.pg_current_wal_lsn()
WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
ELSE pg_current_wal_lsn()
END,
restart_lsn)::pg_catalog.FLOAT8 AS retained_wal
FROM pg_catalog.pg_replication_slots
restart_lsn)::FLOAT8 AS retained_wal
FROM pg_replication_slots
WHERE active = false;

View File

@@ -4,4 +4,4 @@ SELECT
WHEN wal_status = 'lost' THEN 1
ELSE 0
END AS wal_is_lost
FROM pg_catalog.pg_replication_slots;
FROM pg_replication_slots;

View File

@@ -1,11 +1,5 @@
commit 5eb393810cf7c7bafa4e394dad2e349e2a8cb2cb
Author: Alexey Masterov <alexey.masterov@databricks.com>
Date: Mon Jul 28 18:11:02 2025 +0200
Patch for pg_repack
diff --git a/regress/Makefile b/regress/Makefile
index bf6edcb..110e734 100644
index bf6edcb..89b4c7f 100644
--- a/regress/Makefile
+++ b/regress/Makefile
@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\}
@@ -13,36 +7,18 @@ index bf6edcb..110e734 100644
#
-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger
+REGRESS := init-extension noautovacuum repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger autovacuum
+REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
USE_PGXS = 1 # use pgxs if not in contrib directory
PGXS := $(shell $(PG_CONFIG) --pgxs)
diff --git a/regress/expected/autovacuum.out b/regress/expected/autovacuum.out
new file mode 100644
index 0000000..e7f2363
--- /dev/null
+++ b/regress/expected/autovacuum.out
@@ -0,0 +1,7 @@
+ALTER SYSTEM SET autovacuum='on';
+SELECT pg_reload_conf();
+ pg_reload_conf
+----------------
+ t
+(1 row)
+
diff --git a/regress/expected/noautovacuum.out b/regress/expected/noautovacuum.out
new file mode 100644
index 0000000..fc7978e
--- /dev/null
+++ b/regress/expected/noautovacuum.out
@@ -0,0 +1,7 @@
+ALTER SYSTEM SET autovacuum='off';
+SELECT pg_reload_conf();
+ pg_reload_conf
+----------------
+ t
+(1 row)
+
diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
index 9f2e171..f6e4f8d 100644
--- a/regress/expected/init-extension.out
+++ b/regress/expected/init-extension.out
@@ -1,3 +1,2 @@
SET client_min_messages = warning;
CREATE EXTENSION pg_repack;
-RESET client_min_messages;
diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
index 8d0a94e..63b68bf 100644
--- a/regress/expected/nosuper.out
@@ -74,22 +50,14 @@ index 8d0a94e..63b68bf 100644
INFO: repacking table "public.tbl_cluster"
ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block
DETAIL: query was: RESET lock_timeout
diff --git a/regress/sql/autovacuum.sql b/regress/sql/autovacuum.sql
new file mode 100644
index 0000000..a8eda63
--- /dev/null
+++ b/regress/sql/autovacuum.sql
@@ -0,0 +1,2 @@
+ALTER SYSTEM SET autovacuum='on';
+SELECT pg_reload_conf();
diff --git a/regress/sql/noautovacuum.sql b/regress/sql/noautovacuum.sql
new file mode 100644
index 0000000..13d4836
--- /dev/null
+++ b/regress/sql/noautovacuum.sql
@@ -0,0 +1,2 @@
+ALTER SYSTEM SET autovacuum='off';
+SELECT pg_reload_conf();
diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
index 9f2e171..f6e4f8d 100644
--- a/regress/sql/init-extension.sql
+++ b/regress/sql/init-extension.sql
@@ -1,3 +1,2 @@
SET client_min_messages = warning;
CREATE EXTENSION pg_repack;
-RESET client_min_messages;
diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
index 072f0fa..dbe60f8 100644
--- a/regress/sql/nosuper.sql

View File

@@ -82,15 +82,6 @@ struct Cli {
#[arg(long, default_value_t = 3081)]
pub internal_http_port: u16,
/// Backwards-compatible --http-port for Hadron deployments. Functionally the
/// same as --external-http-port.
#[arg(
long,
conflicts_with = "external_http_port",
conflicts_with = "internal_http_port"
)]
pub http_port: Option<u16>,
#[arg(short = 'D', long, value_name = "DATADIR")]
pub pgdata: String,
@@ -190,26 +181,6 @@ impl Cli {
}
}
// Hadron helpers to get compatible compute_ctl http ports from Cli. The old `--http-port`
// arg is used and acts the same as `--external-http-port`. The internal http port is defined
// to be http_port + 1. Hadron runs in the dblet environment which uses the host network, so
// we need to be careful with the ports to choose.
fn get_external_http_port(cli: &Cli) -> u16 {
if cli.lakebase_mode {
return cli.http_port.unwrap_or(cli.external_http_port);
}
cli.external_http_port
}
fn get_internal_http_port(cli: &Cli) -> u16 {
if cli.lakebase_mode {
return cli
.http_port
.map(|p| p + 1)
.unwrap_or(cli.internal_http_port);
}
cli.internal_http_port
}
fn main() -> Result<()> {
let cli = Cli::parse();
@@ -234,18 +205,13 @@ fn main() -> Result<()> {
// enable core dumping for all child processes
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
if cli.lakebase_mode {
installed_extensions::initialize_metrics();
hadron_metrics::initialize_metrics();
}
installed_extensions::initialize_metrics();
hadron_metrics::initialize_metrics();
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
let config = get_config(&cli)?;
let external_http_port = get_external_http_port(&cli);
let internal_http_port = get_internal_http_port(&cli);
let compute_node = ComputeNode::new(
ComputeNodeParams {
compute_id: cli.compute_id,
@@ -254,8 +220,8 @@ fn main() -> Result<()> {
pgdata: cli.pgdata.clone(),
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port,
internal_http_port,
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
remote_ext_base_url: cli.remote_ext_base_url.clone(),
resize_swap_on_bind: cli.resize_swap_on_bind,
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
@@ -279,7 +245,7 @@ fn main() -> Result<()> {
config,
)?;
let exit_code = compute_node.run().context("running compute node")?;
let exit_code = compute_node.run()?;
scenario.teardown();

View File

@@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
});
let query = "
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = pg_catalog.now();";
SET updated_at = now();";
match client.simple_query(query).await {
Result::Ok(result) => {

View File

@@ -6,8 +6,7 @@ use compute_api::responses::{
LfcPrewarmState, PromoteState, TlsConfig,
};
use compute_api::spec::{
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, GenericOption,
PageserverConnectionInfo, PageserverProtocol, PgIdent, Role,
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
};
use futures::StreamExt;
use futures::future::join_all;
@@ -32,17 +31,13 @@ use std::sync::{Arc, Condvar, Mutex, RwLock};
use std::time::{Duration, Instant};
use std::{env, fs};
use tokio::{spawn, sync::watch, task::JoinHandle, time};
use tokio_util::sync::CancellationToken;
use tracing::{Instrument, debug, error, info, instrument, warn};
use url::Url;
use utils::backoff::{
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration,
};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::measured_stream::MeasuredReader;
use utils::pid_file;
use utils::shard::{ShardIndex, ShardNumber, ShardStripeSize};
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
use crate::configurator::launch_configurator;
use crate::disk_quota::set_disk_quota;
@@ -196,7 +191,6 @@ pub struct ComputeState {
pub startup_span: Option<tracing::span::Span>,
pub lfc_prewarm_state: LfcPrewarmState,
pub lfc_prewarm_token: CancellationToken,
pub lfc_offload_state: LfcOffloadState,
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
@@ -222,7 +216,6 @@ impl ComputeState {
lfc_offload_state: LfcOffloadState::default(),
terminate_flush_lsn: None,
promote_state: None,
lfc_prewarm_token: CancellationToken::new(),
}
}
@@ -255,7 +248,7 @@ pub struct ParsedSpec {
pub spec: ComputeSpec,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub pageserver_conninfo: PageserverConnectionInfo,
pub pageserver_connstr: String,
pub safekeeper_connstrings: Vec<String>,
pub storage_auth_token: Option<String>,
/// k8s dns name and port
@@ -303,47 +296,25 @@ impl ParsedSpec {
}
impl TryFrom<ComputeSpec> for ParsedSpec {
type Error = anyhow::Error;
fn try_from(spec: ComputeSpec) -> Result<Self, anyhow::Error> {
type Error = String;
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
// Extract the options from the spec file that are needed to connect to
// the storage system.
//
// In compute specs generated by old control plane versions, the spec file might
// be missing the `pageserver_connection_info` field. In that case, we need to dig
// the pageserver connection info from the `pageserver_connstr` field instead, or
// if that's missing too, from the GUC in the cluster.settings field.
let mut pageserver_conninfo = spec.pageserver_connection_info.clone();
if pageserver_conninfo.is_none() {
if let Some(pageserver_connstr_field) = &spec.pageserver_connstring {
pageserver_conninfo = Some(PageserverConnectionInfo::from_connstr(
pageserver_connstr_field,
spec.shard_stripe_size,
)?);
}
}
if pageserver_conninfo.is_none() {
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
let stripe_size = if let Some(guc) = spec.cluster.settings.find("neon.stripe_size")
{
Some(ShardStripeSize(u32::from_str(&guc)?))
} else {
None
};
pageserver_conninfo =
Some(PageserverConnectionInfo::from_connstr(&guc, stripe_size)?);
}
}
let pageserver_conninfo = pageserver_conninfo.ok_or(anyhow::anyhow!(
"pageserver connection information should be provided"
))?;
// Similarly for safekeeper connection strings
// For backwards-compatibility, the top-level fields in the spec file
// may be empty. In that case, we need to dig them from the GUCs in the
// cluster.settings field.
let pageserver_connstr = spec
.pageserver_connstring
.clone()
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
.ok_or("pageserver connstr should be provided")?;
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
if matches!(spec.mode, ComputeMode::Primary) {
spec.cluster
.settings
.find("neon.safekeepers")
.ok_or(anyhow::anyhow!("safekeeper connstrings should be provided"))?
.ok_or("safekeeper connstrings should be provided")?
.split(',')
.map(|str| str.to_string())
.collect()
@@ -358,22 +329,22 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
tenant_id
} else {
let guc = spec
.cluster
spec.cluster
.settings
.find("neon.tenant_id")
.ok_or(anyhow::anyhow!("tenant id should be provided"))?;
TenantId::from_str(&guc).context("invalid tenant id")?
.ok_or("tenant id should be provided")
.map(|s| TenantId::from_str(&s))?
.or(Err("invalid tenant id"))?
};
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
timeline_id
} else {
let guc = spec
.cluster
spec.cluster
.settings
.find("neon.timeline_id")
.ok_or(anyhow::anyhow!("timeline id should be provided"))?;
TimelineId::from_str(&guc).context(anyhow::anyhow!("invalid timeline id"))?
.ok_or("timeline id should be provided")
.map(|s| TimelineId::from_str(&s))?
.or(Err("invalid timeline id"))?
};
let endpoint_storage_addr: Option<String> = spec
@@ -387,7 +358,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
let res = ParsedSpec {
spec,
pageserver_conninfo,
pageserver_connstr,
safekeeper_connstrings,
storage_auth_token,
tenant_id,
@@ -397,7 +368,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
};
// Now check validity of the parsed specification
res.validate().map_err(anyhow::Error::msg)?;
res.validate()?;
Ok(res)
}
}
@@ -442,66 +413,6 @@ struct StartVmMonitorResult {
vm_monitor: Option<JoinHandle<Result<()>>>,
}
// BEGIN_HADRON
/// This function creates roles that are used by Databricks.
/// These roles are not needs to be botostrapped at PG Compute provisioning time.
/// The auth method for these roles are configured in databricks_pg_hba.conf in universe repository.
pub(crate) fn create_databricks_roles() -> Vec<String> {
let roles = vec![
// Role for prometheus_stats_exporter
Role {
name: "databricks_monitor".to_string(),
// This uses "local" connection and auth method for that is "trust", so no password is needed.
encrypted_password: None,
options: Some(vec![GenericOption {
name: "IN ROLE pg_monitor".to_string(),
value: None,
vartype: "string".to_string(),
}]),
},
// Role for brickstore control plane
Role {
name: "databricks_control_plane".to_string(),
// Certificate user does not need password.
encrypted_password: None,
options: Some(vec![GenericOption {
name: "SUPERUSER".to_string(),
value: None,
vartype: "string".to_string(),
}]),
},
// Role for brickstore httpgateway.
Role {
name: "databricks_gateway".to_string(),
// Certificate user does not need password.
encrypted_password: None,
options: None,
},
];
roles
.into_iter()
.map(|role| {
let query = format!(
r#"
DO $$
BEGIN
IF NOT EXISTS (
SELECT FROM pg_catalog.pg_roles WHERE rolname = '{}')
THEN
CREATE ROLE {} {};
END IF;
END
$$;"#,
role.name,
role.name.pg_quote(),
role.to_pg_options(),
);
query
})
.collect()
}
/// Databricks-specific environment variables to be passed to the `postgres` sub-process.
pub struct DatabricksEnvVars {
/// The Databricks "endpoint ID" of the compute instance. Used by `postgres` to check
@@ -510,27 +421,14 @@ pub struct DatabricksEnvVars {
/// Hostname of the Databricks workspace URL this compute instance belongs to.
/// Used by postgres to verify Databricks PAT tokens.
pub workspace_host: String,
pub lakebase_mode: bool,
}
impl DatabricksEnvVars {
pub fn new(
compute_spec: &ComputeSpec,
compute_id: Option<&String>,
instance_id: Option<String>,
lakebase_mode: bool,
) -> Self {
let endpoint_id = if let Some(instance_id) = instance_id {
// Use instance_id as endpoint_id if it is set. This code path is for PuPr model.
instance_id
} else {
// Use compute_id as endpoint_id if instance_id is not set. The code path is for PrPr model.
// compute_id is a string format of "{endpoint_id}/{compute_idx}"
// endpoint_id is a uuid. We only need to pass down endpoint_id to postgres.
// Panics if compute_id is not set or not in the expected format.
compute_id.unwrap().split('/').next().unwrap().to_string()
};
pub fn new(compute_spec: &ComputeSpec, compute_id: Option<&String>) -> Self {
// compute_id is a string format of "{endpoint_id}/{compute_idx}"
// endpoint_id is a uuid. We only need to pass down endpoint_id to postgres.
// Panics if compute_id is not set or not in the expected format.
let endpoint_id = compute_id.unwrap().split('/').next().unwrap().to_string();
let workspace_host = compute_spec
.databricks_settings
.as_ref()
@@ -539,7 +437,6 @@ impl DatabricksEnvVars {
Self {
endpoint_id,
workspace_host,
lakebase_mode,
}
}
@@ -549,10 +446,6 @@ impl DatabricksEnvVars {
/// Convert DatabricksEnvVars to a list of string pairs that can be passed as env vars. Consumes `self`.
pub fn to_env_var_list(self) -> Vec<(String, String)> {
if !self.lakebase_mode {
// In neon env, we don't need to pass down the env vars to postgres.
return vec![];
}
vec![
(
Self::DATABRICKS_ENDPOINT_ID_ENVVAR.to_string(),
@@ -589,7 +482,7 @@ impl ComputeNode {
// that can affect `compute_ctl` and prevent it from properly configuring the database schema.
// Unset them via connection string options before connecting to the database.
// N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none";
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none";
let options = match conn_conf.get_options() {
// Allow the control plane to override any options set by the
// compute
@@ -602,11 +495,7 @@ impl ComputeNode {
let mut new_state = ComputeState::new();
if let Some(spec) = config.spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
if params.lakebase_mode {
ComputeNode::set_spec(&params, &mut new_state, pspec);
} else {
new_state.pspec = Some(pspec);
}
new_state.pspec = Some(pspec);
}
Ok(ComputeNode {
@@ -1204,14 +1093,7 @@ impl ComputeNode {
// If it is something different then create_dir() will error out anyway.
let pgdata = &self.params.pgdata;
let _ok = fs::remove_dir_all(pgdata);
if self.params.lakebase_mode {
// Ignore creation errors if the directory already exists (e.g. mounting it ahead of time).
// If it is something different then PG startup will error out anyway.
let _ok = fs::create_dir(pgdata);
} else {
fs::create_dir(pgdata)?;
}
fs::create_dir(pgdata)?;
fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?;
Ok(())
@@ -1223,10 +1105,12 @@ impl ComputeNode {
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let started = Instant::now();
let (connected, size) = match spec.pageserver_conninfo.prefer_protocol {
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
};
self.fix_zenith_signal_neon_signal()?;
@@ -1264,20 +1148,23 @@ impl ComputeNode {
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
/// the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_index = ShardIndex {
shard_number: ShardNumber(0),
shard_count: spec.pageserver_conninfo.shard_count,
let shard0_connstr = spec
.pageserver_connstr
.split(',')
.next()
.unwrap()
.to_string();
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
0 | 1 => ShardIndex::unsharded(),
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
};
let shard0_url = spec
.pageserver_conninfo
.shard_url(ShardNumber(0), PageserverProtocol::Grpc)?
.to_owned();
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
let mut client = page_api::Client::connect(
shard0_url,
shard0_connstr,
spec.tenant_id,
spec.timeline_id,
shard0_index,
shard_index,
spec.storage_auth_token.clone(),
None, // NB: base backups use payload compression
)
@@ -1309,9 +1196,7 @@ impl ComputeNode {
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
/// when the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_connstr = spec
.pageserver_conninfo
.shard_url(ShardNumber(0), PageserverProtocol::Libpq)?;
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let mut config = postgres::Config::from_str(shard0_connstr)?;
// Use the storage auth token from the config file, if given.
@@ -1398,7 +1283,10 @@ impl ComputeNode {
return result;
}
Err(ref e) if attempts < max_attempts => {
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
warn!(
"Failed to get basebackup: {} (attempt {}/{})",
e, attempts, max_attempts
);
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
retry_period_ms *= 1.5;
}
@@ -1560,41 +1448,6 @@ impl ComputeNode {
Ok(lsn)
}
fn sync_safekeepers_with_retries(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
let max_retries = 5;
let mut attempts = 0;
loop {
let result = self.sync_safekeepers(storage_auth_token.clone());
match &result {
Ok(_) => {
if attempts > 0 {
tracing::info!("sync_safekeepers succeeded after {attempts} retries");
}
return result;
}
Err(e) if attempts < max_retries => {
tracing::info!(
"sync_safekeepers failed, will retry (attempt {attempts}): {e:#}"
);
}
Err(err) => {
tracing::warn!(
"sync_safekeepers still failed after {attempts} retries, giving up: {err:?}"
);
return result;
}
}
// sleep and retry
let backoff = exponential_backoff_duration(
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
);
std::thread::sleep(backoff);
attempts += 1;
}
}
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip_all)]
@@ -1630,7 +1483,7 @@ impl ComputeNode {
lsn
} else {
info!("starting safekeepers syncing");
self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone())
self.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?
};
info!("safekeepers synced at LSN {}", lsn);
@@ -1646,8 +1499,16 @@ impl ComputeNode {
}
};
self.get_basebackup(compute_state, lsn)
.with_context(|| format!("failed to get basebackup@{lsn}"))?;
info!(
"getting basebackup@{} from pageserver {}",
lsn, &pspec.pageserver_connstr
);
self.get_basebackup(compute_state, lsn).with_context(|| {
format!(
"failed to get basebackup@{} from pageserver {}",
lsn, &pspec.pageserver_connstr
)
})?;
if let Some(settings) = databricks_settings {
copy_tls_certificates(
@@ -1711,7 +1572,7 @@ impl ComputeNode {
// symlink doesn't affect anything.
//
// See https://github.com/neondatabase/autoscaling/issues/800
std::fs::remove_dir_all(pgdata_path.join("pg_dynshmem"))?;
std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
match spec.mode {
@@ -1726,12 +1587,6 @@ impl ComputeNode {
/// Start and stop a postgres process to warm up the VM for startup.
pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
if self.params.lakebase_mode {
// We are running in Hadron mode. Disabling this prewarming step for now as it could run
// into dblet port conflicts and also doesn't add much value with our current infra.
info!("Skipping postgres prewarming in Hadron mode");
return Ok(());
}
info!("prewarming VM memory");
// Create pgdata
@@ -1793,12 +1648,7 @@ impl ComputeNode {
let databricks_env_vars = {
let state = self.state.lock().unwrap();
let spec = &state.pspec.as_ref().unwrap().spec;
DatabricksEnvVars::new(
spec,
Some(&self.params.compute_id),
self.params.instance_id.clone(),
self.params.lakebase_mode,
)
DatabricksEnvVars::new(spec, Some(&self.params.compute_id))
};
info!(
@@ -1925,7 +1775,7 @@ impl ComputeNode {
// It doesn't matter what were the options before, here we just want
// to connect and create a new superuser role.
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0";
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
zenith_admin_conf.options(ZENITH_OPTIONS);
let mut client =
@@ -1970,15 +1820,7 @@ impl ComputeNode {
/// Do initial configuration of the already started Postgres.
#[instrument(skip_all)]
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
if self.params.lakebase_mode {
// Set a 2-minute statement_timeout for the session applying config. The individual SQL statements
// used in apply_spec_sql() should not take long (they are just creating users and installing
// extensions). If any of them are stuck for an extended period of time it usually indicates a
// pageserver connectivity problem and we should bail out.
conf.options("-c statement_timeout=2min");
}
let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
let conf = Arc::new(conf);
let spec = Arc::new(
@@ -2296,17 +2138,7 @@ impl ComputeNode {
pub fn check_for_core_dumps(&self) -> Result<()> {
let core_dump_dir = match std::env::consts::OS {
"macos" => Path::new("/cores/"),
// BEGIN HADRON
// NB: Read core dump files from a fixed location outside of
// the data directory since `compute_ctl` wipes the data directory
// across container restarts.
_ => {
if self.params.lakebase_mode {
Path::new("/databricks/logs/brickstore")
} else {
Path::new(&self.params.pgdata)
}
} // END HADRON
_ => Path::new(&self.params.pgdata),
};
// Collect core dump paths if any
@@ -2380,13 +2212,13 @@ impl ComputeNode {
let result = client
.simple_query(
"SELECT
pg_catalog.row_to_json(pss)
row_to_json(pg_stat_statements)
FROM
public.pg_stat_statements pss
pg_stat_statements
WHERE
pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid
userid != 'cloud_admin'::regrole::oid
ORDER BY
(pss.mean_exec_time + pss.mean_plan_time) DESC
(mean_exec_time + mean_plan_time) DESC
LIMIT 100",
)
.await;
@@ -2514,11 +2346,11 @@ LIMIT 100",
// check the role grants first - to gracefully handle read-replicas.
let select = "SELECT privilege_type
FROM pg_catalog.pg_namespace
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true
JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid
WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name
AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name";
FROM pg_namespace
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
JOIN pg_user users ON acl.grantee = users.usesysid
WHERE users.usename = $1
AND nspname = $2";
let rows = db_client
.query(select, &[role_name, schema_name])
.await
@@ -2587,9 +2419,8 @@ LIMIT 100",
.await
.with_context(|| format!("Failed to execute query: {query}"))?;
} else {
let query = format!(
"CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}"
);
let query =
format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
db_client
.simple_query(&query)
.await
@@ -2620,7 +2451,7 @@ LIMIT 100",
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
libs_vec = libs
.split(&[',', '\'', ' '])
.filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty())
.filter(|s| *s != "neon" && !s.is_empty())
.map(str::to_string)
.collect();
}
@@ -2639,7 +2470,7 @@ LIMIT 100",
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
preload_libs_vec = libs
.split(&[',', '\'', ' '])
.filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty())
.filter(|s| *s != "neon" && !s.is_empty())
.map(str::to_string)
.collect();
}
@@ -2692,22 +2523,22 @@ LIMIT 100",
/// The operation will time out after a specified duration.
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
let state = self.state.lock().unwrap();
let old_pageserver_conninfo = state
let old_pageserver_connstr = state
.pspec
.as_ref()
.expect("spec must be set")
.pageserver_conninfo
.pageserver_connstr
.clone();
let mut unchanged = true;
let _ = self
.state_changed
.wait_timeout_while(state, duration, |s| {
let pageserver_conninfo = &s
let pageserver_connstr = &s
.pspec
.as_ref()
.expect("spec must be set")
.pageserver_conninfo;
unchanged = pageserver_conninfo == &old_pageserver_conninfo;
.pageserver_connstr;
unchanged = pageserver_connstr == &old_pageserver_connstr;
unchanged
})
.unwrap();
@@ -2965,10 +2796,7 @@ mod tests {
match ParsedSpec::try_from(spec.clone()) {
Ok(_p) => panic!("Failed to detect duplicate entry"),
Err(e) => assert!(
e.to_string()
.starts_with("duplicate entry in safekeeper_connstrings:")
),
Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")),
};
}
}

View File

@@ -7,8 +7,7 @@ use http::StatusCode;
use reqwest::Client;
use std::mem::replace;
use std::sync::Arc;
use tokio::{io::AsyncReadExt, select, spawn};
use tokio_util::sync::CancellationToken;
use tokio::{io::AsyncReadExt, spawn};
use tracing::{error, info};
#[derive(serde::Serialize, Default)]
@@ -93,35 +92,34 @@ impl ComputeNode {
/// If there is a prewarm request ongoing, return `false`, `true` otherwise.
/// Has a failpoint "compute-prewarm"
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
let token: CancellationToken;
{
let state = &mut self.state.lock().unwrap();
token = state.lfc_prewarm_token.clone();
if let LfcPrewarmState::Prewarming =
replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming)
{
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) {
return false;
}
}
crate::metrics::LFC_PREWARMS.inc();
let this = self.clone();
let cloned = self.clone();
spawn(async move {
let prewarm_state = match this.prewarm_impl(from_endpoint, token).await {
Ok(state) => state,
let state = match cloned.prewarm_impl(from_endpoint).await {
Ok(true) => LfcPrewarmState::Completed,
Ok(false) => {
info!(
"skipping LFC prewarm because LFC state is not found in endpoint storage"
);
LfcPrewarmState::Skipped
}
Err(err) => {
crate::metrics::LFC_PREWARM_ERRORS.inc();
error!(%err, "could not prewarm LFC");
let error = format!("{err:#}");
LfcPrewarmState::Failed { error }
LfcPrewarmState::Failed {
error: format!("{err:#}"),
}
}
};
let state = &mut this.state.lock().unwrap();
if let LfcPrewarmState::Cancelled = prewarm_state {
state.lfc_prewarm_token = CancellationToken::new();
}
state.lfc_prewarm_state = prewarm_state;
cloned.state.lock().unwrap().lfc_prewarm_state = state;
});
true
}
@@ -134,70 +132,47 @@ impl ComputeNode {
/// Request LFC state from endpoint storage and load corresponding pages into Postgres.
/// Returns a result with `false` if the LFC state is not found in endpoint storage.
async fn prewarm_impl(
&self,
from_endpoint: Option<String>,
token: CancellationToken,
) -> Result<LfcPrewarmState> {
let EndpointStoragePair {
url,
token: storage_token,
} = self.endpoint_storage_pair(from_endpoint)?;
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
#[cfg(feature = "testing")]
fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint"));
fail::fail_point!("compute-prewarm", |_| {
bail!("prewarm configured to fail because of a failpoint")
});
info!(%url, "requesting LFC state from endpoint storage");
let request = Client::new().get(&url).bearer_auth(storage_token);
let response = select! {
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
response = request.send() => response
}
.context("querying endpoint storage")?;
match response.status() {
let request = Client::new().get(&url).bearer_auth(token);
let res = request.send().await.context("querying endpoint storage")?;
match res.status() {
StatusCode::OK => (),
StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped),
StatusCode::NOT_FOUND => {
return Ok(false);
}
status => bail!("{status} querying endpoint storage"),
}
let mut uncompressed = Vec::new();
let lfc_state = select! {
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
lfc_state = response.bytes() => lfc_state
}
.context("getting request body from endpoint storage")?;
let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice());
select! {
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
read = decoder.read_to_end(&mut uncompressed) => read
}
.context("decoding LFC state")?;
let uncompressed_len = uncompressed.len();
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}");
// Client connection and prewarm info querying are fast and therefore don't need
// cancellation
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
let lfc_state = res
.bytes()
.await
.context("connecting to postgres")?;
let pg_token = client.cancel_token();
.context("getting request body from endpoint storage")?;
ZstdDecoder::new(lfc_state.iter().as_slice())
.read_to_end(&mut uncompressed)
.await
.context("decoding LFC state")?;
let uncompressed_len = uncompressed.len();
let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed];
select! {
res = client.query_one("select neon.prewarm_local_cache($1)", &params) => res,
_ = token.cancelled() => {
pg_token.cancel_query(postgres::NoTls).await
.context("cancelling neon.prewarm_local_cache()")?;
return Ok(LfcPrewarmState::Cancelled)
}
}
.context("loading LFC state into postgres")
.map(|_| ())?;
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
Ok(LfcPrewarmState::Completed)
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?
.query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
.await
.context("loading LFC state into postgres")
.map(|_| ())?;
Ok(true)
}
/// If offload request is ongoing, return false, true otherwise
@@ -225,20 +200,20 @@ impl ComputeNode {
async fn offload_lfc_with_state_update(&self) {
crate::metrics::LFC_OFFLOADS.inc();
let state = match self.offload_lfc_impl().await {
Ok(state) => state,
Err(err) => {
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
error!(%err, "could not offload LFC");
let error = format!("{err:#}");
LfcOffloadState::Failed { error }
}
let Err(err) = self.offload_lfc_impl().await else {
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
return;
};
self.state.lock().unwrap().lfc_offload_state = state;
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
error!(%err, "could not offload LFC state to endpoint storage");
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
error: format!("{err:#}"),
};
}
async fn offload_lfc_impl(&self) -> Result<LfcOffloadState> {
async fn offload_lfc_impl(&self) -> Result<()> {
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
info!(%url, "requesting LFC state from Postgres");
@@ -253,7 +228,7 @@ impl ComputeNode {
.context("deserializing LFC state")?;
let Some(state) = state else {
info!(%url, "empty LFC state, not exporting");
return Ok(LfcOffloadState::Skipped);
return Ok(());
};
let mut compressed = Vec::new();
@@ -267,7 +242,7 @@ impl ComputeNode {
let request = Client::new().put(url).bearer_auth(token).body(compressed);
match request.send().await {
Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed),
Ok(res) if res.status() == StatusCode::OK => Ok(()),
Ok(res) => bail!(
"Request to endpoint storage failed with status: {}",
res.status()
@@ -275,8 +250,4 @@ impl ComputeNode {
Err(err) => Err(err).context("writing to endpoint storage"),
}
}
pub fn cancel_prewarm(self: &Arc<Self>) {
self.state.lock().unwrap().lfc_prewarm_token.cancel();
}
}

View File

@@ -78,7 +78,7 @@ impl ComputeNode {
const RETRIES: i32 = 20;
for i in 0..=RETRIES {
let row = client
.query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[])
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
.await
.context("getting last replay lsn")?;
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
@@ -103,7 +103,7 @@ impl ComputeNode {
.await
.context("setting safekeepers")?;
client
.query("SELECT pg_catalog.pg_reload_conf()", &[])
.query("SELECT pg_reload_conf()", &[])
.await
.context("reloading postgres config")?;
@@ -113,7 +113,7 @@ impl ComputeNode {
});
let row = client
.query_one("SELECT * FROM pg_catalog.pg_promote()", &[])
.query_one("SELECT * FROM pg_promote()", &[])
.await
.context("pg_promote")?;
if !row.get::<usize, bool>(0) {

View File

@@ -8,7 +8,7 @@ use std::path::Path;
use compute_api::responses::TlsConfig;
use compute_api::spec::{
ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption, PageserverProtocol,
ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption,
};
use crate::compute::ComputeNodeParams;
@@ -18,8 +18,6 @@ use crate::pg_helpers::{
};
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
use utils::shard::{ShardIndex, ShardNumber};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
@@ -65,111 +63,15 @@ pub fn write_postgres_conf(
writeln!(file, "{conf}")?;
}
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
writeln!(file)?;
if let Some(conninfo) = &spec.pageserver_connection_info {
match conninfo.prefer_protocol {
PageserverProtocol::Libpq => {
writeln!(file, "neon.use_communicator_worker=false")?;
}
PageserverProtocol::Grpc => {
writeln!(file, "neon.use_communicator_worker=true")?;
}
}
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = conninfo.stripe_size {
writeln!(
file,
"# from compute spec's pageserver_connection_info.stripe_size field"
)?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
let num_shards = if conninfo.shard_count.0 == 0 {
1 // unsharded, treat it as a single shard
} else {
conninfo.shard_count.0
};
for shard_number in 0..num_shards {
let shard_index = ShardIndex {
shard_number: ShardNumber(shard_number),
shard_count: conninfo.shard_count,
};
let info = conninfo.shards.get(&shard_index).ok_or_else(|| {
anyhow::anyhow!(
"shard {shard_index} missing from pageserver_connection_info shard map"
)
})?;
let first_pageserver = info
.pageservers
.first()
.expect("must have at least one pageserver");
// Add the libpq URL to the array, or if the URL is missing, reset the array
// forgetting any previous entries. All servers must have a libpq URL, or none
// at all.
if let Some(url) = &first_pageserver.libpq_url {
if let Some(ref mut urls) = libpq_urls {
urls.push(url.clone());
}
} else {
libpq_urls = None
}
// Similarly for gRPC URLs
if let Some(url) = &first_pageserver.grpc_url {
if let Some(ref mut urls) = grpc_urls {
urls.push(url.clone());
}
} else {
grpc_urls = None
}
}
if let Some(libpq_urls) = libpq_urls {
writeln!(
file,
"# derived from compute spec's pageserver_connection_info field"
)?;
writeln!(
file,
"neon.pageserver_connstring={}",
escape_conf_value(&libpq_urls.join(","))
)?;
} else {
writeln!(file, "# no neon.pageserver_connstring")?;
}
if let Some(grpc_urls) = grpc_urls {
writeln!(
file,
"# derived from compute spec's pageserver_conninfo field"
)?;
writeln!(
file,
"neon.pageserver_grpc_urls={}",
escape_conf_value(&grpc_urls.join(","))
)?;
} else {
writeln!(file, "# no neon.pageserver_grpc_urls")?;
}
} else {
writeln!(file, "neon.use_communicator_worker=false")?;
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "# from compute spec's shard_stripe_size field")?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "# from compute spec's pageserver_connstring field")?;
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if !spec.safekeeper_connstrings.is_empty() {
let mut neon_safekeepers_value = String::new();
tracing::info!(

View File

@@ -122,11 +122,8 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
// into the type system.
assert_eq!(state.status, ComputeStatus::RefreshConfiguration);
if state
.pspec
.as_ref()
.map(|ps| ps.pageserver_conninfo.clone())
== Some(pspec.pageserver_conninfo.clone())
if state.pspec.as_ref().map(|ps| ps.pageserver_connstr.clone())
== Some(pspec.pageserver_connstr.clone())
{
info!(
"Refresh configuration: Retrieved spec is the same as the current spec. Waiting for control plane to update the spec before attempting reconfiguration."

View File

@@ -139,15 +139,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/LfcPrewarmState"
delete:
tags:
- Prewarm
summary: Cancel ongoing LFC prewarm
description: ""
operationId: cancelLfcPrewarm
responses:
202:
description: Prewarm cancelled
/lfc/offload:
post:
@@ -645,7 +636,7 @@ components:
properties:
status:
description: LFC offload status
enum: [not_offloaded, offloading, completed, skipped, failed]
enum: [not_offloaded, offloading, completed, failed]
type: string
error:
description: LFC offload error, if any

View File

@@ -46,8 +46,3 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response {
)
}
}
pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode {
compute.cancel_prewarm();
StatusCode::ACCEPTED
}

View File

@@ -99,12 +99,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
);
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route(
"/lfc/prewarm",
get(lfc::prewarm_state)
.post(lfc::prewarm)
.delete(lfc::cancel_prewarm),
)
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
.route("/promote", post(promote::promote))
.route("/check_writability", post(check_writability::is_writable))

View File

@@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
.query(
"SELECT datname FROM pg_catalog.pg_database
WHERE datallowconn
AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4)
AND datconnlimit <> - 2
LIMIT 500",
&[],
)
@@ -67,7 +67,7 @@ pub async fn get_installed_extensions(
let extensions: Vec<(String, String, i32)> = client
.query(
"SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension",
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
&[],
)
.await?

View File

@@ -4,13 +4,14 @@ use std::thread;
use std::time::{Duration, SystemTime};
use anyhow::{Result, bail};
use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol};
use compute_api::spec::{ComputeMode, PageserverProtocol};
use itertools::Itertools as _;
use pageserver_page_api as page_api;
use postgres::{NoTls, SimpleQueryMessage};
use tracing::{info, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::shard::TenantShardId;
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
use crate::compute::ComputeNode;
@@ -28,10 +29,7 @@ pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
let compute = compute.clone();
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
let runtime = tokio::runtime::Handle::current();
thread::spawn(move || {
let _rt_guard = runtime.enter();
let _entered = span.entered();
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
// TODO: might need stronger error feedback than logging an warning.
@@ -80,16 +78,17 @@ fn acquire_lsn_lease_with_retry(
loop {
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
let (conninfo, auth) = {
let (connstrings, auth) = {
let state = compute.state.lock().unwrap();
let spec = state.pspec.as_ref().expect("spec must be set");
(
spec.pageserver_conninfo.clone(),
spec.pageserver_connstr.clone(),
spec.storage_auth_token.clone(),
)
};
let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
let result =
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
match result {
Ok(Some(res)) => {
return Ok(res);
@@ -113,44 +112,35 @@ fn acquire_lsn_lease_with_retry(
/// Tries to acquire LSN leases on all Pageserver shards.
fn try_acquire_lsn_lease(
conninfo: PageserverConnectionInfo,
connstrings: &str,
auth: Option<&str>,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<Option<SystemTime>> {
let connstrings = connstrings.split(',').collect_vec();
let shard_count = connstrings.len();
let mut leases = Vec::new();
for (shard_index, shard) in conninfo.shards.into_iter() {
let tenant_shard_id = TenantShardId {
tenant_id,
shard_number: shard_index.shard_number,
shard_count: shard_index.shard_count,
for (shard_number, &connstring) in connstrings.iter().enumerate() {
let tenant_shard_id = match shard_count {
0 | 1 => TenantShardId::unsharded(tenant_id),
shard_count => TenantShardId {
tenant_id,
shard_number: ShardNumber(shard_number as u8),
shard_count: ShardCount::new(shard_count as u8),
},
};
// XXX: If there are more than pageserver for the one shard, do we need to get a
// leas on all of them? Currently, that's what we assume, but this is hypothetical
// as of this writing, as we never pass the info for more than one pageserver per
// shard.
for pageserver in shard.pageservers {
let lease = match conninfo.prefer_protocol {
PageserverProtocol::Grpc => acquire_lsn_lease_grpc(
&pageserver.grpc_url.unwrap(),
auth,
tenant_shard_id,
timeline_id,
lsn,
)?,
PageserverProtocol::Libpq => acquire_lsn_lease_libpq(
&pageserver.libpq_url.unwrap(),
auth,
tenant_shard_id,
timeline_id,
lsn,
)?,
};
leases.push(lease);
}
let lease = match PageserverProtocol::from_connstring(connstring)? {
PageserverProtocol::Libpq => {
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
}
PageserverProtocol::Grpc => {
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
}
};
leases.push(lease);
}
Ok(leases.into_iter().min().flatten())

View File

@@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> {
self.client
.simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
.await?;
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?;
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
self.client
.simple_query(
"INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",

View File

@@ -15,17 +15,17 @@ DO $$
DECLARE
role_name text;
BEGIN
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member')
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
END LOOP;
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles
FOR role_name IN SELECT rolname FROM pg_roles
WHERE
NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_')
NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
END LOOP;
END $$;

View File

@@ -1,6 +1,6 @@
DO $$
BEGIN
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
END IF;
END $$;

View File

@@ -5,9 +5,9 @@ DO $$
DECLARE
role_name TEXT;
BEGIN
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
END LOOP;
END $$;

View File

@@ -1,6 +1,6 @@
DO $$
BEGIN
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
END IF;

View File

@@ -2,7 +2,7 @@ DO $$
DECLARE
bypassrls boolean;
BEGIN
SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser';
SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser';
IF NOT bypassrls THEN
RAISE EXCEPTION 'neon_superuser cannot bypass RLS';
END IF;

View File

@@ -4,8 +4,8 @@ DECLARE
BEGIN
FOR role IN
SELECT rolname AS name, rolinherit AS inherit
FROM pg_catalog.pg_roles
WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
FROM pg_roles
WHERE pg_has_role(rolname, 'neon_superuser', 'member')
LOOP
IF NOT role.inherit THEN
RAISE EXCEPTION '% cannot inherit', quote_ident(role.name);
@@ -14,12 +14,12 @@ BEGIN
FOR role IN
SELECT rolname AS name, rolbypassrls AS bypassrls
FROM pg_catalog.pg_roles
WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
AND NOT pg_catalog.starts_with(rolname, 'pg_')
FROM pg_roles
WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member')
AND NOT starts_with(rolname, 'pg_')
LOOP
IF role.bypassrls THEN
RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name);
RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name);
END IF;
END LOOP;
END $$;

View File

@@ -1,10 +1,10 @@
DO $$
BEGIN
IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN
IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN
RETURN;
END IF;
IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription';
END IF;
END $$;

View File

@@ -2,12 +2,12 @@ DO $$
DECLARE
monitor record;
BEGIN
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
admin_option AS admin
INTO monitor
FROM pg_catalog.pg_auth_members
WHERE roleid = 'pg_monitor'::pg_catalog.regrole
AND member = 'neon_superuser'::pg_catalog.regrole;
FROM pg_auth_members
WHERE roleid = 'pg_monitor'::regrole
AND member = 'neon_superuser'::regrole;
IF monitor IS NULL THEN
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';

View File

@@ -2,11 +2,11 @@ DO $$
DECLARE
can_execute boolean;
BEGIN
SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute'))
SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute'))
INTO can_execute
FROM pg_catalog.pg_proc
FROM pg_proc
WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot')
AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace;
AND pronamespace = 'pg_catalog'::regnamespace;
IF NOT can_execute THEN
RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot';
END IF;

View File

@@ -2,9 +2,9 @@ DO $$
DECLARE
can_execute boolean;
BEGIN
SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')
SELECT has_function_privilege('neon_superuser', oid, 'execute')
INTO can_execute
FROM pg_catalog.pg_proc
FROM pg_proc
WHERE proname = 'pg_show_replication_origin_status'
AND pronamespace = 'pg_catalog'::regnamespace;
IF NOT can_execute THEN

View File

@@ -2,10 +2,10 @@ DO $$
DECLARE
signal_backend record;
BEGIN
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
admin_option AS admin
INTO signal_backend
FROM pg_catalog.pg_auth_members
FROM pg_auth_members
WHERE roleid = 'pg_signal_backend'::regrole
AND member = 'neon_superuser'::regrole;

View File

@@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
// like `postgres_exporter` use it to query Postgres statistics.
// Use explicit 8 bytes type casts to match Rust types.
let stats = cli.query_one(
"SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time,
pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions
FROM pg_catalog.pg_stat_database
"SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
coalesce(sum(sessions), 0)::bigint AS total_sessions
FROM pg_stat_database
WHERE datname NOT IN (
'postgres',
'template0',
@@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
let mut last_active: Option<DateTime<Utc>> = None;
// Get all running client backends except ourself, use RFC3339 DateTime format.
let backends = cli.query(
"SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
FROM pg_stat_activity
WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text
AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid()
AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors?
WHERE backend_type = 'client backend'
AND pid != pg_backend_pid()
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
&[],
);

View File

@@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async(
.query_raw::<str, &String, &[String; 0]>(
"SELECT
datname AS name,
(SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner,
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
NOT datallowconn AS restrict_conn,
datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid
datconnlimit = - 2 AS invalid
FROM
pg_catalog.pg_database;",
&[],

View File

@@ -13,19 +13,17 @@ use tokio_postgres::Client;
use tokio_postgres::error::SqlState;
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState, create_databricks_roles};
use crate::hadron_metrics::COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS;
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState};
use crate::pg_helpers::{
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
get_existing_roles_async,
};
use crate::spec_apply::ApplySpecPhase::{
AddDatabricksGrants, AlterDatabricksRoles, CreateAndAlterDatabases, CreateAndAlterRoles,
CreateAvailabilityCheck, CreateDatabricksMisc, CreateDatabricksRoles, CreatePgauditExtension,
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon,
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
HandleDatabricksAuthExtension, HandleNeonExtension, HandleOtherExtensions,
RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
RunInEachDatabase,
};
use crate::spec_apply::PerDatabasePhase::{
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions,
@@ -82,7 +80,7 @@ impl ComputeNode {
info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
drop_subscriptions_done = match
client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await {
client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await {
Ok(result) => !result.is_empty(),
Err(e) =>
{
@@ -168,7 +166,6 @@ impl ComputeNode {
concurrency_token.clone(),
db,
[DropLogicalSubscriptions].to_vec(),
self.params.lakebase_mode,
);
Ok(tokio::spawn(fut))
@@ -189,33 +186,15 @@ impl ComputeNode {
};
}
let phases = if self.params.lakebase_mode {
vec![
CreatePrivilegedRole,
// BEGIN_HADRON
CreateDatabricksRoles,
AlterDatabricksRoles,
// END_HADRON
for phase in [
CreatePrivilegedRole,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
RenameAndDeleteDatabases,
CreateAndAlterDatabases,
CreateSchemaNeon,
]
} else {
vec![
CreatePrivilegedRole,
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
RenameAndDeleteDatabases,
CreateAndAlterDatabases,
CreateSchemaNeon,
]
};
for phase in phases {
] {
info!("Applying phase {:?}", &phase);
apply_operations(
params.clone(),
@@ -224,7 +203,6 @@ impl ComputeNode {
jwks_roles.clone(),
phase,
|| async { Ok(&client) },
self.params.lakebase_mode,
)
.await?;
}
@@ -276,7 +254,6 @@ impl ComputeNode {
concurrency_token.clone(),
db,
phases,
self.params.lakebase_mode,
);
Ok(tokio::spawn(fut))
@@ -288,28 +265,12 @@ impl ComputeNode {
handle.await??;
}
let mut phases = if self.params.lakebase_mode {
vec![
HandleOtherExtensions,
HandleNeonExtension, // This step depends on CreateSchemaNeon
// BEGIN_HADRON
HandleDatabricksAuthExtension,
// END_HADRON
CreateAvailabilityCheck,
DropRoles,
// BEGIN_HADRON
AddDatabricksGrants,
CreateDatabricksMisc,
// END_HADRON
]
} else {
vec![
let mut phases = vec![
HandleOtherExtensions,
HandleNeonExtension, // This step depends on CreateSchemaNeon
CreateAvailabilityCheck,
DropRoles,
]
};
];
// This step depends on CreateSchemaNeon
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
@@ -342,7 +303,6 @@ impl ComputeNode {
jwks_roles.clone(),
phase,
|| async { Ok(&client) },
self.params.lakebase_mode,
)
.await?;
}
@@ -368,7 +328,6 @@ impl ComputeNode {
concurrency_token: Arc<tokio::sync::Semaphore>,
db: DB,
subphases: Vec<PerDatabasePhase>,
lakebase_mode: bool,
) -> Result<()> {
let _permit = concurrency_token.acquire().await?;
@@ -396,7 +355,6 @@ impl ComputeNode {
let client = client_conn.as_ref().unwrap();
Ok(client)
},
lakebase_mode,
)
.await?;
}
@@ -519,10 +477,6 @@ pub enum PerDatabasePhase {
#[derive(Clone, Debug)]
pub enum ApplySpecPhase {
CreatePrivilegedRole,
// BEGIN_HADRON
CreateDatabricksRoles,
AlterDatabricksRoles,
// END_HADRON
DropInvalidDatabases,
RenameRoles,
CreateAndAlterRoles,
@@ -535,14 +489,7 @@ pub enum ApplySpecPhase {
DisablePostgresDBPgAudit,
HandleOtherExtensions,
HandleNeonExtension,
// BEGIN_HADRON
HandleDatabricksAuthExtension,
// END_HADRON
CreateAvailabilityCheck,
// BEGIN_HADRON
AddDatabricksGrants,
CreateDatabricksMisc,
// END_HADRON
DropRoles,
FinalizeDropLogicalSubscriptions,
}
@@ -578,7 +525,6 @@ pub async fn apply_operations<'a, Fut, F>(
jwks_roles: Arc<HashSet<String>>,
apply_spec_phase: ApplySpecPhase,
client: F,
lakebase_mode: bool,
) -> Result<()>
where
F: FnOnce() -> Fut,
@@ -625,23 +571,6 @@ where
},
query
);
if !lakebase_mode {
return res;
}
// BEGIN HADRON
if let Err(e) = res.as_ref() {
if let Some(sql_state) = e.code() {
if sql_state.code() == "57014" {
// SQL State 57014 (ERRCODE_QUERY_CANCELED) is used for statement timeouts.
// Increment the counter whenever a statement timeout occurs. Timeouts on
// this configuration path can only occur due to PS connectivity problems that
// Postgres failed to recover from.
COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.inc();
}
}
}
// END HADRON
res
}
.instrument(inspan)
@@ -679,44 +608,10 @@ async fn get_operations<'a>(
ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation {
query: format!(
include_str!("sql/create_privileged_role.sql"),
privileged_role_name = params.privileged_role_name,
privileges = if params.lakebase_mode {
"CREATEDB CREATEROLE NOLOGIN BYPASSRLS"
} else {
"CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS"
}
privileged_role_name = params.privileged_role_name
),
comment: None,
}))),
// BEGIN_HADRON
// New Hadron phase
ApplySpecPhase::CreateDatabricksRoles => {
let queries = create_databricks_roles();
let operations = queries.into_iter().map(|query| Operation {
query,
comment: None,
});
Ok(Box::new(operations))
}
// Backfill existing databricks_reader_* roles with statement timeout from GUC
ApplySpecPhase::AlterDatabricksRoles => {
let query = String::from(include_str!(
"sql/alter_databricks_reader_roles_timeout.sql"
));
let operations = once(Operation {
query,
comment: Some(
"Backfill existing databricks_reader_* roles with statement timeout"
.to_string(),
),
});
Ok(Box::new(operations))
}
// End of new Hadron Phase
// END_HADRON
ApplySpecPhase::DropInvalidDatabases => {
let mut ctx = ctx.write().await;
let databases = &mut ctx.dbs;
@@ -1086,10 +981,7 @@ async fn get_operations<'a>(
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
role_name = escaped_role,
outer_tag = outer_tag,
)
// HADRON change:
.replace("neon_superuser", &params.privileged_role_name),
// HADRON change end ,
),
comment: None,
},
// This now will only drop privileges of the role
@@ -1125,8 +1017,7 @@ async fn get_operations<'a>(
comment: None,
},
Operation {
query: String::from(include_str!("sql/default_grants.sql"))
.replace("neon_superuser", &params.privileged_role_name),
query: String::from(include_str!("sql/default_grants.sql")),
comment: None,
},
]
@@ -1142,9 +1033,7 @@ async fn get_operations<'a>(
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
if libs.contains("pg_stat_statements") {
return Ok(Box::new(once(Operation {
query: String::from(
"CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public",
),
query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"),
comment: Some(String::from("create system extensions")),
})));
}
@@ -1152,13 +1041,11 @@ async fn get_operations<'a>(
Ok(Box::new(empty()))
}
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"),
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
comment: Some(String::from("create pgaudit extensions")),
}))),
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
query: String::from(
"CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public",
),
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
comment: Some(String::from("create pgauditlogtofile extensions")),
}))),
// Disable pgaudit logging for postgres database.
@@ -1182,7 +1069,7 @@ async fn get_operations<'a>(
},
Operation {
query: String::from(
"UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false",
"UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'",
),
comment: Some(String::from("compat/fix: make neon relocatable")),
},
@@ -1199,28 +1086,6 @@ async fn get_operations<'a>(
Ok(Box::new(operations))
}
// BEGIN_HADRON
// Note: we may want to version the extension someday, but for now we just drop it and recreate it.
ApplySpecPhase::HandleDatabricksAuthExtension => {
let operations = vec![
Operation {
query: String::from("DROP EXTENSION IF EXISTS databricks_auth"),
comment: Some(String::from("dropping existing databricks_auth extension")),
},
Operation {
query: String::from("CREATE EXTENSION databricks_auth"),
comment: Some(String::from("creating databricks_auth extension")),
},
Operation {
query: String::from("GRANT SELECT ON databricks_auth_metrics TO pg_monitor"),
comment: Some(String::from("grant select on databricks auth counters")),
},
]
.into_iter();
Ok(Box::new(operations))
}
// END_HADRON
ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation {
query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")),
comment: None,
@@ -1238,63 +1103,6 @@ async fn get_operations<'a>(
Ok(Box::new(operations))
}
// BEGIN_HADRON
// New Hadron phases
//
// Grants permissions to roles that are used by Databricks.
ApplySpecPhase::AddDatabricksGrants => {
let operations = vec![
Operation {
query: String::from("GRANT USAGE ON SCHEMA neon TO databricks_monitor"),
comment: Some(String::from(
"Permissions needed to execute neon.* functions (in the postgres database)",
)),
},
Operation {
query: String::from(
"GRANT SELECT, INSERT, UPDATE ON health_check TO databricks_monitor",
),
comment: Some(String::from("Permissions needed for read and write probes")),
},
Operation {
query: String::from(
"GRANT EXECUTE ON FUNCTION pg_ls_dir(text) TO databricks_monitor",
),
comment: Some(String::from(
"Permissions needed to monitor .snap file counts",
)),
},
Operation {
query: String::from(
"GRANT SELECT ON neon.neon_perf_counters TO databricks_monitor",
),
comment: Some(String::from(
"Permissions needed to access neon performance counters view",
)),
},
Operation {
query: String::from(
"GRANT EXECUTE ON FUNCTION neon.get_perf_counters() TO databricks_monitor",
),
comment: Some(String::from(
"Permissions needed to execute the underlying performance counters function",
)),
},
]
.into_iter();
Ok(Box::new(operations))
}
// Creates minor objects that are used by Databricks.
ApplySpecPhase::CreateDatabricksMisc => Ok(Box::new(once(Operation {
query: String::from(include_str!("sql/create_databricks_misc.sql")),
comment: Some(String::from(
"The function databricks_monitor uses to convert exception to 0 or 1",
)),
}))),
// End of new Hadron phases
// END_HADRON
ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation {
query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")),
comment: None,

View File

@@ -3,17 +3,16 @@ BEGIN
IF NOT EXISTS(
SELECT 1
FROM pg_catalog.pg_tables
WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name
AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
WHERE tablename = 'health_check'
)
THEN
CREATE TABLE public.health_check (
id pg_catalog.int4 primary key generated by default as identity,
updated_at pg_catalog.timestamptz default pg_catalog.now()
CREATE TABLE health_check (
id serial primary key,
updated_at timestamptz default now()
);
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = pg_catalog.now();
SET updated_at = now();
END IF;
END
$$

View File

@@ -1,25 +0,0 @@
DO $$
DECLARE
reader_role RECORD;
timeout_value TEXT;
BEGIN
-- Get the current GUC setting for reader statement timeout
SELECT current_setting('databricks.reader_statement_timeout', true) INTO timeout_value;
-- Only proceed if timeout_value is not null/empty and not '0' (disabled)
IF timeout_value IS NOT NULL AND timeout_value != '' AND timeout_value != '0' THEN
-- Find all databricks_reader_* roles and update their statement_timeout
FOR reader_role IN
SELECT r.rolname
FROM pg_roles r
WHERE r.rolname ~ '^databricks_reader_\d+$'
LOOP
-- Apply the timeout setting to the role (will overwrite existing setting)
EXECUTE format('ALTER ROLE %I SET statement_timeout = %L',
reader_role.rolname, timeout_value);
RAISE LOG 'Updated statement_timeout = % for role %', timeout_value, reader_role.rolname;
END LOOP;
END IF;
END
$$;

View File

@@ -0,0 +1,12 @@
DO $$
DECLARE
query varchar;
BEGIN
FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};'
FROM pg_proc p
JOIN pg_namespace nsp ON p.pronamespace = nsp.oid
WHERE nsp.nspname = 'anon' LOOP
EXECUTE query;
END LOOP;
END
$$;

View File

@@ -1,15 +0,0 @@
ALTER ROLE databricks_monitor SET statement_timeout = '60s';
CREATE OR REPLACE FUNCTION health_check_write_succeeds()
RETURNS INTEGER AS $$
BEGIN
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = now();
RETURN 1;
EXCEPTION WHEN OTHERS THEN
RAISE EXCEPTION '[DATABRICKS_SMGR] health_check failed: [%] %', SQLSTATE, SQLERRM;
RETURN 0;
END;
$$ LANGUAGE plpgsql;

View File

@@ -1,8 +1,8 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name)
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
THEN
CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data;
CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
END IF;
END
$$;

View File

@@ -4,14 +4,14 @@ $$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'
WHERE nspname = 'public'
) AND
pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15
current_setting('server_version_num')::int / 10000 >= 15
THEN
IF EXISTS(
SELECT rolname
FROM pg_catalog.pg_roles
WHERE rolname OPERATOR(pg_catalog.=) 'web_access'
WHERE rolname = 'web_access'
)
THEN
GRANT CREATE ON SCHEMA public TO web_access;
@@ -20,7 +20,7 @@ $$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'
WHERE nspname = 'public'
)
THEN
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;

View File

@@ -2,17 +2,11 @@ DO ${outer_tag}$
DECLARE
subname TEXT;
BEGIN
LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE;
FOR subname IN
SELECT pg_subscription.subname
FROM pg_catalog.pg_subscription
WHERE subdbid OPERATOR(pg_catalog.=) (
SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name
)
LOOP
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname);
LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
${outer_tag}$;

View File

@@ -3,19 +3,19 @@ BEGIN
IF NOT EXISTS(
SELECT 1
FROM pg_catalog.pg_tables
WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name
AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name
WHERE tablename = 'drop_subscriptions_done'
AND schemaname = 'neon'
)
THEN
CREATE TABLE neon.drop_subscriptions_done
(id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text);
(id serial primary key, timeline_id text);
END IF;
-- preserve the timeline_id of the last drop_subscriptions run
-- to ensure that the cleanup of a timeline is executed only once.
-- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id'))
INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
ON CONFLICT (id) DO UPDATE
SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text;
SET timeline_id = current_setting('neon.timeline_id');
END
$$

View File

@@ -15,15 +15,15 @@ BEGIN
WHERE schema_name IN ('public')
LOOP
FOR grantor IN EXECUTE
pg_catalog.format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s',
format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
quote_literal({role_name})
)
LOOP
EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor);
EXECUTE format('SET LOCAL ROLE %I', grantor);
revoke_query := pg_catalog.format(
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
schema,
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`

View File

@@ -5,17 +5,17 @@ DO ${outer_tag}$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
WHERE nspname = 'public'
)
THEN
SELECT nspowner::regrole::text
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text
WHERE nspname = 'public'
INTO schema_owner;
IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
THEN
EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner});
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
END IF;
END IF;
END

View File

@@ -3,10 +3,10 @@ DO ${outer_tag}$
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name
WHERE datname = {datname}
)
THEN
EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname});
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
END IF;
END
${outer_tag}$;

File diff suppressed because it is too large Load Diff

View File

@@ -37,7 +37,7 @@
//! <other PostgreSQL files>
//! ```
//!
use std::collections::{BTreeMap, HashMap};
use std::collections::BTreeMap;
use std::fmt::Display;
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
use std::path::PathBuf;
@@ -58,12 +58,8 @@ use compute_api::responses::{
};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
PageserverShardInfo, PgIdent, RemoteExtSpec, Role,
PgIdent, RemoteExtSpec, Role,
};
// re-export these, because they're used in the reconfigure() function
pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
use jsonwebtoken::jwk::{
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
@@ -78,11 +74,9 @@ use sha2::{Digest, Sha256};
use spki::der::Decode;
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
use tracing::debug;
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT;
use postgres_connection::parse_host_port;
use utils::shard::ShardStripeSize;
use crate::local_env::LocalEnv;
use crate::postgresql_conf::PostgresConf;
@@ -393,8 +387,9 @@ pub struct EndpointStartArgs {
pub endpoint_storage_addr: String,
pub safekeepers_generation: Option<SafekeeperGeneration>,
pub safekeepers: Vec<NodeId>,
pub pageserver_conninfo: PageserverConnectionInfo,
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
pub remote_ext_base_url: Option<String>,
pub shard_stripe_size: usize,
pub create_test_user: bool,
pub start_timeout: Duration,
pub autoprewarm: bool,
@@ -667,6 +662,14 @@ impl Endpoint {
}
}
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
pageservers
.iter()
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
.collect::<Vec<_>>()
.join(",")
}
/// Map safekeepers ids to the actual connection strings.
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
let mut safekeeper_connstrings = Vec::new();
@@ -712,6 +715,9 @@ impl Endpoint {
std::fs::remove_dir_all(self.pgdata())?;
}
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
assert!(!pageserver_connstring.is_empty());
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
// check for file remote_extensions_spec.json
@@ -726,44 +732,6 @@ impl Endpoint {
remote_extensions = None;
};
// For the sake of backwards-compatibility, also fill in 'pageserver_connstring'
//
// XXX: I believe this is not really needed, except to make
// test_forward_compatibility happy.
//
// Use a closure so that we can conviniently return None in the middle of the
// loop.
let pageserver_connstring: Option<String> = (|| {
let num_shards = args.pageserver_conninfo.shard_count.count();
let mut connstrings = Vec::new();
for shard_no in 0..num_shards {
let shard_index = ShardIndex {
shard_count: args.pageserver_conninfo.shard_count,
shard_number: ShardNumber(shard_no),
};
let shard = args
.pageserver_conninfo
.shards
.get(&shard_index)
.ok_or_else(|| {
anyhow!(
"shard {} not found in pageserver_connection_info",
shard_index
)
})?;
let pageserver = shard
.pageservers
.first()
.ok_or(anyhow!("must have at least one pageserver"))?;
if let Some(libpq_url) = &pageserver.libpq_url {
connstrings.push(libpq_url.clone());
} else {
return Ok::<_, anyhow::Error>(None);
}
}
Ok(Some(connstrings.join(",")))
})()?;
// Create config file
let config = {
let mut spec = ComputeSpec {
@@ -808,14 +776,13 @@ impl Endpoint {
branch_id: None,
endpoint_id: Some(self.endpoint_id.clone()),
mode: self.mode,
pageserver_connection_info: Some(args.pageserver_conninfo.clone()),
pageserver_connstring,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: args.auth_token.clone(),
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: args.pageserver_conninfo.stripe_size, // redundant with pageserver_connection_info.stripe_size
shard_stripe_size: Some(args.shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
@@ -999,7 +966,7 @@ impl Endpoint {
// Update the pageservers in the spec file of the endpoint. This is useful to test the spec refresh scenario.
pub async fn update_pageservers_in_config(
&self,
pageserver_conninfo: &PageserverConnectionInfo,
pageservers: Vec<(PageserverProtocol, Host, u16)>,
) -> Result<()> {
let config_path = self.endpoint_path().join("config.json");
let mut config: ComputeConfig = {
@@ -1007,8 +974,10 @@ impl Endpoint {
serde_json::from_reader(file)?
};
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
assert!(!pageserver_connstring.is_empty());
let mut spec = config.spec.unwrap();
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
spec.pageserver_connstring = Some(pageserver_connstring);
config.spec = Some(spec);
let file = std::fs::File::create(&config_path)?;
@@ -1051,7 +1020,8 @@ impl Endpoint {
pub async fn reconfigure(
&self,
pageserver_conninfo: Option<&PageserverConnectionInfo>,
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
stripe_size: Option<ShardStripeSize>,
safekeepers: Option<Vec<NodeId>>,
safekeeper_generation: Option<SafekeeperGeneration>,
) -> Result<()> {
@@ -1066,15 +1036,15 @@ impl Endpoint {
let postgresql_conf = self.read_postgresql_conf()?;
spec.cluster.postgresql_conf = Some(postgresql_conf);
if let Some(pageserver_conninfo) = pageserver_conninfo {
// If pageservers are provided, we need to ensure that they are not empty.
// This is a requirement for the compute_ctl configuration.
anyhow::ensure!(
!pageserver_conninfo.shards.is_empty(),
"no pageservers provided"
);
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
spec.shard_stripe_size = pageserver_conninfo.stripe_size;
// If pageservers are not specified, don't change them.
if let Some(pageservers) = pageservers {
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
spec.pageserver_connstring = Some(pageserver_connstr);
if stripe_size.is_some() {
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
}
}
// If safekeepers are not specified, don't change them.
@@ -1123,9 +1093,11 @@ impl Endpoint {
pub async fn reconfigure_pageservers(
&self,
pageservers: &PageserverConnectionInfo,
pageservers: Vec<(PageserverProtocol, Host, u16)>,
stripe_size: Option<ShardStripeSize>,
) -> Result<()> {
self.reconfigure(Some(pageservers), None, None).await
self.reconfigure(Some(pageservers), stripe_size, None, None)
.await
}
pub async fn reconfigure_safekeepers(
@@ -1133,7 +1105,7 @@ impl Endpoint {
safekeepers: Vec<NodeId>,
generation: SafekeeperGeneration,
) -> Result<()> {
self.reconfigure(None, Some(safekeepers), Some(generation))
self.reconfigure(None, None, Some(safekeepers), Some(generation))
.await
}
@@ -1216,84 +1188,3 @@ impl Endpoint {
)
}
}
/// If caller is telling us what pageserver to use, this is not a tenant which is
/// fully managed by storage controller, therefore not sharded.
pub fn local_pageserver_conf_to_conn_info(
conf: &crate::local_env::PageServerConf,
) -> Result<PageserverConnectionInfo> {
let libpq_url = {
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
let port = port.unwrap_or(5432);
Some(format!("postgres://no_user@{host}:{port}"))
};
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let ps_conninfo = PageserverShardConnectionInfo {
id: Some(conf.id),
libpq_url,
grpc_url,
};
let shard_info = PageserverShardInfo {
pageservers: vec![ps_conninfo],
};
let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)]
.into_iter()
.collect();
Ok(PageserverConnectionInfo {
shard_count: ShardCount::unsharded(),
stripe_size: None,
shards,
prefer_protocol: PageserverProtocol::default(),
})
}
pub fn tenant_locate_response_to_conn_info(
response: &pageserver_api::controller_api::TenantLocateResponse,
) -> Result<PageserverConnectionInfo> {
let mut shards = HashMap::new();
for shard in response.shards.iter() {
tracing::info!("parsing {}", shard.listen_pg_addr);
let libpq_url = {
let host = &shard.listen_pg_addr;
let port = shard.listen_pg_port;
Some(format!("postgres://no_user@{host}:{port}"))
};
let grpc_url = if let Some(grpc_addr) = &shard.listen_grpc_addr {
let host = grpc_addr;
let port = shard.listen_grpc_port.expect("no gRPC port");
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let shard_info = PageserverShardInfo {
pageservers: vec![PageserverShardConnectionInfo {
id: Some(shard.node_id),
libpq_url,
grpc_url,
}],
};
shards.insert(shard.shard_id.to_index(), shard_info);
}
let stripe_size = if response.shard_params.count.is_unsharded() {
None
} else {
Some(response.shard_params.stripe_size)
};
Ok(PageserverConnectionInfo {
shard_count: response.shard_params.count,
stripe_size,
shards,
prefer_protocol: PageserverProtocol::default(),
})
}

View File

@@ -571,6 +571,11 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'basebackup_cache_enabled' as bool")?,
rel_size_v1_access_disabled: settings
.remove("rel_size_v1_access_disabled")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'rel_size_v1_access_disabled' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")

View File

@@ -120,11 +120,6 @@
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "neon.pageserver_grpc_urls",
"value": "grpc://pageserver:6401/",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",

View File

@@ -1,7 +1,6 @@
broker_endpoint='http://storage_broker:50051'
pg_distrib_dir='/usr/local/'
listen_pg_addr='0.0.0.0:6400'
listen_grpc_addr='0.0.0.0:6401'
listen_http_addr='0.0.0.0:9898'
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address

View File

@@ -1,246 +0,0 @@
# Node deletion API improvement
Created on 2025-07-07
Implemented on _TBD_
## Summary
This RFC describes improvements to the storage controller API for gracefully deleting pageserver
nodes.
## Motivation
The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333)
has several limitations:
- Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and
we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone
mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036)
- Process of node deletion is not graceful, i.e. it just imitates a node failure
In this context, "graceful" node deletion means that users do not experience any disruption or
negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers
can handle the workload and all requirements are met). To achieve this, the system must perform
live migration of all tenant shards from the node being deleted while the node is still running
and continue processing all incoming requests. The node is removed only after all tenant shards
have been safely migrated.
Although live migrations can be achieved with the drain functionality, it leads to incorrect shard
placement, such as not matching availability zones. This results in unnecessary work to optimize
the placement that was just recently performed.
If we delete a node before its tenant shards are fully moved, the new node won't have all the
needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at
first. If there are many tenant shards, this slowdown affects a huge amount of users.
Graceful node deletion is more complicated and can introduce new issues. It takes longer because
live migration of each tenant shard can last several minutes. Using non-blocking accessors may
also cause deletion to wait if other processes are holding inner state lock. It also gets trickier
because we need to handle other requests, like drain and fill, at the same time.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
- storage controller
- pageserver (indirectly)
## Proposed implementation
### Tombstones
To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced
as part of the node stored information. Each node has a separate `NodeLifecycle` field with two
possible states: `Active` and `Deleted`. When node deletion completes, the database row is not
deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted`
lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach
and register functionality must be aware of tombstones. Additionally, new debug handlers are
available for listing and deleting tombstones via the `/debug/v1/tombstone` path.
### Gracefulness
The problem of making node deletion graceful is complex and involves several challenges:
- **Cancellable**: The operation must be cancellable to allow administrators to abort the process
if needed, e.g. if run by mistake.
- **Non-blocking**: We don't want to block deployment operations like draining/filling on the node
deletion process. We need clear policies for handling concurrent operations: what happens when a
drain/fill request arrives while deletion is in progress, and what happens when a delete request
arrives while drain/fill is in progress.
- **Persistent**: If the storage controller restarts during this long-running operation, we must
preserve progress and automatically resume the deletion process after the storage controller
restarts.
- **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled
for deletion, as this would move shards to irrelevant locations. The drain process expects the
node to return, so it only moves shards to backup locations, not to their preferred AZs. It also
leaves secondary locations unmoved. This could result in unnecessary load on the storage
controller and inefficient resource utilization.
- **Force option**: Administrators need the ability to force immediate, non-graceful deletion when
time constraints or emergency situations require it, bypassing the normal graceful migration
process.
See below for a detailed breakdown of the proposed changes and mechanisms.
#### Node lifecycle
New `NodeLifecycle` enum and a matching database field with these values:
- `Active`: The normal state. All operations are allowed.
- `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or
will happen later, but the node will eventually be removed. All operations are allowed.
- `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought
back. The only action left is to remove its record from the database. Any attempt to register a
node in this state will fail.
This state persists across storage controller restarts.
**State transition**
```
+--------------------+
+---| Active |<---------------------+
| +--------------------+ |
| ^ |
| start_node_delete | cancel_node_delete |
v | |
+----------------------------------+ |
| ScheduledForDeletion | |
+----------------------------------+ |
| |
| node_register |
| |
| delete_node (at the finish) |
| |
v |
+---------+ tombstone_delete +----------+
| Deleted |-------------------------------->| no row |
+---------+ +----------+
```
#### NodeSchedulingPolicy::Deleting
A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is
running for the node right now. Only one node can have the `Deleting` policy at a time.
The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage
controller restart, any node previously marked as `Deleting` will have its scheduling policy reset
to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is
actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state.
`NodeSchedulingPolicy` transition details:
1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`.
2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the
policy to its previous value. The policy is persisted in storcon DB.
3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since
`NodeLifecycle::Deleted` prevents any further access to this field.
The deletion process cannot be initiated for nodes currently undergoing deployment-related
operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered
once the node transitions to either the `Active` or `Pause` state.
#### OperationTracker
A replacement for `Option<OperationHandler> ongoing_operation`, the `OperationTracker` is a
dedicated service state object responsible for managing all long-running node operations (drain,
fill, delete) with robust concurrency control.
Key responsibilities:
- Orchestrates the execution of operations
- Supports cancellation of currently running operations
- Enforces operation constraints, e.g. allowing only single drain/fill operation at a time
- Persists deletion state, enabling recovery of pending deletions across restarts
- Ensures thread safety across concurrent requests
#### Attached tenant shard processing
When deleting a node, handle each attached tenant shard as follows:
1. Pick the best node to become the new attached (the candidate).
2. If the candidate already has this shard as a secondary:
- Create a new secondary for the shard on another suitable node.
Otherwise:
- Create a secondary for the shard on the candidate node.
3. Wait until all secondaries are ready and pre-warmed.
4. Promote the candidate's secondary to attached.
5. Remove the secondary from the node being deleted.
This process safely moves all attached shards before deleting the node.
#### Secondary tenant shard processing
When deleting a node, handle each secondary tenant shard as follows:
1. Choose the best node to become the new secondary.
2. Create a secondary for the shard on that node.
3. Wait until the new secondary is ready.
4. Remove the secondary from the node being deleted.
This ensures all secondary shards are safely moved before deleting the node.
### Reliability, failure modes and corner cases
In case of a storage controller failure and following restart, the system behavior depends on the
`NodeLifecycle` state:
- If `NodeLifecycle` is `Active`: No action is taken for this node.
- If `NodeLifecycle` is `Deleted`: The node will not be re-added.
- If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for
this node.
In case of a pageserver node failure during deletion, the behavior depends on the `force` flag:
- If `force` is set: The node deletion will proceed regardless of the node's availability.
- If `force` is not set: The deletion will be retried a limited number of times. If the node
remains unavailable, the deletion process will pause and automatically resume when the node
becomes healthy again.
### Operations concurrency
The following sections describe the behavior when different types of requests arrive at the storage
controller and how they interact with ongoing operations.
#### Delete request
Handler: `PUT /control/v1/node/:node_id/delete`
1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`:
- Return `200 OK`: there is already an ongoing deletion request for this node
2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion`
3. Persist current scheduling policy
4. If there is no active operation (drain/fill/delete):
- Run deletion process for this node
#### Cancel delete request
Handler: `DELETE /control/v1/node/:node_id/delete`
1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`:
- Return `404 Not Found`: there is no current deletion request for this node
2. If the active operation is deleting this node, cancel it
3. Update & persist lifecycle to `NodeLifecycle::Active`
4. Restore the last scheduling policy from persistence
#### Drain/fill request
1. If there are already ongoing drain/fill processes:
- Return `409 Conflict`: queueing of drain/fill processes is not supported
2. If there is an ongoing delete process:
- Cancel it and wait until it is cancelled
3. Run the drain/fill process
4. After the drain/fill process is cancelled or finished:
- Try to find another candidate to delete and run the deletion process for that node
#### Drain/fill cancel request
1. If the active operation is not the related process:
- Return `400 Bad Request`: cancellation request is incorrect, operations are not the same
2. Cancel the active operation
3. Try to find another candidate to delete and run the deletion process for that node
## Definition of Done
- [x] Fix flaky node scenario and introduce related debug handlers
- [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion
request regardless of draining/filling requests and restarts
- [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to
recommended locations
- [ ] Deploying does not break due to long deletions - drain/fill operations override deletion
process and deletion resumes after drain/fill completes
- [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a
pageserver node does not respond)
- [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli

View File

@@ -68,15 +68,11 @@ pub enum LfcPrewarmState {
/// We tried to fetch the corresponding LFC state from the endpoint storage,
/// but received `Not Found 404`. This should normally happen only during the
/// first endpoint start after creation with `autoprewarm: true`.
/// This may also happen if LFC is turned off or not initialized
///
/// During the orchestrated prewarm via API, when a caller explicitly
/// provides the LFC state key to prewarm from, it's the caller responsibility
/// to handle this status as an error state in this case.
Skipped,
/// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query
/// has started working before cancellation
Cancelled,
}
impl Display for LfcPrewarmState {
@@ -87,7 +83,6 @@ impl Display for LfcPrewarmState {
LfcPrewarmState::Completed => f.write_str("Completed"),
LfcPrewarmState::Skipped => f.write_str("Skipped"),
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
LfcPrewarmState::Cancelled => f.write_str("Cancelled"),
}
}
}
@@ -102,7 +97,6 @@ pub enum LfcOffloadState {
Failed {
error: String,
},
Skipped,
}
#[derive(Serialize, Debug, Clone, PartialEq)]

View File

@@ -12,9 +12,8 @@ use regex::Regex;
use remote_storage::RemotePath;
use serde::{Deserialize, Serialize};
use url::Url;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
use crate::responses::TlsConfig;
@@ -106,27 +105,8 @@ pub struct ComputeSpec {
// updated to fill these fields, we can make these non optional.
pub tenant_id: Option<TenantId>,
pub timeline_id: Option<TimelineId>,
/// Pageserver information can be passed in three different ways:
/// 1. Here in `pageserver_connection_info`
/// 2. In the `pageserver_connstring` field.
/// 3. in `cluster.settings`.
///
/// The goal is to use method 1. everywhere. But for backwards-compatibility with old
/// versions of the control plane, `compute_ctl` will check 2. and 3. if the
/// `pageserver_connection_info` field is missing.
///
/// If both `pageserver_connection_info` and `pageserver_connstring`+`shard_stripe_size` are
/// given, they must contain the same information.
pub pageserver_connection_info: Option<PageserverConnectionInfo>,
pub pageserver_connstring: Option<String>,
/// Stripe size for pageserver sharding, in pages. This is set together with the legacy
/// `pageserver_connstring` field. When the modern `pageserver_connection_info` field is used,
/// the stripe size is stored in `pageserver_connection_info.stripe_size` instead.
pub shard_stripe_size: Option<ShardStripeSize>,
// More neon ids that we expose to the compute_ctl
// and to postgres as neon extension GUCs.
pub project_id: Option<String>,
@@ -159,6 +139,10 @@ pub struct ComputeSpec {
pub pgbouncer_settings: Option<IndexMap<String, String>>,
// Stripe size for pageserver sharding, in pages
#[serde(default)]
pub shard_stripe_size: Option<usize>,
/// Local Proxy configuration used for JWT authentication
#[serde(default)]
pub local_proxy_config: Option<LocalProxySpec>,
@@ -233,140 +217,6 @@ pub enum ComputeFeature {
UnknownFeature,
}
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverConnectionInfo {
/// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage
pub shard_count: ShardCount,
/// INVARIANT: null if shard_count is 0, otherwise non-null and immutable
pub stripe_size: Option<ShardStripeSize>,
pub shards: HashMap<ShardIndex, PageserverShardInfo>,
/// If the compute supports both protocols, this indicates which one it should use. The compute
/// may use other available protocols too, if it doesn't support the preferred one. The URL's
/// for the protocol specified here must be present for all shards, i.e. do not mark a protocol
/// as preferred if it cannot actually be used with all the pageservers.
#[serde(default)]
pub prefer_protocol: PageserverProtocol,
}
/// Extract PageserverConnectionInfo from a comma-separated list of libpq connection strings.
///
/// This is used for backwards-compatibility, to parse the legacy
/// [ComputeSpec::pageserver_connstring] field, or the 'neon.pageserver_connstring' GUC. Nowadays,
/// the 'pageserver_connection_info' field should be used instead.
impl PageserverConnectionInfo {
pub fn from_connstr(
connstr: &str,
stripe_size: Option<ShardStripeSize>,
) -> Result<PageserverConnectionInfo, anyhow::Error> {
let shard_infos: Vec<_> = connstr
.split(',')
.map(|connstr| PageserverShardInfo {
pageservers: vec![PageserverShardConnectionInfo {
id: None,
libpq_url: Some(connstr.to_string()),
grpc_url: None,
}],
})
.collect();
match shard_infos.len() {
0 => anyhow::bail!("empty connection string"),
1 => {
// We assume that if there's only connection string, it means "unsharded",
// rather than a sharded system with just a single shard. The latter is
// possible in principle, but we never do it.
let shard_count = ShardCount::unsharded();
let only_shard = shard_infos.first().unwrap().clone();
let shards = vec![(ShardIndex::unsharded(), only_shard)];
Ok(PageserverConnectionInfo {
shard_count,
stripe_size: None,
shards: shards.into_iter().collect(),
prefer_protocol: PageserverProtocol::Libpq,
})
}
n => {
if stripe_size.is_none() {
anyhow::bail!("{n} shards but no stripe_size");
}
let shard_count = ShardCount(n.try_into()?);
let shards = shard_infos
.into_iter()
.enumerate()
.map(|(idx, shard_info)| {
(
ShardIndex {
shard_count,
shard_number: ShardNumber(
idx.try_into().expect("shard number fits in u8"),
),
},
shard_info,
)
})
.collect();
Ok(PageserverConnectionInfo {
shard_count,
stripe_size,
shards,
prefer_protocol: PageserverProtocol::Libpq,
})
}
}
}
/// Convenience routine to get the connection string for a shard.
pub fn shard_url(
&self,
shard_number: ShardNumber,
protocol: PageserverProtocol,
) -> anyhow::Result<&str> {
let shard_index = ShardIndex {
shard_number,
shard_count: self.shard_count,
};
let shard = self.shards.get(&shard_index).ok_or_else(|| {
anyhow::anyhow!("shard connection info missing for shard {}", shard_index)
})?;
// Just use the first pageserver in the list. That's good enough for this
// convenience routine; if you need more control, like round robin policy or
// failover support, roll your own. (As of this writing, we never have more than
// one pageserver per shard anyway, but that will change in the future.)
let pageserver = shard
.pageservers
.first()
.ok_or(anyhow::anyhow!("must have at least one pageserver"))?;
let result = match protocol {
PageserverProtocol::Grpc => pageserver
.grpc_url
.as_ref()
.ok_or(anyhow::anyhow!("no grpc_url for shard {shard_index}"))?,
PageserverProtocol::Libpq => pageserver
.libpq_url
.as_ref()
.ok_or(anyhow::anyhow!("no libpq_url for shard {shard_index}"))?,
};
Ok(result)
}
}
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverShardInfo {
pub pageservers: Vec<PageserverShardConnectionInfo>,
}
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverShardConnectionInfo {
pub id: Option<NodeId>,
pub libpq_url: Option<String>,
pub grpc_url: Option<String>,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct RemoteExtSpec {
pub public_extensions: Option<Vec<String>>,
@@ -484,12 +334,6 @@ impl ComputeMode {
}
}
impl Display for ComputeMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.to_type_str())
}
}
/// Log level for audit logging
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
@@ -626,15 +470,13 @@ pub struct JwksSettings {
pub jwt_audience: Option<String>,
}
/// Protocol used to connect to a Pageserver.
#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum PageserverProtocol {
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
#[default]
#[serde(rename = "libpq")]
Libpq,
/// A newer, gRPC-based protocol. Uses grpc:// scheme.
#[serde(rename = "grpc")]
Grpc,
}

View File

@@ -558,11 +558,11 @@ async fn add_request_id_header_to_response(
mut res: Response<Body>,
req_info: RequestInfo,
) -> Result<Response<Body>, ApiError> {
if let Some(request_id) = req_info.context::<RequestId>()
&& let Ok(request_header_value) = HeaderValue::from_str(&request_id.0)
{
res.headers_mut()
.insert(&X_REQUEST_ID_HEADER, request_header_value);
if let Some(request_id) = req_info.context::<RequestId>() {
if let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) {
res.headers_mut()
.insert(&X_REQUEST_ID_HEADER, request_header_value);
};
};
Ok(res)

View File

@@ -72,10 +72,10 @@ impl Server {
if err.is_incomplete_message() || err.is_closed() || err.is_timeout() {
return true;
}
if let Some(inner) = err.source()
&& let Some(io) = inner.downcast_ref::<std::io::Error>()
{
return suppress_io_error(io);
if let Some(inner) = err.source() {
if let Some(io) = inner.downcast_ref::<std::io::Error>() {
return suppress_io_error(io);
}
}
false
}

View File

@@ -16,5 +16,5 @@ rustc-hash.workspace = true
tempfile = "3.14.0"
[dev-dependencies]
rand = "0.9"
rand_distr = "0.5.1"
rand.workspace = true
rand_distr = "0.5.1"

View File

@@ -1,330 +0,0 @@
use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
use neon_shmem::hash::HashMapAccess;
use neon_shmem::hash::HashMapInit;
use neon_shmem::hash::entry::Entry;
use rand::distr::{Distribution, StandardUniform};
use rand::prelude::*;
use std::default::Default;
use std::hash::BuildHasher;
// Taken from bindings to C code
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
#[repr(C)]
pub struct FileCacheKey {
pub _spc_id: u32,
pub _db_id: u32,
pub _rel_number: u32,
pub _fork_num: u32,
pub _block_num: u32,
}
impl Distribution<FileCacheKey> for StandardUniform {
// questionable, but doesn't need to be good randomness
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
FileCacheKey {
_spc_id: rng.random(),
_db_id: rng.random(),
_rel_number: rng.random(),
_fork_num: rng.random(),
_block_num: rng.random(),
}
}
}
#[derive(Clone, Debug)]
#[repr(C)]
pub struct FileCacheEntry {
pub _offset: u32,
pub _access_count: u32,
pub _prev: *mut FileCacheEntry,
pub _next: *mut FileCacheEntry,
pub _state: [u32; 8],
}
impl FileCacheEntry {
fn dummy() -> Self {
Self {
_offset: 0,
_access_count: 0,
_prev: std::ptr::null_mut(),
_next: std::ptr::null_mut(),
_state: [0; 8],
}
}
}
// Utilities for applying operations.
#[derive(Clone, Debug)]
struct TestOp<K, V>(K, Option<V>);
fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
op: TestOp<K, V>,
map: &mut HashMapAccess<K, V, S>,
) {
let entry = map.entry(op.0);
match op.1 {
Some(new) => match entry {
Entry::Occupied(mut e) => Some(e.insert(new)),
Entry::Vacant(e) => {
_ = e.insert(new).unwrap();
None
}
},
None => match entry {
Entry::Occupied(e) => Some(e.remove()),
Entry::Vacant(_) => None,
},
};
}
// Hash utilities
struct SeaRandomState {
k1: u64,
k2: u64,
k3: u64,
k4: u64,
}
impl std::hash::BuildHasher for SeaRandomState {
type Hasher = seahash::SeaHasher;
fn build_hasher(&self) -> Self::Hasher {
seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
}
}
impl SeaRandomState {
fn new() -> Self {
let mut rng = rand::rng();
Self {
k1: rng.random(),
k2: rng.random(),
k3: rng.random(),
k4: rng.random(),
}
}
}
fn small_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Small maps");
group.sample_size(10);
group.bench_function("small_rehash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_xxhash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(twox_hash::xxhash64::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_ahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(ahash::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_seahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(SeaRandomState::new())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.finish();
}
fn real_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Realistic workloads");
group.sample_size(10);
group.bench_function("real_bulk_insert", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut rng = rand::rng();
b.iter_batched(
|| HashMapInit::new_resizeable(size, size * 2).attach_writer(),
|writer| {
for _ in 0..ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
let entry = writer.entry(key);
match entry {
Entry::Occupied(mut e) => {
std::hint::black_box(e.insert(val));
}
Entry::Vacant(e) => {
let _ = std::hint::black_box(e.insert(val).unwrap());
}
}
}
},
BatchSize::SmallInput,
)
});
group.bench_function("real_rehash", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("real_rehash_hashbrown", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
} else {
None
},
)
});
});
for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
group.bench_with_input(
BenchmarkId::new("real_rehash_varied", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
},
);
group.bench_with_input(
BenchmarkId::new("real_rehash_varied_hashbrown", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| {
std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
})
} else {
None
},
)
});
},
);
}
group.finish();
}
criterion_group!(benches, small_benchs, real_benchs);
criterion_main!(benches);

View File

@@ -16,7 +16,6 @@
//!
//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
use std::fmt::Debug;
use std::hash::{BuildHasher, Hash};
use std::mem::MaybeUninit;
@@ -41,75 +40,45 @@ pub enum HashMapShrinkError {
#[error("shmem resize failed: {0}")]
ResizeError(shmem::Error),
/// Occupied entries in to-be-shrunk space were encountered beginning at the given index.
#[error("occupied entry in deallocated space found at {2} (in deallocated range of {0}..{1})")]
RemainingEntries(usize, usize, usize),
#[error("occupied entry in deallocated space found at {0}")]
RemainingEntries(usize),
}
/// This represents a hash table that (possibly) lives in shared memory.
/// If a new process is launched with fork(), the child process inherits
/// this struct.
#[must_use]
pub struct HashMapInit<K: 'static, V: 'static, S = rustc_hash::FxBuildHasher> {
pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
shmem_handle: Option<ShmemHandle>,
shared_ptr: *mut HashMapShared<K, V>,
shared_ptr: *mut HashMapShared<'a, K, V>,
shared_size: usize,
hasher: S,
num_buckets: u32,
}
impl<K, V, S> Debug for HashMapInit<K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapInit")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
.field("shared_size", &self.shared_size)
// .field("hasher", &self.hasher)
.field("num_buckets", &self.num_buckets)
.finish()
}
}
/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
/// If a child process is launched with fork(), the child process should
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
///
/// XXX: We're not making use of it at the moment, but this struct could
/// hold process-local information in the future.
pub struct HashMapAccess<K: 'static, V: 'static, S = rustc_hash::FxBuildHasher> {
pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
shmem_handle: Option<ShmemHandle>,
shared_ptr: *mut HashMapShared<K, V>,
shared_ptr: *mut HashMapShared<'a, K, V>,
hasher: S,
}
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<K, V, S> {}
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
impl<K, V, S> Debug for HashMapAccess<K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapAccess")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
// .field("hasher", &self.hasher)
.finish()
}
}
impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
/// Change the 'hasher' used by the hash table.
///
/// NOTE: This must be called right after creating the hash table,
/// before inserting any entries and before calling attach_writer/reader.
/// Otherwise different accessors could be using different hash function,
/// with confusing results.
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<K, V, T> {
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
HashMapInit {
hasher,
shmem_handle: self.shmem_handle,
@@ -177,7 +146,7 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
}
/// Attach to a hash table for writing.
pub fn attach_writer(self) -> HashMapAccess<K, V, S> {
pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
HashMapAccess {
shmem_handle: self.shmem_handle,
shared_ptr: self.shared_ptr,
@@ -189,7 +158,7 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
///
/// This is a holdover from a previous implementation and is being kept around for
/// backwards compatibility reasons.
pub fn attach_reader(self) -> HashMapAccess<K, V, S> {
pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
self.attach_writer()
}
}
@@ -206,14 +175,14 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
/// dictionary
///
/// In between the above parts, there can be padding bytes to align the parts correctly.
type HashMapShared<K, V> = RwLock<CoreHashMap<K, V>>;
type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
impl<K, V: 'static> HashMapInit<K, V, rustc_hash::FxBuildHasher>
impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
where
K: Clone + Hash + Eq + 'static,
K: Clone + Hash + Eq,
{
/// Place the hash table within a user-supplied fixed memory area.
pub fn with_fixed(num_buckets: u32, area: &'static mut [MaybeUninit<u8>]) -> Self {
pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit<u8>]) -> Self {
Self::new(
num_buckets,
None,
@@ -269,10 +238,9 @@ where
}
}
impl<K, V, S: BuildHasher> HashMapAccess<K, V, S>
impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
where
K: Clone + Hash + Eq + 'static,
V: 'static
K: Clone + Hash + Eq,
{
/// Hash a key using the map's hasher.
#[inline]
@@ -280,7 +248,7 @@ where
self.hasher.hash_one(key)
}
fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'_, K, V> {
fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
let dict_pos = hash as usize % map.dictionary.len();
let first = map.dictionary[dict_pos];
@@ -330,9 +298,9 @@ where
/// Get a reference to the entry containing a key.
///
/// NB: This takes a write lock as there's no way to distinguish whether the intention
/// NB: THis takes a write lock as there's no way to distinguish whether the intention
/// is to use the entry for reading or for writing in advance.
pub fn entry(&self, key: K) -> Entry<'_, K, V> {
pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
let hash = self.get_hash_value(&key);
self.entry_with_hash(key, hash)
}
@@ -366,7 +334,7 @@ where
/// Has more overhead than one would intuitively expect: performs both a clone of the key
/// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
/// to enable repairing the hash chain if the entry is removed.
pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'_, K, V>> {
pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
if pos >= map.buckets.len() {
return None;
@@ -390,22 +358,12 @@ where
map.get_num_buckets()
}
/// Returns the logical number of buckets in the table (aka the amount of allocatable buckets).
pub fn get_num_logical_buckets(&self) -> usize {
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
if map.alloc_limit == INVALID_POS {
map.get_num_buckets()
} else {
map.alloc_limit as usize
}
}
/// Return the key and value stored in bucket with given index. This can be used to
/// iterate through the hash map.
// TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
// _slowly_ iterate through all buckets with its clock hand, without holding a lock.
// If we switch to an Iterator, it must not hold the lock.
pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<'_, (K, V)>> {
pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<(K, V)>> {
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
if pos >= map.buckets.len() {
return None;
@@ -441,7 +399,7 @@ where
/// in the process.
fn rehash_dict(
&self,
inner: &mut CoreHashMap<K, V>,
inner: &mut CoreHashMap<'a, K, V>,
buckets_ptr: *mut core::Bucket<K, V>,
end_ptr: *mut u8,
num_buckets: u32,
@@ -552,12 +510,12 @@ where
/// # Panics
/// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
/// greater than the number of buckets in the map.
pub fn begin_shrink(&self, num_buckets: u32) {
pub fn begin_shrink(&mut self, num_buckets: u32) {
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
// assert!(
// num_buckets <= map.get_num_buckets() as u32,
// "shrink called with a larger number of buckets"
// );
assert!(
num_buckets <= map.get_num_buckets() as u32,
"shrink called with a larger number of buckets"
);
_ = self
.shmem_handle
.as_ref()
@@ -602,9 +560,7 @@ where
for i in (num_buckets as usize)..map.buckets.len() {
if map.buckets[i].inner.is_some() {
return Err(HashMapShrinkError::RemainingEntries(
num_buckets as usize, map.buckets.len(), i)
);
return Err(HashMapShrinkError::RemainingEntries(i));
}
}

View File

@@ -1,6 +1,5 @@
//! Simple hash table with chaining.
use std::fmt::Debug;
use std::hash::Hash;
use std::mem::MaybeUninit;
@@ -18,25 +17,12 @@ pub(crate) struct Bucket<K, V> {
pub(crate) inner: Option<(K, V)>,
}
impl<K, V> Debug for Bucket<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Bucket")
.field("next", &self.next)
.field("inner", &self.inner)
.finish()
}
}
/// Core hash table implementation.
pub(crate) struct CoreHashMap<K: 'static, V: 'static> {
pub(crate) struct CoreHashMap<'a, K, V> {
/// Dictionary used to map hashes to bucket indices.
pub(crate) dictionary: &'static mut [u32],
pub(crate) dictionary: &'a mut [u32],
/// Buckets containing key-value pairs.
pub(crate) buckets: &'static mut [Bucket<K, V>],
pub(crate) buckets: &'a mut [Bucket<K, V>],
/// Head of the freelist.
pub(crate) free_head: u32,
/// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
@@ -45,27 +31,11 @@ pub(crate) struct CoreHashMap<K: 'static, V: 'static> {
pub(crate) buckets_in_use: u32,
}
impl<K, V> Debug for CoreHashMap<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CoreHashMap")
.field("dictionary", &self.dictionary)
.field("buckets", &self.buckets)
.field("free_head", &self.free_head)
.field("alloc_limit", &self.alloc_limit)
.field("buckets_in_use", &self.buckets_in_use)
.finish()
}
}
/// Error for when there are no empty buckets left but one is needed.
#[derive(Debug, PartialEq)]
pub struct FullError;
impl<K: Clone + Hash + Eq, V> CoreHashMap<K, V> {
impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
const FILL_FACTOR: f32 = 0.60;
/// Estimate the size of data contained within the the hash map.
@@ -83,8 +53,8 @@ impl<K: Clone + Hash + Eq, V> CoreHashMap<K, V> {
}
pub fn new(
buckets: &'static mut [MaybeUninit<Bucket<K, V>>],
dictionary: &'static mut [MaybeUninit<u32>],
buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
dictionary: &'a mut [MaybeUninit<u32>],
) -> Self {
// Initialize the buckets
for i in 0..buckets.len() {

View File

@@ -6,9 +6,9 @@ use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
use std::hash::Hash;
use std::mem;
pub enum Entry<'b, K: 'static, V: 'static> {
Occupied(OccupiedEntry<'b, K, V>),
Vacant(VacantEntry<'b, K, V>),
pub enum Entry<'a, 'b, K, V> {
Occupied(OccupiedEntry<'a, 'b, K, V>),
Vacant(VacantEntry<'a, 'b, K, V>),
}
/// Enum representing the previous position within a chain.
@@ -22,9 +22,9 @@ pub(crate) enum PrevPos {
Unknown(u64),
}
pub struct OccupiedEntry<'b, K: 'static, V: 'static> {
pub struct OccupiedEntry<'a, 'b, K, V> {
/// Mutable reference to the map containing this entry.
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<K, V>>,
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
/// The key of the occupied entry
pub(crate) _key: K,
/// The index of the previous entry in the chain.
@@ -33,7 +33,7 @@ pub struct OccupiedEntry<'b, K: 'static, V: 'static> {
pub(crate) bucket_pos: u32,
}
impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
impl<K, V> OccupiedEntry<'_, '_, K, V> {
pub fn get(&self) -> &V {
&self.map.buckets[self.bucket_pos as usize]
.inner
@@ -61,10 +61,6 @@ impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
///
/// This may result in multiple bucket accesses if the entry was obtained by index as the
/// previous chain entry needs to be discovered in this case.
///
/// # Panics
/// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
/// the entry was obtained via calling something like [`super::HashMapAccess::entry_at_bucket`].
pub fn remove(mut self) -> V {
// If this bucket was queried by index, go ahead and follow its chain from the start.
let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
@@ -108,16 +104,16 @@ impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
}
/// An abstract view into a vacant entry within the map.
pub struct VacantEntry<'b, K: 'static, V: 'static> {
pub struct VacantEntry<'a, 'b, K, V> {
/// Mutable reference to the map containing this entry.
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<K, V>>,
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
/// The key to be inserted into this entry.
pub(crate) key: K,
/// The position within the dictionary corresponding to the key's hash.
pub(crate) dict_pos: u32,
}
impl<'b, K: Clone + Hash + Eq + 'static, V: 'static> VacantEntry<'b, K, V> {
impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
/// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
///
/// # Errors

View File

@@ -21,7 +21,6 @@ use nix::unistd::ftruncate as nix_ftruncate;
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
/// future.
#[derive(Debug)]
pub struct ShmemHandle {
/// memfd file descriptor
fd: OwnedFd,
@@ -36,7 +35,6 @@ pub struct ShmemHandle {
}
/// This is stored at the beginning in the shared memory area.
#[derive(Debug)]
struct SharedStruct {
max_size: usize,

View File

@@ -651,6 +651,9 @@ pub struct TenantConfigToml {
// FIXME: Remove skip_serializing_if when the feature is stable.
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub basebackup_cache_enabled: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub rel_size_v1_access_disabled: bool,
}
pub mod defaults {
@@ -959,6 +962,7 @@ impl Default for TenantConfigToml {
sampling_ratio: None,
relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
basebackup_cache_enabled: false,
rel_size_v1_access_disabled: false,
}
}
}

View File

@@ -646,6 +646,8 @@ pub struct TenantConfigPatch {
pub relsize_snapshot_cache_capacity: FieldPatch<usize>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub basebackup_cache_enabled: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub rel_size_v1_access_disabled: FieldPatch<bool>,
}
/// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -783,6 +785,9 @@ pub struct TenantConfig {
#[serde(skip_serializing_if = "Option::is_none")]
pub basebackup_cache_enabled: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub rel_size_v1_access_disabled: Option<bool>,
}
impl TenantConfig {
@@ -830,6 +835,7 @@ impl TenantConfig {
mut sampling_ratio,
mut relsize_snapshot_cache_capacity,
mut basebackup_cache_enabled,
mut rel_size_v1_access_disabled,
} = self;
patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -939,6 +945,9 @@ impl TenantConfig {
patch
.basebackup_cache_enabled
.apply(&mut basebackup_cache_enabled);
patch
.rel_size_v1_access_disabled
.apply(&mut rel_size_v1_access_disabled);
Ok(Self {
checkpoint_distance,
@@ -980,6 +989,7 @@ impl TenantConfig {
sampling_ratio,
relsize_snapshot_cache_capacity,
basebackup_cache_enabled,
rel_size_v1_access_disabled,
})
}
@@ -1094,6 +1104,9 @@ impl TenantConfig {
basebackup_cache_enabled: self
.basebackup_cache_enabled
.unwrap_or(global_conf.basebackup_cache_enabled),
rel_size_v1_access_disabled: self
.rel_size_v1_access_disabled
.unwrap_or(global_conf.rel_size_v1_access_disabled),
}
}
}
@@ -1526,12 +1539,15 @@ pub struct OffloadedTimelineInfo {
pub archived_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
/// The state of the rel size migration. This is persisted in the index part.
/// compatibility.
#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum RelSizeMigration {
/// The tenant is using the old rel_size format.
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
#[default]
Legacy,
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
/// persisted in the storage. The read path will read both formats and validate them.

View File

@@ -9,7 +9,10 @@ regex.workspace = true
bytes.workspace = true
anyhow.workspace = true
crc32c.workspace = true
criterion.workspace = true
once_cell.workspace = true
log.workspace = true
memoffset.workspace = true
pprof.workspace = true
thiserror.workspace = true
serde.workspace = true
@@ -19,7 +22,6 @@ tracing.workspace = true
postgres_versioninfo.workspace = true
[dev-dependencies]
criterion.workspace = true
env_logger.workspace = true
postgres.workspace = true

View File

@@ -34,8 +34,9 @@ const SIZEOF_CONTROLDATA: usize = size_of::<ControlFileData>();
impl ControlFileData {
/// Compute the offset of the `crc` field within the `ControlFileData` struct.
/// Equivalent to offsetof(ControlFileData, crc) in C.
const fn pg_control_crc_offset() -> usize {
std::mem::offset_of!(ControlFileData, crc)
// Someday this can be const when the right compiler features land.
fn pg_control_crc_offset() -> usize {
memoffset::offset_of!(ControlFileData, crc)
}
///

View File

@@ -4,11 +4,12 @@
use crate::pg_constants;
use crate::transaction_id_precedes;
use bytes::BytesMut;
use log::*;
use super::bindings::MultiXactId;
pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
tracing::trace!(
trace!(
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
status
);

View File

@@ -14,6 +14,7 @@ use super::xlog_utils::*;
use crate::WAL_SEGMENT_SIZE;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use crc32c::*;
use log::*;
use std::cmp::min;
use std::num::NonZeroU32;
use utils::lsn::Lsn;
@@ -235,7 +236,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder {
// XLOG_SWITCH records are special. If we see one, we need to skip
// to the next WAL segment.
let next_lsn = if xlogrec.is_xlog_switch_record() {
tracing::trace!("saw xlog switch record at {}", self.lsn);
trace!("saw xlog switch record at {}", self.lsn);
self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64)
} else {
// Pad to an 8-byte boundary

View File

@@ -23,6 +23,8 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
use bytes::BytesMut;
use bytes::{Buf, Bytes};
use log::*;
use serde::Serialize;
use std::ffi::{CString, OsStr};
use std::fs::File;
@@ -233,7 +235,7 @@ pub fn find_end_of_wal(
let mut curr_lsn = start_lsn;
let mut buf = [0u8; XLOG_BLCKSZ];
let pg_version = MY_PGVERSION;
tracing::debug!("find_end_of_wal PG_VERSION: {}", pg_version);
debug!("find_end_of_wal PG_VERSION: {}", pg_version);
let mut decoder = WalStreamDecoder::new(start_lsn, pg_version);
@@ -245,7 +247,7 @@ pub fn find_end_of_wal(
match open_wal_segment(&seg_file_path)? {
None => {
// no more segments
tracing::debug!(
debug!(
"find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
result, seg_file_path
);
@@ -258,7 +260,7 @@ pub fn find_end_of_wal(
while curr_lsn.segment_number(wal_seg_size) == segno {
let bytes_read = segment.read(&mut buf)?;
if bytes_read == 0 {
tracing::debug!(
debug!(
"find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
result,
seg_file_path,
@@ -274,7 +276,7 @@ pub fn find_end_of_wal(
match decoder.poll_decode() {
Ok(Some(record)) => result = record.0,
Err(e) => {
tracing::debug!(
debug!(
"find_end_of_wal reached end at {:?}, decode error: {:?}",
result, e
);

View File

@@ -9,7 +9,7 @@ use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody};
pub use self::sqlstate::*;
#[allow(clippy::unreadable_literal)]
pub mod sqlstate;
mod sqlstate;
/// The severity of a Postgres error or notice.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]

View File

@@ -49,7 +49,7 @@ impl PerfSpan {
}
}
pub fn enter(&self) -> PerfSpanEntered<'_> {
pub fn enter(&self) -> PerfSpanEntered {
if let Some(ref id) = self.inner.id() {
self.dispatch.enter(id);
}

View File

@@ -34,16 +34,13 @@ macro_rules! critical {
#[macro_export]
macro_rules! critical_timeline {
($tenant_shard_id:expr, $timeline_id:expr, $corruption_detected:expr, $($arg:tt)*) => {{
($tenant_shard_id:expr, $timeline_id:expr, $($arg:tt)*) => {{
if cfg!(debug_assertions) {
panic!($($arg)*);
}
// Increment both metrics
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
$crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string());
if let Some(c) = $corruption_detected.as_ref() {
c.store(true, std::sync::atomic::Ordering::Relaxed);
}
let backtrace = std::backtrace::Backtrace::capture();
tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}",
$tenant_shard_id, $timeline_id, format!($($arg)*));

View File

@@ -310,11 +310,6 @@ impl AtomicLsn {
}
}
/// Consumes the atomic and returns the contained value.
pub const fn into_inner(self) -> Lsn {
Lsn(self.inner.into_inner())
}
/// Atomically retrieve the `Lsn` value from memory.
pub fn load(&self) -> Lsn {
Lsn(self.inner.load(Ordering::Acquire))

View File

@@ -32,9 +32,6 @@ pub struct PageserverFeedback {
pub replytime: SystemTime,
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
pub shard_number: u32,
/// If true, the pageserver has detected corruption and the safekeeper and postgres
/// should stop sending WAL.
pub corruption_detected: bool,
}
impl PageserverFeedback {
@@ -46,7 +43,6 @@ impl PageserverFeedback {
disk_consistent_lsn: Lsn::INVALID,
replytime: *PG_EPOCH,
shard_number: 0,
corruption_detected: false,
}
}
@@ -105,13 +101,6 @@ impl PageserverFeedback {
buf.put_u32(self.shard_number);
}
if self.corruption_detected {
nkeys += 1;
buf.put_slice(b"corruption_detected\0");
buf.put_i32(1);
buf.put_u8(1);
}
buf[buf_ptr] = nkeys;
}
@@ -158,11 +147,6 @@ impl PageserverFeedback {
assert_eq!(len, 4);
rf.shard_number = buf.get_u32();
}
b"corruption_detected" => {
let len = buf.get_i32();
assert_eq!(len, 1);
rf.corruption_detected = buf.get_u8() != 0;
}
_ => {
let len = buf.get_i32();
warn!(
@@ -222,26 +206,6 @@ mod tests {
assert_eq!(rf, rf_parsed);
}
// Test that databricks-specific fields added to the PageserverFeedback message are serialized
// and deserialized correctly, in addition to the existing fields from upstream.
#[test]
fn test_replication_feedback_databricks_fields() {
let mut rf = PageserverFeedback::empty();
rf.current_timeline_size = 12345678;
rf.last_received_lsn = Lsn(23456789);
rf.disk_consistent_lsn = Lsn(34567890);
rf.remote_consistent_lsn = Lsn(45678901);
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
rf.shard_number = 1;
rf.corruption_detected = true;
let mut data = BytesMut::new();
rf.serialize(&mut data);
let rf_parsed = PageserverFeedback::parse(data.freeze());
assert_eq!(rf, rf_parsed);
}
#[test]
fn test_replication_feedback_unknown_key() {
let mut rf = PageserverFeedback::empty();

Some files were not shown because too many files have changed in this diff Show More